1 /* $NetBSD: tcp_input.c,v 1.136 2002/03/12 04:36:47 itojun Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 * 35 * NRL grants permission for redistribution and use in source and binary 36 * forms, with or without modification, of the software and documentation 37 * created at NRL provided that the following conditions are met: 38 * 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgements: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * This product includes software developed at the Information 49 * Technology Division, US Naval Research Laboratory. 50 * 4. Neither the name of the NRL nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 * 66 * The views and conclusions contained in the software and documentation 67 * are those of the authors and should not be interpreted as representing 68 * official policies, either expressed or implied, of the US Naval 69 * Research Laboratory (NRL). 70 */ 71 72 /*- 73 * Copyright (c) 1997, 1998, 1999, 2001 The NetBSD Foundation, Inc. 74 * All rights reserved. 75 * 76 * This code is derived from software contributed to The NetBSD Foundation 77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 78 * Facility, NASA Ames Research Center. 79 * 80 * Redistribution and use in source and binary forms, with or without 81 * modification, are permitted provided that the following conditions 82 * are met: 83 * 1. Redistributions of source code must retain the above copyright 84 * notice, this list of conditions and the following disclaimer. 85 * 2. Redistributions in binary form must reproduce the above copyright 86 * notice, this list of conditions and the following disclaimer in the 87 * documentation and/or other materials provided with the distribution. 88 * 3. All advertising materials mentioning features or use of this software 89 * must display the following acknowledgement: 90 * This product includes software developed by the NetBSD 91 * Foundation, Inc. and its contributors. 92 * 4. Neither the name of The NetBSD Foundation nor the names of its 93 * contributors may be used to endorse or promote products derived 94 * from this software without specific prior written permission. 95 * 96 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 97 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 98 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 99 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 100 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 101 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 102 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 103 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 104 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 105 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 106 * POSSIBILITY OF SUCH DAMAGE. 107 */ 108 109 /* 110 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 111 * The Regents of the University of California. All rights reserved. 112 * 113 * Redistribution and use in source and binary forms, with or without 114 * modification, are permitted provided that the following conditions 115 * are met: 116 * 1. Redistributions of source code must retain the above copyright 117 * notice, this list of conditions and the following disclaimer. 118 * 2. Redistributions in binary form must reproduce the above copyright 119 * notice, this list of conditions and the following disclaimer in the 120 * documentation and/or other materials provided with the distribution. 121 * 3. All advertising materials mentioning features or use of this software 122 * must display the following acknowledgement: 123 * This product includes software developed by the University of 124 * California, Berkeley and its contributors. 125 * 4. Neither the name of the University nor the names of its contributors 126 * may be used to endorse or promote products derived from this software 127 * without specific prior written permission. 128 * 129 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 130 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 131 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 132 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 133 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 134 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 135 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 136 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 137 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 138 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 139 * SUCH DAMAGE. 140 * 141 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 142 */ 143 144 /* 145 * TODO list for SYN cache stuff: 146 * 147 * Find room for a "state" field, which is needed to keep a 148 * compressed state for TIME_WAIT TCBs. It's been noted already 149 * that this is fairly important for very high-volume web and 150 * mail servers, which use a large number of short-lived 151 * connections. 152 */ 153 154 #include <sys/cdefs.h> 155 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.136 2002/03/12 04:36:47 itojun Exp $"); 156 157 #include "opt_inet.h" 158 #include "opt_ipsec.h" 159 #include "opt_inet_csum.h" 160 #include "opt_tcp_debug.h" 161 162 #include <sys/param.h> 163 #include <sys/systm.h> 164 #include <sys/malloc.h> 165 #include <sys/mbuf.h> 166 #include <sys/protosw.h> 167 #include <sys/socket.h> 168 #include <sys/socketvar.h> 169 #include <sys/errno.h> 170 #include <sys/syslog.h> 171 #include <sys/pool.h> 172 #include <sys/domain.h> 173 #include <sys/kernel.h> 174 175 #include <net/if.h> 176 #include <net/route.h> 177 #include <net/if_types.h> 178 179 #include <netinet/in.h> 180 #include <netinet/in_systm.h> 181 #include <netinet/ip.h> 182 #include <netinet/in_pcb.h> 183 #include <netinet/ip_var.h> 184 185 #ifdef INET6 186 #ifndef INET 187 #include <netinet/in.h> 188 #endif 189 #include <netinet/ip6.h> 190 #include <netinet6/ip6_var.h> 191 #include <netinet6/in6_pcb.h> 192 #include <netinet6/ip6_var.h> 193 #include <netinet6/in6_var.h> 194 #include <netinet/icmp6.h> 195 #include <netinet6/nd6.h> 196 #endif 197 198 #ifdef PULLDOWN_TEST 199 #ifndef INET6 200 /* always need ip6.h for IP6_EXTHDR_GET */ 201 #include <netinet/ip6.h> 202 #endif 203 #endif 204 205 #include <netinet/tcp.h> 206 #include <netinet/tcp_fsm.h> 207 #include <netinet/tcp_seq.h> 208 #include <netinet/tcp_timer.h> 209 #include <netinet/tcp_var.h> 210 #include <netinet/tcpip.h> 211 #include <netinet/tcp_debug.h> 212 213 #include <machine/stdarg.h> 214 215 #ifdef IPSEC 216 #include <netinet6/ipsec.h> 217 #include <netkey/key.h> 218 #endif /*IPSEC*/ 219 #ifdef INET6 220 #include "faith.h" 221 #if defined(NFAITH) && NFAITH > 0 222 #include <net/if_faith.h> 223 #endif 224 #endif 225 226 int tcprexmtthresh = 3; 227 int tcp_log_refused; 228 229 static int tcp_rst_ppslim_count = 0; 230 static struct timeval tcp_rst_ppslim_last; 231 232 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 233 234 /* for modulo comparisons of timestamps */ 235 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 236 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 237 238 /* 239 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 240 */ 241 #ifdef INET6 242 #define ND6_HINT(tp) \ 243 do { \ 244 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 245 && tp->t_in6pcb->in6p_route.ro_rt) { \ 246 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \ 247 } \ 248 } while (0) 249 #else 250 #define ND6_HINT(tp) 251 #endif 252 253 /* 254 * Macro to compute ACK transmission behavior. Delay the ACK unless 255 * we have already delayed an ACK (must send an ACK every two segments). 256 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 257 * option is enabled. 258 */ 259 #define TCP_SETUP_ACK(tp, th) \ 260 do { \ 261 if ((tp)->t_flags & TF_DELACK || \ 262 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 263 tp->t_flags |= TF_ACKNOW; \ 264 else \ 265 TCP_SET_DELACK(tp); \ 266 } while (0) 267 268 /* 269 * Convert TCP protocol fields to host order for easier processing. 270 */ 271 #define TCP_FIELDS_TO_HOST(th) \ 272 do { \ 273 NTOHL((th)->th_seq); \ 274 NTOHL((th)->th_ack); \ 275 NTOHS((th)->th_win); \ 276 NTOHS((th)->th_urp); \ 277 } while (0) 278 279 #ifdef TCP_CSUM_COUNTERS 280 #include <sys/device.h> 281 282 extern struct evcnt tcp_hwcsum_ok; 283 extern struct evcnt tcp_hwcsum_bad; 284 extern struct evcnt tcp_hwcsum_data; 285 extern struct evcnt tcp_swcsum; 286 287 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 288 289 #else 290 291 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 292 293 #endif /* TCP_CSUM_COUNTERS */ 294 295 int 296 tcp_reass(tp, th, m, tlen) 297 struct tcpcb *tp; 298 struct tcphdr *th; 299 struct mbuf *m; 300 int *tlen; 301 { 302 struct ipqent *p, *q, *nq, *tiqe = NULL; 303 struct socket *so = NULL; 304 int pkt_flags; 305 tcp_seq pkt_seq; 306 unsigned pkt_len; 307 u_long rcvpartdupbyte = 0; 308 u_long rcvoobyte; 309 310 if (tp->t_inpcb) 311 so = tp->t_inpcb->inp_socket; 312 #ifdef INET6 313 else if (tp->t_in6pcb) 314 so = tp->t_in6pcb->in6p_socket; 315 #endif 316 317 TCP_REASS_LOCK_CHECK(tp); 318 319 /* 320 * Call with th==0 after become established to 321 * force pre-ESTABLISHED data up to user socket. 322 */ 323 if (th == 0) 324 goto present; 325 326 rcvoobyte = *tlen; 327 /* 328 * Copy these to local variables because the tcpiphdr 329 * gets munged while we are collapsing mbufs. 330 */ 331 pkt_seq = th->th_seq; 332 pkt_len = *tlen; 333 pkt_flags = th->th_flags; 334 /* 335 * Find a segment which begins after this one does. 336 */ 337 for (p = NULL, q = LIST_FIRST(&tp->segq); q != NULL; q = nq) { 338 nq = LIST_NEXT(q, ipqe_q); 339 /* 340 * If the received segment is just right after this 341 * fragment, merge the two together and then check 342 * for further overlaps. 343 */ 344 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 345 #ifdef TCPREASS_DEBUG 346 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 347 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 348 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 349 #endif 350 pkt_len += q->ipqe_len; 351 pkt_flags |= q->ipqe_flags; 352 pkt_seq = q->ipqe_seq; 353 m_cat(q->ipqe_m, m); 354 m = q->ipqe_m; 355 goto free_ipqe; 356 } 357 /* 358 * If the received segment is completely past this 359 * fragment, we need to go the next fragment. 360 */ 361 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 362 p = q; 363 continue; 364 } 365 /* 366 * If the fragment is past the received segment, 367 * it (or any following) can't be concatenated. 368 */ 369 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 370 break; 371 /* 372 * We've received all the data in this segment before. 373 * mark it as a duplicate and return. 374 */ 375 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 376 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 377 tcpstat.tcps_rcvduppack++; 378 tcpstat.tcps_rcvdupbyte += pkt_len; 379 m_freem(m); 380 if (tiqe != NULL) 381 pool_put(&ipqent_pool, tiqe); 382 return (0); 383 } 384 /* 385 * Received segment completely overlaps this fragment 386 * so we drop the fragment (this keeps the temporal 387 * ordering of segments correct). 388 */ 389 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 390 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 391 rcvpartdupbyte += q->ipqe_len; 392 m_freem(q->ipqe_m); 393 goto free_ipqe; 394 } 395 /* 396 * RX'ed segment extends past the end of the 397 * fragment. Drop the overlapping bytes. Then 398 * merge the fragment and segment then treat as 399 * a longer received packet. 400 */ 401 if (SEQ_LT(q->ipqe_seq, pkt_seq) 402 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 403 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 404 #ifdef TCPREASS_DEBUG 405 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 406 tp, overlap, 407 pkt_seq, pkt_seq + pkt_len, pkt_len); 408 #endif 409 m_adj(m, overlap); 410 rcvpartdupbyte += overlap; 411 m_cat(q->ipqe_m, m); 412 m = q->ipqe_m; 413 pkt_seq = q->ipqe_seq; 414 pkt_len += q->ipqe_len - overlap; 415 rcvoobyte -= overlap; 416 goto free_ipqe; 417 } 418 /* 419 * RX'ed segment extends past the front of the 420 * fragment. Drop the overlapping bytes on the 421 * received packet. The packet will then be 422 * contatentated with this fragment a bit later. 423 */ 424 if (SEQ_GT(q->ipqe_seq, pkt_seq) 425 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 426 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 427 #ifdef TCPREASS_DEBUG 428 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 429 tp, overlap, 430 pkt_seq, pkt_seq + pkt_len, pkt_len); 431 #endif 432 m_adj(m, -overlap); 433 pkt_len -= overlap; 434 rcvpartdupbyte += overlap; 435 rcvoobyte -= overlap; 436 } 437 /* 438 * If the received segment immediates precedes this 439 * fragment then tack the fragment onto this segment 440 * and reinsert the data. 441 */ 442 if (q->ipqe_seq == pkt_seq + pkt_len) { 443 #ifdef TCPREASS_DEBUG 444 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 445 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 446 pkt_seq, pkt_seq + pkt_len, pkt_len); 447 #endif 448 pkt_len += q->ipqe_len; 449 pkt_flags |= q->ipqe_flags; 450 m_cat(m, q->ipqe_m); 451 LIST_REMOVE(q, ipqe_q); 452 LIST_REMOVE(q, ipqe_timeq); 453 if (tiqe == NULL) { 454 tiqe = q; 455 } else { 456 pool_put(&ipqent_pool, q); 457 } 458 break; 459 } 460 /* 461 * If the fragment is before the segment, remember it. 462 * When this loop is terminated, p will contain the 463 * pointer to fragment that is right before the received 464 * segment. 465 */ 466 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 467 p = q; 468 469 continue; 470 471 /* 472 * This is a common operation. It also will allow 473 * to save doing a malloc/free in most instances. 474 */ 475 free_ipqe: 476 LIST_REMOVE(q, ipqe_q); 477 LIST_REMOVE(q, ipqe_timeq); 478 if (tiqe == NULL) { 479 tiqe = q; 480 } else { 481 pool_put(&ipqent_pool, q); 482 } 483 } 484 485 /* 486 * Allocate a new queue entry since the received segment did not 487 * collapse onto any other out-of-order block; thus we are allocating 488 * a new block. If it had collapsed, tiqe would not be NULL and 489 * we would be reusing it. 490 * XXX If we can't, just drop the packet. XXX 491 */ 492 if (tiqe == NULL) { 493 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 494 if (tiqe == NULL) { 495 tcpstat.tcps_rcvmemdrop++; 496 m_freem(m); 497 return (0); 498 } 499 } 500 501 /* 502 * Update the counters. 503 */ 504 tcpstat.tcps_rcvoopack++; 505 tcpstat.tcps_rcvoobyte += rcvoobyte; 506 if (rcvpartdupbyte) { 507 tcpstat.tcps_rcvpartduppack++; 508 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 509 } 510 511 /* 512 * Insert the new fragment queue entry into both queues. 513 */ 514 tiqe->ipqe_m = m; 515 tiqe->ipqe_seq = pkt_seq; 516 tiqe->ipqe_len = pkt_len; 517 tiqe->ipqe_flags = pkt_flags; 518 if (p == NULL) { 519 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 520 #ifdef TCPREASS_DEBUG 521 if (tiqe->ipqe_seq != tp->rcv_nxt) 522 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 523 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 524 #endif 525 } else { 526 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 527 #ifdef TCPREASS_DEBUG 528 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 529 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 530 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 531 #endif 532 } 533 534 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 535 536 present: 537 /* 538 * Present data to user, advancing rcv_nxt through 539 * completed sequence space. 540 */ 541 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 542 return (0); 543 q = LIST_FIRST(&tp->segq); 544 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 545 return (0); 546 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 547 return (0); 548 549 tp->rcv_nxt += q->ipqe_len; 550 pkt_flags = q->ipqe_flags & TH_FIN; 551 ND6_HINT(tp); 552 553 LIST_REMOVE(q, ipqe_q); 554 LIST_REMOVE(q, ipqe_timeq); 555 if (so->so_state & SS_CANTRCVMORE) 556 m_freem(q->ipqe_m); 557 else 558 sbappend(&so->so_rcv, q->ipqe_m); 559 pool_put(&ipqent_pool, q); 560 sorwakeup(so); 561 return (pkt_flags); 562 } 563 564 #ifdef INET6 565 int 566 tcp6_input(mp, offp, proto) 567 struct mbuf **mp; 568 int *offp, proto; 569 { 570 struct mbuf *m = *mp; 571 572 /* 573 * draft-itojun-ipv6-tcp-to-anycast 574 * better place to put this in? 575 */ 576 if (m->m_flags & M_ANYCAST6) { 577 struct ip6_hdr *ip6; 578 if (m->m_len < sizeof(struct ip6_hdr)) { 579 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 580 tcpstat.tcps_rcvshort++; 581 return IPPROTO_DONE; 582 } 583 } 584 ip6 = mtod(m, struct ip6_hdr *); 585 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 586 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 587 return IPPROTO_DONE; 588 } 589 590 tcp_input(m, *offp, proto); 591 return IPPROTO_DONE; 592 } 593 #endif 594 595 /* 596 * TCP input routine, follows pages 65-76 of the 597 * protocol specification dated September, 1981 very closely. 598 */ 599 void 600 #if __STDC__ 601 tcp_input(struct mbuf *m, ...) 602 #else 603 tcp_input(m, va_alist) 604 struct mbuf *m; 605 #endif 606 { 607 int proto; 608 struct tcphdr *th; 609 struct ip *ip; 610 struct inpcb *inp; 611 #ifdef INET6 612 struct ip6_hdr *ip6; 613 struct in6pcb *in6p; 614 #endif 615 caddr_t optp = NULL; 616 int optlen = 0; 617 int len, tlen, toff, hdroptlen = 0; 618 struct tcpcb *tp = 0; 619 int tiflags; 620 struct socket *so = NULL; 621 int todrop, acked, ourfinisacked, needoutput = 0; 622 short ostate = 0; 623 int iss = 0; 624 u_long tiwin; 625 struct tcp_opt_info opti; 626 int off, iphlen; 627 va_list ap; 628 int af; /* af on the wire */ 629 struct mbuf *tcp_saveti = NULL; 630 631 va_start(ap, m); 632 toff = va_arg(ap, int); 633 proto = va_arg(ap, int); 634 va_end(ap); 635 636 tcpstat.tcps_rcvtotal++; 637 638 bzero(&opti, sizeof(opti)); 639 opti.ts_present = 0; 640 opti.maxseg = 0; 641 642 /* 643 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 644 * 645 * TCP is, by definition, unicast, so we reject all 646 * multicast outright. 647 * 648 * Note, there are additional src/dst address checks in 649 * the AF-specific code below. 650 */ 651 if (m->m_flags & (M_BCAST|M_MCAST)) { 652 /* XXX stat */ 653 goto drop; 654 } 655 #ifdef INET6 656 if (m->m_flags & M_ANYCAST6) { 657 /* XXX stat */ 658 goto drop; 659 } 660 #endif 661 662 /* 663 * Get IP and TCP header together in first mbuf. 664 * Note: IP leaves IP header in first mbuf. 665 */ 666 ip = mtod(m, struct ip *); 667 #ifdef INET6 668 ip6 = NULL; 669 #endif 670 switch (ip->ip_v) { 671 #ifdef INET 672 case 4: 673 af = AF_INET; 674 iphlen = sizeof(struct ip); 675 #ifndef PULLDOWN_TEST 676 /* would like to get rid of this... */ 677 if (toff > sizeof (struct ip)) { 678 ip_stripoptions(m, (struct mbuf *)0); 679 toff = sizeof(struct ip); 680 } 681 if (m->m_len < toff + sizeof (struct tcphdr)) { 682 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 683 tcpstat.tcps_rcvshort++; 684 return; 685 } 686 } 687 ip = mtod(m, struct ip *); 688 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 689 #else 690 ip = mtod(m, struct ip *); 691 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 692 sizeof(struct tcphdr)); 693 if (th == NULL) { 694 tcpstat.tcps_rcvshort++; 695 return; 696 } 697 #endif 698 699 /* 700 * Make sure destination address is not multicast. 701 * Source address checked in ip_input(). 702 */ 703 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 704 /* XXX stat */ 705 goto drop; 706 } 707 708 /* We do the checksum after PCB lookup... */ 709 len = ip->ip_len; 710 tlen = len - toff; 711 break; 712 #endif 713 #ifdef INET6 714 case 6: 715 ip = NULL; 716 iphlen = sizeof(struct ip6_hdr); 717 af = AF_INET6; 718 #ifndef PULLDOWN_TEST 719 if (m->m_len < toff + sizeof(struct tcphdr)) { 720 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 721 if (m == NULL) { 722 tcpstat.tcps_rcvshort++; 723 return; 724 } 725 } 726 ip6 = mtod(m, struct ip6_hdr *); 727 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 728 #else 729 ip6 = mtod(m, struct ip6_hdr *); 730 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 731 sizeof(struct tcphdr)); 732 if (th == NULL) { 733 tcpstat.tcps_rcvshort++; 734 return; 735 } 736 #endif 737 738 /* Be proactive about malicious use of IPv4 mapped address */ 739 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 740 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 741 /* XXX stat */ 742 goto drop; 743 } 744 745 /* 746 * Be proactive about unspecified IPv6 address in source. 747 * As we use all-zero to indicate unbounded/unconnected pcb, 748 * unspecified IPv6 address can be used to confuse us. 749 * 750 * Note that packets with unspecified IPv6 destination is 751 * already dropped in ip6_input. 752 */ 753 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 754 /* XXX stat */ 755 goto drop; 756 } 757 758 /* 759 * Make sure destination address is not multicast. 760 * Source address checked in ip6_input(). 761 */ 762 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 763 /* XXX stat */ 764 goto drop; 765 } 766 767 /* We do the checksum after PCB lookup... */ 768 len = m->m_pkthdr.len; 769 tlen = len - toff; 770 break; 771 #endif 772 default: 773 m_freem(m); 774 return; 775 } 776 777 /* 778 * Check that TCP offset makes sense, 779 * pull out TCP options and adjust length. XXX 780 */ 781 off = th->th_off << 2; 782 if (off < sizeof (struct tcphdr) || off > tlen) { 783 tcpstat.tcps_rcvbadoff++; 784 goto drop; 785 } 786 tlen -= off; 787 788 /* 789 * tcp_input() has been modified to use tlen to mean the TCP data 790 * length throughout the function. Other functions can use 791 * m->m_pkthdr.len as the basis for calculating the TCP data length. 792 * rja 793 */ 794 795 if (off > sizeof (struct tcphdr)) { 796 #ifndef PULLDOWN_TEST 797 if (m->m_len < toff + off) { 798 if ((m = m_pullup(m, toff + off)) == 0) { 799 tcpstat.tcps_rcvshort++; 800 return; 801 } 802 switch (af) { 803 #ifdef INET 804 case AF_INET: 805 ip = mtod(m, struct ip *); 806 break; 807 #endif 808 #ifdef INET6 809 case AF_INET6: 810 ip6 = mtod(m, struct ip6_hdr *); 811 break; 812 #endif 813 } 814 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 815 } 816 #else 817 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 818 if (th == NULL) { 819 tcpstat.tcps_rcvshort++; 820 return; 821 } 822 /* 823 * NOTE: ip/ip6 will not be affected by m_pulldown() 824 * (as they're before toff) and we don't need to update those. 825 */ 826 #endif 827 optlen = off - sizeof (struct tcphdr); 828 optp = ((caddr_t)th) + sizeof(struct tcphdr); 829 /* 830 * Do quick retrieval of timestamp options ("options 831 * prediction?"). If timestamp is the only option and it's 832 * formatted as recommended in RFC 1323 appendix A, we 833 * quickly get the values now and not bother calling 834 * tcp_dooptions(), etc. 835 */ 836 if ((optlen == TCPOLEN_TSTAMP_APPA || 837 (optlen > TCPOLEN_TSTAMP_APPA && 838 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 839 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 840 (th->th_flags & TH_SYN) == 0) { 841 opti.ts_present = 1; 842 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 843 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 844 optp = NULL; /* we've parsed the options */ 845 } 846 } 847 tiflags = th->th_flags; 848 849 /* 850 * Locate pcb for segment. 851 */ 852 findpcb: 853 inp = NULL; 854 #ifdef INET6 855 in6p = NULL; 856 #endif 857 switch (af) { 858 #ifdef INET 859 case AF_INET: 860 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 861 ip->ip_dst, th->th_dport); 862 if (inp == 0) { 863 ++tcpstat.tcps_pcbhashmiss; 864 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 865 } 866 #ifdef INET6 867 if (inp == 0) { 868 struct in6_addr s, d; 869 870 /* mapped addr case */ 871 bzero(&s, sizeof(s)); 872 s.s6_addr16[5] = htons(0xffff); 873 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 874 bzero(&d, sizeof(d)); 875 d.s6_addr16[5] = htons(0xffff); 876 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 877 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 878 &d, th->th_dport, 0); 879 if (in6p == 0) { 880 ++tcpstat.tcps_pcbhashmiss; 881 in6p = in6_pcblookup_bind(&tcb6, &d, 882 th->th_dport, 0); 883 } 884 } 885 #endif 886 #ifndef INET6 887 if (inp == 0) 888 #else 889 if (inp == 0 && in6p == 0) 890 #endif 891 { 892 ++tcpstat.tcps_noport; 893 if (tcp_log_refused && (tiflags & TH_SYN)) { 894 char src[4*sizeof "123"]; 895 char dst[4*sizeof "123"]; 896 897 if (ip) { 898 strcpy(src, inet_ntoa(ip->ip_src)); 899 strcpy(dst, inet_ntoa(ip->ip_dst)); 900 } 901 else { 902 strcpy(src, "(unknown)"); 903 strcpy(dst, "(unknown)"); 904 } 905 log(LOG_INFO, 906 "Connection attempt to TCP %s:%d from %s:%d\n", 907 dst, ntohs(th->th_dport), 908 src, ntohs(th->th_sport)); 909 } 910 TCP_FIELDS_TO_HOST(th); 911 goto dropwithreset_ratelim; 912 } 913 #ifdef IPSEC 914 if (inp && ipsec4_in_reject(m, inp)) { 915 ipsecstat.in_polvio++; 916 goto drop; 917 } 918 #ifdef INET6 919 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 920 ipsecstat.in_polvio++; 921 goto drop; 922 } 923 #endif 924 #endif /*IPSEC*/ 925 break; 926 #endif /*INET*/ 927 #ifdef INET6 928 case AF_INET6: 929 { 930 int faith; 931 932 #if defined(NFAITH) && NFAITH > 0 933 faith = faithprefix(&ip6->ip6_dst); 934 #else 935 faith = 0; 936 #endif 937 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 938 &ip6->ip6_dst, th->th_dport, faith); 939 if (in6p == NULL) { 940 ++tcpstat.tcps_pcbhashmiss; 941 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 942 th->th_dport, faith); 943 } 944 if (in6p == NULL) { 945 ++tcpstat.tcps_noport; 946 if (tcp_log_refused && (tiflags & TH_SYN)) { 947 char src[INET6_ADDRSTRLEN]; 948 char dst[INET6_ADDRSTRLEN]; 949 950 if (ip6) { 951 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 952 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 953 } 954 else { 955 strcpy(src, "(unknown v6)"); 956 strcpy(dst, "(unknown v6)"); 957 } 958 log(LOG_INFO, 959 "Connection attempt to TCP [%s]:%d from [%s]:%d\n", 960 dst, ntohs(th->th_dport), 961 src, ntohs(th->th_sport)); 962 } 963 TCP_FIELDS_TO_HOST(th); 964 goto dropwithreset_ratelim; 965 } 966 #ifdef IPSEC 967 if (ipsec6_in_reject(m, in6p)) { 968 ipsec6stat.in_polvio++; 969 goto drop; 970 } 971 #endif /*IPSEC*/ 972 break; 973 } 974 #endif 975 } 976 977 /* 978 * If the state is CLOSED (i.e., TCB does not exist) then 979 * all data in the incoming segment is discarded. 980 * If the TCB exists but is in CLOSED state, it is embryonic, 981 * but should either do a listen or a connect soon. 982 */ 983 tp = NULL; 984 so = NULL; 985 if (inp) { 986 tp = intotcpcb(inp); 987 so = inp->inp_socket; 988 } 989 #ifdef INET6 990 else if (in6p) { 991 tp = in6totcpcb(in6p); 992 so = in6p->in6p_socket; 993 } 994 #endif 995 if (tp == 0) { 996 TCP_FIELDS_TO_HOST(th); 997 goto dropwithreset_ratelim; 998 } 999 if (tp->t_state == TCPS_CLOSED) 1000 goto drop; 1001 1002 /* 1003 * Checksum extended TCP header and data. 1004 */ 1005 switch (af) { 1006 #ifdef INET 1007 case AF_INET: 1008 switch (m->m_pkthdr.csum_flags & 1009 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | 1010 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 1011 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 1012 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 1013 goto badcsum; 1014 1015 case M_CSUM_TCPv4|M_CSUM_DATA: 1016 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 1017 if ((m->m_pkthdr.csum_data ^ 0xffff) != 0) 1018 goto badcsum; 1019 break; 1020 1021 case M_CSUM_TCPv4: 1022 /* Checksum was okay. */ 1023 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 1024 break; 1025 1026 default: 1027 /* Must compute it ourselves. */ 1028 TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 1029 #ifndef PULLDOWN_TEST 1030 { 1031 struct ipovly *ipov; 1032 ipov = (struct ipovly *)ip; 1033 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 1034 ipov->ih_len = htons(tlen + off); 1035 1036 if (in_cksum(m, len) != 0) 1037 goto badcsum; 1038 } 1039 #else 1040 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1041 goto badcsum; 1042 #endif /* ! PULLDOWN_TEST */ 1043 break; 1044 } 1045 break; 1046 #endif /* INET4 */ 1047 1048 #ifdef INET6 1049 case AF_INET6: 1050 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) 1051 goto badcsum; 1052 break; 1053 #endif /* INET6 */ 1054 } 1055 1056 TCP_FIELDS_TO_HOST(th); 1057 1058 /* Unscale the window into a 32-bit value. */ 1059 if ((tiflags & TH_SYN) == 0) 1060 tiwin = th->th_win << tp->snd_scale; 1061 else 1062 tiwin = th->th_win; 1063 1064 #ifdef INET6 1065 /* save packet options if user wanted */ 1066 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 1067 if (in6p->in6p_options) { 1068 m_freem(in6p->in6p_options); 1069 in6p->in6p_options = 0; 1070 } 1071 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 1072 } 1073 #endif 1074 1075 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1076 union syn_cache_sa src; 1077 union syn_cache_sa dst; 1078 1079 bzero(&src, sizeof(src)); 1080 bzero(&dst, sizeof(dst)); 1081 switch (af) { 1082 #ifdef INET 1083 case AF_INET: 1084 src.sin.sin_len = sizeof(struct sockaddr_in); 1085 src.sin.sin_family = AF_INET; 1086 src.sin.sin_addr = ip->ip_src; 1087 src.sin.sin_port = th->th_sport; 1088 1089 dst.sin.sin_len = sizeof(struct sockaddr_in); 1090 dst.sin.sin_family = AF_INET; 1091 dst.sin.sin_addr = ip->ip_dst; 1092 dst.sin.sin_port = th->th_dport; 1093 break; 1094 #endif 1095 #ifdef INET6 1096 case AF_INET6: 1097 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1098 src.sin6.sin6_family = AF_INET6; 1099 src.sin6.sin6_addr = ip6->ip6_src; 1100 src.sin6.sin6_port = th->th_sport; 1101 1102 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1103 dst.sin6.sin6_family = AF_INET6; 1104 dst.sin6.sin6_addr = ip6->ip6_dst; 1105 dst.sin6.sin6_port = th->th_dport; 1106 break; 1107 #endif /* INET6 */ 1108 default: 1109 goto badsyn; /*sanity*/ 1110 } 1111 1112 if (so->so_options & SO_DEBUG) { 1113 ostate = tp->t_state; 1114 1115 tcp_saveti = NULL; 1116 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1117 goto nosave; 1118 1119 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1120 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1121 if (!tcp_saveti) 1122 goto nosave; 1123 } else { 1124 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1125 if (!tcp_saveti) 1126 goto nosave; 1127 tcp_saveti->m_len = iphlen; 1128 m_copydata(m, 0, iphlen, 1129 mtod(tcp_saveti, caddr_t)); 1130 } 1131 1132 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1133 m_freem(tcp_saveti); 1134 tcp_saveti = NULL; 1135 } else { 1136 tcp_saveti->m_len += sizeof(struct tcphdr); 1137 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 1138 sizeof(struct tcphdr)); 1139 } 1140 if (tcp_saveti) { 1141 /* 1142 * need to recover version # field, which was 1143 * overwritten on ip_cksum computation. 1144 */ 1145 struct ip *sip; 1146 sip = mtod(tcp_saveti, struct ip *); 1147 switch (af) { 1148 #ifdef INET 1149 case AF_INET: 1150 sip->ip_v = 4; 1151 break; 1152 #endif 1153 #ifdef INET6 1154 case AF_INET6: 1155 sip->ip_v = 6; 1156 break; 1157 #endif 1158 } 1159 } 1160 nosave:; 1161 } 1162 if (so->so_options & SO_ACCEPTCONN) { 1163 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1164 if (tiflags & TH_RST) { 1165 syn_cache_reset(&src.sa, &dst.sa, th); 1166 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1167 (TH_ACK|TH_SYN)) { 1168 /* 1169 * Received a SYN,ACK. This should 1170 * never happen while we are in 1171 * LISTEN. Send an RST. 1172 */ 1173 goto badsyn; 1174 } else if (tiflags & TH_ACK) { 1175 so = syn_cache_get(&src.sa, &dst.sa, 1176 th, toff, tlen, so, m); 1177 if (so == NULL) { 1178 /* 1179 * We don't have a SYN for 1180 * this ACK; send an RST. 1181 */ 1182 goto badsyn; 1183 } else if (so == 1184 (struct socket *)(-1)) { 1185 /* 1186 * We were unable to create 1187 * the connection. If the 1188 * 3-way handshake was 1189 * completed, and RST has 1190 * been sent to the peer. 1191 * Since the mbuf might be 1192 * in use for the reply, 1193 * do not free it. 1194 */ 1195 m = NULL; 1196 } else { 1197 /* 1198 * We have created a 1199 * full-blown connection. 1200 */ 1201 tp = NULL; 1202 inp = NULL; 1203 #ifdef INET6 1204 in6p = NULL; 1205 #endif 1206 switch (so->so_proto->pr_domain->dom_family) { 1207 #ifdef INET 1208 case AF_INET: 1209 inp = sotoinpcb(so); 1210 tp = intotcpcb(inp); 1211 break; 1212 #endif 1213 #ifdef INET6 1214 case AF_INET6: 1215 in6p = sotoin6pcb(so); 1216 tp = in6totcpcb(in6p); 1217 break; 1218 #endif 1219 } 1220 if (tp == NULL) 1221 goto badsyn; /*XXX*/ 1222 tiwin <<= tp->snd_scale; 1223 goto after_listen; 1224 } 1225 } else { 1226 /* 1227 * None of RST, SYN or ACK was set. 1228 * This is an invalid packet for a 1229 * TCB in LISTEN state. Send a RST. 1230 */ 1231 goto badsyn; 1232 } 1233 } else { 1234 /* 1235 * Received a SYN. 1236 */ 1237 1238 /* 1239 * LISTEN socket received a SYN 1240 * from itself? This can't possibly 1241 * be valid; drop the packet. 1242 */ 1243 if (th->th_sport == th->th_dport) { 1244 int i; 1245 1246 switch (af) { 1247 #ifdef INET 1248 case AF_INET: 1249 i = in_hosteq(ip->ip_src, ip->ip_dst); 1250 break; 1251 #endif 1252 #ifdef INET6 1253 case AF_INET6: 1254 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1255 break; 1256 #endif 1257 default: 1258 i = 1; 1259 } 1260 if (i) { 1261 tcpstat.tcps_badsyn++; 1262 goto drop; 1263 } 1264 } 1265 1266 /* 1267 * SYN looks ok; create compressed TCP 1268 * state for it. 1269 */ 1270 if (so->so_qlen <= so->so_qlimit && 1271 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1272 so, m, optp, optlen, &opti)) 1273 m = NULL; 1274 } 1275 goto drop; 1276 } 1277 } 1278 1279 after_listen: 1280 #ifdef DIAGNOSTIC 1281 /* 1282 * Should not happen now that all embryonic connections 1283 * are handled with compressed state. 1284 */ 1285 if (tp->t_state == TCPS_LISTEN) 1286 panic("tcp_input: TCPS_LISTEN"); 1287 #endif 1288 1289 /* 1290 * Segment received on connection. 1291 * Reset idle time and keep-alive timer. 1292 */ 1293 tp->t_rcvtime = tcp_now; 1294 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1295 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1296 1297 /* 1298 * Process options. 1299 */ 1300 if (optp) 1301 tcp_dooptions(tp, optp, optlen, th, &opti); 1302 1303 /* 1304 * Header prediction: check for the two common cases 1305 * of a uni-directional data xfer. If the packet has 1306 * no control flags, is in-sequence, the window didn't 1307 * change and we're not retransmitting, it's a 1308 * candidate. If the length is zero and the ack moved 1309 * forward, we're the sender side of the xfer. Just 1310 * free the data acked & wake any higher level process 1311 * that was blocked waiting for space. If the length 1312 * is non-zero and the ack didn't move, we're the 1313 * receiver side. If we're getting packets in-order 1314 * (the reassembly queue is empty), add the data to 1315 * the socket buffer and note that we need a delayed ack. 1316 */ 1317 if (tp->t_state == TCPS_ESTABLISHED && 1318 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1319 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1320 th->th_seq == tp->rcv_nxt && 1321 tiwin && tiwin == tp->snd_wnd && 1322 tp->snd_nxt == tp->snd_max) { 1323 1324 /* 1325 * If last ACK falls within this segment's sequence numbers, 1326 * record the timestamp. 1327 */ 1328 if (opti.ts_present && 1329 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1330 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1331 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1332 tp->ts_recent = opti.ts_val; 1333 } 1334 1335 if (tlen == 0) { 1336 if (SEQ_GT(th->th_ack, tp->snd_una) && 1337 SEQ_LEQ(th->th_ack, tp->snd_max) && 1338 tp->snd_cwnd >= tp->snd_wnd && 1339 tp->t_dupacks < tcprexmtthresh) { 1340 /* 1341 * this is a pure ack for outstanding data. 1342 */ 1343 ++tcpstat.tcps_predack; 1344 if (opti.ts_present && opti.ts_ecr) 1345 tcp_xmit_timer(tp, 1346 TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1347 else if (tp->t_rtttime && 1348 SEQ_GT(th->th_ack, tp->t_rtseq)) 1349 tcp_xmit_timer(tp, 1350 tcp_now - tp->t_rtttime); 1351 acked = th->th_ack - tp->snd_una; 1352 tcpstat.tcps_rcvackpack++; 1353 tcpstat.tcps_rcvackbyte += acked; 1354 ND6_HINT(tp); 1355 sbdrop(&so->so_snd, acked); 1356 /* 1357 * We want snd_recover to track snd_una to 1358 * avoid sequence wraparound problems for 1359 * very large transfers. 1360 */ 1361 tp->snd_una = tp->snd_recover = th->th_ack; 1362 m_freem(m); 1363 1364 /* 1365 * If all outstanding data are acked, stop 1366 * retransmit timer, otherwise restart timer 1367 * using current (possibly backed-off) value. 1368 * If process is waiting for space, 1369 * wakeup/selwakeup/signal. If data 1370 * are ready to send, let tcp_output 1371 * decide between more output or persist. 1372 */ 1373 if (tp->snd_una == tp->snd_max) 1374 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1375 else if (TCP_TIMER_ISARMED(tp, 1376 TCPT_PERSIST) == 0) 1377 TCP_TIMER_ARM(tp, TCPT_REXMT, 1378 tp->t_rxtcur); 1379 1380 sowwakeup(so); 1381 if (so->so_snd.sb_cc) 1382 (void) tcp_output(tp); 1383 if (tcp_saveti) 1384 m_freem(tcp_saveti); 1385 return; 1386 } 1387 } else if (th->th_ack == tp->snd_una && 1388 LIST_FIRST(&tp->segq) == NULL && 1389 tlen <= sbspace(&so->so_rcv)) { 1390 /* 1391 * this is a pure, in-sequence data packet 1392 * with nothing on the reassembly queue and 1393 * we have enough buffer space to take it. 1394 */ 1395 ++tcpstat.tcps_preddat; 1396 tp->rcv_nxt += tlen; 1397 tcpstat.tcps_rcvpack++; 1398 tcpstat.tcps_rcvbyte += tlen; 1399 ND6_HINT(tp); 1400 /* 1401 * Drop TCP, IP headers and TCP options then add data 1402 * to socket buffer. 1403 */ 1404 m_adj(m, toff + off); 1405 sbappend(&so->so_rcv, m); 1406 sorwakeup(so); 1407 TCP_SETUP_ACK(tp, th); 1408 if (tp->t_flags & TF_ACKNOW) 1409 (void) tcp_output(tp); 1410 if (tcp_saveti) 1411 m_freem(tcp_saveti); 1412 return; 1413 } 1414 } 1415 1416 /* 1417 * Compute mbuf offset to TCP data segment. 1418 */ 1419 hdroptlen = toff + off; 1420 1421 /* 1422 * Calculate amount of space in receive window, 1423 * and then do TCP input processing. 1424 * Receive window is amount of space in rcv queue, 1425 * but not less than advertised window. 1426 */ 1427 { int win; 1428 1429 win = sbspace(&so->so_rcv); 1430 if (win < 0) 1431 win = 0; 1432 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1433 } 1434 1435 switch (tp->t_state) { 1436 1437 /* 1438 * If the state is SYN_SENT: 1439 * if seg contains an ACK, but not for our SYN, drop the input. 1440 * if seg contains a RST, then drop the connection. 1441 * if seg does not contain SYN, then drop it. 1442 * Otherwise this is an acceptable SYN segment 1443 * initialize tp->rcv_nxt and tp->irs 1444 * if seg contains ack then advance tp->snd_una 1445 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1446 * arrange for segment to be acked (eventually) 1447 * continue processing rest of data/controls, beginning with URG 1448 */ 1449 case TCPS_SYN_SENT: 1450 if ((tiflags & TH_ACK) && 1451 (SEQ_LEQ(th->th_ack, tp->iss) || 1452 SEQ_GT(th->th_ack, tp->snd_max))) 1453 goto dropwithreset; 1454 if (tiflags & TH_RST) { 1455 if (tiflags & TH_ACK) 1456 tp = tcp_drop(tp, ECONNREFUSED); 1457 goto drop; 1458 } 1459 if ((tiflags & TH_SYN) == 0) 1460 goto drop; 1461 if (tiflags & TH_ACK) { 1462 tp->snd_una = tp->snd_recover = th->th_ack; 1463 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1464 tp->snd_nxt = tp->snd_una; 1465 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1466 } 1467 tp->irs = th->th_seq; 1468 tcp_rcvseqinit(tp); 1469 tp->t_flags |= TF_ACKNOW; 1470 tcp_mss_from_peer(tp, opti.maxseg); 1471 1472 /* 1473 * Initialize the initial congestion window. If we 1474 * had to retransmit the SYN, we must initialize cwnd 1475 * to 1 segment (i.e. the Loss Window). 1476 */ 1477 if (tp->t_flags & TF_SYN_REXMT) 1478 tp->snd_cwnd = tp->t_peermss; 1479 else 1480 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1481 tp->t_peermss); 1482 1483 tcp_rmx_rtt(tp); 1484 if (tiflags & TH_ACK) { 1485 tcpstat.tcps_connects++; 1486 soisconnected(so); 1487 tcp_established(tp); 1488 /* Do window scaling on this connection? */ 1489 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1490 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1491 tp->snd_scale = tp->requested_s_scale; 1492 tp->rcv_scale = tp->request_r_scale; 1493 } 1494 TCP_REASS_LOCK(tp); 1495 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1496 TCP_REASS_UNLOCK(tp); 1497 /* 1498 * if we didn't have to retransmit the SYN, 1499 * use its rtt as our initial srtt & rtt var. 1500 */ 1501 if (tp->t_rtttime) 1502 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1503 } else 1504 tp->t_state = TCPS_SYN_RECEIVED; 1505 1506 /* 1507 * Advance th->th_seq to correspond to first data byte. 1508 * If data, trim to stay within window, 1509 * dropping FIN if necessary. 1510 */ 1511 th->th_seq++; 1512 if (tlen > tp->rcv_wnd) { 1513 todrop = tlen - tp->rcv_wnd; 1514 m_adj(m, -todrop); 1515 tlen = tp->rcv_wnd; 1516 tiflags &= ~TH_FIN; 1517 tcpstat.tcps_rcvpackafterwin++; 1518 tcpstat.tcps_rcvbyteafterwin += todrop; 1519 } 1520 tp->snd_wl1 = th->th_seq - 1; 1521 tp->rcv_up = th->th_seq; 1522 goto step6; 1523 1524 /* 1525 * If the state is SYN_RECEIVED: 1526 * If seg contains an ACK, but not for our SYN, drop the input 1527 * and generate an RST. See page 36, rfc793 1528 */ 1529 case TCPS_SYN_RECEIVED: 1530 if ((tiflags & TH_ACK) && 1531 (SEQ_LEQ(th->th_ack, tp->iss) || 1532 SEQ_GT(th->th_ack, tp->snd_max))) 1533 goto dropwithreset; 1534 break; 1535 } 1536 1537 /* 1538 * States other than LISTEN or SYN_SENT. 1539 * First check timestamp, if present. 1540 * Then check that at least some bytes of segment are within 1541 * receive window. If segment begins before rcv_nxt, 1542 * drop leading data (and SYN); if nothing left, just ack. 1543 * 1544 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1545 * and it's less than ts_recent, drop it. 1546 */ 1547 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1548 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1549 1550 /* Check to see if ts_recent is over 24 days old. */ 1551 if ((int)(TCP_TIMESTAMP(tp) - tp->ts_recent_age) > 1552 TCP_PAWS_IDLE) { 1553 /* 1554 * Invalidate ts_recent. If this segment updates 1555 * ts_recent, the age will be reset later and ts_recent 1556 * will get a valid value. If it does not, setting 1557 * ts_recent to zero will at least satisfy the 1558 * requirement that zero be placed in the timestamp 1559 * echo reply when ts_recent isn't valid. The 1560 * age isn't reset until we get a valid ts_recent 1561 * because we don't want out-of-order segments to be 1562 * dropped when ts_recent is old. 1563 */ 1564 tp->ts_recent = 0; 1565 } else { 1566 tcpstat.tcps_rcvduppack++; 1567 tcpstat.tcps_rcvdupbyte += tlen; 1568 tcpstat.tcps_pawsdrop++; 1569 goto dropafterack; 1570 } 1571 } 1572 1573 todrop = tp->rcv_nxt - th->th_seq; 1574 if (todrop > 0) { 1575 if (tiflags & TH_SYN) { 1576 tiflags &= ~TH_SYN; 1577 th->th_seq++; 1578 if (th->th_urp > 1) 1579 th->th_urp--; 1580 else { 1581 tiflags &= ~TH_URG; 1582 th->th_urp = 0; 1583 } 1584 todrop--; 1585 } 1586 if (todrop > tlen || 1587 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1588 /* 1589 * Any valid FIN must be to the left of the window. 1590 * At this point the FIN must be a duplicate or 1591 * out of sequence; drop it. 1592 */ 1593 tiflags &= ~TH_FIN; 1594 /* 1595 * Send an ACK to resynchronize and drop any data. 1596 * But keep on processing for RST or ACK. 1597 */ 1598 tp->t_flags |= TF_ACKNOW; 1599 todrop = tlen; 1600 tcpstat.tcps_rcvdupbyte += todrop; 1601 tcpstat.tcps_rcvduppack++; 1602 } else { 1603 tcpstat.tcps_rcvpartduppack++; 1604 tcpstat.tcps_rcvpartdupbyte += todrop; 1605 } 1606 hdroptlen += todrop; /*drop from head afterwards*/ 1607 th->th_seq += todrop; 1608 tlen -= todrop; 1609 if (th->th_urp > todrop) 1610 th->th_urp -= todrop; 1611 else { 1612 tiflags &= ~TH_URG; 1613 th->th_urp = 0; 1614 } 1615 } 1616 1617 /* 1618 * If new data are received on a connection after the 1619 * user processes are gone, then RST the other end. 1620 */ 1621 if ((so->so_state & SS_NOFDREF) && 1622 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1623 tp = tcp_close(tp); 1624 tcpstat.tcps_rcvafterclose++; 1625 goto dropwithreset; 1626 } 1627 1628 /* 1629 * If segment ends after window, drop trailing data 1630 * (and PUSH and FIN); if nothing left, just ACK. 1631 */ 1632 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1633 if (todrop > 0) { 1634 tcpstat.tcps_rcvpackafterwin++; 1635 if (todrop >= tlen) { 1636 tcpstat.tcps_rcvbyteafterwin += tlen; 1637 /* 1638 * If a new connection request is received 1639 * while in TIME_WAIT, drop the old connection 1640 * and start over if the sequence numbers 1641 * are above the previous ones. 1642 */ 1643 if (tiflags & TH_SYN && 1644 tp->t_state == TCPS_TIME_WAIT && 1645 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1646 iss = tcp_new_iss(tp, tp->snd_nxt); 1647 tp = tcp_close(tp); 1648 goto findpcb; 1649 } 1650 /* 1651 * If window is closed can only take segments at 1652 * window edge, and have to drop data and PUSH from 1653 * incoming segments. Continue processing, but 1654 * remember to ack. Otherwise, drop segment 1655 * and ack. 1656 */ 1657 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1658 tp->t_flags |= TF_ACKNOW; 1659 tcpstat.tcps_rcvwinprobe++; 1660 } else 1661 goto dropafterack; 1662 } else 1663 tcpstat.tcps_rcvbyteafterwin += todrop; 1664 m_adj(m, -todrop); 1665 tlen -= todrop; 1666 tiflags &= ~(TH_PUSH|TH_FIN); 1667 } 1668 1669 /* 1670 * If last ACK falls within this segment's sequence numbers, 1671 * and the timestamp is newer, record it. 1672 */ 1673 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1674 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1675 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1676 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1677 tp->ts_recent_age = TCP_TIMESTAMP(tp); 1678 tp->ts_recent = opti.ts_val; 1679 } 1680 1681 /* 1682 * If the RST bit is set examine the state: 1683 * SYN_RECEIVED STATE: 1684 * If passive open, return to LISTEN state. 1685 * If active open, inform user that connection was refused. 1686 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1687 * Inform user that connection was reset, and close tcb. 1688 * CLOSING, LAST_ACK, TIME_WAIT STATES 1689 * Close the tcb. 1690 */ 1691 if (tiflags&TH_RST) switch (tp->t_state) { 1692 1693 case TCPS_SYN_RECEIVED: 1694 so->so_error = ECONNREFUSED; 1695 goto close; 1696 1697 case TCPS_ESTABLISHED: 1698 case TCPS_FIN_WAIT_1: 1699 case TCPS_FIN_WAIT_2: 1700 case TCPS_CLOSE_WAIT: 1701 so->so_error = ECONNRESET; 1702 close: 1703 tp->t_state = TCPS_CLOSED; 1704 tcpstat.tcps_drops++; 1705 tp = tcp_close(tp); 1706 goto drop; 1707 1708 case TCPS_CLOSING: 1709 case TCPS_LAST_ACK: 1710 case TCPS_TIME_WAIT: 1711 tp = tcp_close(tp); 1712 goto drop; 1713 } 1714 1715 /* 1716 * If a SYN is in the window, then this is an 1717 * error and we send an RST and drop the connection. 1718 */ 1719 if (tiflags & TH_SYN) { 1720 tp = tcp_drop(tp, ECONNRESET); 1721 goto dropwithreset; 1722 } 1723 1724 /* 1725 * If the ACK bit is off we drop the segment and return. 1726 */ 1727 if ((tiflags & TH_ACK) == 0) { 1728 if (tp->t_flags & TF_ACKNOW) 1729 goto dropafterack; 1730 else 1731 goto drop; 1732 } 1733 1734 /* 1735 * Ack processing. 1736 */ 1737 switch (tp->t_state) { 1738 1739 /* 1740 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1741 * ESTABLISHED state and continue processing, otherwise 1742 * send an RST. 1743 */ 1744 case TCPS_SYN_RECEIVED: 1745 if (SEQ_GT(tp->snd_una, th->th_ack) || 1746 SEQ_GT(th->th_ack, tp->snd_max)) 1747 goto dropwithreset; 1748 tcpstat.tcps_connects++; 1749 soisconnected(so); 1750 tcp_established(tp); 1751 /* Do window scaling? */ 1752 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1753 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1754 tp->snd_scale = tp->requested_s_scale; 1755 tp->rcv_scale = tp->request_r_scale; 1756 } 1757 TCP_REASS_LOCK(tp); 1758 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1759 TCP_REASS_UNLOCK(tp); 1760 tp->snd_wl1 = th->th_seq - 1; 1761 /* fall into ... */ 1762 1763 /* 1764 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1765 * ACKs. If the ack is in the range 1766 * tp->snd_una < th->th_ack <= tp->snd_max 1767 * then advance tp->snd_una to th->th_ack and drop 1768 * data from the retransmission queue. If this ACK reflects 1769 * more up to date window information we update our window information. 1770 */ 1771 case TCPS_ESTABLISHED: 1772 case TCPS_FIN_WAIT_1: 1773 case TCPS_FIN_WAIT_2: 1774 case TCPS_CLOSE_WAIT: 1775 case TCPS_CLOSING: 1776 case TCPS_LAST_ACK: 1777 case TCPS_TIME_WAIT: 1778 1779 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1780 if (tlen == 0 && tiwin == tp->snd_wnd) { 1781 tcpstat.tcps_rcvdupack++; 1782 /* 1783 * If we have outstanding data (other than 1784 * a window probe), this is a completely 1785 * duplicate ack (ie, window info didn't 1786 * change), the ack is the biggest we've 1787 * seen and we've seen exactly our rexmt 1788 * threshhold of them, assume a packet 1789 * has been dropped and retransmit it. 1790 * Kludge snd_nxt & the congestion 1791 * window so we send only this one 1792 * packet. 1793 * 1794 * We know we're losing at the current 1795 * window size so do congestion avoidance 1796 * (set ssthresh to half the current window 1797 * and pull our congestion window back to 1798 * the new ssthresh). 1799 * 1800 * Dup acks mean that packets have left the 1801 * network (they're now cached at the receiver) 1802 * so bump cwnd by the amount in the receiver 1803 * to keep a constant cwnd packets in the 1804 * network. 1805 */ 1806 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1807 th->th_ack != tp->snd_una) 1808 tp->t_dupacks = 0; 1809 else if (++tp->t_dupacks == tcprexmtthresh) { 1810 tcp_seq onxt = tp->snd_nxt; 1811 u_int win = 1812 min(tp->snd_wnd, tp->snd_cwnd) / 1813 2 / tp->t_segsz; 1814 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1815 tp->snd_recover)) { 1816 /* 1817 * False fast retransmit after 1818 * timeout. Do not cut window. 1819 */ 1820 tp->snd_cwnd += tp->t_segsz; 1821 tp->t_dupacks = 0; 1822 (void) tcp_output(tp); 1823 goto drop; 1824 } 1825 1826 if (win < 2) 1827 win = 2; 1828 tp->snd_ssthresh = win * tp->t_segsz; 1829 tp->snd_recover = tp->snd_max; 1830 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1831 tp->t_rtttime = 0; 1832 tp->snd_nxt = th->th_ack; 1833 tp->snd_cwnd = tp->t_segsz; 1834 (void) tcp_output(tp); 1835 tp->snd_cwnd = tp->snd_ssthresh + 1836 tp->t_segsz * tp->t_dupacks; 1837 if (SEQ_GT(onxt, tp->snd_nxt)) 1838 tp->snd_nxt = onxt; 1839 goto drop; 1840 } else if (tp->t_dupacks > tcprexmtthresh) { 1841 tp->snd_cwnd += tp->t_segsz; 1842 (void) tcp_output(tp); 1843 goto drop; 1844 } 1845 } else 1846 tp->t_dupacks = 0; 1847 break; 1848 } 1849 /* 1850 * If the congestion window was inflated to account 1851 * for the other side's cached packets, retract it. 1852 */ 1853 if (tcp_do_newreno == 0) { 1854 if (tp->t_dupacks >= tcprexmtthresh && 1855 tp->snd_cwnd > tp->snd_ssthresh) 1856 tp->snd_cwnd = tp->snd_ssthresh; 1857 tp->t_dupacks = 0; 1858 } else if (tp->t_dupacks >= tcprexmtthresh && 1859 tcp_newreno(tp, th) == 0) { 1860 tp->snd_cwnd = tp->snd_ssthresh; 1861 /* 1862 * Window inflation should have left us with approx. 1863 * snd_ssthresh outstanding data. But in case we 1864 * would be inclined to send a burst, better to do 1865 * it via the slow start mechanism. 1866 */ 1867 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1868 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1869 + tp->t_segsz; 1870 tp->t_dupacks = 0; 1871 } 1872 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1873 tcpstat.tcps_rcvacktoomuch++; 1874 goto dropafterack; 1875 } 1876 acked = th->th_ack - tp->snd_una; 1877 tcpstat.tcps_rcvackpack++; 1878 tcpstat.tcps_rcvackbyte += acked; 1879 1880 /* 1881 * If we have a timestamp reply, update smoothed 1882 * round trip time. If no timestamp is present but 1883 * transmit timer is running and timed sequence 1884 * number was acked, update smoothed round trip time. 1885 * Since we now have an rtt measurement, cancel the 1886 * timer backoff (cf., Phil Karn's retransmit alg.). 1887 * Recompute the initial retransmit timer. 1888 */ 1889 if (opti.ts_present && opti.ts_ecr) 1890 tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1); 1891 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1892 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 1893 1894 /* 1895 * If all outstanding data is acked, stop retransmit 1896 * timer and remember to restart (more output or persist). 1897 * If there is more data to be acked, restart retransmit 1898 * timer, using current (possibly backed-off) value. 1899 */ 1900 if (th->th_ack == tp->snd_max) { 1901 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1902 needoutput = 1; 1903 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1904 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1905 /* 1906 * When new data is acked, open the congestion window. 1907 * If the window gives us less than ssthresh packets 1908 * in flight, open exponentially (segsz per packet). 1909 * Otherwise open linearly: segsz per window 1910 * (segsz^2 / cwnd per packet), plus a constant 1911 * fraction of a packet (segsz/8) to help larger windows 1912 * open quickly enough. 1913 */ 1914 { 1915 u_int cw = tp->snd_cwnd; 1916 u_int incr = tp->t_segsz; 1917 1918 if (cw > tp->snd_ssthresh) 1919 incr = incr * incr / cw; 1920 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1921 tp->snd_cwnd = min(cw + incr, 1922 TCP_MAXWIN << tp->snd_scale); 1923 } 1924 ND6_HINT(tp); 1925 if (acked > so->so_snd.sb_cc) { 1926 tp->snd_wnd -= so->so_snd.sb_cc; 1927 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1928 ourfinisacked = 1; 1929 } else { 1930 sbdrop(&so->so_snd, acked); 1931 tp->snd_wnd -= acked; 1932 ourfinisacked = 0; 1933 } 1934 sowwakeup(so); 1935 /* 1936 * We want snd_recover to track snd_una to 1937 * avoid sequence wraparound problems for 1938 * very large transfers. 1939 */ 1940 tp->snd_una = tp->snd_recover = th->th_ack; 1941 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1942 tp->snd_nxt = tp->snd_una; 1943 1944 switch (tp->t_state) { 1945 1946 /* 1947 * In FIN_WAIT_1 STATE in addition to the processing 1948 * for the ESTABLISHED state if our FIN is now acknowledged 1949 * then enter FIN_WAIT_2. 1950 */ 1951 case TCPS_FIN_WAIT_1: 1952 if (ourfinisacked) { 1953 /* 1954 * If we can't receive any more 1955 * data, then closing user can proceed. 1956 * Starting the timer is contrary to the 1957 * specification, but if we don't get a FIN 1958 * we'll hang forever. 1959 */ 1960 if (so->so_state & SS_CANTRCVMORE) { 1961 soisdisconnected(so); 1962 if (tcp_maxidle > 0) 1963 TCP_TIMER_ARM(tp, TCPT_2MSL, 1964 tcp_maxidle); 1965 } 1966 tp->t_state = TCPS_FIN_WAIT_2; 1967 } 1968 break; 1969 1970 /* 1971 * In CLOSING STATE in addition to the processing for 1972 * the ESTABLISHED state if the ACK acknowledges our FIN 1973 * then enter the TIME-WAIT state, otherwise ignore 1974 * the segment. 1975 */ 1976 case TCPS_CLOSING: 1977 if (ourfinisacked) { 1978 tp->t_state = TCPS_TIME_WAIT; 1979 tcp_canceltimers(tp); 1980 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1981 soisdisconnected(so); 1982 } 1983 break; 1984 1985 /* 1986 * In LAST_ACK, we may still be waiting for data to drain 1987 * and/or to be acked, as well as for the ack of our FIN. 1988 * If our FIN is now acknowledged, delete the TCB, 1989 * enter the closed state and return. 1990 */ 1991 case TCPS_LAST_ACK: 1992 if (ourfinisacked) { 1993 tp = tcp_close(tp); 1994 goto drop; 1995 } 1996 break; 1997 1998 /* 1999 * In TIME_WAIT state the only thing that should arrive 2000 * is a retransmission of the remote FIN. Acknowledge 2001 * it and restart the finack timer. 2002 */ 2003 case TCPS_TIME_WAIT: 2004 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2005 goto dropafterack; 2006 } 2007 } 2008 2009 step6: 2010 /* 2011 * Update window information. 2012 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2013 */ 2014 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 2015 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 2016 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 2017 /* keep track of pure window updates */ 2018 if (tlen == 0 && 2019 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2020 tcpstat.tcps_rcvwinupd++; 2021 tp->snd_wnd = tiwin; 2022 tp->snd_wl1 = th->th_seq; 2023 tp->snd_wl2 = th->th_ack; 2024 if (tp->snd_wnd > tp->max_sndwnd) 2025 tp->max_sndwnd = tp->snd_wnd; 2026 needoutput = 1; 2027 } 2028 2029 /* 2030 * Process segments with URG. 2031 */ 2032 if ((tiflags & TH_URG) && th->th_urp && 2033 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2034 /* 2035 * This is a kludge, but if we receive and accept 2036 * random urgent pointers, we'll crash in 2037 * soreceive. It's hard to imagine someone 2038 * actually wanting to send this much urgent data. 2039 */ 2040 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2041 th->th_urp = 0; /* XXX */ 2042 tiflags &= ~TH_URG; /* XXX */ 2043 goto dodata; /* XXX */ 2044 } 2045 /* 2046 * If this segment advances the known urgent pointer, 2047 * then mark the data stream. This should not happen 2048 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2049 * a FIN has been received from the remote side. 2050 * In these states we ignore the URG. 2051 * 2052 * According to RFC961 (Assigned Protocols), 2053 * the urgent pointer points to the last octet 2054 * of urgent data. We continue, however, 2055 * to consider it to indicate the first octet 2056 * of data past the urgent section as the original 2057 * spec states (in one of two places). 2058 */ 2059 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2060 tp->rcv_up = th->th_seq + th->th_urp; 2061 so->so_oobmark = so->so_rcv.sb_cc + 2062 (tp->rcv_up - tp->rcv_nxt) - 1; 2063 if (so->so_oobmark == 0) 2064 so->so_state |= SS_RCVATMARK; 2065 sohasoutofband(so); 2066 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2067 } 2068 /* 2069 * Remove out of band data so doesn't get presented to user. 2070 * This can happen independent of advancing the URG pointer, 2071 * but if two URG's are pending at once, some out-of-band 2072 * data may creep in... ick. 2073 */ 2074 if (th->th_urp <= (u_int16_t) tlen 2075 #ifdef SO_OOBINLINE 2076 && (so->so_options & SO_OOBINLINE) == 0 2077 #endif 2078 ) 2079 tcp_pulloutofband(so, th, m, hdroptlen); 2080 } else 2081 /* 2082 * If no out of band data is expected, 2083 * pull receive urgent pointer along 2084 * with the receive window. 2085 */ 2086 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2087 tp->rcv_up = tp->rcv_nxt; 2088 dodata: /* XXX */ 2089 2090 /* 2091 * Process the segment text, merging it into the TCP sequencing queue, 2092 * and arranging for acknowledgement of receipt if necessary. 2093 * This process logically involves adjusting tp->rcv_wnd as data 2094 * is presented to the user (this happens in tcp_usrreq.c, 2095 * case PRU_RCVD). If a FIN has already been received on this 2096 * connection then we just ignore the text. 2097 */ 2098 if ((tlen || (tiflags & TH_FIN)) && 2099 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2100 /* 2101 * Insert segment ti into reassembly queue of tcp with 2102 * control block tp. Return TH_FIN if reassembly now includes 2103 * a segment with FIN. The macro form does the common case 2104 * inline (segment is the next to be received on an 2105 * established connection, and the queue is empty), 2106 * avoiding linkage into and removal from the queue and 2107 * repetition of various conversions. 2108 * Set DELACK for segments received in order, but ack 2109 * immediately when segments are out of order 2110 * (so fast retransmit can work). 2111 */ 2112 /* NOTE: this was TCP_REASS() macro, but used only once */ 2113 TCP_REASS_LOCK(tp); 2114 if (th->th_seq == tp->rcv_nxt && 2115 LIST_FIRST(&tp->segq) == NULL && 2116 tp->t_state == TCPS_ESTABLISHED) { 2117 TCP_SETUP_ACK(tp, th); 2118 tp->rcv_nxt += tlen; 2119 tiflags = th->th_flags & TH_FIN; 2120 tcpstat.tcps_rcvpack++; 2121 tcpstat.tcps_rcvbyte += tlen; 2122 ND6_HINT(tp); 2123 m_adj(m, hdroptlen); 2124 sbappend(&(so)->so_rcv, m); 2125 sorwakeup(so); 2126 } else { 2127 m_adj(m, hdroptlen); 2128 tiflags = tcp_reass(tp, th, m, &tlen); 2129 tp->t_flags |= TF_ACKNOW; 2130 } 2131 TCP_REASS_UNLOCK(tp); 2132 2133 /* 2134 * Note the amount of data that peer has sent into 2135 * our window, in order to estimate the sender's 2136 * buffer size. 2137 */ 2138 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2139 } else { 2140 m_freem(m); 2141 m = NULL; 2142 tiflags &= ~TH_FIN; 2143 } 2144 2145 /* 2146 * If FIN is received ACK the FIN and let the user know 2147 * that the connection is closing. Ignore a FIN received before 2148 * the connection is fully established. 2149 */ 2150 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2151 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2152 socantrcvmore(so); 2153 tp->t_flags |= TF_ACKNOW; 2154 tp->rcv_nxt++; 2155 } 2156 switch (tp->t_state) { 2157 2158 /* 2159 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2160 */ 2161 case TCPS_ESTABLISHED: 2162 tp->t_state = TCPS_CLOSE_WAIT; 2163 break; 2164 2165 /* 2166 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2167 * enter the CLOSING state. 2168 */ 2169 case TCPS_FIN_WAIT_1: 2170 tp->t_state = TCPS_CLOSING; 2171 break; 2172 2173 /* 2174 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2175 * starting the time-wait timer, turning off the other 2176 * standard timers. 2177 */ 2178 case TCPS_FIN_WAIT_2: 2179 tp->t_state = TCPS_TIME_WAIT; 2180 tcp_canceltimers(tp); 2181 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2182 soisdisconnected(so); 2183 break; 2184 2185 /* 2186 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2187 */ 2188 case TCPS_TIME_WAIT: 2189 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2190 break; 2191 } 2192 } 2193 #ifdef TCP_DEBUG 2194 if (so->so_options & SO_DEBUG) 2195 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2196 #endif 2197 2198 /* 2199 * Return any desired output. 2200 */ 2201 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2202 (void) tcp_output(tp); 2203 if (tcp_saveti) 2204 m_freem(tcp_saveti); 2205 return; 2206 2207 badsyn: 2208 /* 2209 * Received a bad SYN. Increment counters and dropwithreset. 2210 */ 2211 tcpstat.tcps_badsyn++; 2212 tp = NULL; 2213 goto dropwithreset; 2214 2215 dropafterack: 2216 /* 2217 * Generate an ACK dropping incoming segment if it occupies 2218 * sequence space, where the ACK reflects our state. 2219 */ 2220 if (tiflags & TH_RST) 2221 goto drop; 2222 m_freem(m); 2223 tp->t_flags |= TF_ACKNOW; 2224 (void) tcp_output(tp); 2225 if (tcp_saveti) 2226 m_freem(tcp_saveti); 2227 return; 2228 2229 dropwithreset_ratelim: 2230 /* 2231 * We may want to rate-limit RSTs in certain situations, 2232 * particularly if we are sending an RST in response to 2233 * an attempt to connect to or otherwise communicate with 2234 * a port for which we have no socket. 2235 */ 2236 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2237 tcp_rst_ppslim) == 0) { 2238 /* XXX stat */ 2239 goto drop; 2240 } 2241 /* ...fall into dropwithreset... */ 2242 2243 dropwithreset: 2244 /* 2245 * Generate a RST, dropping incoming segment. 2246 * Make ACK acceptable to originator of segment. 2247 */ 2248 if (tiflags & TH_RST) 2249 goto drop; 2250 { 2251 /* 2252 * need to recover version # field, which was overwritten on 2253 * ip_cksum computation. 2254 */ 2255 struct ip *sip; 2256 sip = mtod(m, struct ip *); 2257 switch (af) { 2258 #ifdef INET 2259 case AF_INET: 2260 sip->ip_v = 4; 2261 break; 2262 #endif 2263 #ifdef INET6 2264 case AF_INET6: 2265 sip->ip_v = 6; 2266 break; 2267 #endif 2268 } 2269 } 2270 if (tiflags & TH_ACK) 2271 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2272 else { 2273 if (tiflags & TH_SYN) 2274 tlen++; 2275 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2276 TH_RST|TH_ACK); 2277 } 2278 if (tcp_saveti) 2279 m_freem(tcp_saveti); 2280 return; 2281 2282 badcsum: 2283 tcpstat.tcps_rcvbadsum++; 2284 drop: 2285 /* 2286 * Drop space held by incoming segment and return. 2287 */ 2288 if (tp) { 2289 if (tp->t_inpcb) 2290 so = tp->t_inpcb->inp_socket; 2291 #ifdef INET6 2292 else if (tp->t_in6pcb) 2293 so = tp->t_in6pcb->in6p_socket; 2294 #endif 2295 else 2296 so = NULL; 2297 #ifdef TCP_DEBUG 2298 if (so && (so->so_options & SO_DEBUG) != 0) 2299 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2300 #endif 2301 } 2302 if (tcp_saveti) 2303 m_freem(tcp_saveti); 2304 m_freem(m); 2305 return; 2306 } 2307 2308 void 2309 tcp_dooptions(tp, cp, cnt, th, oi) 2310 struct tcpcb *tp; 2311 u_char *cp; 2312 int cnt; 2313 struct tcphdr *th; 2314 struct tcp_opt_info *oi; 2315 { 2316 u_int16_t mss; 2317 int opt, optlen; 2318 2319 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2320 opt = cp[0]; 2321 if (opt == TCPOPT_EOL) 2322 break; 2323 if (opt == TCPOPT_NOP) 2324 optlen = 1; 2325 else { 2326 if (cnt < 2) 2327 break; 2328 optlen = cp[1]; 2329 if (optlen < 2 || optlen > cnt) 2330 break; 2331 } 2332 switch (opt) { 2333 2334 default: 2335 continue; 2336 2337 case TCPOPT_MAXSEG: 2338 if (optlen != TCPOLEN_MAXSEG) 2339 continue; 2340 if (!(th->th_flags & TH_SYN)) 2341 continue; 2342 bcopy(cp + 2, &mss, sizeof(mss)); 2343 oi->maxseg = ntohs(mss); 2344 break; 2345 2346 case TCPOPT_WINDOW: 2347 if (optlen != TCPOLEN_WINDOW) 2348 continue; 2349 if (!(th->th_flags & TH_SYN)) 2350 continue; 2351 tp->t_flags |= TF_RCVD_SCALE; 2352 tp->requested_s_scale = cp[2]; 2353 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2354 #if 0 /*XXX*/ 2355 char *p; 2356 2357 if (ip) 2358 p = ntohl(ip->ip_src); 2359 #ifdef INET6 2360 else if (ip6) 2361 p = ip6_sprintf(&ip6->ip6_src); 2362 #endif 2363 else 2364 p = "(unknown)"; 2365 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2366 "assuming %d\n", 2367 tp->requested_s_scale, p, 2368 TCP_MAX_WINSHIFT); 2369 #else 2370 log(LOG_ERR, "TCP: invalid wscale %d, " 2371 "assuming %d\n", 2372 tp->requested_s_scale, 2373 TCP_MAX_WINSHIFT); 2374 #endif 2375 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2376 } 2377 break; 2378 2379 case TCPOPT_TIMESTAMP: 2380 if (optlen != TCPOLEN_TIMESTAMP) 2381 continue; 2382 oi->ts_present = 1; 2383 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2384 NTOHL(oi->ts_val); 2385 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2386 NTOHL(oi->ts_ecr); 2387 2388 /* 2389 * A timestamp received in a SYN makes 2390 * it ok to send timestamp requests and replies. 2391 */ 2392 if (th->th_flags & TH_SYN) { 2393 tp->t_flags |= TF_RCVD_TSTMP; 2394 tp->ts_recent = oi->ts_val; 2395 tp->ts_recent_age = TCP_TIMESTAMP(tp); 2396 } 2397 break; 2398 case TCPOPT_SACK_PERMITTED: 2399 if (optlen != TCPOLEN_SACK_PERMITTED) 2400 continue; 2401 if (!(th->th_flags & TH_SYN)) 2402 continue; 2403 tp->t_flags &= ~TF_CANT_TXSACK; 2404 break; 2405 2406 case TCPOPT_SACK: 2407 if (tp->t_flags & TF_IGNR_RXSACK) 2408 continue; 2409 if (optlen % 8 != 2 || optlen < 10) 2410 continue; 2411 cp += 2; 2412 optlen -= 2; 2413 for (; optlen > 0; cp -= 8, optlen -= 8) { 2414 tcp_seq lwe, rwe; 2415 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2416 NTOHL(lwe); 2417 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2418 NTOHL(rwe); 2419 /* tcp_mark_sacked(tp, lwe, rwe); */ 2420 } 2421 break; 2422 } 2423 } 2424 } 2425 2426 /* 2427 * Pull out of band byte out of a segment so 2428 * it doesn't appear in the user's data queue. 2429 * It is still reflected in the segment length for 2430 * sequencing purposes. 2431 */ 2432 void 2433 tcp_pulloutofband(so, th, m, off) 2434 struct socket *so; 2435 struct tcphdr *th; 2436 struct mbuf *m; 2437 int off; 2438 { 2439 int cnt = off + th->th_urp - 1; 2440 2441 while (cnt >= 0) { 2442 if (m->m_len > cnt) { 2443 char *cp = mtod(m, caddr_t) + cnt; 2444 struct tcpcb *tp = sototcpcb(so); 2445 2446 tp->t_iobc = *cp; 2447 tp->t_oobflags |= TCPOOB_HAVEDATA; 2448 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2449 m->m_len--; 2450 return; 2451 } 2452 cnt -= m->m_len; 2453 m = m->m_next; 2454 if (m == 0) 2455 break; 2456 } 2457 panic("tcp_pulloutofband"); 2458 } 2459 2460 /* 2461 * Collect new round-trip time estimate 2462 * and update averages and current timeout. 2463 */ 2464 void 2465 tcp_xmit_timer(tp, rtt) 2466 struct tcpcb *tp; 2467 uint32_t rtt; 2468 { 2469 int32_t delta; 2470 2471 tcpstat.tcps_rttupdated++; 2472 if (tp->t_srtt != 0) { 2473 /* 2474 * srtt is stored as fixed point with 3 bits after the 2475 * binary point (i.e., scaled by 8). The following magic 2476 * is equivalent to the smoothing algorithm in rfc793 with 2477 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2478 * point). Adjust rtt to origin 0. 2479 */ 2480 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2481 if ((tp->t_srtt += delta) <= 0) 2482 tp->t_srtt = 1 << 2; 2483 /* 2484 * We accumulate a smoothed rtt variance (actually, a 2485 * smoothed mean difference), then set the retransmit 2486 * timer to smoothed rtt + 4 times the smoothed variance. 2487 * rttvar is stored as fixed point with 2 bits after the 2488 * binary point (scaled by 4). The following is 2489 * equivalent to rfc793 smoothing with an alpha of .75 2490 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2491 * rfc793's wired-in beta. 2492 */ 2493 if (delta < 0) 2494 delta = -delta; 2495 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2496 if ((tp->t_rttvar += delta) <= 0) 2497 tp->t_rttvar = 1 << 2; 2498 } else { 2499 /* 2500 * No rtt measurement yet - use the unsmoothed rtt. 2501 * Set the variance to half the rtt (so our first 2502 * retransmit happens at 3*rtt). 2503 */ 2504 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2505 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2506 } 2507 tp->t_rtttime = 0; 2508 tp->t_rxtshift = 0; 2509 2510 /* 2511 * the retransmit should happen at rtt + 4 * rttvar. 2512 * Because of the way we do the smoothing, srtt and rttvar 2513 * will each average +1/2 tick of bias. When we compute 2514 * the retransmit timer, we want 1/2 tick of rounding and 2515 * 1 extra tick because of +-1/2 tick uncertainty in the 2516 * firing of the timer. The bias will give us exactly the 2517 * 1.5 tick we need. But, because the bias is 2518 * statistical, we have to test that we don't drop below 2519 * the minimum feasible timer (which is 2 ticks). 2520 */ 2521 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2522 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2523 2524 /* 2525 * We received an ack for a packet that wasn't retransmitted; 2526 * it is probably safe to discard any error indications we've 2527 * received recently. This isn't quite right, but close enough 2528 * for now (a route might have failed after we sent a segment, 2529 * and the return path might not be symmetrical). 2530 */ 2531 tp->t_softerror = 0; 2532 } 2533 2534 /* 2535 * Checks for partial ack. If partial ack arrives, force the retransmission 2536 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2537 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2538 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2539 */ 2540 int 2541 tcp_newreno(tp, th) 2542 struct tcpcb *tp; 2543 struct tcphdr *th; 2544 { 2545 tcp_seq onxt = tp->snd_nxt; 2546 u_long ocwnd = tp->snd_cwnd; 2547 2548 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2549 /* 2550 * snd_una has not yet been updated and the socket's send 2551 * buffer has not yet drained off the ACK'd data, so we 2552 * have to leave snd_una as it was to get the correct data 2553 * offset in tcp_output(). 2554 */ 2555 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2556 tp->t_rtttime = 0; 2557 tp->snd_nxt = th->th_ack; 2558 /* 2559 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2560 * is not yet updated when we're called. 2561 */ 2562 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2563 (void) tcp_output(tp); 2564 tp->snd_cwnd = ocwnd; 2565 if (SEQ_GT(onxt, tp->snd_nxt)) 2566 tp->snd_nxt = onxt; 2567 /* 2568 * Partial window deflation. Relies on fact that tp->snd_una 2569 * not updated yet. 2570 */ 2571 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2572 return 1; 2573 } 2574 return 0; 2575 } 2576 2577 2578 /* 2579 * TCP compressed state engine. Currently used to hold compressed 2580 * state for SYN_RECEIVED. 2581 */ 2582 2583 u_long syn_cache_count; 2584 u_int32_t syn_hash1, syn_hash2; 2585 2586 #define SYN_HASH(sa, sp, dp) \ 2587 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2588 ((u_int32_t)(sp)))^syn_hash2))) 2589 #ifndef INET6 2590 #define SYN_HASHALL(hash, src, dst) \ 2591 do { \ 2592 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2593 ((struct sockaddr_in *)(src))->sin_port, \ 2594 ((struct sockaddr_in *)(dst))->sin_port); \ 2595 } while (0) 2596 #else 2597 #define SYN_HASH6(sa, sp, dp) \ 2598 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2599 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2600 & 0x7fffffff) 2601 2602 #define SYN_HASHALL(hash, src, dst) \ 2603 do { \ 2604 switch ((src)->sa_family) { \ 2605 case AF_INET: \ 2606 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2607 ((struct sockaddr_in *)(src))->sin_port, \ 2608 ((struct sockaddr_in *)(dst))->sin_port); \ 2609 break; \ 2610 case AF_INET6: \ 2611 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2612 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2613 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2614 break; \ 2615 default: \ 2616 hash = 0; \ 2617 } \ 2618 } while (/*CONSTCOND*/0) 2619 #endif /* INET6 */ 2620 2621 #define SYN_CACHE_RM(sc) \ 2622 do { \ 2623 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \ 2624 (sc), sc_bucketq); \ 2625 (sc)->sc_tp = NULL; \ 2626 LIST_REMOVE((sc), sc_tpq); \ 2627 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2628 callout_stop(&(sc)->sc_timer); \ 2629 syn_cache_count--; \ 2630 } while (/*CONSTCOND*/0) 2631 2632 #define SYN_CACHE_PUT(sc) \ 2633 do { \ 2634 if ((sc)->sc_ipopts) \ 2635 (void) m_free((sc)->sc_ipopts); \ 2636 if ((sc)->sc_route4.ro_rt != NULL) \ 2637 RTFREE((sc)->sc_route4.ro_rt); \ 2638 pool_put(&syn_cache_pool, (sc)); \ 2639 } while (/*CONSTCOND*/0) 2640 2641 struct pool syn_cache_pool; 2642 2643 /* 2644 * We don't estimate RTT with SYNs, so each packet starts with the default 2645 * RTT and each timer step has a fixed timeout value. 2646 */ 2647 #define SYN_CACHE_TIMER_ARM(sc) \ 2648 do { \ 2649 TCPT_RANGESET((sc)->sc_rxtcur, \ 2650 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2651 TCPTV_REXMTMAX); \ 2652 callout_reset(&(sc)->sc_timer, \ 2653 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \ 2654 } while (/*CONSTCOND*/0) 2655 2656 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 2657 2658 void 2659 syn_cache_init() 2660 { 2661 int i; 2662 2663 /* Initialize the hash buckets. */ 2664 for (i = 0; i < tcp_syn_cache_size; i++) 2665 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 2666 2667 /* Initialize the syn cache pool. */ 2668 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2669 "synpl", NULL); 2670 } 2671 2672 void 2673 syn_cache_insert(sc, tp) 2674 struct syn_cache *sc; 2675 struct tcpcb *tp; 2676 { 2677 struct syn_cache_head *scp; 2678 struct syn_cache *sc2; 2679 int s; 2680 2681 /* 2682 * If there are no entries in the hash table, reinitialize 2683 * the hash secrets. 2684 */ 2685 if (syn_cache_count == 0) { 2686 struct timeval tv; 2687 microtime(&tv); 2688 syn_hash1 = random() ^ (u_long)≻ 2689 syn_hash2 = random() ^ tv.tv_usec; 2690 } 2691 2692 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2693 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2694 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2695 2696 /* 2697 * Make sure that we don't overflow the per-bucket 2698 * limit or the total cache size limit. 2699 */ 2700 s = splsoftnet(); 2701 if (scp->sch_length >= tcp_syn_bucket_limit) { 2702 tcpstat.tcps_sc_bucketoverflow++; 2703 /* 2704 * The bucket is full. Toss the oldest element in the 2705 * bucket. This will be the first entry in the bucket. 2706 */ 2707 sc2 = TAILQ_FIRST(&scp->sch_bucket); 2708 #ifdef DIAGNOSTIC 2709 /* 2710 * This should never happen; we should always find an 2711 * entry in our bucket. 2712 */ 2713 if (sc2 == NULL) 2714 panic("syn_cache_insert: bucketoverflow: impossible"); 2715 #endif 2716 SYN_CACHE_RM(sc2); 2717 SYN_CACHE_PUT(sc2); 2718 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2719 struct syn_cache_head *scp2, *sce; 2720 2721 tcpstat.tcps_sc_overflowed++; 2722 /* 2723 * The cache is full. Toss the oldest entry in the 2724 * first non-empty bucket we can find. 2725 * 2726 * XXX We would really like to toss the oldest 2727 * entry in the cache, but we hope that this 2728 * condition doesn't happen very often. 2729 */ 2730 scp2 = scp; 2731 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 2732 sce = &tcp_syn_cache[tcp_syn_cache_size]; 2733 for (++scp2; scp2 != scp; scp2++) { 2734 if (scp2 >= sce) 2735 scp2 = &tcp_syn_cache[0]; 2736 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 2737 break; 2738 } 2739 #ifdef DIAGNOSTIC 2740 /* 2741 * This should never happen; we should always find a 2742 * non-empty bucket. 2743 */ 2744 if (scp2 == scp) 2745 panic("syn_cache_insert: cacheoverflow: " 2746 "impossible"); 2747 #endif 2748 } 2749 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 2750 SYN_CACHE_RM(sc2); 2751 SYN_CACHE_PUT(sc2); 2752 } 2753 2754 /* 2755 * Initialize the entry's timer. 2756 */ 2757 sc->sc_rxttot = 0; 2758 sc->sc_rxtshift = 0; 2759 SYN_CACHE_TIMER_ARM(sc); 2760 2761 /* Link it from tcpcb entry */ 2762 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2763 2764 /* Put it into the bucket. */ 2765 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 2766 scp->sch_length++; 2767 syn_cache_count++; 2768 2769 tcpstat.tcps_sc_added++; 2770 splx(s); 2771 } 2772 2773 /* 2774 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2775 * If we have retransmitted an entry the maximum number of times, expire 2776 * that entry. 2777 */ 2778 void 2779 syn_cache_timer(void *arg) 2780 { 2781 struct syn_cache *sc = arg; 2782 int s; 2783 2784 s = splsoftnet(); 2785 2786 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 2787 /* Drop it -- too many retransmissions. */ 2788 goto dropit; 2789 } 2790 2791 /* 2792 * Compute the total amount of time this entry has 2793 * been on a queue. If this entry has been on longer 2794 * than the keep alive timer would allow, expire it. 2795 */ 2796 sc->sc_rxttot += sc->sc_rxtcur; 2797 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) 2798 goto dropit; 2799 2800 tcpstat.tcps_sc_retransmitted++; 2801 (void) syn_cache_respond(sc, NULL); 2802 2803 /* Advance the timer back-off. */ 2804 sc->sc_rxtshift++; 2805 SYN_CACHE_TIMER_ARM(sc); 2806 2807 splx(s); 2808 return; 2809 2810 dropit: 2811 tcpstat.tcps_sc_timed_out++; 2812 SYN_CACHE_RM(sc); 2813 SYN_CACHE_PUT(sc); 2814 splx(s); 2815 } 2816 2817 /* 2818 * Remove syn cache created by the specified tcb entry, 2819 * because this does not make sense to keep them 2820 * (if there's no tcb entry, syn cache entry will never be used) 2821 */ 2822 void 2823 syn_cache_cleanup(tp) 2824 struct tcpcb *tp; 2825 { 2826 struct syn_cache *sc, *nsc; 2827 int s; 2828 2829 s = splsoftnet(); 2830 2831 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2832 nsc = LIST_NEXT(sc, sc_tpq); 2833 2834 #ifdef DIAGNOSTIC 2835 if (sc->sc_tp != tp) 2836 panic("invalid sc_tp in syn_cache_cleanup"); 2837 #endif 2838 SYN_CACHE_RM(sc); 2839 SYN_CACHE_PUT(sc); 2840 } 2841 /* just for safety */ 2842 LIST_INIT(&tp->t_sc); 2843 2844 splx(s); 2845 } 2846 2847 /* 2848 * Find an entry in the syn cache. 2849 */ 2850 struct syn_cache * 2851 syn_cache_lookup(src, dst, headp) 2852 struct sockaddr *src; 2853 struct sockaddr *dst; 2854 struct syn_cache_head **headp; 2855 { 2856 struct syn_cache *sc; 2857 struct syn_cache_head *scp; 2858 u_int32_t hash; 2859 int s; 2860 2861 SYN_HASHALL(hash, src, dst); 2862 2863 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2864 *headp = scp; 2865 s = splsoftnet(); 2866 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 2867 sc = TAILQ_NEXT(sc, sc_bucketq)) { 2868 if (sc->sc_hash != hash) 2869 continue; 2870 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2871 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2872 splx(s); 2873 return (sc); 2874 } 2875 } 2876 splx(s); 2877 return (NULL); 2878 } 2879 2880 /* 2881 * This function gets called when we receive an ACK for a 2882 * socket in the LISTEN state. We look up the connection 2883 * in the syn cache, and if its there, we pull it out of 2884 * the cache and turn it into a full-blown connection in 2885 * the SYN-RECEIVED state. 2886 * 2887 * The return values may not be immediately obvious, and their effects 2888 * can be subtle, so here they are: 2889 * 2890 * NULL SYN was not found in cache; caller should drop the 2891 * packet and send an RST. 2892 * 2893 * -1 We were unable to create the new connection, and are 2894 * aborting it. An ACK,RST is being sent to the peer 2895 * (unless we got screwey sequence numbners; see below), 2896 * because the 3-way handshake has been completed. Caller 2897 * should not free the mbuf, since we may be using it. If 2898 * we are not, we will free it. 2899 * 2900 * Otherwise, the return value is a pointer to the new socket 2901 * associated with the connection. 2902 */ 2903 struct socket * 2904 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2905 struct sockaddr *src; 2906 struct sockaddr *dst; 2907 struct tcphdr *th; 2908 unsigned int hlen, tlen; 2909 struct socket *so; 2910 struct mbuf *m; 2911 { 2912 struct syn_cache *sc; 2913 struct syn_cache_head *scp; 2914 struct inpcb *inp = NULL; 2915 #ifdef INET6 2916 struct in6pcb *in6p = NULL; 2917 #endif 2918 struct tcpcb *tp = 0; 2919 struct mbuf *am; 2920 int s; 2921 struct socket *oso; 2922 2923 s = splsoftnet(); 2924 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2925 splx(s); 2926 return (NULL); 2927 } 2928 2929 /* 2930 * Verify the sequence and ack numbers. Try getting the correct 2931 * response again. 2932 */ 2933 if ((th->th_ack != sc->sc_iss + 1) || 2934 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2935 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2936 (void) syn_cache_respond(sc, m); 2937 splx(s); 2938 return ((struct socket *)(-1)); 2939 } 2940 2941 /* Remove this cache entry */ 2942 SYN_CACHE_RM(sc); 2943 splx(s); 2944 2945 /* 2946 * Ok, create the full blown connection, and set things up 2947 * as they would have been set up if we had created the 2948 * connection when the SYN arrived. If we can't create 2949 * the connection, abort it. 2950 */ 2951 /* 2952 * inp still has the OLD in_pcb stuff, set the 2953 * v6-related flags on the new guy, too. This is 2954 * done particularly for the case where an AF_INET6 2955 * socket is bound only to a port, and a v4 connection 2956 * comes in on that port. 2957 * we also copy the flowinfo from the original pcb 2958 * to the new one. 2959 */ 2960 { 2961 struct inpcb *parentinpcb; 2962 2963 parentinpcb = (struct inpcb *)so->so_pcb; 2964 2965 oso = so; 2966 so = sonewconn(so, SS_ISCONNECTED); 2967 if (so == NULL) 2968 goto resetandabort; 2969 2970 switch (so->so_proto->pr_domain->dom_family) { 2971 #ifdef INET 2972 case AF_INET: 2973 inp = sotoinpcb(so); 2974 break; 2975 #endif 2976 #ifdef INET6 2977 case AF_INET6: 2978 in6p = sotoin6pcb(so); 2979 break; 2980 #endif 2981 } 2982 } 2983 switch (src->sa_family) { 2984 #ifdef INET 2985 case AF_INET: 2986 if (inp) { 2987 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2988 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2989 inp->inp_options = ip_srcroute(); 2990 in_pcbstate(inp, INP_BOUND); 2991 if (inp->inp_options == NULL) { 2992 inp->inp_options = sc->sc_ipopts; 2993 sc->sc_ipopts = NULL; 2994 } 2995 } 2996 #ifdef INET6 2997 else if (in6p) { 2998 /* IPv4 packet to AF_INET6 socket */ 2999 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 3000 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 3001 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 3002 &in6p->in6p_laddr.s6_addr32[3], 3003 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 3004 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 3005 in6totcpcb(in6p)->t_family = AF_INET; 3006 } 3007 #endif 3008 break; 3009 #endif 3010 #ifdef INET6 3011 case AF_INET6: 3012 if (in6p) { 3013 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 3014 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 3015 #if 0 3016 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 3017 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 3018 #endif 3019 } 3020 break; 3021 #endif 3022 } 3023 #ifdef INET6 3024 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 3025 struct in6pcb *oin6p = sotoin6pcb(oso); 3026 /* inherit socket options from the listening socket */ 3027 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 3028 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 3029 m_freem(in6p->in6p_options); 3030 in6p->in6p_options = 0; 3031 } 3032 ip6_savecontrol(in6p, &in6p->in6p_options, 3033 mtod(m, struct ip6_hdr *), m); 3034 } 3035 #endif 3036 3037 #ifdef IPSEC 3038 /* 3039 * we make a copy of policy, instead of sharing the policy, 3040 * for better behavior in terms of SA lookup and dead SA removal. 3041 */ 3042 if (inp) { 3043 /* copy old policy into new socket's */ 3044 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 3045 printf("tcp_input: could not copy policy\n"); 3046 } 3047 #ifdef INET6 3048 else if (in6p) { 3049 /* copy old policy into new socket's */ 3050 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp)) 3051 printf("tcp_input: could not copy policy\n"); 3052 } 3053 #endif 3054 #endif 3055 3056 /* 3057 * Give the new socket our cached route reference. 3058 */ 3059 if (inp) 3060 inp->inp_route = sc->sc_route4; /* struct assignment */ 3061 #ifdef INET6 3062 else 3063 in6p->in6p_route = sc->sc_route6; 3064 #endif 3065 sc->sc_route4.ro_rt = NULL; 3066 3067 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3068 if (am == NULL) 3069 goto resetandabort; 3070 am->m_len = src->sa_len; 3071 bcopy(src, mtod(am, caddr_t), src->sa_len); 3072 if (inp) { 3073 if (in_pcbconnect(inp, am)) { 3074 (void) m_free(am); 3075 goto resetandabort; 3076 } 3077 } 3078 #ifdef INET6 3079 else if (in6p) { 3080 if (src->sa_family == AF_INET) { 3081 /* IPv4 packet to AF_INET6 socket */ 3082 struct sockaddr_in6 *sin6; 3083 sin6 = mtod(am, struct sockaddr_in6 *); 3084 am->m_len = sizeof(*sin6); 3085 bzero(sin6, sizeof(*sin6)); 3086 sin6->sin6_family = AF_INET6; 3087 sin6->sin6_len = sizeof(*sin6); 3088 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 3089 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 3090 bcopy(&((struct sockaddr_in *)src)->sin_addr, 3091 &sin6->sin6_addr.s6_addr32[3], 3092 sizeof(sin6->sin6_addr.s6_addr32[3])); 3093 } 3094 if (in6_pcbconnect(in6p, am)) { 3095 (void) m_free(am); 3096 goto resetandabort; 3097 } 3098 } 3099 #endif 3100 else { 3101 (void) m_free(am); 3102 goto resetandabort; 3103 } 3104 (void) m_free(am); 3105 3106 if (inp) 3107 tp = intotcpcb(inp); 3108 #ifdef INET6 3109 else if (in6p) 3110 tp = in6totcpcb(in6p); 3111 #endif 3112 else 3113 tp = NULL; 3114 if (sc->sc_request_r_scale != 15) { 3115 tp->requested_s_scale = sc->sc_requested_s_scale; 3116 tp->request_r_scale = sc->sc_request_r_scale; 3117 tp->snd_scale = sc->sc_requested_s_scale; 3118 tp->rcv_scale = sc->sc_request_r_scale; 3119 tp->t_flags |= TF_RCVD_SCALE; 3120 } 3121 if (sc->sc_flags & SCF_TIMESTAMP) 3122 tp->t_flags |= TF_RCVD_TSTMP; 3123 tp->ts_timebase = sc->sc_timebase; 3124 3125 tp->t_template = tcp_template(tp); 3126 if (tp->t_template == 0) { 3127 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3128 so = NULL; 3129 m_freem(m); 3130 goto abort; 3131 } 3132 3133 tp->iss = sc->sc_iss; 3134 tp->irs = sc->sc_irs; 3135 tcp_sendseqinit(tp); 3136 tcp_rcvseqinit(tp); 3137 tp->t_state = TCPS_SYN_RECEIVED; 3138 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 3139 tcpstat.tcps_accepts++; 3140 3141 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3142 tp->t_ourmss = sc->sc_ourmaxseg; 3143 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3144 3145 /* 3146 * Initialize the initial congestion window. If we 3147 * had to retransmit the SYN,ACK, we must initialize cwnd 3148 * to 1 segment (i.e. the Loss Window). 3149 */ 3150 if (sc->sc_rxtshift) 3151 tp->snd_cwnd = tp->t_peermss; 3152 else 3153 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3154 3155 tcp_rmx_rtt(tp); 3156 tp->snd_wl1 = sc->sc_irs; 3157 tp->rcv_up = sc->sc_irs + 1; 3158 3159 /* 3160 * This is what whould have happened in tcp_ouput() when 3161 * the SYN,ACK was sent. 3162 */ 3163 tp->snd_up = tp->snd_una; 3164 tp->snd_max = tp->snd_nxt = tp->iss+1; 3165 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3166 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3167 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3168 tp->last_ack_sent = tp->rcv_nxt; 3169 3170 tcpstat.tcps_sc_completed++; 3171 SYN_CACHE_PUT(sc); 3172 return (so); 3173 3174 resetandabort: 3175 (void) tcp_respond(NULL, m, m, th, 3176 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3177 abort: 3178 if (so != NULL) 3179 (void) soabort(so); 3180 SYN_CACHE_PUT(sc); 3181 tcpstat.tcps_sc_aborted++; 3182 return ((struct socket *)(-1)); 3183 } 3184 3185 /* 3186 * This function is called when we get a RST for a 3187 * non-existent connection, so that we can see if the 3188 * connection is in the syn cache. If it is, zap it. 3189 */ 3190 3191 void 3192 syn_cache_reset(src, dst, th) 3193 struct sockaddr *src; 3194 struct sockaddr *dst; 3195 struct tcphdr *th; 3196 { 3197 struct syn_cache *sc; 3198 struct syn_cache_head *scp; 3199 int s = splsoftnet(); 3200 3201 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3202 splx(s); 3203 return; 3204 } 3205 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3206 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3207 splx(s); 3208 return; 3209 } 3210 SYN_CACHE_RM(sc); 3211 splx(s); 3212 tcpstat.tcps_sc_reset++; 3213 SYN_CACHE_PUT(sc); 3214 } 3215 3216 void 3217 syn_cache_unreach(src, dst, th) 3218 struct sockaddr *src; 3219 struct sockaddr *dst; 3220 struct tcphdr *th; 3221 { 3222 struct syn_cache *sc; 3223 struct syn_cache_head *scp; 3224 int s; 3225 3226 s = splsoftnet(); 3227 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3228 splx(s); 3229 return; 3230 } 3231 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3232 if (ntohl (th->th_seq) != sc->sc_iss) { 3233 splx(s); 3234 return; 3235 } 3236 3237 /* 3238 * If we've rertransmitted 3 times and this is our second error, 3239 * we remove the entry. Otherwise, we allow it to continue on. 3240 * This prevents us from incorrectly nuking an entry during a 3241 * spurious network outage. 3242 * 3243 * See tcp_notify(). 3244 */ 3245 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3246 sc->sc_flags |= SCF_UNREACH; 3247 splx(s); 3248 return; 3249 } 3250 3251 SYN_CACHE_RM(sc); 3252 splx(s); 3253 tcpstat.tcps_sc_unreach++; 3254 SYN_CACHE_PUT(sc); 3255 } 3256 3257 /* 3258 * Given a LISTEN socket and an inbound SYN request, add 3259 * this to the syn cache, and send back a segment: 3260 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3261 * to the source. 3262 * 3263 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3264 * Doing so would require that we hold onto the data and deliver it 3265 * to the application. However, if we are the target of a SYN-flood 3266 * DoS attack, an attacker could send data which would eventually 3267 * consume all available buffer space if it were ACKed. By not ACKing 3268 * the data, we avoid this DoS scenario. 3269 */ 3270 3271 int 3272 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3273 struct sockaddr *src; 3274 struct sockaddr *dst; 3275 struct tcphdr *th; 3276 unsigned int hlen; 3277 struct socket *so; 3278 struct mbuf *m; 3279 u_char *optp; 3280 int optlen; 3281 struct tcp_opt_info *oi; 3282 { 3283 struct tcpcb tb, *tp; 3284 long win; 3285 struct syn_cache *sc; 3286 struct syn_cache_head *scp; 3287 struct mbuf *ipopts; 3288 3289 tp = sototcpcb(so); 3290 3291 /* 3292 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3293 * 3294 * Note this check is performed in tcp_input() very early on. 3295 */ 3296 3297 /* 3298 * Initialize some local state. 3299 */ 3300 win = sbspace(&so->so_rcv); 3301 if (win > TCP_MAXWIN) 3302 win = TCP_MAXWIN; 3303 3304 switch (src->sa_family) { 3305 #ifdef INET 3306 case AF_INET: 3307 /* 3308 * Remember the IP options, if any. 3309 */ 3310 ipopts = ip_srcroute(); 3311 break; 3312 #endif 3313 default: 3314 ipopts = NULL; 3315 } 3316 3317 if (optp) { 3318 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3319 tcp_dooptions(&tb, optp, optlen, th, oi); 3320 } else 3321 tb.t_flags = 0; 3322 3323 /* 3324 * See if we already have an entry for this connection. 3325 * If we do, resend the SYN,ACK. We do not count this 3326 * as a retransmission (XXX though maybe we should). 3327 */ 3328 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3329 tcpstat.tcps_sc_dupesyn++; 3330 if (ipopts) { 3331 /* 3332 * If we were remembering a previous source route, 3333 * forget it and use the new one we've been given. 3334 */ 3335 if (sc->sc_ipopts) 3336 (void) m_free(sc->sc_ipopts); 3337 sc->sc_ipopts = ipopts; 3338 } 3339 sc->sc_timestamp = tb.ts_recent; 3340 if (syn_cache_respond(sc, m) == 0) { 3341 tcpstat.tcps_sndacks++; 3342 tcpstat.tcps_sndtotal++; 3343 } 3344 return (1); 3345 } 3346 3347 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3348 if (sc == NULL) { 3349 if (ipopts) 3350 (void) m_free(ipopts); 3351 return (0); 3352 } 3353 3354 /* 3355 * Fill in the cache, and put the necessary IP and TCP 3356 * options into the reply. 3357 */ 3358 callout_init(&sc->sc_timer); 3359 bzero(sc, sizeof(struct syn_cache)); 3360 bcopy(src, &sc->sc_src, src->sa_len); 3361 bcopy(dst, &sc->sc_dst, dst->sa_len); 3362 sc->sc_flags = 0; 3363 sc->sc_ipopts = ipopts; 3364 sc->sc_irs = th->th_seq; 3365 switch (src->sa_family) { 3366 #ifdef INET 3367 case AF_INET: 3368 { 3369 struct sockaddr_in *srcin = (void *) src; 3370 struct sockaddr_in *dstin = (void *) dst; 3371 3372 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 3373 &srcin->sin_addr, dstin->sin_port, 3374 srcin->sin_port, sizeof(dstin->sin_addr), 0); 3375 break; 3376 } 3377 #endif /* INET */ 3378 #ifdef INET6 3379 case AF_INET6: 3380 { 3381 struct sockaddr_in6 *srcin6 = (void *) src; 3382 struct sockaddr_in6 *dstin6 = (void *) dst; 3383 3384 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 3385 &srcin6->sin6_addr, dstin6->sin6_port, 3386 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); 3387 break; 3388 } 3389 #endif /* INET6 */ 3390 } 3391 sc->sc_peermaxseg = oi->maxseg; 3392 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3393 m->m_pkthdr.rcvif : NULL, 3394 sc->sc_src.sa.sa_family); 3395 sc->sc_win = win; 3396 sc->sc_timebase = tcp_now; /* see tcp_newtcpcb() */ 3397 sc->sc_timestamp = tb.ts_recent; 3398 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3399 sc->sc_flags |= SCF_TIMESTAMP; 3400 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3401 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3402 sc->sc_requested_s_scale = tb.requested_s_scale; 3403 sc->sc_request_r_scale = 0; 3404 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3405 TCP_MAXWIN << sc->sc_request_r_scale < 3406 so->so_rcv.sb_hiwat) 3407 sc->sc_request_r_scale++; 3408 } else { 3409 sc->sc_requested_s_scale = 15; 3410 sc->sc_request_r_scale = 15; 3411 } 3412 sc->sc_tp = tp; 3413 if (syn_cache_respond(sc, m) == 0) { 3414 syn_cache_insert(sc, tp); 3415 tcpstat.tcps_sndacks++; 3416 tcpstat.tcps_sndtotal++; 3417 } else { 3418 SYN_CACHE_PUT(sc); 3419 tcpstat.tcps_sc_dropped++; 3420 } 3421 return (1); 3422 } 3423 3424 int 3425 syn_cache_respond(sc, m) 3426 struct syn_cache *sc; 3427 struct mbuf *m; 3428 { 3429 struct route *ro; 3430 u_int8_t *optp; 3431 int optlen, error; 3432 u_int16_t tlen; 3433 struct ip *ip = NULL; 3434 #ifdef INET6 3435 struct ip6_hdr *ip6 = NULL; 3436 #endif 3437 struct tcphdr *th; 3438 u_int hlen; 3439 3440 switch (sc->sc_src.sa.sa_family) { 3441 case AF_INET: 3442 hlen = sizeof(struct ip); 3443 ro = &sc->sc_route4; 3444 break; 3445 #ifdef INET6 3446 case AF_INET6: 3447 hlen = sizeof(struct ip6_hdr); 3448 ro = (struct route *)&sc->sc_route6; 3449 break; 3450 #endif 3451 default: 3452 if (m) 3453 m_freem(m); 3454 return EAFNOSUPPORT; 3455 } 3456 3457 /* Compute the size of the TCP options. */ 3458 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3459 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3460 3461 tlen = hlen + sizeof(struct tcphdr) + optlen; 3462 3463 /* 3464 * Create the IP+TCP header from scratch. 3465 */ 3466 if (m) 3467 m_freem(m); 3468 #ifdef DIAGNOSTIC 3469 if (max_linkhdr + tlen > MCLBYTES) 3470 return (ENOBUFS); 3471 #endif 3472 MGETHDR(m, M_DONTWAIT, MT_DATA); 3473 if (m && tlen > MHLEN) { 3474 MCLGET(m, M_DONTWAIT); 3475 if ((m->m_flags & M_EXT) == 0) { 3476 m_freem(m); 3477 m = NULL; 3478 } 3479 } 3480 if (m == NULL) 3481 return (ENOBUFS); 3482 3483 /* Fixup the mbuf. */ 3484 m->m_data += max_linkhdr; 3485 m->m_len = m->m_pkthdr.len = tlen; 3486 #ifdef IPSEC 3487 if (sc->sc_tp) { 3488 struct tcpcb *tp; 3489 struct socket *so; 3490 3491 tp = sc->sc_tp; 3492 if (tp->t_inpcb) 3493 so = tp->t_inpcb->inp_socket; 3494 #ifdef INET6 3495 else if (tp->t_in6pcb) 3496 so = tp->t_in6pcb->in6p_socket; 3497 #endif 3498 else 3499 so = NULL; 3500 /* use IPsec policy on listening socket, on SYN ACK */ 3501 if (ipsec_setsocket(m, so) != 0) { 3502 m_freem(m); 3503 return ENOBUFS; 3504 } 3505 } 3506 #endif 3507 m->m_pkthdr.rcvif = NULL; 3508 memset(mtod(m, u_char *), 0, tlen); 3509 3510 switch (sc->sc_src.sa.sa_family) { 3511 case AF_INET: 3512 ip = mtod(m, struct ip *); 3513 ip->ip_dst = sc->sc_src.sin.sin_addr; 3514 ip->ip_src = sc->sc_dst.sin.sin_addr; 3515 ip->ip_p = IPPROTO_TCP; 3516 th = (struct tcphdr *)(ip + 1); 3517 th->th_dport = sc->sc_src.sin.sin_port; 3518 th->th_sport = sc->sc_dst.sin.sin_port; 3519 break; 3520 #ifdef INET6 3521 case AF_INET6: 3522 ip6 = mtod(m, struct ip6_hdr *); 3523 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3524 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3525 ip6->ip6_nxt = IPPROTO_TCP; 3526 /* ip6_plen will be updated in ip6_output() */ 3527 th = (struct tcphdr *)(ip6 + 1); 3528 th->th_dport = sc->sc_src.sin6.sin6_port; 3529 th->th_sport = sc->sc_dst.sin6.sin6_port; 3530 break; 3531 #endif 3532 default: 3533 th = NULL; 3534 } 3535 3536 th->th_seq = htonl(sc->sc_iss); 3537 th->th_ack = htonl(sc->sc_irs + 1); 3538 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3539 th->th_flags = TH_SYN|TH_ACK; 3540 th->th_win = htons(sc->sc_win); 3541 /* th_sum already 0 */ 3542 /* th_urp already 0 */ 3543 3544 /* Tack on the TCP options. */ 3545 optp = (u_int8_t *)(th + 1); 3546 *optp++ = TCPOPT_MAXSEG; 3547 *optp++ = 4; 3548 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3549 *optp++ = sc->sc_ourmaxseg & 0xff; 3550 3551 if (sc->sc_request_r_scale != 15) { 3552 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3553 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3554 sc->sc_request_r_scale); 3555 optp += 4; 3556 } 3557 3558 if (sc->sc_flags & SCF_TIMESTAMP) { 3559 u_int32_t *lp = (u_int32_t *)(optp); 3560 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3561 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3562 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 3563 *lp = htonl(sc->sc_timestamp); 3564 optp += TCPOLEN_TSTAMP_APPA; 3565 } 3566 3567 /* Compute the packet's checksum. */ 3568 switch (sc->sc_src.sa.sa_family) { 3569 case AF_INET: 3570 ip->ip_len = htons(tlen - hlen); 3571 th->th_sum = 0; 3572 th->th_sum = in_cksum(m, tlen); 3573 break; 3574 #ifdef INET6 3575 case AF_INET6: 3576 ip6->ip6_plen = htons(tlen - hlen); 3577 th->th_sum = 0; 3578 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3579 break; 3580 #endif 3581 } 3582 3583 /* 3584 * Fill in some straggling IP bits. Note the stack expects 3585 * ip_len to be in host order, for convenience. 3586 */ 3587 switch (sc->sc_src.sa.sa_family) { 3588 #ifdef INET 3589 case AF_INET: 3590 ip->ip_len = tlen; 3591 ip->ip_ttl = ip_defttl; 3592 /* XXX tos? */ 3593 break; 3594 #endif 3595 #ifdef INET6 3596 case AF_INET6: 3597 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3598 ip6->ip6_vfc |= IPV6_VERSION; 3599 ip6->ip6_plen = htons(tlen - hlen); 3600 /* ip6_hlim will be initialized afterwards */ 3601 /* XXX flowlabel? */ 3602 break; 3603 #endif 3604 } 3605 3606 switch (sc->sc_src.sa.sa_family) { 3607 #ifdef INET 3608 case AF_INET: 3609 error = ip_output(m, sc->sc_ipopts, ro, 3610 (ip_mtudisc ? IP_MTUDISC : 0), 3611 NULL); 3612 break; 3613 #endif 3614 #ifdef INET6 3615 case AF_INET6: 3616 ip6->ip6_hlim = in6_selecthlim(NULL, 3617 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3618 3619 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3620 0, NULL, NULL); 3621 break; 3622 #endif 3623 default: 3624 error = EAFNOSUPPORT; 3625 break; 3626 } 3627 return (error); 3628 } 3629