1 /* $NetBSD: tcp_input.c,v 1.101 1999/12/22 04:03:02 itojun Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 115 */ 116 117 /* 118 * TODO list for SYN cache stuff: 119 * 120 * Find room for a "state" field, which is needed to keep a 121 * compressed state for TIME_WAIT TCBs. It's been noted already 122 * that this is fairly important for very high-volume web and 123 * mail servers, which use a large number of short-lived 124 * connections. 125 */ 126 127 #include "opt_inet.h" 128 #include "opt_ipsec.h" 129 130 #include <sys/param.h> 131 #include <sys/systm.h> 132 #include <sys/malloc.h> 133 #include <sys/mbuf.h> 134 #include <sys/protosw.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/errno.h> 138 #include <sys/syslog.h> 139 #include <sys/pool.h> 140 #include <sys/domain.h> 141 142 #include <net/if.h> 143 #include <net/route.h> 144 #include <net/if_types.h> 145 146 #include <netinet/in.h> 147 #include <netinet/in_systm.h> 148 #include <netinet/ip.h> 149 #include <netinet/in_pcb.h> 150 #include <netinet/ip_var.h> 151 152 #ifdef INET6 153 #ifndef INET 154 #include <netinet/in.h> 155 #endif 156 #include <netinet/ip6.h> 157 #include <netinet6/in6_pcb.h> 158 #include <netinet6/ip6_var.h> 159 #include <netinet6/in6_var.h> 160 #include <netinet/icmp6.h> 161 #include <netinet6/nd6.h> 162 #endif 163 164 #ifdef PULLDOWN_TEST 165 #ifndef INET6 166 /* always need ip6.h for IP6_EXTHDR_GET */ 167 #include <netinet/ip6.h> 168 #endif 169 #endif 170 171 #include <netinet/tcp.h> 172 #include <netinet/tcp_fsm.h> 173 #include <netinet/tcp_seq.h> 174 #include <netinet/tcp_timer.h> 175 #include <netinet/tcp_var.h> 176 #include <netinet/tcpip.h> 177 #include <netinet/tcp_debug.h> 178 179 #include <machine/stdarg.h> 180 181 #ifdef IPSEC 182 #include <netinet6/ipsec.h> 183 #include <netkey/key.h> 184 #include <netkey/key_debug.h> 185 #endif /*IPSEC*/ 186 #ifdef INET6 187 #include "faith.h" 188 #endif 189 190 int tcprexmtthresh = 3; 191 int tcp_log_refused; 192 193 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 194 195 /* for modulo comparisons of timestamps */ 196 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 197 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 198 199 /* 200 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 201 */ 202 #ifdef INET6 203 #define ND6_HINT(tp) \ 204 do { \ 205 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \ 206 && tp->t_in6pcb->in6p_route.ro_rt) { \ 207 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL); \ 208 } \ 209 } while (0) 210 #else 211 #define ND6_HINT(tp) 212 #endif 213 214 /* 215 * Macro to compute ACK transmission behavior. Delay the ACK unless 216 * we have already delayed an ACK (must send an ACK every two segments). 217 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 218 * option is enabled. 219 */ 220 #define TCP_SETUP_ACK(tp, th) \ 221 do { \ 222 if ((tp)->t_flags & TF_DELACK || \ 223 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \ 224 tp->t_flags |= TF_ACKNOW; \ 225 else \ 226 TCP_SET_DELACK(tp); \ 227 } while (0) 228 229 int 230 tcp_reass(tp, th, m, tlen) 231 register struct tcpcb *tp; 232 register struct tcphdr *th; 233 struct mbuf *m; 234 int *tlen; 235 { 236 register struct ipqent *p, *q, *nq, *tiqe = NULL; 237 struct socket *so = NULL; 238 int pkt_flags; 239 tcp_seq pkt_seq; 240 unsigned pkt_len; 241 u_long rcvpartdupbyte = 0; 242 u_long rcvoobyte; 243 244 if (tp->t_inpcb) 245 so = tp->t_inpcb->inp_socket; 246 #ifdef INET6 247 else if (tp->t_in6pcb) 248 so = tp->t_in6pcb->in6p_socket; 249 #endif 250 251 TCP_REASS_LOCK_CHECK(tp); 252 253 /* 254 * Call with th==0 after become established to 255 * force pre-ESTABLISHED data up to user socket. 256 */ 257 if (th == 0) 258 goto present; 259 260 rcvoobyte = *tlen; 261 /* 262 * Copy these to local variables because the tcpiphdr 263 * gets munged while we are collapsing mbufs. 264 */ 265 pkt_seq = th->th_seq; 266 pkt_len = *tlen; 267 pkt_flags = th->th_flags; 268 /* 269 * Find a segment which begins after this one does. 270 */ 271 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) { 272 nq = q->ipqe_q.le_next; 273 /* 274 * If the received segment is just right after this 275 * fragment, merge the two together and then check 276 * for further overlaps. 277 */ 278 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 279 #ifdef TCPREASS_DEBUG 280 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 281 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 282 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 283 #endif 284 pkt_len += q->ipqe_len; 285 pkt_flags |= q->ipqe_flags; 286 pkt_seq = q->ipqe_seq; 287 m_cat(q->ipqe_m, m); 288 m = q->ipqe_m; 289 goto free_ipqe; 290 } 291 /* 292 * If the received segment is completely past this 293 * fragment, we need to go the next fragment. 294 */ 295 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 296 p = q; 297 continue; 298 } 299 /* 300 * If the fragment is past the received segment, 301 * it (or any following) can't be concatenated. 302 */ 303 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) 304 break; 305 /* 306 * We've received all the data in this segment before. 307 * mark it as a duplicate and return. 308 */ 309 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 310 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 311 tcpstat.tcps_rcvduppack++; 312 tcpstat.tcps_rcvdupbyte += pkt_len; 313 m_freem(m); 314 if (tiqe != NULL) 315 pool_put(&ipqent_pool, tiqe); 316 return (0); 317 } 318 /* 319 * Received segment completely overlaps this fragment 320 * so we drop the fragment (this keeps the temporal 321 * ordering of segments correct). 322 */ 323 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 324 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 325 rcvpartdupbyte += q->ipqe_len; 326 m_freem(q->ipqe_m); 327 goto free_ipqe; 328 } 329 /* 330 * RX'ed segment extends past the end of the 331 * fragment. Drop the overlapping bytes. Then 332 * merge the fragment and segment then treat as 333 * a longer received packet. 334 */ 335 if (SEQ_LT(q->ipqe_seq, pkt_seq) 336 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 337 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 338 #ifdef TCPREASS_DEBUG 339 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 340 tp, overlap, 341 pkt_seq, pkt_seq + pkt_len, pkt_len); 342 #endif 343 m_adj(m, overlap); 344 rcvpartdupbyte += overlap; 345 m_cat(q->ipqe_m, m); 346 m = q->ipqe_m; 347 pkt_seq = q->ipqe_seq; 348 pkt_len += q->ipqe_len - overlap; 349 rcvoobyte -= overlap; 350 goto free_ipqe; 351 } 352 /* 353 * RX'ed segment extends past the front of the 354 * fragment. Drop the overlapping bytes on the 355 * received packet. The packet will then be 356 * contatentated with this fragment a bit later. 357 */ 358 if (SEQ_GT(q->ipqe_seq, pkt_seq) 359 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 360 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 361 #ifdef TCPREASS_DEBUG 362 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 363 tp, overlap, 364 pkt_seq, pkt_seq + pkt_len, pkt_len); 365 #endif 366 m_adj(m, -overlap); 367 pkt_len -= overlap; 368 rcvpartdupbyte += overlap; 369 rcvoobyte -= overlap; 370 } 371 /* 372 * If the received segment immediates precedes this 373 * fragment then tack the fragment onto this segment 374 * and reinsert the data. 375 */ 376 if (q->ipqe_seq == pkt_seq + pkt_len) { 377 #ifdef TCPREASS_DEBUG 378 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 379 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 380 pkt_seq, pkt_seq + pkt_len, pkt_len); 381 #endif 382 pkt_len += q->ipqe_len; 383 pkt_flags |= q->ipqe_flags; 384 m_cat(m, q->ipqe_m); 385 LIST_REMOVE(q, ipqe_q); 386 LIST_REMOVE(q, ipqe_timeq); 387 if (tiqe == NULL) { 388 tiqe = q; 389 } else { 390 pool_put(&ipqent_pool, q); 391 } 392 break; 393 } 394 /* 395 * If the fragment is before the segment, remember it. 396 * When this loop is terminated, p will contain the 397 * pointer to fragment that is right before the received 398 * segment. 399 */ 400 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 401 p = q; 402 403 continue; 404 405 /* 406 * This is a common operation. It also will allow 407 * to save doing a malloc/free in most instances. 408 */ 409 free_ipqe: 410 LIST_REMOVE(q, ipqe_q); 411 LIST_REMOVE(q, ipqe_timeq); 412 if (tiqe == NULL) { 413 tiqe = q; 414 } else { 415 pool_put(&ipqent_pool, q); 416 } 417 } 418 419 /* 420 * Allocate a new queue entry since the received segment did not 421 * collapse onto any other out-of-order block; thus we are allocating 422 * a new block. If it had collapsed, tiqe would not be NULL and 423 * we would be reusing it. 424 * XXX If we can't, just drop the packet. XXX 425 */ 426 if (tiqe == NULL) { 427 tiqe = pool_get(&ipqent_pool, PR_NOWAIT); 428 if (tiqe == NULL) { 429 tcpstat.tcps_rcvmemdrop++; 430 m_freem(m); 431 return (0); 432 } 433 } 434 435 /* 436 * Update the counters. 437 */ 438 tcpstat.tcps_rcvoopack++; 439 tcpstat.tcps_rcvoobyte += rcvoobyte; 440 if (rcvpartdupbyte) { 441 tcpstat.tcps_rcvpartduppack++; 442 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte; 443 } 444 445 /* 446 * Insert the new fragment queue entry into both queues. 447 */ 448 tiqe->ipqe_m = m; 449 tiqe->ipqe_seq = pkt_seq; 450 tiqe->ipqe_len = pkt_len; 451 tiqe->ipqe_flags = pkt_flags; 452 if (p == NULL) { 453 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 454 #ifdef TCPREASS_DEBUG 455 if (tiqe->ipqe_seq != tp->rcv_nxt) 456 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 457 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 458 #endif 459 } else { 460 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 461 #ifdef TCPREASS_DEBUG 462 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 463 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 464 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 465 #endif 466 } 467 468 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 469 470 present: 471 /* 472 * Present data to user, advancing rcv_nxt through 473 * completed sequence space. 474 */ 475 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 476 return (0); 477 q = tp->segq.lh_first; 478 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 479 return (0); 480 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 481 return (0); 482 483 tp->rcv_nxt += q->ipqe_len; 484 pkt_flags = q->ipqe_flags & TH_FIN; 485 ND6_HINT(tp); 486 487 LIST_REMOVE(q, ipqe_q); 488 LIST_REMOVE(q, ipqe_timeq); 489 if (so->so_state & SS_CANTRCVMORE) 490 m_freem(q->ipqe_m); 491 else 492 sbappend(&so->so_rcv, q->ipqe_m); 493 pool_put(&ipqent_pool, q); 494 sorwakeup(so); 495 return (pkt_flags); 496 } 497 498 #if defined(INET6) && !defined(TCP6) 499 int 500 tcp6_input(mp, offp, proto) 501 struct mbuf **mp; 502 int *offp, proto; 503 { 504 struct mbuf *m = *mp; 505 506 #if defined(NFAITH) && 0 < NFAITH 507 if (m->m_pkthdr.rcvif) { 508 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 509 /* XXX send icmp6 host/port unreach? */ 510 m_freem(m); 511 return IPPROTO_DONE; 512 } 513 } 514 #endif 515 516 /* 517 * draft-itojun-ipv6-tcp-to-anycast 518 * better place to put this in? 519 */ 520 if (m->m_flags & M_ANYCAST6) { 521 struct ip6_hdr *ip6; 522 if (m->m_len < sizeof(struct ip6_hdr)) { 523 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 524 tcpstat.tcps_rcvshort++; 525 return IPPROTO_DONE; 526 } 527 } 528 ip6 = mtod(m, struct ip6_hdr *); 529 icmp6_error(m, ICMP6_DST_UNREACH, 530 ICMP6_DST_UNREACH_ADDR, 531 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 532 return IPPROTO_DONE; 533 } 534 535 tcp_input(m, *offp, proto); 536 return IPPROTO_DONE; 537 } 538 #endif 539 540 /* 541 * TCP input routine, follows pages 65-76 of the 542 * protocol specification dated September, 1981 very closely. 543 */ 544 void 545 #if __STDC__ 546 tcp_input(struct mbuf *m, ...) 547 #else 548 tcp_input(m, va_alist) 549 register struct mbuf *m; 550 #endif 551 { 552 int proto; 553 register struct tcphdr *th; 554 struct ip *ip; 555 register struct inpcb *inp; 556 #ifdef INET6 557 struct ip6_hdr *ip6; 558 register struct in6pcb *in6p; 559 #endif 560 caddr_t optp = NULL; 561 int optlen = 0; 562 int len, tlen, toff, hdroptlen = 0; 563 register struct tcpcb *tp = 0; 564 register int tiflags; 565 struct socket *so = NULL; 566 int todrop, acked, ourfinisacked, needoutput = 0; 567 short ostate = 0; 568 int iss = 0; 569 u_long tiwin; 570 struct tcp_opt_info opti; 571 int off, iphlen; 572 va_list ap; 573 int af; /* af on the wire */ 574 struct mbuf *tcp_saveti = NULL; 575 576 va_start(ap, m); 577 toff = va_arg(ap, int); 578 proto = va_arg(ap, int); 579 va_end(ap); 580 581 tcpstat.tcps_rcvtotal++; 582 583 bzero(&opti, sizeof(opti)); 584 opti.ts_present = 0; 585 opti.maxseg = 0; 586 587 /* 588 * Get IP and TCP header together in first mbuf. 589 * Note: IP leaves IP header in first mbuf. 590 */ 591 ip = mtod(m, struct ip *); 592 #ifdef INET6 593 ip6 = NULL; 594 #endif 595 switch (ip->ip_v) { 596 case 4: 597 af = AF_INET; 598 iphlen = sizeof(struct ip); 599 #ifndef PULLDOWN_TEST 600 /* would like to get rid of this... */ 601 if (toff > sizeof (struct ip)) { 602 ip_stripoptions(m, (struct mbuf *)0); 603 toff = sizeof(struct ip); 604 } 605 if (m->m_len < toff + sizeof (struct tcphdr)) { 606 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) { 607 tcpstat.tcps_rcvshort++; 608 return; 609 } 610 } 611 ip = mtod(m, struct ip *); 612 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 613 #else 614 ip = mtod(m, struct ip *); 615 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 616 sizeof(struct tcphdr)); 617 if (th == NULL) { 618 tcpstat.tcps_rcvshort++; 619 return; 620 } 621 #endif 622 623 /* 624 * Checksum extended TCP header and data. 625 */ 626 len = ip->ip_len; 627 tlen = len - toff; 628 #ifndef PULLDOWN_TEST 629 { 630 struct ipovly *ipov; 631 ipov = (struct ipovly *)ip; 632 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 633 ipov->ih_len = htons(tlen); 634 } 635 if (in_cksum(m, len) != 0) { 636 tcpstat.tcps_rcvbadsum++; 637 goto drop; 638 } 639 #else 640 if (in4_cksum(m, IPPROTO_TCP, toff, tlen) != 0) { 641 tcpstat.tcps_rcvbadsum++; 642 goto drop; 643 } 644 #endif 645 break; 646 #ifdef INET6 647 case 6: 648 ip = NULL; 649 iphlen = sizeof(struct ip6_hdr); 650 af = AF_INET6; 651 #ifndef PULLDOWN_TEST 652 if (m->m_len < toff + sizeof(struct tcphdr)) { 653 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/ 654 if (m == NULL) { 655 tcpstat.tcps_rcvshort++; 656 return; 657 } 658 } 659 ip6 = mtod(m, struct ip6_hdr *); 660 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 661 #else 662 ip6 = mtod(m, struct ip6_hdr *); 663 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 664 sizeof(struct tcphdr)); 665 if (th == NULL) { 666 tcpstat.tcps_rcvshort++; 667 return; 668 } 669 #endif 670 671 /* Be proactive about malicious use of IPv4 mapped address */ 672 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 673 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 674 /* XXX stat */ 675 goto drop; 676 } 677 678 /* 679 * Checksum extended TCP header and data. 680 */ 681 len = m->m_pkthdr.len; 682 tlen = len - toff; 683 if (in6_cksum(m, IPPROTO_TCP, toff, tlen)) { 684 tcpstat.tcps_rcvbadsum++; 685 goto drop; 686 } 687 break; 688 #endif 689 default: 690 m_freem(m); 691 return; 692 } 693 694 /* 695 * Check that TCP offset makes sense, 696 * pull out TCP options and adjust length. XXX 697 */ 698 off = th->th_off << 2; 699 if (off < sizeof (struct tcphdr) || off > tlen) { 700 tcpstat.tcps_rcvbadoff++; 701 goto drop; 702 } 703 tlen -= off; 704 705 /* 706 * tcp_input() has been modified to use tlen to mean the TCP data 707 * length throughout the function. Other functions can use 708 * m->m_pkthdr.len as the basis for calculating the TCP data length. 709 * rja 710 */ 711 712 if (off > sizeof (struct tcphdr)) { 713 #ifndef PULLDOWN_TEST 714 if (m->m_len < toff + off) { 715 if ((m = m_pullup(m, toff + off)) == 0) { 716 tcpstat.tcps_rcvshort++; 717 return; 718 } 719 switch (af) { 720 case AF_INET: 721 ip = mtod(m, struct ip *); 722 break; 723 #ifdef INET6 724 case AF_INET6: 725 ip6 = mtod(m, struct ip6_hdr *); 726 break; 727 #endif 728 } 729 th = (struct tcphdr *)(mtod(m, caddr_t) + toff); 730 } 731 #else 732 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 733 if (th == NULL) { 734 tcpstat.tcps_rcvshort++; 735 return; 736 } 737 /* 738 * NOTE: ip/ip6 will not be affected by m_pulldown() 739 * (as they're before toff) and we don't need to update those. 740 */ 741 #endif 742 optlen = off - sizeof (struct tcphdr); 743 optp = ((caddr_t)th) + sizeof(struct tcphdr); 744 /* 745 * Do quick retrieval of timestamp options ("options 746 * prediction?"). If timestamp is the only option and it's 747 * formatted as recommended in RFC 1323 appendix A, we 748 * quickly get the values now and not bother calling 749 * tcp_dooptions(), etc. 750 */ 751 if ((optlen == TCPOLEN_TSTAMP_APPA || 752 (optlen > TCPOLEN_TSTAMP_APPA && 753 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 754 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 755 (th->th_flags & TH_SYN) == 0) { 756 opti.ts_present = 1; 757 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 758 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 759 optp = NULL; /* we've parsed the options */ 760 } 761 } 762 tiflags = th->th_flags; 763 764 /* 765 * Convert TCP protocol specific fields to host format. 766 */ 767 NTOHL(th->th_seq); 768 NTOHL(th->th_ack); 769 NTOHS(th->th_win); 770 NTOHS(th->th_urp); 771 772 /* 773 * Locate pcb for segment. 774 */ 775 findpcb: 776 inp = NULL; 777 #ifdef INET6 778 in6p = NULL; 779 #endif 780 switch (af) { 781 case AF_INET: 782 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 783 ip->ip_dst, th->th_dport); 784 if (inp == 0) { 785 ++tcpstat.tcps_pcbhashmiss; 786 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 787 } 788 #if defined(INET6) && !defined(TCP6) 789 if (inp == 0) { 790 struct in6_addr s, d; 791 792 /* mapped addr case */ 793 bzero(&s, sizeof(s)); 794 s.s6_addr16[5] = htons(0xffff); 795 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src)); 796 bzero(&d, sizeof(d)); 797 d.s6_addr16[5] = htons(0xffff); 798 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst)); 799 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport, 800 &d, th->th_dport, 0); 801 if (in6p == 0) { 802 ++tcpstat.tcps_pcbhashmiss; 803 in6p = in6_pcblookup_bind(&tcb6, &d, 804 th->th_dport, 0); 805 } 806 } 807 #endif 808 #ifndef INET6 809 if (inp == 0) 810 #else 811 if (inp == 0 && in6p == 0) 812 #endif 813 { 814 ++tcpstat.tcps_noport; 815 if (tcp_log_refused && (tiflags & TH_SYN)) { 816 #ifndef INET6 817 char src[4*sizeof "123"]; 818 char dst[4*sizeof "123"]; 819 #else 820 char src[INET6_ADDRSTRLEN]; 821 char dst[INET6_ADDRSTRLEN]; 822 #endif 823 if (ip) { 824 strcpy(src, inet_ntoa(ip->ip_src)); 825 strcpy(dst, inet_ntoa(ip->ip_dst)); 826 } 827 #ifdef INET6 828 else if (ip6) { 829 strcpy(src, ip6_sprintf(&ip6->ip6_src)); 830 strcpy(dst, ip6_sprintf(&ip6->ip6_dst)); 831 } 832 #endif 833 else { 834 strcpy(src, "(unknown)"); 835 strcpy(dst, "(unknown)"); 836 } 837 log(LOG_INFO, 838 "Connection attempt to TCP %s:%d from %s:%d\n", 839 dst, ntohs(th->th_dport), 840 src, ntohs(th->th_sport)); 841 } 842 goto dropwithreset; 843 } 844 #ifdef IPSEC 845 if (inp && ipsec4_in_reject(m, inp)) { 846 ipsecstat.in_polvio++; 847 goto drop; 848 } 849 #ifdef INET6 850 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) { 851 ipsecstat.in_polvio++; 852 goto drop; 853 } 854 #endif 855 #endif /*IPSEC*/ 856 break; 857 #if defined(INET6) && !defined(TCP6) 858 case AF_INET6: 859 { 860 int faith; 861 862 #if defined(NFAITH) && NFAITH > 0 863 if (m->m_pkthdr.rcvif 864 && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 865 faith = 1; 866 } else 867 faith = 0; 868 #else 869 faith = 0; 870 #endif 871 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport, 872 &ip6->ip6_dst, th->th_dport, faith); 873 if (in6p == NULL) { 874 ++tcpstat.tcps_pcbhashmiss; 875 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst, 876 th->th_dport, faith); 877 } 878 if (in6p == NULL) { 879 ++tcpstat.tcps_noport; 880 goto dropwithreset; 881 } 882 #ifdef IPSEC 883 if (ipsec6_in_reject(m, in6p)) { 884 ipsec6stat.in_polvio++; 885 goto drop; 886 } 887 #endif /*IPSEC*/ 888 break; 889 } 890 #endif 891 } 892 893 /* 894 * If the state is CLOSED (i.e., TCB does not exist) then 895 * all data in the incoming segment is discarded. 896 * If the TCB exists but is in CLOSED state, it is embryonic, 897 * but should either do a listen or a connect soon. 898 */ 899 tp = NULL; 900 so = NULL; 901 if (inp) { 902 tp = intotcpcb(inp); 903 so = inp->inp_socket; 904 } 905 #ifdef INET6 906 else if (in6p) { 907 tp = in6totcpcb(in6p); 908 so = in6p->in6p_socket; 909 } 910 #endif 911 if (tp == 0) { 912 goto dropwithreset; 913 } 914 if (tp->t_state == TCPS_CLOSED) 915 goto drop; 916 917 /* Unscale the window into a 32-bit value. */ 918 if ((tiflags & TH_SYN) == 0) 919 tiwin = th->th_win << tp->snd_scale; 920 else 921 tiwin = th->th_win; 922 923 #ifdef INET6 924 /* save packet options if user wanted */ 925 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 926 if (in6p->in6p_options) { 927 m_freem(in6p->in6p_options); 928 in6p->in6p_options = 0; 929 } 930 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 931 } 932 #endif 933 934 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 935 union syn_cache_sa src; 936 union syn_cache_sa dst; 937 938 bzero(&src, sizeof(src)); 939 bzero(&dst, sizeof(dst)); 940 switch (af) { 941 case AF_INET: 942 src.sin.sin_len = sizeof(struct sockaddr_in); 943 src.sin.sin_family = AF_INET; 944 src.sin.sin_addr = ip->ip_src; 945 src.sin.sin_port = th->th_sport; 946 947 dst.sin.sin_len = sizeof(struct sockaddr_in); 948 dst.sin.sin_family = AF_INET; 949 dst.sin.sin_addr = ip->ip_dst; 950 dst.sin.sin_port = th->th_dport; 951 break; 952 #ifdef INET6 953 case AF_INET6: 954 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 955 src.sin6.sin6_family = AF_INET6; 956 src.sin6.sin6_addr = ip6->ip6_src; 957 src.sin6.sin6_port = th->th_sport; 958 959 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 960 dst.sin6.sin6_family = AF_INET6; 961 dst.sin6.sin6_addr = ip6->ip6_dst; 962 dst.sin6.sin6_port = th->th_dport; 963 break; 964 #endif /* INET6 */ 965 default: 966 goto badsyn; /*sanity*/ 967 } 968 969 if (so->so_options & SO_DEBUG) { 970 ostate = tp->t_state; 971 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 972 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 973 m_freem(tcp_saveti); 974 tcp_saveti = NULL; 975 } else { 976 tcp_saveti->m_len += sizeof(struct tcphdr); 977 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen, 978 sizeof(struct tcphdr)); 979 } 980 if (tcp_saveti) { 981 /* 982 * need to recover version # field, which was 983 * overwritten on ip_cksum computation. 984 */ 985 struct ip *sip; 986 sip = mtod(tcp_saveti, struct ip *); 987 switch (af) { 988 case AF_INET: 989 sip->ip_v = 4; 990 break; 991 #ifdef INET6 992 case AF_INET6: 993 sip->ip_v = 6; 994 break; 995 #endif 996 } 997 } 998 } 999 if (so->so_options & SO_ACCEPTCONN) { 1000 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1001 if (tiflags & TH_RST) { 1002 syn_cache_reset(&src.sa, &dst.sa, th); 1003 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1004 (TH_ACK|TH_SYN)) { 1005 /* 1006 * Received a SYN,ACK. This should 1007 * never happen while we are in 1008 * LISTEN. Send an RST. 1009 */ 1010 goto badsyn; 1011 } else if (tiflags & TH_ACK) { 1012 so = syn_cache_get(&src.sa, &dst.sa, 1013 th, toff, tlen, so, m); 1014 if (so == NULL) { 1015 /* 1016 * We don't have a SYN for 1017 * this ACK; send an RST. 1018 */ 1019 goto badsyn; 1020 } else if (so == 1021 (struct socket *)(-1)) { 1022 /* 1023 * We were unable to create 1024 * the connection. If the 1025 * 3-way handshake was 1026 * completed, and RST has 1027 * been sent to the peer. 1028 * Since the mbuf might be 1029 * in use for the reply, 1030 * do not free it. 1031 */ 1032 m = NULL; 1033 } else { 1034 /* 1035 * We have created a 1036 * full-blown connection. 1037 */ 1038 tp = NULL; 1039 inp = NULL; 1040 #ifdef INET6 1041 in6p = NULL; 1042 #endif 1043 switch (so->so_proto->pr_domain->dom_family) { 1044 case AF_INET: 1045 inp = sotoinpcb(so); 1046 tp = intotcpcb(inp); 1047 break; 1048 #ifdef INET6 1049 case AF_INET6: 1050 in6p = sotoin6pcb(so); 1051 tp = in6totcpcb(in6p); 1052 break; 1053 #endif 1054 } 1055 if (tp == NULL) 1056 goto badsyn; /*XXX*/ 1057 tiwin <<= tp->snd_scale; 1058 goto after_listen; 1059 } 1060 } else { 1061 /* 1062 * None of RST, SYN or ACK was set. 1063 * This is an invalid packet for a 1064 * TCB in LISTEN state. Send a RST. 1065 */ 1066 goto badsyn; 1067 } 1068 } else { 1069 /* 1070 * Received a SYN. 1071 */ 1072 1073 /* 1074 * LISTEN socket received a SYN 1075 * from itself? This can't possibly 1076 * be valid; drop the packet. 1077 */ 1078 if (th->th_sport == th->th_dport) { 1079 int i; 1080 1081 switch (af) { 1082 case AF_INET: 1083 i = in_hosteq(ip->ip_src, ip->ip_dst); 1084 break; 1085 #ifdef INET6 1086 case AF_INET6: 1087 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1088 break; 1089 #endif 1090 default: 1091 i = 1; 1092 } 1093 if (i) { 1094 tcpstat.tcps_badsyn++; 1095 goto drop; 1096 } 1097 } 1098 1099 /* 1100 * SYN looks ok; create compressed TCP 1101 * state for it. 1102 */ 1103 if (so->so_qlen <= so->so_qlimit && 1104 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1105 so, m, optp, optlen, &opti)) 1106 m = NULL; 1107 } 1108 goto drop; 1109 } 1110 } 1111 1112 after_listen: 1113 #ifdef DIAGNOSTIC 1114 /* 1115 * Should not happen now that all embryonic connections 1116 * are handled with compressed state. 1117 */ 1118 if (tp->t_state == TCPS_LISTEN) 1119 panic("tcp_input: TCPS_LISTEN"); 1120 #endif 1121 1122 /* 1123 * Segment received on connection. 1124 * Reset idle time and keep-alive timer. 1125 */ 1126 tp->t_idle = 0; 1127 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1128 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1129 1130 /* 1131 * Process options. 1132 */ 1133 if (optp) 1134 tcp_dooptions(tp, optp, optlen, th, &opti); 1135 1136 /* 1137 * Header prediction: check for the two common cases 1138 * of a uni-directional data xfer. If the packet has 1139 * no control flags, is in-sequence, the window didn't 1140 * change and we're not retransmitting, it's a 1141 * candidate. If the length is zero and the ack moved 1142 * forward, we're the sender side of the xfer. Just 1143 * free the data acked & wake any higher level process 1144 * that was blocked waiting for space. If the length 1145 * is non-zero and the ack didn't move, we're the 1146 * receiver side. If we're getting packets in-order 1147 * (the reassembly queue is empty), add the data to 1148 * the socket buffer and note that we need a delayed ack. 1149 */ 1150 if (tp->t_state == TCPS_ESTABLISHED && 1151 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1152 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1153 th->th_seq == tp->rcv_nxt && 1154 tiwin && tiwin == tp->snd_wnd && 1155 tp->snd_nxt == tp->snd_max) { 1156 1157 /* 1158 * If last ACK falls within this segment's sequence numbers, 1159 * record the timestamp. 1160 */ 1161 if (opti.ts_present && 1162 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1163 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) { 1164 tp->ts_recent_age = tcp_now; 1165 tp->ts_recent = opti.ts_val; 1166 } 1167 1168 if (tlen == 0) { 1169 if (SEQ_GT(th->th_ack, tp->snd_una) && 1170 SEQ_LEQ(th->th_ack, tp->snd_max) && 1171 tp->snd_cwnd >= tp->snd_wnd && 1172 tp->t_dupacks < tcprexmtthresh) { 1173 /* 1174 * this is a pure ack for outstanding data. 1175 */ 1176 ++tcpstat.tcps_predack; 1177 if (opti.ts_present && opti.ts_ecr) 1178 tcp_xmit_timer(tp, 1179 tcp_now - opti.ts_ecr + 1); 1180 else if (tp->t_rtt && 1181 SEQ_GT(th->th_ack, tp->t_rtseq)) 1182 tcp_xmit_timer(tp, tp->t_rtt); 1183 acked = th->th_ack - tp->snd_una; 1184 tcpstat.tcps_rcvackpack++; 1185 tcpstat.tcps_rcvackbyte += acked; 1186 ND6_HINT(tp); 1187 sbdrop(&so->so_snd, acked); 1188 /* 1189 * We want snd_recover to track snd_una to 1190 * avoid sequence wraparound problems for 1191 * very large transfers. 1192 */ 1193 tp->snd_una = tp->snd_recover = th->th_ack; 1194 m_freem(m); 1195 1196 /* 1197 * If all outstanding data are acked, stop 1198 * retransmit timer, otherwise restart timer 1199 * using current (possibly backed-off) value. 1200 * If process is waiting for space, 1201 * wakeup/selwakeup/signal. If data 1202 * are ready to send, let tcp_output 1203 * decide between more output or persist. 1204 */ 1205 if (tp->snd_una == tp->snd_max) 1206 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1207 else if (TCP_TIMER_ISARMED(tp, 1208 TCPT_PERSIST) == 0) 1209 TCP_TIMER_ARM(tp, TCPT_REXMT, 1210 tp->t_rxtcur); 1211 1212 sowwakeup(so); 1213 if (so->so_snd.sb_cc) 1214 (void) tcp_output(tp); 1215 if (tcp_saveti) 1216 m_freem(tcp_saveti); 1217 return; 1218 } 1219 } else if (th->th_ack == tp->snd_una && 1220 tp->segq.lh_first == NULL && 1221 tlen <= sbspace(&so->so_rcv)) { 1222 /* 1223 * this is a pure, in-sequence data packet 1224 * with nothing on the reassembly queue and 1225 * we have enough buffer space to take it. 1226 */ 1227 ++tcpstat.tcps_preddat; 1228 tp->rcv_nxt += tlen; 1229 tcpstat.tcps_rcvpack++; 1230 tcpstat.tcps_rcvbyte += tlen; 1231 ND6_HINT(tp); 1232 /* 1233 * Drop TCP, IP headers and TCP options then add data 1234 * to socket buffer. 1235 */ 1236 m_adj(m, toff + off); 1237 sbappend(&so->so_rcv, m); 1238 sorwakeup(so); 1239 TCP_SETUP_ACK(tp, th); 1240 if (tp->t_flags & TF_ACKNOW) 1241 (void) tcp_output(tp); 1242 if (tcp_saveti) 1243 m_freem(tcp_saveti); 1244 return; 1245 } 1246 } 1247 1248 /* 1249 * Compute mbuf offset to TCP data segment. 1250 */ 1251 hdroptlen = toff + off; 1252 1253 /* 1254 * Calculate amount of space in receive window, 1255 * and then do TCP input processing. 1256 * Receive window is amount of space in rcv queue, 1257 * but not less than advertised window. 1258 */ 1259 { int win; 1260 1261 win = sbspace(&so->so_rcv); 1262 if (win < 0) 1263 win = 0; 1264 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1265 } 1266 1267 switch (tp->t_state) { 1268 1269 /* 1270 * If the state is SYN_SENT: 1271 * if seg contains an ACK, but not for our SYN, drop the input. 1272 * if seg contains a RST, then drop the connection. 1273 * if seg does not contain SYN, then drop it. 1274 * Otherwise this is an acceptable SYN segment 1275 * initialize tp->rcv_nxt and tp->irs 1276 * if seg contains ack then advance tp->snd_una 1277 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1278 * arrange for segment to be acked (eventually) 1279 * continue processing rest of data/controls, beginning with URG 1280 */ 1281 case TCPS_SYN_SENT: 1282 if ((tiflags & TH_ACK) && 1283 (SEQ_LEQ(th->th_ack, tp->iss) || 1284 SEQ_GT(th->th_ack, tp->snd_max))) 1285 goto dropwithreset; 1286 if (tiflags & TH_RST) { 1287 if (tiflags & TH_ACK) 1288 tp = tcp_drop(tp, ECONNREFUSED); 1289 goto drop; 1290 } 1291 if ((tiflags & TH_SYN) == 0) 1292 goto drop; 1293 if (tiflags & TH_ACK) { 1294 tp->snd_una = tp->snd_recover = th->th_ack; 1295 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1296 tp->snd_nxt = tp->snd_una; 1297 } 1298 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1299 tp->irs = th->th_seq; 1300 tcp_rcvseqinit(tp); 1301 tp->t_flags |= TF_ACKNOW; 1302 tcp_mss_from_peer(tp, opti.maxseg); 1303 1304 /* 1305 * Initialize the initial congestion window. If we 1306 * had to retransmit the SYN, we must initialize cwnd 1307 * to 1 segment (i.e. the Loss Window). 1308 */ 1309 if (tp->t_flags & TF_SYN_REXMT) 1310 tp->snd_cwnd = tp->t_peermss; 1311 else 1312 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, 1313 tp->t_peermss); 1314 1315 tcp_rmx_rtt(tp); 1316 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1317 tcpstat.tcps_connects++; 1318 soisconnected(so); 1319 tcp_established(tp); 1320 /* Do window scaling on this connection? */ 1321 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1322 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1323 tp->snd_scale = tp->requested_s_scale; 1324 tp->rcv_scale = tp->request_r_scale; 1325 } 1326 TCP_REASS_LOCK(tp); 1327 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1328 TCP_REASS_UNLOCK(tp); 1329 /* 1330 * if we didn't have to retransmit the SYN, 1331 * use its rtt as our initial srtt & rtt var. 1332 */ 1333 if (tp->t_rtt) 1334 tcp_xmit_timer(tp, tp->t_rtt); 1335 } else 1336 tp->t_state = TCPS_SYN_RECEIVED; 1337 1338 /* 1339 * Advance th->th_seq to correspond to first data byte. 1340 * If data, trim to stay within window, 1341 * dropping FIN if necessary. 1342 */ 1343 th->th_seq++; 1344 if (tlen > tp->rcv_wnd) { 1345 todrop = tlen - tp->rcv_wnd; 1346 m_adj(m, -todrop); 1347 tlen = tp->rcv_wnd; 1348 tiflags &= ~TH_FIN; 1349 tcpstat.tcps_rcvpackafterwin++; 1350 tcpstat.tcps_rcvbyteafterwin += todrop; 1351 } 1352 tp->snd_wl1 = th->th_seq - 1; 1353 tp->rcv_up = th->th_seq; 1354 goto step6; 1355 1356 /* 1357 * If the state is SYN_RECEIVED: 1358 * If seg contains an ACK, but not for our SYN, drop the input 1359 * and generate an RST. See page 36, rfc793 1360 */ 1361 case TCPS_SYN_RECEIVED: 1362 if ((tiflags & TH_ACK) && 1363 (SEQ_LEQ(th->th_ack, tp->iss) || 1364 SEQ_GT(th->th_ack, tp->snd_max))) 1365 goto dropwithreset; 1366 break; 1367 } 1368 1369 /* 1370 * States other than LISTEN or SYN_SENT. 1371 * First check timestamp, if present. 1372 * Then check that at least some bytes of segment are within 1373 * receive window. If segment begins before rcv_nxt, 1374 * drop leading data (and SYN); if nothing left, just ack. 1375 * 1376 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1377 * and it's less than ts_recent, drop it. 1378 */ 1379 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1380 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1381 1382 /* Check to see if ts_recent is over 24 days old. */ 1383 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1384 /* 1385 * Invalidate ts_recent. If this segment updates 1386 * ts_recent, the age will be reset later and ts_recent 1387 * will get a valid value. If it does not, setting 1388 * ts_recent to zero will at least satisfy the 1389 * requirement that zero be placed in the timestamp 1390 * echo reply when ts_recent isn't valid. The 1391 * age isn't reset until we get a valid ts_recent 1392 * because we don't want out-of-order segments to be 1393 * dropped when ts_recent is old. 1394 */ 1395 tp->ts_recent = 0; 1396 } else { 1397 tcpstat.tcps_rcvduppack++; 1398 tcpstat.tcps_rcvdupbyte += tlen; 1399 tcpstat.tcps_pawsdrop++; 1400 goto dropafterack; 1401 } 1402 } 1403 1404 todrop = tp->rcv_nxt - th->th_seq; 1405 if (todrop > 0) { 1406 if (tiflags & TH_SYN) { 1407 tiflags &= ~TH_SYN; 1408 th->th_seq++; 1409 if (th->th_urp > 1) 1410 th->th_urp--; 1411 else { 1412 tiflags &= ~TH_URG; 1413 th->th_urp = 0; 1414 } 1415 todrop--; 1416 } 1417 if (todrop > tlen || 1418 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1419 /* 1420 * Any valid FIN must be to the left of the window. 1421 * At this point the FIN must be a duplicate or 1422 * out of sequence; drop it. 1423 */ 1424 tiflags &= ~TH_FIN; 1425 /* 1426 * Send an ACK to resynchronize and drop any data. 1427 * But keep on processing for RST or ACK. 1428 */ 1429 tp->t_flags |= TF_ACKNOW; 1430 todrop = tlen; 1431 tcpstat.tcps_rcvdupbyte += todrop; 1432 tcpstat.tcps_rcvduppack++; 1433 } else { 1434 tcpstat.tcps_rcvpartduppack++; 1435 tcpstat.tcps_rcvpartdupbyte += todrop; 1436 } 1437 hdroptlen += todrop; /*drop from head afterwards*/ 1438 th->th_seq += todrop; 1439 tlen -= todrop; 1440 if (th->th_urp > todrop) 1441 th->th_urp -= todrop; 1442 else { 1443 tiflags &= ~TH_URG; 1444 th->th_urp = 0; 1445 } 1446 } 1447 1448 /* 1449 * If new data are received on a connection after the 1450 * user processes are gone, then RST the other end. 1451 */ 1452 if ((so->so_state & SS_NOFDREF) && 1453 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1454 tp = tcp_close(tp); 1455 tcpstat.tcps_rcvafterclose++; 1456 goto dropwithreset; 1457 } 1458 1459 /* 1460 * If segment ends after window, drop trailing data 1461 * (and PUSH and FIN); if nothing left, just ACK. 1462 */ 1463 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1464 if (todrop > 0) { 1465 tcpstat.tcps_rcvpackafterwin++; 1466 if (todrop >= tlen) { 1467 tcpstat.tcps_rcvbyteafterwin += tlen; 1468 /* 1469 * If a new connection request is received 1470 * while in TIME_WAIT, drop the old connection 1471 * and start over if the sequence numbers 1472 * are above the previous ones. 1473 */ 1474 if (tiflags & TH_SYN && 1475 tp->t_state == TCPS_TIME_WAIT && 1476 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1477 iss = tcp_new_iss(tp, sizeof(struct tcpcb), 1478 tp->snd_nxt); 1479 tp = tcp_close(tp); 1480 goto findpcb; 1481 } 1482 /* 1483 * If window is closed can only take segments at 1484 * window edge, and have to drop data and PUSH from 1485 * incoming segments. Continue processing, but 1486 * remember to ack. Otherwise, drop segment 1487 * and ack. 1488 */ 1489 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1490 tp->t_flags |= TF_ACKNOW; 1491 tcpstat.tcps_rcvwinprobe++; 1492 } else 1493 goto dropafterack; 1494 } else 1495 tcpstat.tcps_rcvbyteafterwin += todrop; 1496 m_adj(m, -todrop); 1497 tlen -= todrop; 1498 tiflags &= ~(TH_PUSH|TH_FIN); 1499 } 1500 1501 /* 1502 * If last ACK falls within this segment's sequence numbers, 1503 * and the timestamp is newer, record it. 1504 */ 1505 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1506 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1507 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen + 1508 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 1509 tp->ts_recent_age = tcp_now; 1510 tp->ts_recent = opti.ts_val; 1511 } 1512 1513 /* 1514 * If the RST bit is set examine the state: 1515 * SYN_RECEIVED STATE: 1516 * If passive open, return to LISTEN state. 1517 * If active open, inform user that connection was refused. 1518 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1519 * Inform user that connection was reset, and close tcb. 1520 * CLOSING, LAST_ACK, TIME_WAIT STATES 1521 * Close the tcb. 1522 */ 1523 if (tiflags&TH_RST) switch (tp->t_state) { 1524 1525 case TCPS_SYN_RECEIVED: 1526 so->so_error = ECONNREFUSED; 1527 goto close; 1528 1529 case TCPS_ESTABLISHED: 1530 case TCPS_FIN_WAIT_1: 1531 case TCPS_FIN_WAIT_2: 1532 case TCPS_CLOSE_WAIT: 1533 so->so_error = ECONNRESET; 1534 close: 1535 tp->t_state = TCPS_CLOSED; 1536 tcpstat.tcps_drops++; 1537 tp = tcp_close(tp); 1538 goto drop; 1539 1540 case TCPS_CLOSING: 1541 case TCPS_LAST_ACK: 1542 case TCPS_TIME_WAIT: 1543 tp = tcp_close(tp); 1544 goto drop; 1545 } 1546 1547 /* 1548 * If a SYN is in the window, then this is an 1549 * error and we send an RST and drop the connection. 1550 */ 1551 if (tiflags & TH_SYN) { 1552 tp = tcp_drop(tp, ECONNRESET); 1553 goto dropwithreset; 1554 } 1555 1556 /* 1557 * If the ACK bit is off we drop the segment and return. 1558 */ 1559 if ((tiflags & TH_ACK) == 0) { 1560 if (tp->t_flags & TF_ACKNOW) 1561 goto dropafterack; 1562 else 1563 goto drop; 1564 } 1565 1566 /* 1567 * Ack processing. 1568 */ 1569 switch (tp->t_state) { 1570 1571 /* 1572 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1573 * ESTABLISHED state and continue processing, otherwise 1574 * send an RST. 1575 */ 1576 case TCPS_SYN_RECEIVED: 1577 if (SEQ_GT(tp->snd_una, th->th_ack) || 1578 SEQ_GT(th->th_ack, tp->snd_max)) 1579 goto dropwithreset; 1580 tcpstat.tcps_connects++; 1581 soisconnected(so); 1582 tcp_established(tp); 1583 /* Do window scaling? */ 1584 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1585 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1586 tp->snd_scale = tp->requested_s_scale; 1587 tp->rcv_scale = tp->request_r_scale; 1588 } 1589 TCP_REASS_LOCK(tp); 1590 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen); 1591 TCP_REASS_UNLOCK(tp); 1592 tp->snd_wl1 = th->th_seq - 1; 1593 /* fall into ... */ 1594 1595 /* 1596 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1597 * ACKs. If the ack is in the range 1598 * tp->snd_una < th->th_ack <= tp->snd_max 1599 * then advance tp->snd_una to th->th_ack and drop 1600 * data from the retransmission queue. If this ACK reflects 1601 * more up to date window information we update our window information. 1602 */ 1603 case TCPS_ESTABLISHED: 1604 case TCPS_FIN_WAIT_1: 1605 case TCPS_FIN_WAIT_2: 1606 case TCPS_CLOSE_WAIT: 1607 case TCPS_CLOSING: 1608 case TCPS_LAST_ACK: 1609 case TCPS_TIME_WAIT: 1610 1611 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1612 if (tlen == 0 && tiwin == tp->snd_wnd) { 1613 tcpstat.tcps_rcvdupack++; 1614 /* 1615 * If we have outstanding data (other than 1616 * a window probe), this is a completely 1617 * duplicate ack (ie, window info didn't 1618 * change), the ack is the biggest we've 1619 * seen and we've seen exactly our rexmt 1620 * threshhold of them, assume a packet 1621 * has been dropped and retransmit it. 1622 * Kludge snd_nxt & the congestion 1623 * window so we send only this one 1624 * packet. 1625 * 1626 * We know we're losing at the current 1627 * window size so do congestion avoidance 1628 * (set ssthresh to half the current window 1629 * and pull our congestion window back to 1630 * the new ssthresh). 1631 * 1632 * Dup acks mean that packets have left the 1633 * network (they're now cached at the receiver) 1634 * so bump cwnd by the amount in the receiver 1635 * to keep a constant cwnd packets in the 1636 * network. 1637 */ 1638 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 1639 th->th_ack != tp->snd_una) 1640 tp->t_dupacks = 0; 1641 else if (++tp->t_dupacks == tcprexmtthresh) { 1642 tcp_seq onxt = tp->snd_nxt; 1643 u_int win = 1644 min(tp->snd_wnd, tp->snd_cwnd) / 1645 2 / tp->t_segsz; 1646 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1647 tp->snd_recover)) { 1648 /* 1649 * False fast retransmit after 1650 * timeout. Do not cut window. 1651 */ 1652 tp->snd_cwnd += tp->t_segsz; 1653 tp->t_dupacks = 0; 1654 (void) tcp_output(tp); 1655 goto drop; 1656 } 1657 1658 if (win < 2) 1659 win = 2; 1660 tp->snd_ssthresh = win * tp->t_segsz; 1661 tp->snd_recover = tp->snd_max; 1662 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1663 tp->t_rtt = 0; 1664 tp->snd_nxt = th->th_ack; 1665 tp->snd_cwnd = tp->t_segsz; 1666 (void) tcp_output(tp); 1667 tp->snd_cwnd = tp->snd_ssthresh + 1668 tp->t_segsz * tp->t_dupacks; 1669 if (SEQ_GT(onxt, tp->snd_nxt)) 1670 tp->snd_nxt = onxt; 1671 goto drop; 1672 } else if (tp->t_dupacks > tcprexmtthresh) { 1673 tp->snd_cwnd += tp->t_segsz; 1674 (void) tcp_output(tp); 1675 goto drop; 1676 } 1677 } else 1678 tp->t_dupacks = 0; 1679 break; 1680 } 1681 /* 1682 * If the congestion window was inflated to account 1683 * for the other side's cached packets, retract it. 1684 */ 1685 if (tcp_do_newreno == 0) { 1686 if (tp->t_dupacks >= tcprexmtthresh && 1687 tp->snd_cwnd > tp->snd_ssthresh) 1688 tp->snd_cwnd = tp->snd_ssthresh; 1689 tp->t_dupacks = 0; 1690 } else if (tp->t_dupacks >= tcprexmtthresh && 1691 tcp_newreno(tp, th) == 0) { 1692 tp->snd_cwnd = tp->snd_ssthresh; 1693 /* 1694 * Window inflation should have left us with approx. 1695 * snd_ssthresh outstanding data. But in case we 1696 * would be inclined to send a burst, better to do 1697 * it via the slow start mechanism. 1698 */ 1699 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 1700 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 1701 + tp->t_segsz; 1702 tp->t_dupacks = 0; 1703 } 1704 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1705 tcpstat.tcps_rcvacktoomuch++; 1706 goto dropafterack; 1707 } 1708 acked = th->th_ack - tp->snd_una; 1709 tcpstat.tcps_rcvackpack++; 1710 tcpstat.tcps_rcvackbyte += acked; 1711 1712 /* 1713 * If we have a timestamp reply, update smoothed 1714 * round trip time. If no timestamp is present but 1715 * transmit timer is running and timed sequence 1716 * number was acked, update smoothed round trip time. 1717 * Since we now have an rtt measurement, cancel the 1718 * timer backoff (cf., Phil Karn's retransmit alg.). 1719 * Recompute the initial retransmit timer. 1720 */ 1721 if (opti.ts_present && opti.ts_ecr) 1722 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1); 1723 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1724 tcp_xmit_timer(tp,tp->t_rtt); 1725 1726 /* 1727 * If all outstanding data is acked, stop retransmit 1728 * timer and remember to restart (more output or persist). 1729 * If there is more data to be acked, restart retransmit 1730 * timer, using current (possibly backed-off) value. 1731 */ 1732 if (th->th_ack == tp->snd_max) { 1733 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1734 needoutput = 1; 1735 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1736 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1737 /* 1738 * When new data is acked, open the congestion window. 1739 * If the window gives us less than ssthresh packets 1740 * in flight, open exponentially (segsz per packet). 1741 * Otherwise open linearly: segsz per window 1742 * (segsz^2 / cwnd per packet), plus a constant 1743 * fraction of a packet (segsz/8) to help larger windows 1744 * open quickly enough. 1745 */ 1746 { 1747 register u_int cw = tp->snd_cwnd; 1748 register u_int incr = tp->t_segsz; 1749 1750 if (cw > tp->snd_ssthresh) 1751 incr = incr * incr / cw; 1752 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover)) 1753 tp->snd_cwnd = min(cw + incr, 1754 TCP_MAXWIN << tp->snd_scale); 1755 } 1756 ND6_HINT(tp); 1757 if (acked > so->so_snd.sb_cc) { 1758 tp->snd_wnd -= so->so_snd.sb_cc; 1759 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1760 ourfinisacked = 1; 1761 } else { 1762 sbdrop(&so->so_snd, acked); 1763 tp->snd_wnd -= acked; 1764 ourfinisacked = 0; 1765 } 1766 sowwakeup(so); 1767 /* 1768 * We want snd_recover to track snd_una to 1769 * avoid sequence wraparound problems for 1770 * very large transfers. 1771 */ 1772 tp->snd_una = tp->snd_recover = th->th_ack; 1773 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1774 tp->snd_nxt = tp->snd_una; 1775 1776 switch (tp->t_state) { 1777 1778 /* 1779 * In FIN_WAIT_1 STATE in addition to the processing 1780 * for the ESTABLISHED state if our FIN is now acknowledged 1781 * then enter FIN_WAIT_2. 1782 */ 1783 case TCPS_FIN_WAIT_1: 1784 if (ourfinisacked) { 1785 /* 1786 * If we can't receive any more 1787 * data, then closing user can proceed. 1788 * Starting the timer is contrary to the 1789 * specification, but if we don't get a FIN 1790 * we'll hang forever. 1791 */ 1792 if (so->so_state & SS_CANTRCVMORE) { 1793 soisdisconnected(so); 1794 if (tcp_maxidle > 0) 1795 TCP_TIMER_ARM(tp, TCPT_2MSL, 1796 tcp_maxidle); 1797 } 1798 tp->t_state = TCPS_FIN_WAIT_2; 1799 } 1800 break; 1801 1802 /* 1803 * In CLOSING STATE in addition to the processing for 1804 * the ESTABLISHED state if the ACK acknowledges our FIN 1805 * then enter the TIME-WAIT state, otherwise ignore 1806 * the segment. 1807 */ 1808 case TCPS_CLOSING: 1809 if (ourfinisacked) { 1810 tp->t_state = TCPS_TIME_WAIT; 1811 tcp_canceltimers(tp); 1812 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1813 soisdisconnected(so); 1814 } 1815 break; 1816 1817 /* 1818 * In LAST_ACK, we may still be waiting for data to drain 1819 * and/or to be acked, as well as for the ack of our FIN. 1820 * If our FIN is now acknowledged, delete the TCB, 1821 * enter the closed state and return. 1822 */ 1823 case TCPS_LAST_ACK: 1824 if (ourfinisacked) { 1825 tp = tcp_close(tp); 1826 goto drop; 1827 } 1828 break; 1829 1830 /* 1831 * In TIME_WAIT state the only thing that should arrive 1832 * is a retransmission of the remote FIN. Acknowledge 1833 * it and restart the finack timer. 1834 */ 1835 case TCPS_TIME_WAIT: 1836 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1837 goto dropafterack; 1838 } 1839 } 1840 1841 step6: 1842 /* 1843 * Update window information. 1844 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1845 */ 1846 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1847 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1848 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1849 /* keep track of pure window updates */ 1850 if (tlen == 0 && 1851 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1852 tcpstat.tcps_rcvwinupd++; 1853 tp->snd_wnd = tiwin; 1854 tp->snd_wl1 = th->th_seq; 1855 tp->snd_wl2 = th->th_ack; 1856 if (tp->snd_wnd > tp->max_sndwnd) 1857 tp->max_sndwnd = tp->snd_wnd; 1858 needoutput = 1; 1859 } 1860 1861 /* 1862 * Process segments with URG. 1863 */ 1864 if ((tiflags & TH_URG) && th->th_urp && 1865 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1866 /* 1867 * This is a kludge, but if we receive and accept 1868 * random urgent pointers, we'll crash in 1869 * soreceive. It's hard to imagine someone 1870 * actually wanting to send this much urgent data. 1871 */ 1872 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1873 th->th_urp = 0; /* XXX */ 1874 tiflags &= ~TH_URG; /* XXX */ 1875 goto dodata; /* XXX */ 1876 } 1877 /* 1878 * If this segment advances the known urgent pointer, 1879 * then mark the data stream. This should not happen 1880 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1881 * a FIN has been received from the remote side. 1882 * In these states we ignore the URG. 1883 * 1884 * According to RFC961 (Assigned Protocols), 1885 * the urgent pointer points to the last octet 1886 * of urgent data. We continue, however, 1887 * to consider it to indicate the first octet 1888 * of data past the urgent section as the original 1889 * spec states (in one of two places). 1890 */ 1891 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1892 tp->rcv_up = th->th_seq + th->th_urp; 1893 so->so_oobmark = so->so_rcv.sb_cc + 1894 (tp->rcv_up - tp->rcv_nxt) - 1; 1895 if (so->so_oobmark == 0) 1896 so->so_state |= SS_RCVATMARK; 1897 sohasoutofband(so); 1898 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1899 } 1900 /* 1901 * Remove out of band data so doesn't get presented to user. 1902 * This can happen independent of advancing the URG pointer, 1903 * but if two URG's are pending at once, some out-of-band 1904 * data may creep in... ick. 1905 */ 1906 if (th->th_urp <= (u_int16_t) tlen 1907 #ifdef SO_OOBINLINE 1908 && (so->so_options & SO_OOBINLINE) == 0 1909 #endif 1910 ) 1911 tcp_pulloutofband(so, th, m, hdroptlen); 1912 } else 1913 /* 1914 * If no out of band data is expected, 1915 * pull receive urgent pointer along 1916 * with the receive window. 1917 */ 1918 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1919 tp->rcv_up = tp->rcv_nxt; 1920 dodata: /* XXX */ 1921 1922 /* 1923 * Process the segment text, merging it into the TCP sequencing queue, 1924 * and arranging for acknowledgement of receipt if necessary. 1925 * This process logically involves adjusting tp->rcv_wnd as data 1926 * is presented to the user (this happens in tcp_usrreq.c, 1927 * case PRU_RCVD). If a FIN has already been received on this 1928 * connection then we just ignore the text. 1929 */ 1930 if ((tlen || (tiflags & TH_FIN)) && 1931 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1932 /* 1933 * Insert segment ti into reassembly queue of tcp with 1934 * control block tp. Return TH_FIN if reassembly now includes 1935 * a segment with FIN. The macro form does the common case 1936 * inline (segment is the next to be received on an 1937 * established connection, and the queue is empty), 1938 * avoiding linkage into and removal from the queue and 1939 * repetition of various conversions. 1940 * Set DELACK for segments received in order, but ack 1941 * immediately when segments are out of order 1942 * (so fast retransmit can work). 1943 */ 1944 /* NOTE: this was TCP_REASS() macro, but used only once */ 1945 TCP_REASS_LOCK(tp); 1946 if (th->th_seq == tp->rcv_nxt && 1947 tp->segq.lh_first == NULL && 1948 tp->t_state == TCPS_ESTABLISHED) { 1949 TCP_SETUP_ACK(tp, th); 1950 tp->rcv_nxt += tlen; 1951 tiflags = th->th_flags & TH_FIN; 1952 tcpstat.tcps_rcvpack++; 1953 tcpstat.tcps_rcvbyte += tlen; 1954 ND6_HINT(tp); 1955 m_adj(m, hdroptlen); 1956 sbappend(&(so)->so_rcv, m); 1957 sorwakeup(so); 1958 } else { 1959 m_adj(m, hdroptlen); 1960 tiflags = tcp_reass(tp, th, m, &tlen); 1961 tp->t_flags |= TF_ACKNOW; 1962 } 1963 TCP_REASS_UNLOCK(tp); 1964 1965 /* 1966 * Note the amount of data that peer has sent into 1967 * our window, in order to estimate the sender's 1968 * buffer size. 1969 */ 1970 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1971 } else { 1972 m_freem(m); 1973 m = NULL; 1974 tiflags &= ~TH_FIN; 1975 } 1976 1977 /* 1978 * If FIN is received ACK the FIN and let the user know 1979 * that the connection is closing. Ignore a FIN received before 1980 * the connection is fully established. 1981 */ 1982 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1983 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1984 socantrcvmore(so); 1985 tp->t_flags |= TF_ACKNOW; 1986 tp->rcv_nxt++; 1987 } 1988 switch (tp->t_state) { 1989 1990 /* 1991 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1992 */ 1993 case TCPS_ESTABLISHED: 1994 tp->t_state = TCPS_CLOSE_WAIT; 1995 break; 1996 1997 /* 1998 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1999 * enter the CLOSING state. 2000 */ 2001 case TCPS_FIN_WAIT_1: 2002 tp->t_state = TCPS_CLOSING; 2003 break; 2004 2005 /* 2006 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2007 * starting the time-wait timer, turning off the other 2008 * standard timers. 2009 */ 2010 case TCPS_FIN_WAIT_2: 2011 tp->t_state = TCPS_TIME_WAIT; 2012 tcp_canceltimers(tp); 2013 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2014 soisdisconnected(so); 2015 break; 2016 2017 /* 2018 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2019 */ 2020 case TCPS_TIME_WAIT: 2021 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2022 break; 2023 } 2024 } 2025 if (so->so_options & SO_DEBUG) { 2026 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2027 } 2028 2029 /* 2030 * Return any desired output. 2031 */ 2032 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2033 (void) tcp_output(tp); 2034 if (tcp_saveti) 2035 m_freem(tcp_saveti); 2036 return; 2037 2038 badsyn: 2039 /* 2040 * Received a bad SYN. Increment counters and dropwithreset. 2041 */ 2042 tcpstat.tcps_badsyn++; 2043 tp = NULL; 2044 goto dropwithreset; 2045 2046 dropafterack: 2047 /* 2048 * Generate an ACK dropping incoming segment if it occupies 2049 * sequence space, where the ACK reflects our state. 2050 */ 2051 if (tiflags & TH_RST) 2052 goto drop; 2053 m_freem(m); 2054 tp->t_flags |= TF_ACKNOW; 2055 (void) tcp_output(tp); 2056 if (tcp_saveti) 2057 m_freem(tcp_saveti); 2058 return; 2059 2060 dropwithreset: 2061 /* 2062 * Generate a RST, dropping incoming segment. 2063 * Make ACK acceptable to originator of segment. 2064 * Don't bother to respond if destination was broadcast/multicast. 2065 */ 2066 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2067 goto drop; 2068 if (ip && IN_MULTICAST(ip->ip_dst.s_addr)) 2069 goto drop; 2070 #ifdef INET6 2071 if (m->m_flags & M_ANYCAST6) 2072 goto drop; 2073 else if (ip6 && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 2074 goto drop; 2075 #endif 2076 { 2077 /* 2078 * need to recover version # field, which was overwritten on 2079 * ip_cksum computation. 2080 */ 2081 struct ip *sip; 2082 sip = mtod(m, struct ip *); 2083 switch (af) { 2084 case AF_INET: 2085 sip->ip_v = 4; 2086 break; 2087 #ifdef INET6 2088 case AF_INET6: 2089 sip->ip_v = 6; 2090 break; 2091 #endif 2092 } 2093 } 2094 if (tiflags & TH_ACK) 2095 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2096 else { 2097 if (tiflags & TH_SYN) 2098 tlen++; 2099 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2100 TH_RST|TH_ACK); 2101 } 2102 if (tcp_saveti) 2103 m_freem(tcp_saveti); 2104 return; 2105 2106 drop: 2107 /* 2108 * Drop space held by incoming segment and return. 2109 */ 2110 if (tp) { 2111 if (tp->t_inpcb) 2112 so = tp->t_inpcb->inp_socket; 2113 #ifdef INET6 2114 else if (tp->t_in6pcb) 2115 so = tp->t_in6pcb->in6p_socket; 2116 #endif 2117 else 2118 so = NULL; 2119 if (so && (so->so_options & SO_DEBUG) != 0) 2120 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2121 } 2122 if (tcp_saveti) 2123 m_freem(tcp_saveti); 2124 m_freem(m); 2125 return; 2126 } 2127 2128 void 2129 tcp_dooptions(tp, cp, cnt, th, oi) 2130 struct tcpcb *tp; 2131 u_char *cp; 2132 int cnt; 2133 struct tcphdr *th; 2134 struct tcp_opt_info *oi; 2135 { 2136 u_int16_t mss; 2137 int opt, optlen; 2138 2139 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2140 opt = cp[0]; 2141 if (opt == TCPOPT_EOL) 2142 break; 2143 if (opt == TCPOPT_NOP) 2144 optlen = 1; 2145 else { 2146 optlen = cp[1]; 2147 if (optlen <= 0) 2148 break; 2149 } 2150 switch (opt) { 2151 2152 default: 2153 continue; 2154 2155 case TCPOPT_MAXSEG: 2156 if (optlen != TCPOLEN_MAXSEG) 2157 continue; 2158 if (!(th->th_flags & TH_SYN)) 2159 continue; 2160 bcopy(cp + 2, &mss, sizeof(mss)); 2161 oi->maxseg = ntohs(mss); 2162 break; 2163 2164 case TCPOPT_WINDOW: 2165 if (optlen != TCPOLEN_WINDOW) 2166 continue; 2167 if (!(th->th_flags & TH_SYN)) 2168 continue; 2169 tp->t_flags |= TF_RCVD_SCALE; 2170 tp->requested_s_scale = cp[2]; 2171 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 2172 #if 0 /*XXX*/ 2173 char *p; 2174 2175 if (ip) 2176 p = ntohl(ip->ip_src); 2177 #ifdef INET6 2178 else if (ip6) 2179 p = ip6_sprintf(&ip6->ip6_src); 2180 #endif 2181 else 2182 p = "(unknown)"; 2183 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 2184 "assuming %d\n", 2185 tp->requested_s_scale, p, 2186 TCP_MAX_WINSHIFT); 2187 #else 2188 log(LOG_ERR, "TCP: invalid wscale %d, " 2189 "assuming %d\n", 2190 tp->requested_s_scale, 2191 TCP_MAX_WINSHIFT); 2192 #endif 2193 tp->requested_s_scale = TCP_MAX_WINSHIFT; 2194 } 2195 break; 2196 2197 case TCPOPT_TIMESTAMP: 2198 if (optlen != TCPOLEN_TIMESTAMP) 2199 continue; 2200 oi->ts_present = 1; 2201 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 2202 NTOHL(oi->ts_val); 2203 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 2204 NTOHL(oi->ts_ecr); 2205 2206 /* 2207 * A timestamp received in a SYN makes 2208 * it ok to send timestamp requests and replies. 2209 */ 2210 if (th->th_flags & TH_SYN) { 2211 tp->t_flags |= TF_RCVD_TSTMP; 2212 tp->ts_recent = oi->ts_val; 2213 tp->ts_recent_age = tcp_now; 2214 } 2215 break; 2216 case TCPOPT_SACK_PERMITTED: 2217 if (optlen != TCPOLEN_SACK_PERMITTED) 2218 continue; 2219 if (!(th->th_flags & TH_SYN)) 2220 continue; 2221 tp->t_flags &= ~TF_CANT_TXSACK; 2222 break; 2223 2224 case TCPOPT_SACK: 2225 if (tp->t_flags & TF_IGNR_RXSACK) 2226 continue; 2227 if (optlen % 8 != 2 || optlen < 10) 2228 continue; 2229 cp += 2; 2230 optlen -= 2; 2231 for (; optlen > 0; cp -= 8, optlen -= 8) { 2232 tcp_seq lwe, rwe; 2233 bcopy((char *)cp, (char *) &lwe, sizeof(lwe)); 2234 NTOHL(lwe); 2235 bcopy((char *)cp, (char *) &rwe, sizeof(rwe)); 2236 NTOHL(rwe); 2237 /* tcp_mark_sacked(tp, lwe, rwe); */ 2238 } 2239 break; 2240 } 2241 } 2242 } 2243 2244 /* 2245 * Pull out of band byte out of a segment so 2246 * it doesn't appear in the user's data queue. 2247 * It is still reflected in the segment length for 2248 * sequencing purposes. 2249 */ 2250 void 2251 tcp_pulloutofband(so, th, m, off) 2252 struct socket *so; 2253 struct tcphdr *th; 2254 register struct mbuf *m; 2255 int off; 2256 { 2257 int cnt = off + th->th_urp - 1; 2258 2259 while (cnt >= 0) { 2260 if (m->m_len > cnt) { 2261 char *cp = mtod(m, caddr_t) + cnt; 2262 struct tcpcb *tp = sototcpcb(so); 2263 2264 tp->t_iobc = *cp; 2265 tp->t_oobflags |= TCPOOB_HAVEDATA; 2266 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2267 m->m_len--; 2268 return; 2269 } 2270 cnt -= m->m_len; 2271 m = m->m_next; 2272 if (m == 0) 2273 break; 2274 } 2275 panic("tcp_pulloutofband"); 2276 } 2277 2278 /* 2279 * Collect new round-trip time estimate 2280 * and update averages and current timeout. 2281 */ 2282 void 2283 tcp_xmit_timer(tp, rtt) 2284 register struct tcpcb *tp; 2285 short rtt; 2286 { 2287 register short delta; 2288 short rttmin; 2289 2290 tcpstat.tcps_rttupdated++; 2291 --rtt; 2292 if (tp->t_srtt != 0) { 2293 /* 2294 * srtt is stored as fixed point with 3 bits after the 2295 * binary point (i.e., scaled by 8). The following magic 2296 * is equivalent to the smoothing algorithm in rfc793 with 2297 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2298 * point). Adjust rtt to origin 0. 2299 */ 2300 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2301 if ((tp->t_srtt += delta) <= 0) 2302 tp->t_srtt = 1 << 2; 2303 /* 2304 * We accumulate a smoothed rtt variance (actually, a 2305 * smoothed mean difference), then set the retransmit 2306 * timer to smoothed rtt + 4 times the smoothed variance. 2307 * rttvar is stored as fixed point with 2 bits after the 2308 * binary point (scaled by 4). The following is 2309 * equivalent to rfc793 smoothing with an alpha of .75 2310 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2311 * rfc793's wired-in beta. 2312 */ 2313 if (delta < 0) 2314 delta = -delta; 2315 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2316 if ((tp->t_rttvar += delta) <= 0) 2317 tp->t_rttvar = 1 << 2; 2318 } else { 2319 /* 2320 * No rtt measurement yet - use the unsmoothed rtt. 2321 * Set the variance to half the rtt (so our first 2322 * retransmit happens at 3*rtt). 2323 */ 2324 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2325 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2326 } 2327 tp->t_rtt = 0; 2328 tp->t_rxtshift = 0; 2329 2330 /* 2331 * the retransmit should happen at rtt + 4 * rttvar. 2332 * Because of the way we do the smoothing, srtt and rttvar 2333 * will each average +1/2 tick of bias. When we compute 2334 * the retransmit timer, we want 1/2 tick of rounding and 2335 * 1 extra tick because of +-1/2 tick uncertainty in the 2336 * firing of the timer. The bias will give us exactly the 2337 * 1.5 tick we need. But, because the bias is 2338 * statistical, we have to test that we don't drop below 2339 * the minimum feasible timer (which is 2 ticks). 2340 */ 2341 if (tp->t_rttmin > rtt + 2) 2342 rttmin = tp->t_rttmin; 2343 else 2344 rttmin = rtt + 2; 2345 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2346 2347 /* 2348 * We received an ack for a packet that wasn't retransmitted; 2349 * it is probably safe to discard any error indications we've 2350 * received recently. This isn't quite right, but close enough 2351 * for now (a route might have failed after we sent a segment, 2352 * and the return path might not be symmetrical). 2353 */ 2354 tp->t_softerror = 0; 2355 } 2356 2357 /* 2358 * Checks for partial ack. If partial ack arrives, force the retransmission 2359 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2360 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to 2361 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2362 */ 2363 int 2364 tcp_newreno(tp, th) 2365 struct tcpcb *tp; 2366 struct tcphdr *th; 2367 { 2368 tcp_seq onxt = tp->snd_nxt; 2369 u_long ocwnd = tp->snd_cwnd; 2370 2371 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2372 /* 2373 * snd_una has not yet been updated and the socket's send 2374 * buffer has not yet drained off the ACK'd data, so we 2375 * have to leave snd_una as it was to get the correct data 2376 * offset in tcp_output(). 2377 */ 2378 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2379 tp->t_rtt = 0; 2380 tp->snd_nxt = th->th_ack; 2381 /* 2382 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una 2383 * is not yet updated when we're called. 2384 */ 2385 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 2386 (void) tcp_output(tp); 2387 tp->snd_cwnd = ocwnd; 2388 if (SEQ_GT(onxt, tp->snd_nxt)) 2389 tp->snd_nxt = onxt; 2390 /* 2391 * Partial window deflation. Relies on fact that tp->snd_una 2392 * not updated yet. 2393 */ 2394 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz); 2395 return 1; 2396 } 2397 return 0; 2398 } 2399 2400 2401 /* 2402 * TCP compressed state engine. Currently used to hold compressed 2403 * state for SYN_RECEIVED. 2404 */ 2405 2406 u_long syn_cache_count; 2407 u_int32_t syn_hash1, syn_hash2; 2408 2409 #define SYN_HASH(sa, sp, dp) \ 2410 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 2411 ((u_int32_t)(sp)))^syn_hash2))) 2412 #ifndef INET6 2413 #define SYN_HASHALL(hash, src, dst) \ 2414 do { \ 2415 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2416 ((struct sockaddr_in *)(src))->sin_port, \ 2417 ((struct sockaddr_in *)(dst))->sin_port); \ 2418 } while (0) 2419 #else 2420 #define SYN_HASH6(sa, sp, dp) \ 2421 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 2422 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 2423 & 0x7fffffff) 2424 2425 #define SYN_HASHALL(hash, src, dst) \ 2426 do { \ 2427 switch ((src)->sa_family) { \ 2428 case AF_INET: \ 2429 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \ 2430 ((struct sockaddr_in *)(src))->sin_port, \ 2431 ((struct sockaddr_in *)(dst))->sin_port); \ 2432 break; \ 2433 case AF_INET6: \ 2434 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \ 2435 ((struct sockaddr_in6 *)(src))->sin6_port, \ 2436 ((struct sockaddr_in6 *)(dst))->sin6_port); \ 2437 break; \ 2438 default: \ 2439 hash = 0; \ 2440 } \ 2441 } while (0) 2442 #endif /* INET6 */ 2443 2444 #define SYN_CACHE_RM(sc) \ 2445 do { \ 2446 LIST_REMOVE((sc), sc_bucketq); \ 2447 (sc)->sc_tp = NULL; \ 2448 LIST_REMOVE((sc), sc_tpq); \ 2449 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \ 2450 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \ 2451 syn_cache_count--; \ 2452 } while (0) 2453 2454 #define SYN_CACHE_PUT(sc) \ 2455 do { \ 2456 if ((sc)->sc_ipopts) \ 2457 (void) m_free((sc)->sc_ipopts); \ 2458 if ((sc)->sc_route4.ro_rt != NULL) \ 2459 RTFREE((sc)->sc_route4.ro_rt); \ 2460 pool_put(&syn_cache_pool, (sc)); \ 2461 } while (0) 2462 2463 struct pool syn_cache_pool; 2464 2465 /* 2466 * We don't estimate RTT with SYNs, so each packet starts with the default 2467 * RTT and each timer queue has a fixed timeout value. This allows us to 2468 * optimize the timer queues somewhat. 2469 */ 2470 #define SYN_CACHE_TIMER_ARM(sc) \ 2471 do { \ 2472 TCPT_RANGESET((sc)->sc_rxtcur, \ 2473 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 2474 TCPTV_REXMTMAX); \ 2475 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \ 2476 } while (0) 2477 2478 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1]; 2479 2480 void 2481 syn_cache_init() 2482 { 2483 int i; 2484 2485 /* Initialize the hash buckets. */ 2486 for (i = 0; i < tcp_syn_cache_size; i++) 2487 LIST_INIT(&tcp_syn_cache[i].sch_bucket); 2488 2489 /* Initialize the timer queues. */ 2490 for (i = 0; i <= TCP_MAXRXTSHIFT; i++) 2491 TAILQ_INIT(&tcp_syn_cache_timeq[i]); 2492 2493 /* Initialize the syn cache pool. */ 2494 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 2495 "synpl", 0, NULL, NULL, M_PCB); 2496 } 2497 2498 void 2499 syn_cache_insert(sc, tp) 2500 struct syn_cache *sc; 2501 struct tcpcb *tp; 2502 { 2503 struct syn_cache_head *scp; 2504 struct syn_cache *sc2; 2505 int s, i; 2506 2507 /* 2508 * If there are no entries in the hash table, reinitialize 2509 * the hash secrets. 2510 */ 2511 if (syn_cache_count == 0) { 2512 struct timeval tv; 2513 microtime(&tv); 2514 syn_hash1 = random() ^ (u_long)≻ 2515 syn_hash2 = random() ^ tv.tv_usec; 2516 } 2517 2518 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 2519 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 2520 scp = &tcp_syn_cache[sc->sc_bucketidx]; 2521 2522 /* 2523 * Make sure that we don't overflow the per-bucket 2524 * limit or the total cache size limit. 2525 */ 2526 s = splsoftnet(); 2527 if (scp->sch_length >= tcp_syn_bucket_limit) { 2528 tcpstat.tcps_sc_bucketoverflow++; 2529 /* 2530 * The bucket is full. Toss the oldest element in the 2531 * bucket. This will be the entry with our bucket 2532 * index closest to the front of the timer queue with 2533 * the largest timeout value. 2534 * 2535 * Note: This timer queue traversal may be expensive, so 2536 * we hope that this doesn't happen very often. It is 2537 * much more likely that we'll overflow the entire 2538 * cache, which is much easier to handle; see below. 2539 */ 2540 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2541 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2542 sc2 != NULL; 2543 sc2 = TAILQ_NEXT(sc2, sc_timeq)) { 2544 if (sc2->sc_bucketidx == sc->sc_bucketidx) { 2545 SYN_CACHE_RM(sc2); 2546 SYN_CACHE_PUT(sc2); 2547 goto insert; /* 2 level break */ 2548 } 2549 } 2550 } 2551 #ifdef DIAGNOSTIC 2552 /* 2553 * This should never happen; we should always find an 2554 * entry in our bucket. 2555 */ 2556 panic("syn_cache_insert: bucketoverflow: impossible"); 2557 #endif 2558 } else if (syn_cache_count >= tcp_syn_cache_limit) { 2559 tcpstat.tcps_sc_overflowed++; 2560 /* 2561 * The cache is full. Toss the oldest entry in the 2562 * entire cache. This is the front entry in the 2563 * first non-empty timer queue with the largest 2564 * timeout value. 2565 */ 2566 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) { 2567 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2568 if (sc2 == NULL) 2569 continue; 2570 SYN_CACHE_RM(sc2); 2571 SYN_CACHE_PUT(sc2); 2572 goto insert; /* symmetry with above */ 2573 } 2574 #ifdef DIAGNOSTIC 2575 /* 2576 * This should never happen; we should always find an 2577 * entry in the cache. 2578 */ 2579 panic("syn_cache_insert: cache overflow: impossible"); 2580 #endif 2581 } 2582 2583 insert: 2584 /* 2585 * Initialize the entry's timer. 2586 */ 2587 sc->sc_rxttot = 0; 2588 sc->sc_rxtshift = 0; 2589 SYN_CACHE_TIMER_ARM(sc); 2590 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq); 2591 2592 /* Link it from tcpcb entry */ 2593 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 2594 2595 /* Put it into the bucket. */ 2596 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq); 2597 scp->sch_length++; 2598 syn_cache_count++; 2599 2600 tcpstat.tcps_sc_added++; 2601 splx(s); 2602 } 2603 2604 /* 2605 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 2606 * If we have retransmitted an entry the maximum number of times, expire 2607 * that entry. 2608 */ 2609 void 2610 syn_cache_timer() 2611 { 2612 struct syn_cache *sc, *nsc; 2613 int i, s; 2614 2615 s = splsoftnet(); 2616 2617 /* 2618 * First, get all the entries that need to be retransmitted, or 2619 * must be expired due to exceeding the initial keepalive time. 2620 */ 2621 for (i = 0; i < TCP_MAXRXTSHIFT; i++) { 2622 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]); 2623 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2624 sc = nsc) { 2625 nsc = TAILQ_NEXT(sc, sc_timeq); 2626 2627 /* 2628 * Compute the total amount of time this entry has 2629 * been on a queue. If this entry has been on longer 2630 * than the keep alive timer would allow, expire it. 2631 */ 2632 sc->sc_rxttot += sc->sc_rxtcur; 2633 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) { 2634 tcpstat.tcps_sc_timed_out++; 2635 SYN_CACHE_RM(sc); 2636 SYN_CACHE_PUT(sc); 2637 continue; 2638 } 2639 2640 tcpstat.tcps_sc_retransmitted++; 2641 (void) syn_cache_respond(sc, NULL); 2642 2643 /* Advance this entry onto the next timer queue. */ 2644 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq); 2645 sc->sc_rxtshift = i + 1; 2646 SYN_CACHE_TIMER_ARM(sc); 2647 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], 2648 sc, sc_timeq); 2649 } 2650 } 2651 2652 /* 2653 * Now get all the entries that are expired due to too many 2654 * retransmissions. 2655 */ 2656 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]); 2657 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt); 2658 sc = nsc) { 2659 nsc = TAILQ_NEXT(sc, sc_timeq); 2660 tcpstat.tcps_sc_timed_out++; 2661 SYN_CACHE_RM(sc); 2662 SYN_CACHE_PUT(sc); 2663 } 2664 splx(s); 2665 } 2666 2667 /* 2668 * Remove syn cache created by the specified tcb entry, 2669 * because this does not make sense to keep them 2670 * (if there's no tcb entry, syn cache entry will never be used) 2671 */ 2672 void 2673 syn_cache_cleanup(tp) 2674 struct tcpcb *tp; 2675 { 2676 struct syn_cache *sc, *nsc; 2677 int s; 2678 2679 s = splsoftnet(); 2680 2681 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 2682 nsc = LIST_NEXT(sc, sc_tpq); 2683 2684 #ifdef DIAGNOSTIC 2685 if (sc->sc_tp != tp) 2686 panic("invalid sc_tp in syn_cache_cleanup"); 2687 #endif 2688 SYN_CACHE_RM(sc); 2689 SYN_CACHE_PUT(sc); 2690 } 2691 /* just for safety */ 2692 LIST_INIT(&tp->t_sc); 2693 2694 splx(s); 2695 } 2696 2697 /* 2698 * Find an entry in the syn cache. 2699 */ 2700 struct syn_cache * 2701 syn_cache_lookup(src, dst, headp) 2702 struct sockaddr *src; 2703 struct sockaddr *dst; 2704 struct syn_cache_head **headp; 2705 { 2706 struct syn_cache *sc; 2707 struct syn_cache_head *scp; 2708 u_int32_t hash; 2709 int s; 2710 2711 SYN_HASHALL(hash, src, dst); 2712 2713 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 2714 *headp = scp; 2715 s = splsoftnet(); 2716 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL; 2717 sc = LIST_NEXT(sc, sc_bucketq)) { 2718 if (sc->sc_hash != hash) 2719 continue; 2720 if (!bcmp(&sc->sc_src, src, src->sa_len) && 2721 !bcmp(&sc->sc_dst, dst, dst->sa_len)) { 2722 splx(s); 2723 return (sc); 2724 } 2725 } 2726 splx(s); 2727 return (NULL); 2728 } 2729 2730 /* 2731 * This function gets called when we receive an ACK for a 2732 * socket in the LISTEN state. We look up the connection 2733 * in the syn cache, and if its there, we pull it out of 2734 * the cache and turn it into a full-blown connection in 2735 * the SYN-RECEIVED state. 2736 * 2737 * The return values may not be immediately obvious, and their effects 2738 * can be subtle, so here they are: 2739 * 2740 * NULL SYN was not found in cache; caller should drop the 2741 * packet and send an RST. 2742 * 2743 * -1 We were unable to create the new connection, and are 2744 * aborting it. An ACK,RST is being sent to the peer 2745 * (unless we got screwey sequence numbners; see below), 2746 * because the 3-way handshake has been completed. Caller 2747 * should not free the mbuf, since we may be using it. If 2748 * we are not, we will free it. 2749 * 2750 * Otherwise, the return value is a pointer to the new socket 2751 * associated with the connection. 2752 */ 2753 struct socket * 2754 syn_cache_get(src, dst, th, hlen, tlen, so, m) 2755 struct sockaddr *src; 2756 struct sockaddr *dst; 2757 struct tcphdr *th; 2758 unsigned int hlen, tlen; 2759 struct socket *so; 2760 struct mbuf *m; 2761 { 2762 struct syn_cache *sc; 2763 struct syn_cache_head *scp; 2764 register struct inpcb *inp = NULL; 2765 #ifdef INET6 2766 register struct in6pcb *in6p = NULL; 2767 #endif 2768 register struct tcpcb *tp = 0; 2769 struct mbuf *am; 2770 int s; 2771 struct socket *oso; 2772 2773 s = splsoftnet(); 2774 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 2775 splx(s); 2776 return (NULL); 2777 } 2778 2779 /* 2780 * Verify the sequence and ack numbers. Try getting the correct 2781 * response again. 2782 */ 2783 if ((th->th_ack != sc->sc_iss + 1) || 2784 SEQ_LEQ(th->th_seq, sc->sc_irs) || 2785 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 2786 (void) syn_cache_respond(sc, m); 2787 splx(s); 2788 return ((struct socket *)(-1)); 2789 } 2790 2791 /* Remove this cache entry */ 2792 SYN_CACHE_RM(sc); 2793 splx(s); 2794 2795 /* 2796 * Ok, create the full blown connection, and set things up 2797 * as they would have been set up if we had created the 2798 * connection when the SYN arrived. If we can't create 2799 * the connection, abort it. 2800 */ 2801 /* 2802 * inp still has the OLD in_pcb stuff, set the 2803 * v6-related flags on the new guy, too. This is 2804 * done particularly for the case where an AF_INET6 2805 * socket is bound only to a port, and a v4 connection 2806 * comes in on that port. 2807 * we also copy the flowinfo from the original pcb 2808 * to the new one. 2809 */ 2810 { 2811 struct inpcb *parentinpcb; 2812 2813 parentinpcb = (struct inpcb *)so->so_pcb; 2814 2815 oso = so; 2816 so = sonewconn(so, SS_ISCONNECTED); 2817 if (so == NULL) 2818 goto resetandabort; 2819 2820 switch (so->so_proto->pr_domain->dom_family) { 2821 case AF_INET: 2822 inp = sotoinpcb(so); 2823 break; 2824 #ifdef INET6 2825 case AF_INET6: 2826 in6p = sotoin6pcb(so); 2827 #if 0 /*def INET6*/ 2828 inp->inp_flags |= (parentinpcb->inp_flags & 2829 (INP_IPV6 | INP_IPV6_UNDEC | INP_IPV6_MAPPED)); 2830 if ((inp->inp_flags & INP_IPV6) && 2831 !(inp->inp_flags & INP_IPV6_MAPPED)) { 2832 inp->inp_ipv6.ip6_hlim = parentinpcb->inp_ipv6.ip6_hlim; 2833 inp->inp_ipv6.ip6_vfc = parentinpcb->inp_ipv6.ip6_vfc; 2834 } 2835 #endif 2836 break; 2837 #endif 2838 } 2839 } 2840 switch (src->sa_family) { 2841 case AF_INET: 2842 if (inp) { 2843 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 2844 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 2845 inp->inp_options = ip_srcroute(); 2846 in_pcbstate(inp, INP_BOUND); 2847 if (inp->inp_options == NULL) { 2848 inp->inp_options = sc->sc_ipopts; 2849 sc->sc_ipopts = NULL; 2850 } 2851 } 2852 #ifdef INET6 2853 else if (in6p) { 2854 /* IPv4 packet to AF_INET6 socket */ 2855 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr)); 2856 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 2857 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 2858 &in6p->in6p_laddr.s6_addr32[3], 2859 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 2860 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 2861 in6totcpcb(in6p)->t_family = AF_INET; 2862 } 2863 #endif 2864 break; 2865 #ifdef INET6 2866 case AF_INET6: 2867 if (in6p) { 2868 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 2869 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 2870 #if 0 2871 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK; 2872 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 2873 #endif 2874 } 2875 break; 2876 #endif 2877 } 2878 #ifdef INET6 2879 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 2880 struct in6pcb *oin6p = sotoin6pcb(oso); 2881 /* inherit socket options from the listening socket */ 2882 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 2883 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 2884 m_freem(in6p->in6p_options); 2885 in6p->in6p_options = 0; 2886 } 2887 ip6_savecontrol(in6p, &in6p->in6p_options, 2888 mtod(m, struct ip6_hdr *), m); 2889 } 2890 #endif 2891 2892 #ifdef IPSEC 2893 { 2894 struct secpolicy *sp; 2895 if (inp) { 2896 sp = ipsec_copy_policy(sotoinpcb(oso)->inp_sp); 2897 if (sp) { 2898 key_freesp(inp->inp_sp); 2899 inp->inp_sp = sp; 2900 } else 2901 printf("tcp_input: could not copy policy\n"); 2902 } 2903 #ifdef INET6 2904 else if (in6p) { 2905 sp = ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp); 2906 if (sp) { 2907 key_freesp(in6p->in6p_sp); 2908 in6p->in6p_sp = sp; 2909 } else 2910 printf("tcp_input: could not copy policy\n"); 2911 } 2912 #endif 2913 } 2914 #endif 2915 2916 /* 2917 * Give the new socket our cached route reference. 2918 */ 2919 if (inp) 2920 inp->inp_route = sc->sc_route4; /* struct assignment */ 2921 #ifdef INET6 2922 else 2923 in6p->in6p_route = sc->sc_route6; 2924 #endif 2925 sc->sc_route4.ro_rt = NULL; 2926 2927 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 2928 if (am == NULL) 2929 goto resetandabort; 2930 am->m_len = src->sa_len; 2931 bcopy(src, mtod(am, caddr_t), src->sa_len); 2932 if (inp) { 2933 if (in_pcbconnect(inp, am)) { 2934 (void) m_free(am); 2935 goto resetandabort; 2936 } 2937 } 2938 #ifdef INET6 2939 else if (in6p) { 2940 if (src->sa_family == AF_INET) { 2941 /* IPv4 packet to AF_INET6 socket */ 2942 struct sockaddr_in6 *sin6; 2943 sin6 = mtod(am, struct sockaddr_in6 *); 2944 am->m_len = sizeof(*sin6); 2945 bzero(sin6, sizeof(*sin6)); 2946 sin6->sin6_family = AF_INET6; 2947 sin6->sin6_len = sizeof(*sin6); 2948 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port; 2949 sin6->sin6_addr.s6_addr16[5] = htons(0xffff); 2950 bcopy(&((struct sockaddr_in *)src)->sin_addr, 2951 &sin6->sin6_addr.s6_addr32[3], 2952 sizeof(sin6->sin6_addr.s6_addr32[3])); 2953 } 2954 if (in6_pcbconnect(in6p, am)) { 2955 (void) m_free(am); 2956 goto resetandabort; 2957 } 2958 } 2959 #endif 2960 else { 2961 (void) m_free(am); 2962 goto resetandabort; 2963 } 2964 (void) m_free(am); 2965 2966 if (inp) 2967 tp = intotcpcb(inp); 2968 #ifdef INET6 2969 else if (in6p) 2970 tp = in6totcpcb(in6p); 2971 #endif 2972 else 2973 tp = NULL; 2974 if (sc->sc_request_r_scale != 15) { 2975 tp->requested_s_scale = sc->sc_requested_s_scale; 2976 tp->request_r_scale = sc->sc_request_r_scale; 2977 tp->snd_scale = sc->sc_requested_s_scale; 2978 tp->rcv_scale = sc->sc_request_r_scale; 2979 tp->t_flags |= TF_RCVD_SCALE; 2980 } 2981 if (sc->sc_flags & SCF_TIMESTAMP) 2982 tp->t_flags |= TF_RCVD_TSTMP; 2983 2984 tp->t_template = tcp_template(tp); 2985 if (tp->t_template == 0) { 2986 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 2987 so = NULL; 2988 m_freem(m); 2989 goto abort; 2990 } 2991 2992 tp->iss = sc->sc_iss; 2993 tp->irs = sc->sc_irs; 2994 tcp_sendseqinit(tp); 2995 tcp_rcvseqinit(tp); 2996 tp->t_state = TCPS_SYN_RECEIVED; 2997 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 2998 tcpstat.tcps_accepts++; 2999 3000 /* Initialize tp->t_ourmss before we deal with the peer's! */ 3001 tp->t_ourmss = sc->sc_ourmaxseg; 3002 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 3003 3004 /* 3005 * Initialize the initial congestion window. If we 3006 * had to retransmit the SYN,ACK, we must initialize cwnd 3007 * to 1 segment (i.e. the Loss Window). 3008 */ 3009 if (sc->sc_rxtshift) 3010 tp->snd_cwnd = tp->t_peermss; 3011 else 3012 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss); 3013 3014 tcp_rmx_rtt(tp); 3015 tp->snd_wl1 = sc->sc_irs; 3016 tp->rcv_up = sc->sc_irs + 1; 3017 3018 /* 3019 * This is what whould have happened in tcp_ouput() when 3020 * the SYN,ACK was sent. 3021 */ 3022 tp->snd_up = tp->snd_una; 3023 tp->snd_max = tp->snd_nxt = tp->iss+1; 3024 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3025 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3026 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3027 tp->last_ack_sent = tp->rcv_nxt; 3028 3029 tcpstat.tcps_sc_completed++; 3030 SYN_CACHE_PUT(sc); 3031 return (so); 3032 3033 resetandabort: 3034 (void) tcp_respond(NULL, m, m, th, 3035 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 3036 abort: 3037 if (so != NULL) 3038 (void) soabort(so); 3039 SYN_CACHE_PUT(sc); 3040 tcpstat.tcps_sc_aborted++; 3041 return ((struct socket *)(-1)); 3042 } 3043 3044 /* 3045 * This function is called when we get a RST for a 3046 * non-existant connection, so that we can see if the 3047 * connection is in the syn cache. If it is, zap it. 3048 */ 3049 3050 void 3051 syn_cache_reset(src, dst, th) 3052 struct sockaddr *src; 3053 struct sockaddr *dst; 3054 struct tcphdr *th; 3055 { 3056 struct syn_cache *sc; 3057 struct syn_cache_head *scp; 3058 int s = splsoftnet(); 3059 3060 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3061 splx(s); 3062 return; 3063 } 3064 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3065 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 3066 splx(s); 3067 return; 3068 } 3069 SYN_CACHE_RM(sc); 3070 splx(s); 3071 tcpstat.tcps_sc_reset++; 3072 SYN_CACHE_PUT(sc); 3073 } 3074 3075 void 3076 syn_cache_unreach(src, dst, th) 3077 struct sockaddr *src; 3078 struct sockaddr *dst; 3079 struct tcphdr *th; 3080 { 3081 struct syn_cache *sc; 3082 struct syn_cache_head *scp; 3083 int s; 3084 3085 s = splsoftnet(); 3086 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3087 splx(s); 3088 return; 3089 } 3090 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3091 if (ntohl (th->th_seq) != sc->sc_iss) { 3092 splx(s); 3093 return; 3094 } 3095 3096 /* 3097 * If we've rertransmitted 3 times and this is our second error, 3098 * we remove the entry. Otherwise, we allow it to continue on. 3099 * This prevents us from incorrectly nuking an entry during a 3100 * spurious network outage. 3101 * 3102 * See tcp_notify(). 3103 */ 3104 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 3105 sc->sc_flags |= SCF_UNREACH; 3106 splx(s); 3107 return; 3108 } 3109 3110 SYN_CACHE_RM(sc); 3111 splx(s); 3112 tcpstat.tcps_sc_unreach++; 3113 SYN_CACHE_PUT(sc); 3114 } 3115 3116 /* 3117 * Given a LISTEN socket and an inbound SYN request, add 3118 * this to the syn cache, and send back a segment: 3119 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3120 * to the source. 3121 * 3122 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3123 * Doing so would require that we hold onto the data and deliver it 3124 * to the application. However, if we are the target of a SYN-flood 3125 * DoS attack, an attacker could send data which would eventually 3126 * consume all available buffer space if it were ACKed. By not ACKing 3127 * the data, we avoid this DoS scenario. 3128 */ 3129 3130 int 3131 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi) 3132 struct sockaddr *src; 3133 struct sockaddr *dst; 3134 struct tcphdr *th; 3135 unsigned int hlen; 3136 struct socket *so; 3137 struct mbuf *m; 3138 u_char *optp; 3139 int optlen; 3140 struct tcp_opt_info *oi; 3141 { 3142 struct tcpcb tb, *tp; 3143 long win; 3144 struct syn_cache *sc; 3145 struct syn_cache_head *scp; 3146 struct mbuf *ipopts; 3147 3148 tp = sototcpcb(so); 3149 3150 /* 3151 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3152 * in_broadcast() should never return true on a received 3153 * packet with M_BCAST not set. 3154 */ 3155 if (m->m_flags & (M_BCAST|M_MCAST)) 3156 return 0; 3157 #ifdef INET6 3158 if (m->m_flags & M_ANYCAST6) 3159 return 0; 3160 #endif 3161 3162 switch (src->sa_family) { 3163 case AF_INET: 3164 if (IN_MULTICAST(((struct sockaddr_in *)src)->sin_addr.s_addr) 3165 || IN_MULTICAST(((struct sockaddr_in *)dst)->sin_addr.s_addr)) 3166 return 0; 3167 break; 3168 #ifdef INET6 3169 case AF_INET6: 3170 if (IN6_IS_ADDR_MULTICAST(&((struct sockaddr_in6 *)src)->sin6_addr) 3171 || IN6_IS_ADDR_MULTICAST(&((struct sockaddr_in6 *)dst)->sin6_addr)) 3172 return 0; 3173 break; 3174 #endif 3175 default: 3176 return 0; 3177 } 3178 3179 /* 3180 * Initialize some local state. 3181 */ 3182 win = sbspace(&so->so_rcv); 3183 if (win > TCP_MAXWIN) 3184 win = TCP_MAXWIN; 3185 3186 if (src->sa_family == AF_INET) { 3187 /* 3188 * Remember the IP options, if any. 3189 */ 3190 ipopts = ip_srcroute(); 3191 } else 3192 ipopts = NULL; 3193 3194 if (optp) { 3195 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3196 tcp_dooptions(&tb, optp, optlen, th, oi); 3197 } else 3198 tb.t_flags = 0; 3199 3200 /* 3201 * See if we already have an entry for this connection. 3202 * If we do, resend the SYN,ACK. We do not count this 3203 * as a retransmission (XXX though maybe we should). 3204 */ 3205 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 3206 tcpstat.tcps_sc_dupesyn++; 3207 if (ipopts) { 3208 /* 3209 * If we were remembering a previous source route, 3210 * forget it and use the new one we've been given. 3211 */ 3212 if (sc->sc_ipopts) 3213 (void) m_free(sc->sc_ipopts); 3214 sc->sc_ipopts = ipopts; 3215 } 3216 sc->sc_timestamp = tb.ts_recent; 3217 if (syn_cache_respond(sc, m) == 0) { 3218 tcpstat.tcps_sndacks++; 3219 tcpstat.tcps_sndtotal++; 3220 } 3221 return (1); 3222 } 3223 3224 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 3225 if (sc == NULL) { 3226 if (ipopts) 3227 (void) m_free(ipopts); 3228 return (0); 3229 } 3230 3231 /* 3232 * Fill in the cache, and put the necessary IP and TCP 3233 * options into the reply. 3234 */ 3235 bzero(sc, sizeof(struct syn_cache)); 3236 bcopy(src, &sc->sc_src, src->sa_len); 3237 bcopy(dst, &sc->sc_dst, dst->sa_len); 3238 sc->sc_flags = 0; 3239 sc->sc_ipopts = ipopts; 3240 sc->sc_irs = th->th_seq; 3241 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0); 3242 sc->sc_peermaxseg = oi->maxseg; 3243 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 3244 m->m_pkthdr.rcvif : NULL, 3245 sc->sc_src.sa.sa_family); 3246 sc->sc_win = win; 3247 sc->sc_timestamp = tb.ts_recent; 3248 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) 3249 sc->sc_flags |= SCF_TIMESTAMP; 3250 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3251 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3252 sc->sc_requested_s_scale = tb.requested_s_scale; 3253 sc->sc_request_r_scale = 0; 3254 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3255 TCP_MAXWIN << sc->sc_request_r_scale < 3256 so->so_rcv.sb_hiwat) 3257 sc->sc_request_r_scale++; 3258 } else { 3259 sc->sc_requested_s_scale = 15; 3260 sc->sc_request_r_scale = 15; 3261 } 3262 sc->sc_tp = tp; 3263 if (syn_cache_respond(sc, m) == 0) { 3264 syn_cache_insert(sc, tp); 3265 tcpstat.tcps_sndacks++; 3266 tcpstat.tcps_sndtotal++; 3267 } else { 3268 SYN_CACHE_PUT(sc); 3269 tcpstat.tcps_sc_dropped++; 3270 } 3271 return (1); 3272 } 3273 3274 int 3275 syn_cache_respond(sc, m) 3276 struct syn_cache *sc; 3277 struct mbuf *m; 3278 { 3279 struct route *ro; 3280 struct rtentry *rt; 3281 u_int8_t *optp; 3282 int optlen, error; 3283 u_int16_t tlen; 3284 struct ip *ip = NULL; 3285 #ifdef INET6 3286 struct ip6_hdr *ip6 = NULL; 3287 #endif 3288 struct tcphdr *th; 3289 u_int hlen; 3290 3291 switch (sc->sc_src.sa.sa_family) { 3292 case AF_INET: 3293 hlen = sizeof(struct ip); 3294 ro = &sc->sc_route4; 3295 break; 3296 #ifdef INET6 3297 case AF_INET6: 3298 hlen = sizeof(struct ip6_hdr); 3299 ro = (struct route *)&sc->sc_route6; 3300 break; 3301 #endif 3302 default: 3303 if (m) 3304 m_freem(m); 3305 return EAFNOSUPPORT; 3306 } 3307 3308 /* Compute the size of the TCP options. */ 3309 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3310 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3311 3312 tlen = hlen + sizeof(struct tcphdr) + optlen; 3313 3314 /* 3315 * Create the IP+TCP header from scratch. Reuse the received mbuf 3316 * if possible. 3317 */ 3318 if (m != NULL) { 3319 m_freem(m->m_next); 3320 m->m_next = NULL; 3321 MRESETDATA(m); 3322 } else { 3323 MGETHDR(m, M_DONTWAIT, MT_DATA); 3324 if (m == NULL) 3325 return (ENOBUFS); 3326 } 3327 3328 /* Fixup the mbuf. */ 3329 m->m_data += max_linkhdr; 3330 m->m_len = m->m_pkthdr.len = tlen; 3331 #ifdef IPSEC 3332 if (sc->sc_tp) { 3333 struct tcpcb *tp; 3334 struct socket *so; 3335 3336 tp = sc->sc_tp; 3337 if (tp->t_inpcb) 3338 so = tp->t_inpcb->inp_socket; 3339 #ifdef INET6 3340 else if (tp->t_in6pcb) 3341 so = tp->t_in6pcb->in6p_socket; 3342 #endif 3343 else 3344 so = NULL; 3345 /* use IPsec policy on listening socket, on SYN ACK */ 3346 m->m_pkthdr.rcvif = (struct ifnet *)so; 3347 } 3348 #else 3349 m->m_pkthdr.rcvif = NULL; 3350 #endif 3351 memset(mtod(m, u_char *), 0, tlen); 3352 3353 switch (sc->sc_src.sa.sa_family) { 3354 case AF_INET: 3355 ip = mtod(m, struct ip *); 3356 ip->ip_dst = sc->sc_src.sin.sin_addr; 3357 ip->ip_src = sc->sc_dst.sin.sin_addr; 3358 ip->ip_p = IPPROTO_TCP; 3359 th = (struct tcphdr *)(ip + 1); 3360 th->th_dport = sc->sc_src.sin.sin_port; 3361 th->th_sport = sc->sc_dst.sin.sin_port; 3362 break; 3363 #ifdef INET6 3364 case AF_INET6: 3365 ip6 = mtod(m, struct ip6_hdr *); 3366 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 3367 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 3368 ip6->ip6_nxt = IPPROTO_TCP; 3369 /* ip6_plen will be updated in ip6_output() */ 3370 th = (struct tcphdr *)(ip6 + 1); 3371 th->th_dport = sc->sc_src.sin6.sin6_port; 3372 th->th_sport = sc->sc_dst.sin6.sin6_port; 3373 break; 3374 #endif 3375 default: 3376 th = NULL; 3377 } 3378 3379 th->th_seq = htonl(sc->sc_iss); 3380 th->th_ack = htonl(sc->sc_irs + 1); 3381 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 3382 th->th_flags = TH_SYN|TH_ACK; 3383 th->th_win = htons(sc->sc_win); 3384 /* th_sum already 0 */ 3385 /* th_urp already 0 */ 3386 3387 /* Tack on the TCP options. */ 3388 optp = (u_int8_t *)(th + 1); 3389 *optp++ = TCPOPT_MAXSEG; 3390 *optp++ = 4; 3391 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 3392 *optp++ = sc->sc_ourmaxseg & 0xff; 3393 3394 if (sc->sc_request_r_scale != 15) { 3395 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 3396 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 3397 sc->sc_request_r_scale); 3398 optp += 4; 3399 } 3400 3401 if (sc->sc_flags & SCF_TIMESTAMP) { 3402 u_int32_t *lp = (u_int32_t *)(optp); 3403 /* Form timestamp option as shown in appendix A of RFC 1323. */ 3404 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 3405 *lp++ = htonl(tcp_now); 3406 *lp = htonl(sc->sc_timestamp); 3407 optp += TCPOLEN_TSTAMP_APPA; 3408 } 3409 3410 /* Compute the packet's checksum. */ 3411 switch (sc->sc_src.sa.sa_family) { 3412 case AF_INET: 3413 ip->ip_len = htons(tlen - hlen); 3414 th->th_sum = 0; 3415 th->th_sum = in_cksum(m, tlen); 3416 break; 3417 #ifdef INET6 3418 case AF_INET6: 3419 ip6->ip6_plen = htons(tlen - hlen); 3420 th->th_sum = 0; 3421 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 3422 break; 3423 #endif 3424 } 3425 3426 /* 3427 * Fill in some straggling IP bits. Note the stack expects 3428 * ip_len to be in host order, for convenience. 3429 */ 3430 switch (sc->sc_src.sa.sa_family) { 3431 case AF_INET: 3432 ip->ip_len = tlen; 3433 ip->ip_ttl = ip_defttl; 3434 /* XXX tos? */ 3435 break; 3436 #ifdef INET6 3437 case AF_INET6: 3438 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 3439 ip6->ip6_vfc |= IPV6_VERSION; 3440 ip6->ip6_plen = htons(tlen - hlen); 3441 /* ip6_hlim will be initialized afterwards */ 3442 /* XXX flowlabel? */ 3443 break; 3444 #endif 3445 } 3446 3447 /* 3448 * If we're doing Path MTU discovery, we need to set DF unless 3449 * the route's MTU is locked. If we don't yet know the route, 3450 * look it up now. We will copy this reference to the inpcb 3451 * when we finish creating the connection. 3452 */ 3453 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) { 3454 if (ro->ro_rt != NULL) { 3455 RTFREE(ro->ro_rt); 3456 ro->ro_rt = NULL; 3457 } 3458 bcopy(&sc->sc_src, &ro->ro_dst, sc->sc_src.sa.sa_len); 3459 rtalloc(ro); 3460 if ((rt = ro->ro_rt) == NULL) { 3461 m_freem(m); 3462 switch (sc->sc_src.sa.sa_family) { 3463 case AF_INET: 3464 ipstat.ips_noroute++; 3465 break; 3466 #ifdef INET6 3467 case AF_INET6: 3468 ip6stat.ip6s_noroute++; 3469 break; 3470 #endif 3471 } 3472 return (EHOSTUNREACH); 3473 } 3474 } 3475 3476 switch (sc->sc_src.sa.sa_family) { 3477 case AF_INET: 3478 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 3479 ip->ip_off |= IP_DF; 3480 3481 /* ...and send it off! */ 3482 error = ip_output(m, sc->sc_ipopts, ro, 0, NULL); 3483 break; 3484 #ifdef INET6 3485 case AF_INET6: 3486 ip6->ip6_hlim = in6_selecthlim(NULL, 3487 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 3488 3489 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 3490 0, NULL, NULL); 3491 break; 3492 #endif 3493 default: 3494 error = EAFNOSUPPORT; 3495 break; 3496 } 3497 return (error); 3498 } 3499