1 /* $NetBSD: tcp_input.c,v 1.350 2016/12/08 05:16:33 ozaki-r Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 * 35 * NRL grants permission for redistribution and use in source and binary 36 * forms, with or without modification, of the software and documentation 37 * created at NRL provided that the following conditions are met: 38 * 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgements: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * This product includes software developed at the Information 49 * Technology Division, US Naval Research Laboratory. 50 * 4. Neither the name of the NRL nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 * 66 * The views and conclusions contained in the software and documentation 67 * are those of the authors and should not be interpreted as representing 68 * official policies, either expressed or implied, of the US Naval 69 * Research Laboratory (NRL). 70 */ 71 72 /*- 73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, 74 * 2011 The NetBSD Foundation, Inc. 75 * All rights reserved. 76 * 77 * This code is derived from software contributed to The NetBSD Foundation 78 * by Coyote Point Systems, Inc. 79 * This code is derived from software contributed to The NetBSD Foundation 80 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 81 * Facility, NASA Ames Research Center. 82 * This code is derived from software contributed to The NetBSD Foundation 83 * by Charles M. Hannum. 84 * This code is derived from software contributed to The NetBSD Foundation 85 * by Rui Paulo. 86 * 87 * Redistribution and use in source and binary forms, with or without 88 * modification, are permitted provided that the following conditions 89 * are met: 90 * 1. Redistributions of source code must retain the above copyright 91 * notice, this list of conditions and the following disclaimer. 92 * 2. Redistributions in binary form must reproduce the above copyright 93 * notice, this list of conditions and the following disclaimer in the 94 * documentation and/or other materials provided with the distribution. 95 * 96 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 97 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 98 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 99 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 100 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 101 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 102 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 103 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 104 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 105 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 106 * POSSIBILITY OF SUCH DAMAGE. 107 */ 108 109 /* 110 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 111 * The Regents of the University of California. All rights reserved. 112 * 113 * Redistribution and use in source and binary forms, with or without 114 * modification, are permitted provided that the following conditions 115 * are met: 116 * 1. Redistributions of source code must retain the above copyright 117 * notice, this list of conditions and the following disclaimer. 118 * 2. Redistributions in binary form must reproduce the above copyright 119 * notice, this list of conditions and the following disclaimer in the 120 * documentation and/or other materials provided with the distribution. 121 * 3. Neither the name of the University nor the names of its contributors 122 * may be used to endorse or promote products derived from this software 123 * without specific prior written permission. 124 * 125 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 126 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 127 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 128 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 129 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 130 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 131 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 132 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 133 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 134 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 135 * SUCH DAMAGE. 136 * 137 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 138 */ 139 140 /* 141 * TODO list for SYN cache stuff: 142 * 143 * Find room for a "state" field, which is needed to keep a 144 * compressed state for TIME_WAIT TCBs. It's been noted already 145 * that this is fairly important for very high-volume web and 146 * mail servers, which use a large number of short-lived 147 * connections. 148 */ 149 150 #include <sys/cdefs.h> 151 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.350 2016/12/08 05:16:33 ozaki-r Exp $"); 152 153 #ifdef _KERNEL_OPT 154 #include "opt_inet.h" 155 #include "opt_ipsec.h" 156 #include "opt_inet_csum.h" 157 #include "opt_tcp_debug.h" 158 #endif 159 160 #include <sys/param.h> 161 #include <sys/systm.h> 162 #include <sys/malloc.h> 163 #include <sys/mbuf.h> 164 #include <sys/protosw.h> 165 #include <sys/socket.h> 166 #include <sys/socketvar.h> 167 #include <sys/errno.h> 168 #include <sys/syslog.h> 169 #include <sys/pool.h> 170 #include <sys/domain.h> 171 #include <sys/kernel.h> 172 #ifdef TCP_SIGNATURE 173 #include <sys/md5.h> 174 #endif 175 #include <sys/lwp.h> /* for lwp0 */ 176 #include <sys/cprng.h> 177 178 #include <net/if.h> 179 #include <net/if_types.h> 180 181 #include <netinet/in.h> 182 #include <netinet/in_systm.h> 183 #include <netinet/ip.h> 184 #include <netinet/in_pcb.h> 185 #include <netinet/in_var.h> 186 #include <netinet/ip_var.h> 187 #include <netinet/in_offload.h> 188 189 #ifdef INET6 190 #ifndef INET 191 #include <netinet/in.h> 192 #endif 193 #include <netinet/ip6.h> 194 #include <netinet6/ip6_var.h> 195 #include <netinet6/in6_pcb.h> 196 #include <netinet6/ip6_var.h> 197 #include <netinet6/in6_var.h> 198 #include <netinet/icmp6.h> 199 #include <netinet6/nd6.h> 200 #ifdef TCP_SIGNATURE 201 #include <netinet6/scope6_var.h> 202 #endif 203 #endif 204 205 #ifndef INET6 206 /* always need ip6.h for IP6_EXTHDR_GET */ 207 #include <netinet/ip6.h> 208 #endif 209 210 #include <netinet/tcp.h> 211 #include <netinet/tcp_fsm.h> 212 #include <netinet/tcp_seq.h> 213 #include <netinet/tcp_timer.h> 214 #include <netinet/tcp_var.h> 215 #include <netinet/tcp_private.h> 216 #include <netinet/tcpip.h> 217 #include <netinet/tcp_congctl.h> 218 #include <netinet/tcp_debug.h> 219 220 #ifdef INET6 221 #include "faith.h" 222 #if defined(NFAITH) && NFAITH > 0 223 #include <net/if_faith.h> 224 #endif 225 #endif /* INET6 */ 226 227 #ifdef IPSEC 228 #include <netipsec/ipsec.h> 229 #include <netipsec/ipsec_var.h> 230 #include <netipsec/ipsec_private.h> 231 #include <netipsec/key.h> 232 #ifdef INET6 233 #include <netipsec/ipsec6.h> 234 #endif 235 #endif /* IPSEC*/ 236 237 #include <netinet/tcp_vtw.h> 238 239 int tcprexmtthresh = 3; 240 int tcp_log_refused; 241 242 int tcp_do_autorcvbuf = 1; 243 int tcp_autorcvbuf_inc = 16 * 1024; 244 int tcp_autorcvbuf_max = 256 * 1024; 245 int tcp_msl = (TCPTV_MSL / PR_SLOWHZ); 246 247 static int tcp_rst_ppslim_count = 0; 248 static struct timeval tcp_rst_ppslim_last; 249 static int tcp_ackdrop_ppslim_count = 0; 250 static struct timeval tcp_ackdrop_ppslim_last; 251 252 #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ) 253 254 /* for modulo comparisons of timestamps */ 255 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 256 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 257 258 /* 259 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 260 */ 261 #ifdef INET6 262 static inline void 263 nd6_hint(struct tcpcb *tp) 264 { 265 struct rtentry *rt = NULL; 266 267 if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 && 268 (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL) 269 nd6_nud_hint(rt); 270 rtcache_unref(rt, &tp->t_in6pcb->in6p_route); 271 } 272 #else 273 static inline void 274 nd6_hint(struct tcpcb *tp) 275 { 276 } 277 #endif 278 279 /* 280 * Compute ACK transmission behavior. Delay the ACK unless 281 * we have already delayed an ACK (must send an ACK every two segments). 282 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 283 * option is enabled. 284 */ 285 static void 286 tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th) 287 { 288 289 if (tp->t_flags & TF_DELACK || 290 (tcp_ack_on_push && th->th_flags & TH_PUSH)) 291 tp->t_flags |= TF_ACKNOW; 292 else 293 TCP_SET_DELACK(tp); 294 } 295 296 static void 297 icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked) 298 { 299 300 /* 301 * If we had a pending ICMP message that refers to data that have 302 * just been acknowledged, disregard the recorded ICMP message. 303 */ 304 if ((tp->t_flags & TF_PMTUD_PEND) && 305 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 306 tp->t_flags &= ~TF_PMTUD_PEND; 307 308 /* 309 * Keep track of the largest chunk of data 310 * acknowledged since last PMTU update 311 */ 312 if (tp->t_pmtud_mss_acked < acked) 313 tp->t_pmtud_mss_acked = acked; 314 } 315 316 /* 317 * Convert TCP protocol fields to host order for easier processing. 318 */ 319 static void 320 tcp_fields_to_host(struct tcphdr *th) 321 { 322 323 NTOHL(th->th_seq); 324 NTOHL(th->th_ack); 325 NTOHS(th->th_win); 326 NTOHS(th->th_urp); 327 } 328 329 /* 330 * ... and reverse the above. 331 */ 332 static void 333 tcp_fields_to_net(struct tcphdr *th) 334 { 335 336 HTONL(th->th_seq); 337 HTONL(th->th_ack); 338 HTONS(th->th_win); 339 HTONS(th->th_urp); 340 } 341 342 #ifdef TCP_CSUM_COUNTERS 343 #include <sys/device.h> 344 345 #if defined(INET) 346 extern struct evcnt tcp_hwcsum_ok; 347 extern struct evcnt tcp_hwcsum_bad; 348 extern struct evcnt tcp_hwcsum_data; 349 extern struct evcnt tcp_swcsum; 350 #endif /* defined(INET) */ 351 #if defined(INET6) 352 extern struct evcnt tcp6_hwcsum_ok; 353 extern struct evcnt tcp6_hwcsum_bad; 354 extern struct evcnt tcp6_hwcsum_data; 355 extern struct evcnt tcp6_swcsum; 356 #endif /* defined(INET6) */ 357 358 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 359 360 #else 361 362 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 363 364 #endif /* TCP_CSUM_COUNTERS */ 365 366 #ifdef TCP_REASS_COUNTERS 367 #include <sys/device.h> 368 369 extern struct evcnt tcp_reass_; 370 extern struct evcnt tcp_reass_empty; 371 extern struct evcnt tcp_reass_iteration[8]; 372 extern struct evcnt tcp_reass_prependfirst; 373 extern struct evcnt tcp_reass_prepend; 374 extern struct evcnt tcp_reass_insert; 375 extern struct evcnt tcp_reass_inserttail; 376 extern struct evcnt tcp_reass_append; 377 extern struct evcnt tcp_reass_appendtail; 378 extern struct evcnt tcp_reass_overlaptail; 379 extern struct evcnt tcp_reass_overlapfront; 380 extern struct evcnt tcp_reass_segdup; 381 extern struct evcnt tcp_reass_fragdup; 382 383 #define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++ 384 385 #else 386 387 #define TCP_REASS_COUNTER_INCR(ev) /* nothing */ 388 389 #endif /* TCP_REASS_COUNTERS */ 390 391 static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *, 392 int *); 393 static int tcp_dooptions(struct tcpcb *, const u_char *, int, 394 struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); 395 396 #ifdef INET 397 static void tcp4_log_refused(const struct ip *, const struct tcphdr *); 398 #endif 399 #ifdef INET6 400 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *); 401 #endif 402 403 #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next 404 405 #if defined(MBUFTRACE) 406 struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass"); 407 #endif /* defined(MBUFTRACE) */ 408 409 static struct pool tcpipqent_pool; 410 411 void 412 tcpipqent_init(void) 413 { 414 415 pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl", 416 NULL, IPL_VM); 417 } 418 419 struct ipqent * 420 tcpipqent_alloc(void) 421 { 422 struct ipqent *ipqe; 423 int s; 424 425 s = splvm(); 426 ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT); 427 splx(s); 428 429 return ipqe; 430 } 431 432 void 433 tcpipqent_free(struct ipqent *ipqe) 434 { 435 int s; 436 437 s = splvm(); 438 pool_put(&tcpipqent_pool, ipqe); 439 splx(s); 440 } 441 442 static int 443 tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int *tlen) 444 { 445 struct ipqent *p, *q, *nq, *tiqe = NULL; 446 struct socket *so = NULL; 447 int pkt_flags; 448 tcp_seq pkt_seq; 449 unsigned pkt_len; 450 u_long rcvpartdupbyte = 0; 451 u_long rcvoobyte; 452 #ifdef TCP_REASS_COUNTERS 453 u_int count = 0; 454 #endif 455 uint64_t *tcps; 456 457 if (tp->t_inpcb) 458 so = tp->t_inpcb->inp_socket; 459 #ifdef INET6 460 else if (tp->t_in6pcb) 461 so = tp->t_in6pcb->in6p_socket; 462 #endif 463 464 TCP_REASS_LOCK_CHECK(tp); 465 466 /* 467 * Call with th==0 after become established to 468 * force pre-ESTABLISHED data up to user socket. 469 */ 470 if (th == 0) 471 goto present; 472 473 m_claimm(m, &tcp_reass_mowner); 474 475 rcvoobyte = *tlen; 476 /* 477 * Copy these to local variables because the tcpiphdr 478 * gets munged while we are collapsing mbufs. 479 */ 480 pkt_seq = th->th_seq; 481 pkt_len = *tlen; 482 pkt_flags = th->th_flags; 483 484 TCP_REASS_COUNTER_INCR(&tcp_reass_); 485 486 if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) { 487 /* 488 * When we miss a packet, the vast majority of time we get 489 * packets that follow it in order. So optimize for that. 490 */ 491 if (pkt_seq == p->ipqe_seq + p->ipqe_len) { 492 p->ipqe_len += pkt_len; 493 p->ipqe_flags |= pkt_flags; 494 m_cat(p->ipre_mlast, m); 495 TRAVERSE(p->ipre_mlast); 496 m = NULL; 497 tiqe = p; 498 TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq); 499 TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail); 500 goto skip_replacement; 501 } 502 /* 503 * While we're here, if the pkt is completely beyond 504 * anything we have, just insert it at the tail. 505 */ 506 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) { 507 TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail); 508 goto insert_it; 509 } 510 } 511 512 q = TAILQ_FIRST(&tp->segq); 513 514 if (q != NULL) { 515 /* 516 * If this segment immediately precedes the first out-of-order 517 * block, simply slap the segment in front of it and (mostly) 518 * skip the complicated logic. 519 */ 520 if (pkt_seq + pkt_len == q->ipqe_seq) { 521 q->ipqe_seq = pkt_seq; 522 q->ipqe_len += pkt_len; 523 q->ipqe_flags |= pkt_flags; 524 m_cat(m, q->ipqe_m); 525 q->ipqe_m = m; 526 q->ipre_mlast = m; /* last mbuf may have changed */ 527 TRAVERSE(q->ipre_mlast); 528 tiqe = q; 529 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 530 TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst); 531 goto skip_replacement; 532 } 533 } else { 534 TCP_REASS_COUNTER_INCR(&tcp_reass_empty); 535 } 536 537 /* 538 * Find a segment which begins after this one does. 539 */ 540 for (p = NULL; q != NULL; q = nq) { 541 nq = TAILQ_NEXT(q, ipqe_q); 542 #ifdef TCP_REASS_COUNTERS 543 count++; 544 #endif 545 /* 546 * If the received segment is just right after this 547 * fragment, merge the two together and then check 548 * for further overlaps. 549 */ 550 if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 551 #ifdef TCPREASS_DEBUG 552 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n", 553 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 554 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); 555 #endif 556 pkt_len += q->ipqe_len; 557 pkt_flags |= q->ipqe_flags; 558 pkt_seq = q->ipqe_seq; 559 m_cat(q->ipre_mlast, m); 560 TRAVERSE(q->ipre_mlast); 561 m = q->ipqe_m; 562 TCP_REASS_COUNTER_INCR(&tcp_reass_append); 563 goto free_ipqe; 564 } 565 /* 566 * If the received segment is completely past this 567 * fragment, we need to go the next fragment. 568 */ 569 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 570 p = q; 571 continue; 572 } 573 /* 574 * If the fragment is past the received segment, 575 * it (or any following) can't be concatenated. 576 */ 577 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) { 578 TCP_REASS_COUNTER_INCR(&tcp_reass_insert); 579 break; 580 } 581 582 /* 583 * We've received all the data in this segment before. 584 * mark it as a duplicate and return. 585 */ 586 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 587 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 588 tcps = TCP_STAT_GETREF(); 589 tcps[TCP_STAT_RCVDUPPACK]++; 590 tcps[TCP_STAT_RCVDUPBYTE] += pkt_len; 591 TCP_STAT_PUTREF(); 592 tcp_new_dsack(tp, pkt_seq, pkt_len); 593 m_freem(m); 594 if (tiqe != NULL) { 595 tcpipqent_free(tiqe); 596 } 597 TCP_REASS_COUNTER_INCR(&tcp_reass_segdup); 598 goto out; 599 } 600 /* 601 * Received segment completely overlaps this fragment 602 * so we drop the fragment (this keeps the temporal 603 * ordering of segments correct). 604 */ 605 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 606 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 607 rcvpartdupbyte += q->ipqe_len; 608 m_freem(q->ipqe_m); 609 TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup); 610 goto free_ipqe; 611 } 612 /* 613 * RX'ed segment extends past the end of the 614 * fragment. Drop the overlapping bytes. Then 615 * merge the fragment and segment then treat as 616 * a longer received packet. 617 */ 618 if (SEQ_LT(q->ipqe_seq, pkt_seq) && 619 SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 620 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 621 #ifdef TCPREASS_DEBUG 622 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n", 623 tp, overlap, 624 pkt_seq, pkt_seq + pkt_len, pkt_len); 625 #endif 626 m_adj(m, overlap); 627 rcvpartdupbyte += overlap; 628 m_cat(q->ipre_mlast, m); 629 TRAVERSE(q->ipre_mlast); 630 m = q->ipqe_m; 631 pkt_seq = q->ipqe_seq; 632 pkt_len += q->ipqe_len - overlap; 633 rcvoobyte -= overlap; 634 TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail); 635 goto free_ipqe; 636 } 637 /* 638 * RX'ed segment extends past the front of the 639 * fragment. Drop the overlapping bytes on the 640 * received packet. The packet will then be 641 * contatentated with this fragment a bit later. 642 */ 643 if (SEQ_GT(q->ipqe_seq, pkt_seq) && 644 SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 645 int overlap = pkt_seq + pkt_len - q->ipqe_seq; 646 #ifdef TCPREASS_DEBUG 647 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n", 648 tp, overlap, 649 pkt_seq, pkt_seq + pkt_len, pkt_len); 650 #endif 651 m_adj(m, -overlap); 652 pkt_len -= overlap; 653 rcvpartdupbyte += overlap; 654 TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront); 655 rcvoobyte -= overlap; 656 } 657 /* 658 * If the received segment immediates precedes this 659 * fragment then tack the fragment onto this segment 660 * and reinsert the data. 661 */ 662 if (q->ipqe_seq == pkt_seq + pkt_len) { 663 #ifdef TCPREASS_DEBUG 664 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n", 665 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, 666 pkt_seq, pkt_seq + pkt_len, pkt_len); 667 #endif 668 pkt_len += q->ipqe_len; 669 pkt_flags |= q->ipqe_flags; 670 m_cat(m, q->ipqe_m); 671 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 672 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 673 tp->t_segqlen--; 674 KASSERT(tp->t_segqlen >= 0); 675 KASSERT(tp->t_segqlen != 0 || 676 (TAILQ_EMPTY(&tp->segq) && 677 TAILQ_EMPTY(&tp->timeq))); 678 if (tiqe == NULL) { 679 tiqe = q; 680 } else { 681 tcpipqent_free(q); 682 } 683 TCP_REASS_COUNTER_INCR(&tcp_reass_prepend); 684 break; 685 } 686 /* 687 * If the fragment is before the segment, remember it. 688 * When this loop is terminated, p will contain the 689 * pointer to fragment that is right before the received 690 * segment. 691 */ 692 if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 693 p = q; 694 695 continue; 696 697 /* 698 * This is a common operation. It also will allow 699 * to save doing a malloc/free in most instances. 700 */ 701 free_ipqe: 702 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 703 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 704 tp->t_segqlen--; 705 KASSERT(tp->t_segqlen >= 0); 706 KASSERT(tp->t_segqlen != 0 || 707 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 708 if (tiqe == NULL) { 709 tiqe = q; 710 } else { 711 tcpipqent_free(q); 712 } 713 } 714 715 #ifdef TCP_REASS_COUNTERS 716 if (count > 7) 717 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]); 718 else if (count > 0) 719 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]); 720 #endif 721 722 insert_it: 723 724 /* 725 * Allocate a new queue entry since the received segment did not 726 * collapse onto any other out-of-order block; thus we are allocating 727 * a new block. If it had collapsed, tiqe would not be NULL and 728 * we would be reusing it. 729 * XXX If we can't, just drop the packet. XXX 730 */ 731 if (tiqe == NULL) { 732 tiqe = tcpipqent_alloc(); 733 if (tiqe == NULL) { 734 TCP_STATINC(TCP_STAT_RCVMEMDROP); 735 m_freem(m); 736 goto out; 737 } 738 } 739 740 /* 741 * Update the counters. 742 */ 743 tp->t_rcvoopack++; 744 tcps = TCP_STAT_GETREF(); 745 tcps[TCP_STAT_RCVOOPACK]++; 746 tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte; 747 if (rcvpartdupbyte) { 748 tcps[TCP_STAT_RCVPARTDUPPACK]++; 749 tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte; 750 } 751 TCP_STAT_PUTREF(); 752 753 /* 754 * Insert the new fragment queue entry into both queues. 755 */ 756 tiqe->ipqe_m = m; 757 tiqe->ipre_mlast = m; 758 tiqe->ipqe_seq = pkt_seq; 759 tiqe->ipqe_len = pkt_len; 760 tiqe->ipqe_flags = pkt_flags; 761 if (p == NULL) { 762 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 763 #ifdef TCPREASS_DEBUG 764 if (tiqe->ipqe_seq != tp->rcv_nxt) 765 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n", 766 tp, pkt_seq, pkt_seq + pkt_len, pkt_len); 767 #endif 768 } else { 769 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q); 770 #ifdef TCPREASS_DEBUG 771 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n", 772 tp, pkt_seq, pkt_seq + pkt_len, pkt_len, 773 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); 774 #endif 775 } 776 tp->t_segqlen++; 777 778 skip_replacement: 779 780 TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 781 782 present: 783 /* 784 * Present data to user, advancing rcv_nxt through 785 * completed sequence space. 786 */ 787 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 788 goto out; 789 q = TAILQ_FIRST(&tp->segq); 790 if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 791 goto out; 792 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 793 goto out; 794 795 tp->rcv_nxt += q->ipqe_len; 796 pkt_flags = q->ipqe_flags & TH_FIN; 797 nd6_hint(tp); 798 799 TAILQ_REMOVE(&tp->segq, q, ipqe_q); 800 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 801 tp->t_segqlen--; 802 KASSERT(tp->t_segqlen >= 0); 803 KASSERT(tp->t_segqlen != 0 || 804 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 805 if (so->so_state & SS_CANTRCVMORE) 806 m_freem(q->ipqe_m); 807 else 808 sbappendstream(&so->so_rcv, q->ipqe_m); 809 tcpipqent_free(q); 810 TCP_REASS_UNLOCK(tp); 811 sorwakeup(so); 812 return (pkt_flags); 813 out: 814 TCP_REASS_UNLOCK(tp); 815 return (0); 816 } 817 818 #ifdef INET6 819 int 820 tcp6_input(struct mbuf **mp, int *offp, int proto) 821 { 822 struct mbuf *m = *mp; 823 824 /* 825 * draft-itojun-ipv6-tcp-to-anycast 826 * better place to put this in? 827 */ 828 if (m->m_flags & M_ANYCAST6) { 829 struct ip6_hdr *ip6; 830 if (m->m_len < sizeof(struct ip6_hdr)) { 831 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 832 TCP_STATINC(TCP_STAT_RCVSHORT); 833 return IPPROTO_DONE; 834 } 835 } 836 ip6 = mtod(m, struct ip6_hdr *); 837 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 838 (char *)&ip6->ip6_dst - (char *)ip6); 839 return IPPROTO_DONE; 840 } 841 842 tcp_input(m, *offp, proto); 843 return IPPROTO_DONE; 844 } 845 #endif 846 847 #ifdef INET 848 static void 849 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th) 850 { 851 char src[INET_ADDRSTRLEN]; 852 char dst[INET_ADDRSTRLEN]; 853 854 if (ip) { 855 in_print(src, sizeof(src), &ip->ip_src); 856 in_print(dst, sizeof(dst), &ip->ip_dst); 857 } 858 else { 859 strlcpy(src, "(unknown)", sizeof(src)); 860 strlcpy(dst, "(unknown)", sizeof(dst)); 861 } 862 log(LOG_INFO, 863 "Connection attempt to TCP %s:%d from %s:%d\n", 864 dst, ntohs(th->th_dport), 865 src, ntohs(th->th_sport)); 866 } 867 #endif 868 869 #ifdef INET6 870 static void 871 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th) 872 { 873 char src[INET6_ADDRSTRLEN]; 874 char dst[INET6_ADDRSTRLEN]; 875 876 if (ip6) { 877 in6_print(src, sizeof(src), &ip6->ip6_src); 878 in6_print(dst, sizeof(dst), &ip6->ip6_dst); 879 } 880 else { 881 strlcpy(src, "(unknown v6)", sizeof(src)); 882 strlcpy(dst, "(unknown v6)", sizeof(dst)); 883 } 884 log(LOG_INFO, 885 "Connection attempt to TCP [%s]:%d from [%s]:%d\n", 886 dst, ntohs(th->th_dport), 887 src, ntohs(th->th_sport)); 888 } 889 #endif 890 891 /* 892 * Checksum extended TCP header and data. 893 */ 894 int 895 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, 896 int toff, int off, int tlen) 897 { 898 struct ifnet *rcvif; 899 int s; 900 901 /* 902 * XXX it's better to record and check if this mbuf is 903 * already checked. 904 */ 905 906 rcvif = m_get_rcvif(m, &s); 907 908 switch (af) { 909 #ifdef INET 910 case AF_INET: 911 switch (m->m_pkthdr.csum_flags & 912 ((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | 913 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 914 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 915 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 916 goto badcsum; 917 918 case M_CSUM_TCPv4|M_CSUM_DATA: { 919 u_int32_t hw_csum = m->m_pkthdr.csum_data; 920 921 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 922 if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) { 923 const struct ip *ip = 924 mtod(m, const struct ip *); 925 926 hw_csum = in_cksum_phdr(ip->ip_src.s_addr, 927 ip->ip_dst.s_addr, 928 htons(hw_csum + tlen + off + IPPROTO_TCP)); 929 } 930 if ((hw_csum ^ 0xffff) != 0) 931 goto badcsum; 932 break; 933 } 934 935 case M_CSUM_TCPv4: 936 /* Checksum was okay. */ 937 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 938 break; 939 940 default: 941 /* 942 * Must compute it ourselves. Maybe skip checksum 943 * on loopback interfaces. 944 */ 945 if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) || 946 tcp_do_loopback_cksum)) { 947 TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 948 if (in4_cksum(m, IPPROTO_TCP, toff, 949 tlen + off) != 0) 950 goto badcsum; 951 } 952 break; 953 } 954 break; 955 #endif /* INET4 */ 956 957 #ifdef INET6 958 case AF_INET6: 959 switch (m->m_pkthdr.csum_flags & 960 ((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) | 961 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 962 case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD: 963 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad); 964 goto badcsum; 965 966 #if 0 /* notyet */ 967 case M_CSUM_TCPv6|M_CSUM_DATA: 968 #endif 969 970 case M_CSUM_TCPv6: 971 /* Checksum was okay. */ 972 TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok); 973 break; 974 975 default: 976 /* 977 * Must compute it ourselves. Maybe skip checksum 978 * on loopback interfaces. 979 */ 980 if (__predict_true((m->m_flags & M_LOOP) == 0 || 981 tcp_do_loopback_cksum)) { 982 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum); 983 if (in6_cksum(m, IPPROTO_TCP, toff, 984 tlen + off) != 0) 985 goto badcsum; 986 } 987 } 988 break; 989 #endif /* INET6 */ 990 } 991 m_put_rcvif(rcvif, &s); 992 993 return 0; 994 995 badcsum: 996 m_put_rcvif(rcvif, &s); 997 TCP_STATINC(TCP_STAT_RCVBADSUM); 998 return -1; 999 } 1000 1001 /* When a packet arrives addressed to a vestigial tcpbp, we 1002 * nevertheless have to respond to it per the spec. 1003 */ 1004 static void tcp_vtw_input(struct tcphdr *th, vestigial_inpcb_t *vp, 1005 struct mbuf *m, int tlen, int multicast) 1006 { 1007 int tiflags; 1008 int todrop; 1009 uint32_t t_flags = 0; 1010 uint64_t *tcps; 1011 1012 tiflags = th->th_flags; 1013 todrop = vp->rcv_nxt - th->th_seq; 1014 1015 if (todrop > 0) { 1016 if (tiflags & TH_SYN) { 1017 tiflags &= ~TH_SYN; 1018 ++th->th_seq; 1019 if (th->th_urp > 1) 1020 --th->th_urp; 1021 else { 1022 tiflags &= ~TH_URG; 1023 th->th_urp = 0; 1024 } 1025 --todrop; 1026 } 1027 if (todrop > tlen || 1028 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1029 /* 1030 * Any valid FIN or RST must be to the left of the 1031 * window. At this point the FIN or RST must be a 1032 * duplicate or out of sequence; drop it. 1033 */ 1034 if (tiflags & TH_RST) 1035 goto drop; 1036 tiflags &= ~(TH_FIN|TH_RST); 1037 /* 1038 * Send an ACK to resynchronize and drop any data. 1039 * But keep on processing for RST or ACK. 1040 */ 1041 t_flags |= TF_ACKNOW; 1042 todrop = tlen; 1043 tcps = TCP_STAT_GETREF(); 1044 tcps[TCP_STAT_RCVDUPPACK] += 1; 1045 tcps[TCP_STAT_RCVDUPBYTE] += todrop; 1046 TCP_STAT_PUTREF(); 1047 } else if ((tiflags & TH_RST) 1048 && th->th_seq != vp->rcv_nxt) { 1049 /* 1050 * Test for reset before adjusting the sequence 1051 * number for overlapping data. 1052 */ 1053 goto dropafterack_ratelim; 1054 } else { 1055 tcps = TCP_STAT_GETREF(); 1056 tcps[TCP_STAT_RCVPARTDUPPACK] += 1; 1057 tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop; 1058 TCP_STAT_PUTREF(); 1059 } 1060 1061 // tcp_new_dsack(tp, th->th_seq, todrop); 1062 // hdroptlen += todrop; /*drop from head afterwards*/ 1063 1064 th->th_seq += todrop; 1065 tlen -= todrop; 1066 1067 if (th->th_urp > todrop) 1068 th->th_urp -= todrop; 1069 else { 1070 tiflags &= ~TH_URG; 1071 th->th_urp = 0; 1072 } 1073 } 1074 1075 /* 1076 * If new data are received on a connection after the 1077 * user processes are gone, then RST the other end. 1078 */ 1079 if (tlen) { 1080 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); 1081 goto dropwithreset; 1082 } 1083 1084 /* 1085 * If segment ends after window, drop trailing data 1086 * (and PUSH and FIN); if nothing left, just ACK. 1087 */ 1088 todrop = (th->th_seq + tlen) - (vp->rcv_nxt+vp->rcv_wnd); 1089 1090 if (todrop > 0) { 1091 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); 1092 if (todrop >= tlen) { 1093 /* 1094 * The segment actually starts after the window. 1095 * th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen 1096 * th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0 1097 * th->th_seq >= vp->rcv_nxt + vp->rcv_wnd 1098 */ 1099 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); 1100 /* 1101 * If a new connection request is received 1102 * while in TIME_WAIT, drop the old connection 1103 * and start over if the sequence numbers 1104 * are above the previous ones. 1105 */ 1106 if ((tiflags & TH_SYN) 1107 && SEQ_GT(th->th_seq, vp->rcv_nxt)) { 1108 /* We only support this in the !NOFDREF case, which 1109 * is to say: not here. 1110 */ 1111 goto dropwithreset; 1112 } 1113 /* 1114 * If window is closed can only take segments at 1115 * window edge, and have to drop data and PUSH from 1116 * incoming segments. Continue processing, but 1117 * remember to ack. Otherwise, drop segment 1118 * and (if not RST) ack. 1119 */ 1120 if (vp->rcv_wnd == 0 && th->th_seq == vp->rcv_nxt) { 1121 t_flags |= TF_ACKNOW; 1122 TCP_STATINC(TCP_STAT_RCVWINPROBE); 1123 } else 1124 goto dropafterack; 1125 } else 1126 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); 1127 m_adj(m, -todrop); 1128 tlen -= todrop; 1129 tiflags &= ~(TH_PUSH|TH_FIN); 1130 } 1131 1132 if (tiflags & TH_RST) { 1133 if (th->th_seq != vp->rcv_nxt) 1134 goto dropafterack_ratelim; 1135 1136 vtw_del(vp->ctl, vp->vtw); 1137 goto drop; 1138 } 1139 1140 /* 1141 * If the ACK bit is off we drop the segment and return. 1142 */ 1143 if ((tiflags & TH_ACK) == 0) { 1144 if (t_flags & TF_ACKNOW) 1145 goto dropafterack; 1146 else 1147 goto drop; 1148 } 1149 1150 /* 1151 * In TIME_WAIT state the only thing that should arrive 1152 * is a retransmission of the remote FIN. Acknowledge 1153 * it and restart the finack timer. 1154 */ 1155 vtw_restart(vp); 1156 goto dropafterack; 1157 1158 dropafterack: 1159 /* 1160 * Generate an ACK dropping incoming segment if it occupies 1161 * sequence space, where the ACK reflects our state. 1162 */ 1163 if (tiflags & TH_RST) 1164 goto drop; 1165 goto dropafterack2; 1166 1167 dropafterack_ratelim: 1168 /* 1169 * We may want to rate-limit ACKs against SYN/RST attack. 1170 */ 1171 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 1172 tcp_ackdrop_ppslim) == 0) { 1173 /* XXX stat */ 1174 goto drop; 1175 } 1176 /* ...fall into dropafterack2... */ 1177 1178 dropafterack2: 1179 (void)tcp_respond(0, m, m, th, th->th_seq + tlen, th->th_ack, 1180 TH_ACK); 1181 return; 1182 1183 dropwithreset: 1184 /* 1185 * Generate a RST, dropping incoming segment. 1186 * Make ACK acceptable to originator of segment. 1187 */ 1188 if (tiflags & TH_RST) 1189 goto drop; 1190 1191 if (tiflags & TH_ACK) 1192 tcp_respond(0, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 1193 else { 1194 if (tiflags & TH_SYN) 1195 ++tlen; 1196 (void)tcp_respond(0, m, m, th, th->th_seq + tlen, (tcp_seq)0, 1197 TH_RST|TH_ACK); 1198 } 1199 return; 1200 drop: 1201 m_freem(m); 1202 } 1203 1204 /* 1205 * TCP input routine, follows pages 65-76 of RFC 793 very closely. 1206 */ 1207 void 1208 tcp_input(struct mbuf *m, ...) 1209 { 1210 struct tcphdr *th; 1211 struct ip *ip; 1212 struct inpcb *inp; 1213 #ifdef INET6 1214 struct ip6_hdr *ip6; 1215 struct in6pcb *in6p; 1216 #endif 1217 u_int8_t *optp = NULL; 1218 int optlen = 0; 1219 int len, tlen, toff, hdroptlen = 0; 1220 struct tcpcb *tp = 0; 1221 int tiflags; 1222 struct socket *so = NULL; 1223 int todrop, acked, ourfinisacked, needoutput = 0; 1224 bool dupseg; 1225 #ifdef TCP_DEBUG 1226 short ostate = 0; 1227 #endif 1228 u_long tiwin; 1229 struct tcp_opt_info opti; 1230 int off, iphlen; 1231 va_list ap; 1232 int af; /* af on the wire */ 1233 struct mbuf *tcp_saveti = NULL; 1234 uint32_t ts_rtt; 1235 uint8_t iptos; 1236 uint64_t *tcps; 1237 vestigial_inpcb_t vestige; 1238 1239 vestige.valid = 0; 1240 1241 MCLAIM(m, &tcp_rx_mowner); 1242 va_start(ap, m); 1243 toff = va_arg(ap, int); 1244 (void)va_arg(ap, int); /* ignore value, advance ap */ 1245 va_end(ap); 1246 1247 TCP_STATINC(TCP_STAT_RCVTOTAL); 1248 1249 memset(&opti, 0, sizeof(opti)); 1250 opti.ts_present = 0; 1251 opti.maxseg = 0; 1252 1253 /* 1254 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 1255 * 1256 * TCP is, by definition, unicast, so we reject all 1257 * multicast outright. 1258 * 1259 * Note, there are additional src/dst address checks in 1260 * the AF-specific code below. 1261 */ 1262 if (m->m_flags & (M_BCAST|M_MCAST)) { 1263 /* XXX stat */ 1264 goto drop; 1265 } 1266 #ifdef INET6 1267 if (m->m_flags & M_ANYCAST6) { 1268 /* XXX stat */ 1269 goto drop; 1270 } 1271 #endif 1272 1273 /* 1274 * Get IP and TCP header. 1275 * Note: IP leaves IP header in first mbuf. 1276 */ 1277 ip = mtod(m, struct ip *); 1278 switch (ip->ip_v) { 1279 #ifdef INET 1280 case 4: 1281 #ifdef INET6 1282 ip6 = NULL; 1283 #endif 1284 af = AF_INET; 1285 iphlen = sizeof(struct ip); 1286 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 1287 sizeof(struct tcphdr)); 1288 if (th == NULL) { 1289 TCP_STATINC(TCP_STAT_RCVSHORT); 1290 return; 1291 } 1292 /* We do the checksum after PCB lookup... */ 1293 len = ntohs(ip->ip_len); 1294 tlen = len - toff; 1295 iptos = ip->ip_tos; 1296 break; 1297 #endif 1298 #ifdef INET6 1299 case 6: 1300 ip = NULL; 1301 iphlen = sizeof(struct ip6_hdr); 1302 af = AF_INET6; 1303 ip6 = mtod(m, struct ip6_hdr *); 1304 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, 1305 sizeof(struct tcphdr)); 1306 if (th == NULL) { 1307 TCP_STATINC(TCP_STAT_RCVSHORT); 1308 return; 1309 } 1310 1311 /* Be proactive about malicious use of IPv4 mapped address */ 1312 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || 1313 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { 1314 /* XXX stat */ 1315 goto drop; 1316 } 1317 1318 /* 1319 * Be proactive about unspecified IPv6 address in source. 1320 * As we use all-zero to indicate unbounded/unconnected pcb, 1321 * unspecified IPv6 address can be used to confuse us. 1322 * 1323 * Note that packets with unspecified IPv6 destination is 1324 * already dropped in ip6_input. 1325 */ 1326 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1327 /* XXX stat */ 1328 goto drop; 1329 } 1330 1331 /* 1332 * Make sure destination address is not multicast. 1333 * Source address checked in ip6_input(). 1334 */ 1335 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 1336 /* XXX stat */ 1337 goto drop; 1338 } 1339 1340 /* We do the checksum after PCB lookup... */ 1341 len = m->m_pkthdr.len; 1342 tlen = len - toff; 1343 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1344 break; 1345 #endif 1346 default: 1347 m_freem(m); 1348 return; 1349 } 1350 /* 1351 * Enforce alignment requirements that are violated in 1352 * some cases, see kern/50766 for details. 1353 */ 1354 if (TCP_HDR_ALIGNED_P(th) == 0) { 1355 m = m_copyup(m, toff + sizeof(struct tcphdr), 0); 1356 if (m == NULL) { 1357 TCP_STATINC(TCP_STAT_RCVSHORT); 1358 return; 1359 } 1360 ip = mtod(m, struct ip *); 1361 #ifdef INET6 1362 ip6 = mtod(m, struct ip6_hdr *); 1363 #endif 1364 th = (struct tcphdr *)(mtod(m, char *) + toff); 1365 } 1366 KASSERT(TCP_HDR_ALIGNED_P(th)); 1367 1368 /* 1369 * Check that TCP offset makes sense, 1370 * pull out TCP options and adjust length. XXX 1371 */ 1372 off = th->th_off << 2; 1373 if (off < sizeof (struct tcphdr) || off > tlen) { 1374 TCP_STATINC(TCP_STAT_RCVBADOFF); 1375 goto drop; 1376 } 1377 tlen -= off; 1378 1379 /* 1380 * tcp_input() has been modified to use tlen to mean the TCP data 1381 * length throughout the function. Other functions can use 1382 * m->m_pkthdr.len as the basis for calculating the TCP data length. 1383 * rja 1384 */ 1385 1386 if (off > sizeof (struct tcphdr)) { 1387 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); 1388 if (th == NULL) { 1389 TCP_STATINC(TCP_STAT_RCVSHORT); 1390 return; 1391 } 1392 /* 1393 * NOTE: ip/ip6 will not be affected by m_pulldown() 1394 * (as they're before toff) and we don't need to update those. 1395 */ 1396 KASSERT(TCP_HDR_ALIGNED_P(th)); 1397 optlen = off - sizeof (struct tcphdr); 1398 optp = ((u_int8_t *)th) + sizeof(struct tcphdr); 1399 /* 1400 * Do quick retrieval of timestamp options ("options 1401 * prediction?"). If timestamp is the only option and it's 1402 * formatted as recommended in RFC 1323 appendix A, we 1403 * quickly get the values now and not bother calling 1404 * tcp_dooptions(), etc. 1405 */ 1406 if ((optlen == TCPOLEN_TSTAMP_APPA || 1407 (optlen > TCPOLEN_TSTAMP_APPA && 1408 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 1409 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 1410 (th->th_flags & TH_SYN) == 0) { 1411 opti.ts_present = 1; 1412 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 1413 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 1414 optp = NULL; /* we've parsed the options */ 1415 } 1416 } 1417 tiflags = th->th_flags; 1418 1419 /* 1420 * Checksum extended TCP header and data 1421 */ 1422 if (tcp_input_checksum(af, m, th, toff, off, tlen)) 1423 goto badcsum; 1424 1425 /* 1426 * Locate pcb for segment. 1427 */ 1428 findpcb: 1429 inp = NULL; 1430 #ifdef INET6 1431 in6p = NULL; 1432 #endif 1433 switch (af) { 1434 #ifdef INET 1435 case AF_INET: 1436 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, 1437 ip->ip_dst, th->th_dport, 1438 &vestige); 1439 if (inp == 0 && !vestige.valid) { 1440 TCP_STATINC(TCP_STAT_PCBHASHMISS); 1441 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); 1442 } 1443 #ifdef INET6 1444 if (inp == 0 && !vestige.valid) { 1445 struct in6_addr s, d; 1446 1447 /* mapped addr case */ 1448 in6_in_2_v4mapin6(&ip->ip_src, &s); 1449 in6_in_2_v4mapin6(&ip->ip_dst, &d); 1450 in6p = in6_pcblookup_connect(&tcbtable, &s, 1451 th->th_sport, &d, th->th_dport, 1452 0, &vestige); 1453 if (in6p == 0 && !vestige.valid) { 1454 TCP_STATINC(TCP_STAT_PCBHASHMISS); 1455 in6p = in6_pcblookup_bind(&tcbtable, &d, 1456 th->th_dport, 0); 1457 } 1458 } 1459 #endif 1460 #ifndef INET6 1461 if (inp == 0 && !vestige.valid) 1462 #else 1463 if (inp == 0 && in6p == 0 && !vestige.valid) 1464 #endif 1465 { 1466 TCP_STATINC(TCP_STAT_NOPORT); 1467 if (tcp_log_refused && 1468 (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { 1469 tcp4_log_refused(ip, th); 1470 } 1471 tcp_fields_to_host(th); 1472 goto dropwithreset_ratelim; 1473 } 1474 #if defined(IPSEC) 1475 if (ipsec_used) { 1476 if (inp && 1477 (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 1478 && ipsec4_in_reject(m, inp)) { 1479 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); 1480 goto drop; 1481 } 1482 #ifdef INET6 1483 else if (in6p && 1484 (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 1485 && ipsec6_in_reject_so(m, in6p->in6p_socket)) { 1486 IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); 1487 goto drop; 1488 } 1489 #endif 1490 } 1491 #endif /*IPSEC*/ 1492 break; 1493 #endif /*INET*/ 1494 #ifdef INET6 1495 case AF_INET6: 1496 { 1497 int faith; 1498 1499 #if defined(NFAITH) && NFAITH > 0 1500 faith = faithprefix(&ip6->ip6_dst); 1501 #else 1502 faith = 0; 1503 #endif 1504 in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src, 1505 th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige); 1506 if (!in6p && !vestige.valid) { 1507 TCP_STATINC(TCP_STAT_PCBHASHMISS); 1508 in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst, 1509 th->th_dport, faith); 1510 } 1511 if (!in6p && !vestige.valid) { 1512 TCP_STATINC(TCP_STAT_NOPORT); 1513 if (tcp_log_refused && 1514 (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { 1515 tcp6_log_refused(ip6, th); 1516 } 1517 tcp_fields_to_host(th); 1518 goto dropwithreset_ratelim; 1519 } 1520 #if defined(IPSEC) 1521 if (ipsec_used && in6p 1522 && (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 1523 && ipsec6_in_reject(m, in6p)) { 1524 IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO); 1525 goto drop; 1526 } 1527 #endif /*IPSEC*/ 1528 break; 1529 } 1530 #endif 1531 } 1532 1533 /* 1534 * If the state is CLOSED (i.e., TCB does not exist) then 1535 * all data in the incoming segment is discarded. 1536 * If the TCB exists but is in CLOSED state, it is embryonic, 1537 * but should either do a listen or a connect soon. 1538 */ 1539 tp = NULL; 1540 so = NULL; 1541 if (inp) { 1542 /* Check the minimum TTL for socket. */ 1543 if (ip->ip_ttl < inp->inp_ip_minttl) 1544 goto drop; 1545 1546 tp = intotcpcb(inp); 1547 so = inp->inp_socket; 1548 } 1549 #ifdef INET6 1550 else if (in6p) { 1551 tp = in6totcpcb(in6p); 1552 so = in6p->in6p_socket; 1553 } 1554 #endif 1555 else if (vestige.valid) { 1556 int mc = 0; 1557 1558 /* We do not support the resurrection of vtw tcpcps. 1559 */ 1560 if (tcp_input_checksum(af, m, th, toff, off, tlen)) 1561 goto badcsum; 1562 1563 switch (af) { 1564 #ifdef INET6 1565 case AF_INET6: 1566 mc = IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst); 1567 break; 1568 #endif 1569 1570 case AF_INET: 1571 mc = (IN_MULTICAST(ip->ip_dst.s_addr) 1572 || in_broadcast(ip->ip_dst, 1573 m_get_rcvif_NOMPSAFE(m))); 1574 break; 1575 } 1576 1577 tcp_fields_to_host(th); 1578 tcp_vtw_input(th, &vestige, m, tlen, mc); 1579 m = 0; 1580 goto drop; 1581 } 1582 1583 if (tp == 0) { 1584 tcp_fields_to_host(th); 1585 goto dropwithreset_ratelim; 1586 } 1587 if (tp->t_state == TCPS_CLOSED) 1588 goto drop; 1589 1590 KASSERT(so->so_lock == softnet_lock); 1591 KASSERT(solocked(so)); 1592 1593 tcp_fields_to_host(th); 1594 1595 /* Unscale the window into a 32-bit value. */ 1596 if ((tiflags & TH_SYN) == 0) 1597 tiwin = th->th_win << tp->snd_scale; 1598 else 1599 tiwin = th->th_win; 1600 1601 #ifdef INET6 1602 /* save packet options if user wanted */ 1603 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { 1604 if (in6p->in6p_options) { 1605 m_freem(in6p->in6p_options); 1606 in6p->in6p_options = 0; 1607 } 1608 KASSERT(ip6 != NULL); 1609 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); 1610 } 1611 #endif 1612 1613 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1614 union syn_cache_sa src; 1615 union syn_cache_sa dst; 1616 1617 memset(&src, 0, sizeof(src)); 1618 memset(&dst, 0, sizeof(dst)); 1619 switch (af) { 1620 #ifdef INET 1621 case AF_INET: 1622 src.sin.sin_len = sizeof(struct sockaddr_in); 1623 src.sin.sin_family = AF_INET; 1624 src.sin.sin_addr = ip->ip_src; 1625 src.sin.sin_port = th->th_sport; 1626 1627 dst.sin.sin_len = sizeof(struct sockaddr_in); 1628 dst.sin.sin_family = AF_INET; 1629 dst.sin.sin_addr = ip->ip_dst; 1630 dst.sin.sin_port = th->th_dport; 1631 break; 1632 #endif 1633 #ifdef INET6 1634 case AF_INET6: 1635 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1636 src.sin6.sin6_family = AF_INET6; 1637 src.sin6.sin6_addr = ip6->ip6_src; 1638 src.sin6.sin6_port = th->th_sport; 1639 1640 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1641 dst.sin6.sin6_family = AF_INET6; 1642 dst.sin6.sin6_addr = ip6->ip6_dst; 1643 dst.sin6.sin6_port = th->th_dport; 1644 break; 1645 #endif /* INET6 */ 1646 default: 1647 goto badsyn; /*sanity*/ 1648 } 1649 1650 if (so->so_options & SO_DEBUG) { 1651 #ifdef TCP_DEBUG 1652 ostate = tp->t_state; 1653 #endif 1654 1655 tcp_saveti = NULL; 1656 if (iphlen + sizeof(struct tcphdr) > MHLEN) 1657 goto nosave; 1658 1659 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1660 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1661 if (!tcp_saveti) 1662 goto nosave; 1663 } else { 1664 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1665 if (!tcp_saveti) 1666 goto nosave; 1667 MCLAIM(m, &tcp_mowner); 1668 tcp_saveti->m_len = iphlen; 1669 m_copydata(m, 0, iphlen, 1670 mtod(tcp_saveti, void *)); 1671 } 1672 1673 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1674 m_freem(tcp_saveti); 1675 tcp_saveti = NULL; 1676 } else { 1677 tcp_saveti->m_len += sizeof(struct tcphdr); 1678 memcpy(mtod(tcp_saveti, char *) + iphlen, th, 1679 sizeof(struct tcphdr)); 1680 } 1681 nosave:; 1682 } 1683 if (so->so_options & SO_ACCEPTCONN) { 1684 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1685 if (tiflags & TH_RST) { 1686 syn_cache_reset(&src.sa, &dst.sa, th); 1687 } else if ((tiflags & (TH_ACK|TH_SYN)) == 1688 (TH_ACK|TH_SYN)) { 1689 /* 1690 * Received a SYN,ACK. This should 1691 * never happen while we are in 1692 * LISTEN. Send an RST. 1693 */ 1694 goto badsyn; 1695 } else if (tiflags & TH_ACK) { 1696 so = syn_cache_get(&src.sa, &dst.sa, 1697 th, toff, tlen, so, m); 1698 if (so == NULL) { 1699 /* 1700 * We don't have a SYN for 1701 * this ACK; send an RST. 1702 */ 1703 goto badsyn; 1704 } else if (so == 1705 (struct socket *)(-1)) { 1706 /* 1707 * We were unable to create 1708 * the connection. If the 1709 * 3-way handshake was 1710 * completed, and RST has 1711 * been sent to the peer. 1712 * Since the mbuf might be 1713 * in use for the reply, 1714 * do not free it. 1715 */ 1716 m = NULL; 1717 } else { 1718 /* 1719 * We have created a 1720 * full-blown connection. 1721 */ 1722 tp = NULL; 1723 inp = NULL; 1724 #ifdef INET6 1725 in6p = NULL; 1726 #endif 1727 switch (so->so_proto->pr_domain->dom_family) { 1728 #ifdef INET 1729 case AF_INET: 1730 inp = sotoinpcb(so); 1731 tp = intotcpcb(inp); 1732 break; 1733 #endif 1734 #ifdef INET6 1735 case AF_INET6: 1736 in6p = sotoin6pcb(so); 1737 tp = in6totcpcb(in6p); 1738 break; 1739 #endif 1740 } 1741 if (tp == NULL) 1742 goto badsyn; /*XXX*/ 1743 tiwin <<= tp->snd_scale; 1744 goto after_listen; 1745 } 1746 } else { 1747 /* 1748 * None of RST, SYN or ACK was set. 1749 * This is an invalid packet for a 1750 * TCB in LISTEN state. Send a RST. 1751 */ 1752 goto badsyn; 1753 } 1754 } else { 1755 /* 1756 * Received a SYN. 1757 * 1758 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1759 */ 1760 if (m->m_flags & (M_BCAST|M_MCAST)) 1761 goto drop; 1762 1763 switch (af) { 1764 #ifdef INET6 1765 case AF_INET6: 1766 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 1767 goto drop; 1768 break; 1769 #endif /* INET6 */ 1770 case AF_INET: 1771 if (IN_MULTICAST(ip->ip_dst.s_addr) || 1772 in_broadcast(ip->ip_dst, 1773 m_get_rcvif_NOMPSAFE(m))) 1774 goto drop; 1775 break; 1776 } 1777 1778 #ifdef INET6 1779 /* 1780 * If deprecated address is forbidden, we do 1781 * not accept SYN to deprecated interface 1782 * address to prevent any new inbound 1783 * connection from getting established. 1784 * When we do not accept SYN, we send a TCP 1785 * RST, with deprecated source address (instead 1786 * of dropping it). We compromise it as it is 1787 * much better for peer to send a RST, and 1788 * RST will be the final packet for the 1789 * exchange. 1790 * 1791 * If we do not forbid deprecated addresses, we 1792 * accept the SYN packet. RFC2462 does not 1793 * suggest dropping SYN in this case. 1794 * If we decipher RFC2462 5.5.4, it says like 1795 * this: 1796 * 1. use of deprecated addr with existing 1797 * communication is okay - "SHOULD continue 1798 * to be used" 1799 * 2. use of it with new communication: 1800 * (2a) "SHOULD NOT be used if alternate 1801 * address with sufficient scope is 1802 * available" 1803 * (2b) nothing mentioned otherwise. 1804 * Here we fall into (2b) case as we have no 1805 * choice in our source address selection - we 1806 * must obey the peer. 1807 * 1808 * The wording in RFC2462 is confusing, and 1809 * there are multiple description text for 1810 * deprecated address handling - worse, they 1811 * are not exactly the same. I believe 5.5.4 1812 * is the best one, so we follow 5.5.4. 1813 */ 1814 if (af == AF_INET6 && !ip6_use_deprecated) { 1815 struct in6_ifaddr *ia6; 1816 int s; 1817 struct ifnet *rcvif = m_get_rcvif(m, &s); 1818 if (rcvif == NULL) 1819 goto dropwithreset; /* XXX */ 1820 if ((ia6 = in6ifa_ifpwithaddr(rcvif, 1821 &ip6->ip6_dst)) && 1822 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1823 tp = NULL; 1824 m_put_rcvif(rcvif, &s); 1825 goto dropwithreset; 1826 } 1827 m_put_rcvif(rcvif, &s); 1828 } 1829 #endif 1830 1831 #if defined(IPSEC) 1832 if (ipsec_used) { 1833 switch (af) { 1834 #ifdef INET 1835 case AF_INET: 1836 if (!ipsec4_in_reject_so(m, so)) 1837 break; 1838 IPSEC_STATINC( 1839 IPSEC_STAT_IN_POLVIO); 1840 tp = NULL; 1841 goto dropwithreset; 1842 #endif 1843 #ifdef INET6 1844 case AF_INET6: 1845 if (!ipsec6_in_reject_so(m, so)) 1846 break; 1847 IPSEC6_STATINC( 1848 IPSEC_STAT_IN_POLVIO); 1849 tp = NULL; 1850 goto dropwithreset; 1851 #endif /*INET6*/ 1852 } 1853 } 1854 #endif /*IPSEC*/ 1855 1856 /* 1857 * LISTEN socket received a SYN 1858 * from itself? This can't possibly 1859 * be valid; drop the packet. 1860 */ 1861 if (th->th_sport == th->th_dport) { 1862 int i; 1863 1864 switch (af) { 1865 #ifdef INET 1866 case AF_INET: 1867 i = in_hosteq(ip->ip_src, ip->ip_dst); 1868 break; 1869 #endif 1870 #ifdef INET6 1871 case AF_INET6: 1872 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); 1873 break; 1874 #endif 1875 default: 1876 i = 1; 1877 } 1878 if (i) { 1879 TCP_STATINC(TCP_STAT_BADSYN); 1880 goto drop; 1881 } 1882 } 1883 1884 /* 1885 * SYN looks ok; create compressed TCP 1886 * state for it. 1887 */ 1888 if (so->so_qlen <= so->so_qlimit && 1889 syn_cache_add(&src.sa, &dst.sa, th, tlen, 1890 so, m, optp, optlen, &opti)) 1891 m = NULL; 1892 } 1893 goto drop; 1894 } 1895 } 1896 1897 after_listen: 1898 #ifdef DIAGNOSTIC 1899 /* 1900 * Should not happen now that all embryonic connections 1901 * are handled with compressed state. 1902 */ 1903 if (tp->t_state == TCPS_LISTEN) 1904 panic("tcp_input: TCPS_LISTEN"); 1905 #endif 1906 1907 /* 1908 * Segment received on connection. 1909 * Reset idle time and keep-alive timer. 1910 */ 1911 tp->t_rcvtime = tcp_now; 1912 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1913 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); 1914 1915 /* 1916 * Process options. 1917 */ 1918 #ifdef TCP_SIGNATURE 1919 if (optp || (tp->t_flags & TF_SIGNATURE)) 1920 #else 1921 if (optp) 1922 #endif 1923 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0) 1924 goto drop; 1925 1926 if (TCP_SACK_ENABLED(tp)) { 1927 tcp_del_sackholes(tp, th); 1928 } 1929 1930 if (TCP_ECN_ALLOWED(tp)) { 1931 if (tiflags & TH_CWR) { 1932 tp->t_flags &= ~TF_ECN_SND_ECE; 1933 } 1934 switch (iptos & IPTOS_ECN_MASK) { 1935 case IPTOS_ECN_CE: 1936 tp->t_flags |= TF_ECN_SND_ECE; 1937 TCP_STATINC(TCP_STAT_ECN_CE); 1938 break; 1939 case IPTOS_ECN_ECT0: 1940 TCP_STATINC(TCP_STAT_ECN_ECT); 1941 break; 1942 case IPTOS_ECN_ECT1: 1943 /* XXX: ignore for now -- rpaulo */ 1944 break; 1945 } 1946 /* 1947 * Congestion experienced. 1948 * Ignore if we are already trying to recover. 1949 */ 1950 if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover)) 1951 tp->t_congctl->cong_exp(tp); 1952 } 1953 1954 if (opti.ts_present && opti.ts_ecr) { 1955 /* 1956 * Calculate the RTT from the returned time stamp and the 1957 * connection's time base. If the time stamp is later than 1958 * the current time, or is extremely old, fall back to non-1323 1959 * RTT calculation. Since ts_rtt is unsigned, we can test both 1960 * at the same time. 1961 * 1962 * Note that ts_rtt is in units of slow ticks (500 1963 * ms). Since most earthbound RTTs are < 500 ms, 1964 * observed values will have large quantization noise. 1965 * Our smoothed RTT is then the fraction of observed 1966 * samples that are 1 tick instead of 0 (times 500 1967 * ms). 1968 * 1969 * ts_rtt is increased by 1 to denote a valid sample, 1970 * with 0 indicating an invalid measurement. This 1971 * extra 1 must be removed when ts_rtt is used, or 1972 * else an an erroneous extra 500 ms will result. 1973 */ 1974 ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1; 1975 if (ts_rtt > TCP_PAWS_IDLE) 1976 ts_rtt = 0; 1977 } else { 1978 ts_rtt = 0; 1979 } 1980 1981 /* 1982 * Header prediction: check for the two common cases 1983 * of a uni-directional data xfer. If the packet has 1984 * no control flags, is in-sequence, the window didn't 1985 * change and we're not retransmitting, it's a 1986 * candidate. If the length is zero and the ack moved 1987 * forward, we're the sender side of the xfer. Just 1988 * free the data acked & wake any higher level process 1989 * that was blocked waiting for space. If the length 1990 * is non-zero and the ack didn't move, we're the 1991 * receiver side. If we're getting packets in-order 1992 * (the reassembly queue is empty), add the data to 1993 * the socket buffer and note that we need a delayed ack. 1994 */ 1995 if (tp->t_state == TCPS_ESTABLISHED && 1996 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) 1997 == TH_ACK && 1998 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1999 th->th_seq == tp->rcv_nxt && 2000 tiwin && tiwin == tp->snd_wnd && 2001 tp->snd_nxt == tp->snd_max) { 2002 2003 /* 2004 * If last ACK falls within this segment's sequence numbers, 2005 * record the timestamp. 2006 * NOTE that the test is modified according to the latest 2007 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2008 * 2009 * note that we already know 2010 * TSTMP_GEQ(opti.ts_val, tp->ts_recent) 2011 */ 2012 if (opti.ts_present && 2013 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2014 tp->ts_recent_age = tcp_now; 2015 tp->ts_recent = opti.ts_val; 2016 } 2017 2018 if (tlen == 0) { 2019 /* Ack prediction. */ 2020 if (SEQ_GT(th->th_ack, tp->snd_una) && 2021 SEQ_LEQ(th->th_ack, tp->snd_max) && 2022 tp->snd_cwnd >= tp->snd_wnd && 2023 tp->t_partialacks < 0) { 2024 /* 2025 * this is a pure ack for outstanding data. 2026 */ 2027 if (ts_rtt) 2028 tcp_xmit_timer(tp, ts_rtt - 1); 2029 else if (tp->t_rtttime && 2030 SEQ_GT(th->th_ack, tp->t_rtseq)) 2031 tcp_xmit_timer(tp, 2032 tcp_now - tp->t_rtttime); 2033 acked = th->th_ack - tp->snd_una; 2034 tcps = TCP_STAT_GETREF(); 2035 tcps[TCP_STAT_PREDACK]++; 2036 tcps[TCP_STAT_RCVACKPACK]++; 2037 tcps[TCP_STAT_RCVACKBYTE] += acked; 2038 TCP_STAT_PUTREF(); 2039 nd6_hint(tp); 2040 2041 if (acked > (tp->t_lastoff - tp->t_inoff)) 2042 tp->t_lastm = NULL; 2043 sbdrop(&so->so_snd, acked); 2044 tp->t_lastoff -= acked; 2045 2046 icmp_check(tp, th, acked); 2047 2048 tp->snd_una = th->th_ack; 2049 tp->snd_fack = tp->snd_una; 2050 if (SEQ_LT(tp->snd_high, tp->snd_una)) 2051 tp->snd_high = tp->snd_una; 2052 m_freem(m); 2053 2054 /* 2055 * If all outstanding data are acked, stop 2056 * retransmit timer, otherwise restart timer 2057 * using current (possibly backed-off) value. 2058 * If process is waiting for space, 2059 * wakeup/selnotify/signal. If data 2060 * are ready to send, let tcp_output 2061 * decide between more output or persist. 2062 */ 2063 if (tp->snd_una == tp->snd_max) 2064 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2065 else if (TCP_TIMER_ISARMED(tp, 2066 TCPT_PERSIST) == 0) 2067 TCP_TIMER_ARM(tp, TCPT_REXMT, 2068 tp->t_rxtcur); 2069 2070 sowwakeup(so); 2071 if (so->so_snd.sb_cc) { 2072 KERNEL_LOCK(1, NULL); 2073 (void) tcp_output(tp); 2074 KERNEL_UNLOCK_ONE(NULL); 2075 } 2076 if (tcp_saveti) 2077 m_freem(tcp_saveti); 2078 return; 2079 } 2080 } else if (th->th_ack == tp->snd_una && 2081 TAILQ_FIRST(&tp->segq) == NULL && 2082 tlen <= sbspace(&so->so_rcv)) { 2083 int newsize = 0; /* automatic sockbuf scaling */ 2084 2085 /* 2086 * this is a pure, in-sequence data packet 2087 * with nothing on the reassembly queue and 2088 * we have enough buffer space to take it. 2089 */ 2090 tp->rcv_nxt += tlen; 2091 tcps = TCP_STAT_GETREF(); 2092 tcps[TCP_STAT_PREDDAT]++; 2093 tcps[TCP_STAT_RCVPACK]++; 2094 tcps[TCP_STAT_RCVBYTE] += tlen; 2095 TCP_STAT_PUTREF(); 2096 nd6_hint(tp); 2097 2098 /* 2099 * Automatic sizing enables the performance of large buffers 2100 * and most of the efficiency of small ones by only allocating 2101 * space when it is needed. 2102 * 2103 * On the receive side the socket buffer memory is only rarely 2104 * used to any significant extent. This allows us to be much 2105 * more aggressive in scaling the receive socket buffer. For 2106 * the case that the buffer space is actually used to a large 2107 * extent and we run out of kernel memory we can simply drop 2108 * the new segments; TCP on the sender will just retransmit it 2109 * later. Setting the buffer size too big may only consume too 2110 * much kernel memory if the application doesn't read() from 2111 * the socket or packet loss or reordering makes use of the 2112 * reassembly queue. 2113 * 2114 * The criteria to step up the receive buffer one notch are: 2115 * 1. the number of bytes received during the time it takes 2116 * one timestamp to be reflected back to us (the RTT); 2117 * 2. received bytes per RTT is within seven eighth of the 2118 * current socket buffer size; 2119 * 3. receive buffer size has not hit maximal automatic size; 2120 * 2121 * This algorithm does one step per RTT at most and only if 2122 * we receive a bulk stream w/o packet losses or reorderings. 2123 * Shrinking the buffer during idle times is not necessary as 2124 * it doesn't consume any memory when idle. 2125 * 2126 * TODO: Only step up if the application is actually serving 2127 * the buffer to better manage the socket buffer resources. 2128 */ 2129 if (tcp_do_autorcvbuf && 2130 opti.ts_ecr && 2131 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 2132 if (opti.ts_ecr > tp->rfbuf_ts && 2133 opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) { 2134 if (tp->rfbuf_cnt > 2135 (so->so_rcv.sb_hiwat / 8 * 7) && 2136 so->so_rcv.sb_hiwat < 2137 tcp_autorcvbuf_max) { 2138 newsize = 2139 min(so->so_rcv.sb_hiwat + 2140 tcp_autorcvbuf_inc, 2141 tcp_autorcvbuf_max); 2142 } 2143 /* Start over with next RTT. */ 2144 tp->rfbuf_ts = 0; 2145 tp->rfbuf_cnt = 0; 2146 } else 2147 tp->rfbuf_cnt += tlen; /* add up */ 2148 } 2149 2150 /* 2151 * Drop TCP, IP headers and TCP options then add data 2152 * to socket buffer. 2153 */ 2154 if (so->so_state & SS_CANTRCVMORE) 2155 m_freem(m); 2156 else { 2157 /* 2158 * Set new socket buffer size. 2159 * Give up when limit is reached. 2160 */ 2161 if (newsize) 2162 if (!sbreserve(&so->so_rcv, 2163 newsize, so)) 2164 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 2165 m_adj(m, toff + off); 2166 sbappendstream(&so->so_rcv, m); 2167 } 2168 sorwakeup(so); 2169 tcp_setup_ack(tp, th); 2170 if (tp->t_flags & TF_ACKNOW) { 2171 KERNEL_LOCK(1, NULL); 2172 (void) tcp_output(tp); 2173 KERNEL_UNLOCK_ONE(NULL); 2174 } 2175 if (tcp_saveti) 2176 m_freem(tcp_saveti); 2177 return; 2178 } 2179 } 2180 2181 /* 2182 * Compute mbuf offset to TCP data segment. 2183 */ 2184 hdroptlen = toff + off; 2185 2186 /* 2187 * Calculate amount of space in receive window, 2188 * and then do TCP input processing. 2189 * Receive window is amount of space in rcv queue, 2190 * but not less than advertised window. 2191 */ 2192 { int win; 2193 2194 win = sbspace(&so->so_rcv); 2195 if (win < 0) 2196 win = 0; 2197 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 2198 } 2199 2200 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 2201 tp->rfbuf_ts = 0; 2202 tp->rfbuf_cnt = 0; 2203 2204 switch (tp->t_state) { 2205 /* 2206 * If the state is SYN_SENT: 2207 * if seg contains an ACK, but not for our SYN, drop the input. 2208 * if seg contains a RST, then drop the connection. 2209 * if seg does not contain SYN, then drop it. 2210 * Otherwise this is an acceptable SYN segment 2211 * initialize tp->rcv_nxt and tp->irs 2212 * if seg contains ack then advance tp->snd_una 2213 * if seg contains a ECE and ECN support is enabled, the stream 2214 * is ECN capable. 2215 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 2216 * arrange for segment to be acked (eventually) 2217 * continue processing rest of data/controls, beginning with URG 2218 */ 2219 case TCPS_SYN_SENT: 2220 if ((tiflags & TH_ACK) && 2221 (SEQ_LEQ(th->th_ack, tp->iss) || 2222 SEQ_GT(th->th_ack, tp->snd_max))) 2223 goto dropwithreset; 2224 if (tiflags & TH_RST) { 2225 if (tiflags & TH_ACK) 2226 tp = tcp_drop(tp, ECONNREFUSED); 2227 goto drop; 2228 } 2229 if ((tiflags & TH_SYN) == 0) 2230 goto drop; 2231 if (tiflags & TH_ACK) { 2232 tp->snd_una = th->th_ack; 2233 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2234 tp->snd_nxt = tp->snd_una; 2235 if (SEQ_LT(tp->snd_high, tp->snd_una)) 2236 tp->snd_high = tp->snd_una; 2237 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2238 2239 if ((tiflags & TH_ECE) && tcp_do_ecn) { 2240 tp->t_flags |= TF_ECN_PERMIT; 2241 TCP_STATINC(TCP_STAT_ECN_SHS); 2242 } 2243 2244 } 2245 tp->irs = th->th_seq; 2246 tcp_rcvseqinit(tp); 2247 tp->t_flags |= TF_ACKNOW; 2248 tcp_mss_from_peer(tp, opti.maxseg); 2249 2250 /* 2251 * Initialize the initial congestion window. If we 2252 * had to retransmit the SYN, we must initialize cwnd 2253 * to 1 segment (i.e. the Loss Window). 2254 */ 2255 if (tp->t_flags & TF_SYN_REXMT) 2256 tp->snd_cwnd = tp->t_peermss; 2257 else { 2258 int ss = tcp_init_win; 2259 #ifdef INET 2260 if (inp != NULL && in_localaddr(inp->inp_faddr)) 2261 ss = tcp_init_win_local; 2262 #endif 2263 #ifdef INET6 2264 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) 2265 ss = tcp_init_win_local; 2266 #endif 2267 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 2268 } 2269 2270 tcp_rmx_rtt(tp); 2271 if (tiflags & TH_ACK) { 2272 TCP_STATINC(TCP_STAT_CONNECTS); 2273 /* 2274 * move tcp_established before soisconnected 2275 * because upcall handler can drive tcp_output 2276 * functionality. 2277 * XXX we might call soisconnected at the end of 2278 * all processing 2279 */ 2280 tcp_established(tp); 2281 soisconnected(so); 2282 /* Do window scaling on this connection? */ 2283 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2284 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2285 tp->snd_scale = tp->requested_s_scale; 2286 tp->rcv_scale = tp->request_r_scale; 2287 } 2288 TCP_REASS_LOCK(tp); 2289 (void) tcp_reass(tp, NULL, NULL, &tlen); 2290 /* 2291 * if we didn't have to retransmit the SYN, 2292 * use its rtt as our initial srtt & rtt var. 2293 */ 2294 if (tp->t_rtttime) 2295 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 2296 } else 2297 tp->t_state = TCPS_SYN_RECEIVED; 2298 2299 /* 2300 * Advance th->th_seq to correspond to first data byte. 2301 * If data, trim to stay within window, 2302 * dropping FIN if necessary. 2303 */ 2304 th->th_seq++; 2305 if (tlen > tp->rcv_wnd) { 2306 todrop = tlen - tp->rcv_wnd; 2307 m_adj(m, -todrop); 2308 tlen = tp->rcv_wnd; 2309 tiflags &= ~TH_FIN; 2310 tcps = TCP_STAT_GETREF(); 2311 tcps[TCP_STAT_RCVPACKAFTERWIN]++; 2312 tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop; 2313 TCP_STAT_PUTREF(); 2314 } 2315 tp->snd_wl1 = th->th_seq - 1; 2316 tp->rcv_up = th->th_seq; 2317 goto step6; 2318 2319 /* 2320 * If the state is SYN_RECEIVED: 2321 * If seg contains an ACK, but not for our SYN, drop the input 2322 * and generate an RST. See page 36, rfc793 2323 */ 2324 case TCPS_SYN_RECEIVED: 2325 if ((tiflags & TH_ACK) && 2326 (SEQ_LEQ(th->th_ack, tp->iss) || 2327 SEQ_GT(th->th_ack, tp->snd_max))) 2328 goto dropwithreset; 2329 break; 2330 } 2331 2332 /* 2333 * States other than LISTEN or SYN_SENT. 2334 * First check timestamp, if present. 2335 * Then check that at least some bytes of segment are within 2336 * receive window. If segment begins before rcv_nxt, 2337 * drop leading data (and SYN); if nothing left, just ack. 2338 * 2339 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2340 * and it's less than ts_recent, drop it. 2341 */ 2342 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 2343 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 2344 2345 /* Check to see if ts_recent is over 24 days old. */ 2346 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) { 2347 /* 2348 * Invalidate ts_recent. If this segment updates 2349 * ts_recent, the age will be reset later and ts_recent 2350 * will get a valid value. If it does not, setting 2351 * ts_recent to zero will at least satisfy the 2352 * requirement that zero be placed in the timestamp 2353 * echo reply when ts_recent isn't valid. The 2354 * age isn't reset until we get a valid ts_recent 2355 * because we don't want out-of-order segments to be 2356 * dropped when ts_recent is old. 2357 */ 2358 tp->ts_recent = 0; 2359 } else { 2360 tcps = TCP_STAT_GETREF(); 2361 tcps[TCP_STAT_RCVDUPPACK]++; 2362 tcps[TCP_STAT_RCVDUPBYTE] += tlen; 2363 tcps[TCP_STAT_PAWSDROP]++; 2364 TCP_STAT_PUTREF(); 2365 tcp_new_dsack(tp, th->th_seq, tlen); 2366 goto dropafterack; 2367 } 2368 } 2369 2370 todrop = tp->rcv_nxt - th->th_seq; 2371 dupseg = false; 2372 if (todrop > 0) { 2373 if (tiflags & TH_SYN) { 2374 tiflags &= ~TH_SYN; 2375 th->th_seq++; 2376 if (th->th_urp > 1) 2377 th->th_urp--; 2378 else { 2379 tiflags &= ~TH_URG; 2380 th->th_urp = 0; 2381 } 2382 todrop--; 2383 } 2384 if (todrop > tlen || 2385 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 2386 /* 2387 * Any valid FIN or RST must be to the left of the 2388 * window. At this point the FIN or RST must be a 2389 * duplicate or out of sequence; drop it. 2390 */ 2391 if (tiflags & TH_RST) 2392 goto drop; 2393 tiflags &= ~(TH_FIN|TH_RST); 2394 /* 2395 * Send an ACK to resynchronize and drop any data. 2396 * But keep on processing for RST or ACK. 2397 */ 2398 tp->t_flags |= TF_ACKNOW; 2399 todrop = tlen; 2400 dupseg = true; 2401 tcps = TCP_STAT_GETREF(); 2402 tcps[TCP_STAT_RCVDUPPACK]++; 2403 tcps[TCP_STAT_RCVDUPBYTE] += todrop; 2404 TCP_STAT_PUTREF(); 2405 } else if ((tiflags & TH_RST) && 2406 th->th_seq != tp->rcv_nxt) { 2407 /* 2408 * Test for reset before adjusting the sequence 2409 * number for overlapping data. 2410 */ 2411 goto dropafterack_ratelim; 2412 } else { 2413 tcps = TCP_STAT_GETREF(); 2414 tcps[TCP_STAT_RCVPARTDUPPACK]++; 2415 tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop; 2416 TCP_STAT_PUTREF(); 2417 } 2418 tcp_new_dsack(tp, th->th_seq, todrop); 2419 hdroptlen += todrop; /*drop from head afterwards*/ 2420 th->th_seq += todrop; 2421 tlen -= todrop; 2422 if (th->th_urp > todrop) 2423 th->th_urp -= todrop; 2424 else { 2425 tiflags &= ~TH_URG; 2426 th->th_urp = 0; 2427 } 2428 } 2429 2430 /* 2431 * If new data are received on a connection after the 2432 * user processes are gone, then RST the other end. 2433 */ 2434 if ((so->so_state & SS_NOFDREF) && 2435 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 2436 tp = tcp_close(tp); 2437 TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); 2438 goto dropwithreset; 2439 } 2440 2441 /* 2442 * If segment ends after window, drop trailing data 2443 * (and PUSH and FIN); if nothing left, just ACK. 2444 */ 2445 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 2446 if (todrop > 0) { 2447 TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); 2448 if (todrop >= tlen) { 2449 /* 2450 * The segment actually starts after the window. 2451 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen 2452 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0 2453 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd 2454 */ 2455 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); 2456 /* 2457 * If a new connection request is received 2458 * while in TIME_WAIT, drop the old connection 2459 * and start over if the sequence numbers 2460 * are above the previous ones. 2461 * 2462 * NOTE: We will checksum the packet again, and 2463 * so we need to put the header fields back into 2464 * network order! 2465 * XXX This kind of sucks, but we don't expect 2466 * XXX this to happen very often, so maybe it 2467 * XXX doesn't matter so much. 2468 */ 2469 if (tiflags & TH_SYN && 2470 tp->t_state == TCPS_TIME_WAIT && 2471 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 2472 tp = tcp_close(tp); 2473 tcp_fields_to_net(th); 2474 goto findpcb; 2475 } 2476 /* 2477 * If window is closed can only take segments at 2478 * window edge, and have to drop data and PUSH from 2479 * incoming segments. Continue processing, but 2480 * remember to ack. Otherwise, drop segment 2481 * and (if not RST) ack. 2482 */ 2483 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2484 tp->t_flags |= TF_ACKNOW; 2485 TCP_STATINC(TCP_STAT_RCVWINPROBE); 2486 } else 2487 goto dropafterack; 2488 } else 2489 TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); 2490 m_adj(m, -todrop); 2491 tlen -= todrop; 2492 tiflags &= ~(TH_PUSH|TH_FIN); 2493 } 2494 2495 /* 2496 * If last ACK falls within this segment's sequence numbers, 2497 * record the timestamp. 2498 * NOTE: 2499 * 1) That the test incorporates suggestions from the latest 2500 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2501 * 2) That updating only on newer timestamps interferes with 2502 * our earlier PAWS tests, so this check should be solely 2503 * predicated on the sequence space of this segment. 2504 * 3) That we modify the segment boundary check to be 2505 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2506 * instead of RFC1323's 2507 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2508 * This modified check allows us to overcome RFC1323's 2509 * limitations as described in Stevens TCP/IP Illustrated 2510 * Vol. 2 p.869. In such cases, we can still calculate the 2511 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2512 */ 2513 if (opti.ts_present && 2514 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2515 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2516 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 2517 tp->ts_recent_age = tcp_now; 2518 tp->ts_recent = opti.ts_val; 2519 } 2520 2521 /* 2522 * If the RST bit is set examine the state: 2523 * SYN_RECEIVED STATE: 2524 * If passive open, return to LISTEN state. 2525 * If active open, inform user that connection was refused. 2526 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 2527 * Inform user that connection was reset, and close tcb. 2528 * CLOSING, LAST_ACK, TIME_WAIT STATES 2529 * Close the tcb. 2530 */ 2531 if (tiflags & TH_RST) { 2532 if (th->th_seq != tp->rcv_nxt) 2533 goto dropafterack_ratelim; 2534 2535 switch (tp->t_state) { 2536 case TCPS_SYN_RECEIVED: 2537 so->so_error = ECONNREFUSED; 2538 goto close; 2539 2540 case TCPS_ESTABLISHED: 2541 case TCPS_FIN_WAIT_1: 2542 case TCPS_FIN_WAIT_2: 2543 case TCPS_CLOSE_WAIT: 2544 so->so_error = ECONNRESET; 2545 close: 2546 tp->t_state = TCPS_CLOSED; 2547 TCP_STATINC(TCP_STAT_DROPS); 2548 tp = tcp_close(tp); 2549 goto drop; 2550 2551 case TCPS_CLOSING: 2552 case TCPS_LAST_ACK: 2553 case TCPS_TIME_WAIT: 2554 tp = tcp_close(tp); 2555 goto drop; 2556 } 2557 } 2558 2559 /* 2560 * Since we've covered the SYN-SENT and SYN-RECEIVED states above 2561 * we must be in a synchronized state. RFC791 states (under RST 2562 * generation) that any unacceptable segment (an out-of-order SYN 2563 * qualifies) received in a synchronized state must elicit only an 2564 * empty acknowledgment segment ... and the connection remains in 2565 * the same state. 2566 */ 2567 if (tiflags & TH_SYN) { 2568 if (tp->rcv_nxt == th->th_seq) { 2569 tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1, 2570 TH_ACK); 2571 if (tcp_saveti) 2572 m_freem(tcp_saveti); 2573 return; 2574 } 2575 2576 goto dropafterack_ratelim; 2577 } 2578 2579 /* 2580 * If the ACK bit is off we drop the segment and return. 2581 */ 2582 if ((tiflags & TH_ACK) == 0) { 2583 if (tp->t_flags & TF_ACKNOW) 2584 goto dropafterack; 2585 else 2586 goto drop; 2587 } 2588 2589 /* 2590 * Ack processing. 2591 */ 2592 switch (tp->t_state) { 2593 2594 /* 2595 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 2596 * ESTABLISHED state and continue processing, otherwise 2597 * send an RST. 2598 */ 2599 case TCPS_SYN_RECEIVED: 2600 if (SEQ_GT(tp->snd_una, th->th_ack) || 2601 SEQ_GT(th->th_ack, tp->snd_max)) 2602 goto dropwithreset; 2603 TCP_STATINC(TCP_STAT_CONNECTS); 2604 soisconnected(so); 2605 tcp_established(tp); 2606 /* Do window scaling? */ 2607 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2608 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2609 tp->snd_scale = tp->requested_s_scale; 2610 tp->rcv_scale = tp->request_r_scale; 2611 } 2612 TCP_REASS_LOCK(tp); 2613 (void) tcp_reass(tp, NULL, NULL, &tlen); 2614 tp->snd_wl1 = th->th_seq - 1; 2615 /* fall into ... */ 2616 2617 /* 2618 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2619 * ACKs. If the ack is in the range 2620 * tp->snd_una < th->th_ack <= tp->snd_max 2621 * then advance tp->snd_una to th->th_ack and drop 2622 * data from the retransmission queue. If this ACK reflects 2623 * more up to date window information we update our window information. 2624 */ 2625 case TCPS_ESTABLISHED: 2626 case TCPS_FIN_WAIT_1: 2627 case TCPS_FIN_WAIT_2: 2628 case TCPS_CLOSE_WAIT: 2629 case TCPS_CLOSING: 2630 case TCPS_LAST_ACK: 2631 case TCPS_TIME_WAIT: 2632 2633 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2634 if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) { 2635 TCP_STATINC(TCP_STAT_RCVDUPACK); 2636 /* 2637 * If we have outstanding data (other than 2638 * a window probe), this is a completely 2639 * duplicate ack (ie, window info didn't 2640 * change), the ack is the biggest we've 2641 * seen and we've seen exactly our rexmt 2642 * threshhold of them, assume a packet 2643 * has been dropped and retransmit it. 2644 * Kludge snd_nxt & the congestion 2645 * window so we send only this one 2646 * packet. 2647 */ 2648 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 2649 th->th_ack != tp->snd_una) 2650 tp->t_dupacks = 0; 2651 else if (tp->t_partialacks < 0 && 2652 (++tp->t_dupacks == tcprexmtthresh || 2653 TCP_FACK_FASTRECOV(tp))) { 2654 /* 2655 * Do the fast retransmit, and adjust 2656 * congestion control paramenters. 2657 */ 2658 if (tp->t_congctl->fast_retransmit(tp, th)) { 2659 /* False fast retransmit */ 2660 break; 2661 } else 2662 goto drop; 2663 } else if (tp->t_dupacks > tcprexmtthresh) { 2664 tp->snd_cwnd += tp->t_segsz; 2665 KERNEL_LOCK(1, NULL); 2666 (void) tcp_output(tp); 2667 KERNEL_UNLOCK_ONE(NULL); 2668 goto drop; 2669 } 2670 } else { 2671 /* 2672 * If the ack appears to be very old, only 2673 * allow data that is in-sequence. This 2674 * makes it somewhat more difficult to insert 2675 * forged data by guessing sequence numbers. 2676 * Sent an ack to try to update the send 2677 * sequence number on the other side. 2678 */ 2679 if (tlen && th->th_seq != tp->rcv_nxt && 2680 SEQ_LT(th->th_ack, 2681 tp->snd_una - tp->max_sndwnd)) 2682 goto dropafterack; 2683 } 2684 break; 2685 } 2686 /* 2687 * If the congestion window was inflated to account 2688 * for the other side's cached packets, retract it. 2689 */ 2690 tp->t_congctl->fast_retransmit_newack(tp, th); 2691 2692 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2693 TCP_STATINC(TCP_STAT_RCVACKTOOMUCH); 2694 goto dropafterack; 2695 } 2696 acked = th->th_ack - tp->snd_una; 2697 tcps = TCP_STAT_GETREF(); 2698 tcps[TCP_STAT_RCVACKPACK]++; 2699 tcps[TCP_STAT_RCVACKBYTE] += acked; 2700 TCP_STAT_PUTREF(); 2701 2702 /* 2703 * If we have a timestamp reply, update smoothed 2704 * round trip time. If no timestamp is present but 2705 * transmit timer is running and timed sequence 2706 * number was acked, update smoothed round trip time. 2707 * Since we now have an rtt measurement, cancel the 2708 * timer backoff (cf., Phil Karn's retransmit alg.). 2709 * Recompute the initial retransmit timer. 2710 */ 2711 if (ts_rtt) 2712 tcp_xmit_timer(tp, ts_rtt - 1); 2713 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 2714 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 2715 2716 /* 2717 * If all outstanding data is acked, stop retransmit 2718 * timer and remember to restart (more output or persist). 2719 * If there is more data to be acked, restart retransmit 2720 * timer, using current (possibly backed-off) value. 2721 */ 2722 if (th->th_ack == tp->snd_max) { 2723 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2724 needoutput = 1; 2725 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 2726 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 2727 2728 /* 2729 * New data has been acked, adjust the congestion window. 2730 */ 2731 tp->t_congctl->newack(tp, th); 2732 2733 nd6_hint(tp); 2734 if (acked > so->so_snd.sb_cc) { 2735 tp->snd_wnd -= so->so_snd.sb_cc; 2736 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 2737 ourfinisacked = 1; 2738 } else { 2739 if (acked > (tp->t_lastoff - tp->t_inoff)) 2740 tp->t_lastm = NULL; 2741 sbdrop(&so->so_snd, acked); 2742 tp->t_lastoff -= acked; 2743 if (tp->snd_wnd > acked) 2744 tp->snd_wnd -= acked; 2745 else 2746 tp->snd_wnd = 0; 2747 ourfinisacked = 0; 2748 } 2749 sowwakeup(so); 2750 2751 icmp_check(tp, th, acked); 2752 2753 tp->snd_una = th->th_ack; 2754 if (SEQ_GT(tp->snd_una, tp->snd_fack)) 2755 tp->snd_fack = tp->snd_una; 2756 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2757 tp->snd_nxt = tp->snd_una; 2758 if (SEQ_LT(tp->snd_high, tp->snd_una)) 2759 tp->snd_high = tp->snd_una; 2760 2761 switch (tp->t_state) { 2762 2763 /* 2764 * In FIN_WAIT_1 STATE in addition to the processing 2765 * for the ESTABLISHED state if our FIN is now acknowledged 2766 * then enter FIN_WAIT_2. 2767 */ 2768 case TCPS_FIN_WAIT_1: 2769 if (ourfinisacked) { 2770 /* 2771 * If we can't receive any more 2772 * data, then closing user can proceed. 2773 * Starting the timer is contrary to the 2774 * specification, but if we don't get a FIN 2775 * we'll hang forever. 2776 */ 2777 if (so->so_state & SS_CANTRCVMORE) { 2778 soisdisconnected(so); 2779 if (tp->t_maxidle > 0) 2780 TCP_TIMER_ARM(tp, TCPT_2MSL, 2781 tp->t_maxidle); 2782 } 2783 tp->t_state = TCPS_FIN_WAIT_2; 2784 } 2785 break; 2786 2787 /* 2788 * In CLOSING STATE in addition to the processing for 2789 * the ESTABLISHED state if the ACK acknowledges our FIN 2790 * then enter the TIME-WAIT state, otherwise ignore 2791 * the segment. 2792 */ 2793 case TCPS_CLOSING: 2794 if (ourfinisacked) { 2795 tp->t_state = TCPS_TIME_WAIT; 2796 tcp_canceltimers(tp); 2797 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2798 soisdisconnected(so); 2799 } 2800 break; 2801 2802 /* 2803 * In LAST_ACK, we may still be waiting for data to drain 2804 * and/or to be acked, as well as for the ack of our FIN. 2805 * If our FIN is now acknowledged, delete the TCB, 2806 * enter the closed state and return. 2807 */ 2808 case TCPS_LAST_ACK: 2809 if (ourfinisacked) { 2810 tp = tcp_close(tp); 2811 goto drop; 2812 } 2813 break; 2814 2815 /* 2816 * In TIME_WAIT state the only thing that should arrive 2817 * is a retransmission of the remote FIN. Acknowledge 2818 * it and restart the finack timer. 2819 */ 2820 case TCPS_TIME_WAIT: 2821 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2822 goto dropafterack; 2823 } 2824 } 2825 2826 step6: 2827 /* 2828 * Update window information. 2829 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2830 */ 2831 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 2832 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2833 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2834 /* keep track of pure window updates */ 2835 if (tlen == 0 && 2836 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2837 TCP_STATINC(TCP_STAT_RCVWINUPD); 2838 tp->snd_wnd = tiwin; 2839 tp->snd_wl1 = th->th_seq; 2840 tp->snd_wl2 = th->th_ack; 2841 if (tp->snd_wnd > tp->max_sndwnd) 2842 tp->max_sndwnd = tp->snd_wnd; 2843 needoutput = 1; 2844 } 2845 2846 /* 2847 * Process segments with URG. 2848 */ 2849 if ((tiflags & TH_URG) && th->th_urp && 2850 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2851 /* 2852 * This is a kludge, but if we receive and accept 2853 * random urgent pointers, we'll crash in 2854 * soreceive. It's hard to imagine someone 2855 * actually wanting to send this much urgent data. 2856 */ 2857 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2858 th->th_urp = 0; /* XXX */ 2859 tiflags &= ~TH_URG; /* XXX */ 2860 goto dodata; /* XXX */ 2861 } 2862 /* 2863 * If this segment advances the known urgent pointer, 2864 * then mark the data stream. This should not happen 2865 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2866 * a FIN has been received from the remote side. 2867 * In these states we ignore the URG. 2868 * 2869 * According to RFC961 (Assigned Protocols), 2870 * the urgent pointer points to the last octet 2871 * of urgent data. We continue, however, 2872 * to consider it to indicate the first octet 2873 * of data past the urgent section as the original 2874 * spec states (in one of two places). 2875 */ 2876 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2877 tp->rcv_up = th->th_seq + th->th_urp; 2878 so->so_oobmark = so->so_rcv.sb_cc + 2879 (tp->rcv_up - tp->rcv_nxt) - 1; 2880 if (so->so_oobmark == 0) 2881 so->so_state |= SS_RCVATMARK; 2882 sohasoutofband(so); 2883 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2884 } 2885 /* 2886 * Remove out of band data so doesn't get presented to user. 2887 * This can happen independent of advancing the URG pointer, 2888 * but if two URG's are pending at once, some out-of-band 2889 * data may creep in... ick. 2890 */ 2891 if (th->th_urp <= (u_int16_t) tlen 2892 #ifdef SO_OOBINLINE 2893 && (so->so_options & SO_OOBINLINE) == 0 2894 #endif 2895 ) 2896 tcp_pulloutofband(so, th, m, hdroptlen); 2897 } else 2898 /* 2899 * If no out of band data is expected, 2900 * pull receive urgent pointer along 2901 * with the receive window. 2902 */ 2903 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2904 tp->rcv_up = tp->rcv_nxt; 2905 dodata: /* XXX */ 2906 2907 /* 2908 * Process the segment text, merging it into the TCP sequencing queue, 2909 * and arranging for acknowledgement of receipt if necessary. 2910 * This process logically involves adjusting tp->rcv_wnd as data 2911 * is presented to the user (this happens in tcp_usrreq.c, 2912 * tcp_rcvd()). If a FIN has already been received on this 2913 * connection then we just ignore the text. 2914 */ 2915 if ((tlen || (tiflags & TH_FIN)) && 2916 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2917 /* 2918 * Insert segment ti into reassembly queue of tcp with 2919 * control block tp. Return TH_FIN if reassembly now includes 2920 * a segment with FIN. The macro form does the common case 2921 * inline (segment is the next to be received on an 2922 * established connection, and the queue is empty), 2923 * avoiding linkage into and removal from the queue and 2924 * repetition of various conversions. 2925 * Set DELACK for segments received in order, but ack 2926 * immediately when segments are out of order 2927 * (so fast retransmit can work). 2928 */ 2929 /* NOTE: this was TCP_REASS() macro, but used only once */ 2930 TCP_REASS_LOCK(tp); 2931 if (th->th_seq == tp->rcv_nxt && 2932 TAILQ_FIRST(&tp->segq) == NULL && 2933 tp->t_state == TCPS_ESTABLISHED) { 2934 tcp_setup_ack(tp, th); 2935 tp->rcv_nxt += tlen; 2936 tiflags = th->th_flags & TH_FIN; 2937 tcps = TCP_STAT_GETREF(); 2938 tcps[TCP_STAT_RCVPACK]++; 2939 tcps[TCP_STAT_RCVBYTE] += tlen; 2940 TCP_STAT_PUTREF(); 2941 nd6_hint(tp); 2942 if (so->so_state & SS_CANTRCVMORE) 2943 m_freem(m); 2944 else { 2945 m_adj(m, hdroptlen); 2946 sbappendstream(&(so)->so_rcv, m); 2947 } 2948 TCP_REASS_UNLOCK(tp); 2949 sorwakeup(so); 2950 } else { 2951 m_adj(m, hdroptlen); 2952 tiflags = tcp_reass(tp, th, m, &tlen); 2953 tp->t_flags |= TF_ACKNOW; 2954 } 2955 2956 /* 2957 * Note the amount of data that peer has sent into 2958 * our window, in order to estimate the sender's 2959 * buffer size. 2960 */ 2961 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2962 } else { 2963 m_freem(m); 2964 m = NULL; 2965 tiflags &= ~TH_FIN; 2966 } 2967 2968 /* 2969 * If FIN is received ACK the FIN and let the user know 2970 * that the connection is closing. Ignore a FIN received before 2971 * the connection is fully established. 2972 */ 2973 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2974 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2975 socantrcvmore(so); 2976 tp->t_flags |= TF_ACKNOW; 2977 tp->rcv_nxt++; 2978 } 2979 switch (tp->t_state) { 2980 2981 /* 2982 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2983 */ 2984 case TCPS_ESTABLISHED: 2985 tp->t_state = TCPS_CLOSE_WAIT; 2986 break; 2987 2988 /* 2989 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2990 * enter the CLOSING state. 2991 */ 2992 case TCPS_FIN_WAIT_1: 2993 tp->t_state = TCPS_CLOSING; 2994 break; 2995 2996 /* 2997 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2998 * starting the time-wait timer, turning off the other 2999 * standard timers. 3000 */ 3001 case TCPS_FIN_WAIT_2: 3002 tp->t_state = TCPS_TIME_WAIT; 3003 tcp_canceltimers(tp); 3004 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 3005 soisdisconnected(so); 3006 break; 3007 3008 /* 3009 * In TIME_WAIT state restart the 2 MSL time_wait timer. 3010 */ 3011 case TCPS_TIME_WAIT: 3012 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 3013 break; 3014 } 3015 } 3016 #ifdef TCP_DEBUG 3017 if (so->so_options & SO_DEBUG) 3018 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 3019 #endif 3020 3021 /* 3022 * Return any desired output. 3023 */ 3024 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 3025 KERNEL_LOCK(1, NULL); 3026 (void) tcp_output(tp); 3027 KERNEL_UNLOCK_ONE(NULL); 3028 } 3029 if (tcp_saveti) 3030 m_freem(tcp_saveti); 3031 3032 if (tp->t_state == TCPS_TIME_WAIT 3033 && (so->so_state & SS_NOFDREF) 3034 && (tp->t_inpcb || af != AF_INET) 3035 && (tp->t_in6pcb || af != AF_INET6) 3036 && ((af == AF_INET ? tcp4_vtw_enable : tcp6_vtw_enable) & 1) != 0 3037 && TAILQ_EMPTY(&tp->segq) 3038 && vtw_add(af, tp)) { 3039 ; 3040 } 3041 return; 3042 3043 badsyn: 3044 /* 3045 * Received a bad SYN. Increment counters and dropwithreset. 3046 */ 3047 TCP_STATINC(TCP_STAT_BADSYN); 3048 tp = NULL; 3049 goto dropwithreset; 3050 3051 dropafterack: 3052 /* 3053 * Generate an ACK dropping incoming segment if it occupies 3054 * sequence space, where the ACK reflects our state. 3055 */ 3056 if (tiflags & TH_RST) 3057 goto drop; 3058 goto dropafterack2; 3059 3060 dropafterack_ratelim: 3061 /* 3062 * We may want to rate-limit ACKs against SYN/RST attack. 3063 */ 3064 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 3065 tcp_ackdrop_ppslim) == 0) { 3066 /* XXX stat */ 3067 goto drop; 3068 } 3069 /* ...fall into dropafterack2... */ 3070 3071 dropafterack2: 3072 m_freem(m); 3073 tp->t_flags |= TF_ACKNOW; 3074 KERNEL_LOCK(1, NULL); 3075 (void) tcp_output(tp); 3076 KERNEL_UNLOCK_ONE(NULL); 3077 if (tcp_saveti) 3078 m_freem(tcp_saveti); 3079 return; 3080 3081 dropwithreset_ratelim: 3082 /* 3083 * We may want to rate-limit RSTs in certain situations, 3084 * particularly if we are sending an RST in response to 3085 * an attempt to connect to or otherwise communicate with 3086 * a port for which we have no socket. 3087 */ 3088 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 3089 tcp_rst_ppslim) == 0) { 3090 /* XXX stat */ 3091 goto drop; 3092 } 3093 /* ...fall into dropwithreset... */ 3094 3095 dropwithreset: 3096 /* 3097 * Generate a RST, dropping incoming segment. 3098 * Make ACK acceptable to originator of segment. 3099 */ 3100 if (tiflags & TH_RST) 3101 goto drop; 3102 3103 switch (af) { 3104 #ifdef INET6 3105 case AF_INET6: 3106 /* For following calls to tcp_respond */ 3107 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) 3108 goto drop; 3109 break; 3110 #endif /* INET6 */ 3111 case AF_INET: 3112 if (IN_MULTICAST(ip->ip_dst.s_addr) || 3113 in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m))) 3114 goto drop; 3115 } 3116 3117 if (tiflags & TH_ACK) 3118 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 3119 else { 3120 if (tiflags & TH_SYN) 3121 tlen++; 3122 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 3123 TH_RST|TH_ACK); 3124 } 3125 if (tcp_saveti) 3126 m_freem(tcp_saveti); 3127 return; 3128 3129 badcsum: 3130 drop: 3131 /* 3132 * Drop space held by incoming segment and return. 3133 */ 3134 if (tp) { 3135 if (tp->t_inpcb) 3136 so = tp->t_inpcb->inp_socket; 3137 #ifdef INET6 3138 else if (tp->t_in6pcb) 3139 so = tp->t_in6pcb->in6p_socket; 3140 #endif 3141 else 3142 so = NULL; 3143 #ifdef TCP_DEBUG 3144 if (so && (so->so_options & SO_DEBUG) != 0) 3145 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 3146 #endif 3147 } 3148 if (tcp_saveti) 3149 m_freem(tcp_saveti); 3150 m_freem(m); 3151 return; 3152 } 3153 3154 #ifdef TCP_SIGNATURE 3155 int 3156 tcp_signature_apply(void *fstate, void *data, u_int len) 3157 { 3158 3159 MD5Update(fstate, (u_char *)data, len); 3160 return (0); 3161 } 3162 3163 struct secasvar * 3164 tcp_signature_getsav(struct mbuf *m, struct tcphdr *th) 3165 { 3166 struct ip *ip; 3167 struct ip6_hdr *ip6; 3168 3169 ip = mtod(m, struct ip *); 3170 switch (ip->ip_v) { 3171 case 4: 3172 ip = mtod(m, struct ip *); 3173 ip6 = NULL; 3174 break; 3175 case 6: 3176 ip = NULL; 3177 ip6 = mtod(m, struct ip6_hdr *); 3178 break; 3179 default: 3180 return (NULL); 3181 } 3182 3183 #ifdef IPSEC 3184 if (ipsec_used) { 3185 union sockaddr_union dst; 3186 /* Extract the destination from the IP header in the mbuf. */ 3187 memset(&dst, 0, sizeof(union sockaddr_union)); 3188 if (ip != NULL) { 3189 dst.sa.sa_len = sizeof(struct sockaddr_in); 3190 dst.sa.sa_family = AF_INET; 3191 dst.sin.sin_addr = ip->ip_dst; 3192 } else { 3193 dst.sa.sa_len = sizeof(struct sockaddr_in6); 3194 dst.sa.sa_family = AF_INET6; 3195 dst.sin6.sin6_addr = ip6->ip6_dst; 3196 } 3197 3198 /* 3199 * Look up an SADB entry which matches the address of the peer. 3200 */ 3201 return KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI), 0, 0); 3202 } 3203 return NULL; 3204 #else 3205 if (ip) 3206 return key_allocsa(AF_INET, (void *)&ip->ip_src, 3207 (void *)&ip->ip_dst, IPPROTO_TCP, 3208 htonl(TCP_SIG_SPI), 0, 0); 3209 else 3210 return key_allocsa(AF_INET6, (void *)&ip6->ip6_src, 3211 (void *)&ip6->ip6_dst, IPPROTO_TCP, 3212 htonl(TCP_SIG_SPI), 0, 0); 3213 #endif 3214 } 3215 3216 int 3217 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff, 3218 struct secasvar *sav, char *sig) 3219 { 3220 MD5_CTX ctx; 3221 struct ip *ip; 3222 struct ipovly *ipovly; 3223 #ifdef INET6 3224 struct ip6_hdr *ip6; 3225 struct ip6_hdr_pseudo ip6pseudo; 3226 #endif /* INET6 */ 3227 struct ippseudo ippseudo; 3228 struct tcphdr th0; 3229 int l, tcphdrlen; 3230 3231 if (sav == NULL) 3232 return (-1); 3233 3234 tcphdrlen = th->th_off * 4; 3235 3236 switch (mtod(m, struct ip *)->ip_v) { 3237 case 4: 3238 MD5Init(&ctx); 3239 ip = mtod(m, struct ip *); 3240 memset(&ippseudo, 0, sizeof(ippseudo)); 3241 ipovly = (struct ipovly *)ip; 3242 ippseudo.ippseudo_src = ipovly->ih_src; 3243 ippseudo.ippseudo_dst = ipovly->ih_dst; 3244 ippseudo.ippseudo_pad = 0; 3245 ippseudo.ippseudo_p = IPPROTO_TCP; 3246 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff); 3247 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo)); 3248 break; 3249 #if INET6 3250 case 6: 3251 MD5Init(&ctx); 3252 ip6 = mtod(m, struct ip6_hdr *); 3253 memset(&ip6pseudo, 0, sizeof(ip6pseudo)); 3254 ip6pseudo.ip6ph_src = ip6->ip6_src; 3255 in6_clearscope(&ip6pseudo.ip6ph_src); 3256 ip6pseudo.ip6ph_dst = ip6->ip6_dst; 3257 in6_clearscope(&ip6pseudo.ip6ph_dst); 3258 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff); 3259 ip6pseudo.ip6ph_nxt = IPPROTO_TCP; 3260 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo)); 3261 break; 3262 #endif /* INET6 */ 3263 default: 3264 return (-1); 3265 } 3266 3267 th0 = *th; 3268 th0.th_sum = 0; 3269 MD5Update(&ctx, (char *)&th0, sizeof(th0)); 3270 3271 l = m->m_pkthdr.len - thoff - tcphdrlen; 3272 if (l > 0) 3273 m_apply(m, thoff + tcphdrlen, 3274 m->m_pkthdr.len - thoff - tcphdrlen, 3275 tcp_signature_apply, &ctx); 3276 3277 MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); 3278 MD5Final(sig, &ctx); 3279 3280 return (0); 3281 } 3282 #endif 3283 3284 /* 3285 * tcp_dooptions: parse and process tcp options. 3286 * 3287 * returns -1 if this segment should be dropped. (eg. wrong signature) 3288 * otherwise returns 0. 3289 */ 3290 3291 static int 3292 tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, 3293 struct tcphdr *th, 3294 struct mbuf *m, int toff, struct tcp_opt_info *oi) 3295 { 3296 u_int16_t mss; 3297 int opt, optlen = 0; 3298 #ifdef TCP_SIGNATURE 3299 void *sigp = NULL; 3300 char sigbuf[TCP_SIGLEN]; 3301 struct secasvar *sav = NULL; 3302 #endif 3303 3304 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 3305 opt = cp[0]; 3306 if (opt == TCPOPT_EOL) 3307 break; 3308 if (opt == TCPOPT_NOP) 3309 optlen = 1; 3310 else { 3311 if (cnt < 2) 3312 break; 3313 optlen = cp[1]; 3314 if (optlen < 2 || optlen > cnt) 3315 break; 3316 } 3317 switch (opt) { 3318 3319 default: 3320 continue; 3321 3322 case TCPOPT_MAXSEG: 3323 if (optlen != TCPOLEN_MAXSEG) 3324 continue; 3325 if (!(th->th_flags & TH_SYN)) 3326 continue; 3327 if (TCPS_HAVERCVDSYN(tp->t_state)) 3328 continue; 3329 bcopy(cp + 2, &mss, sizeof(mss)); 3330 oi->maxseg = ntohs(mss); 3331 break; 3332 3333 case TCPOPT_WINDOW: 3334 if (optlen != TCPOLEN_WINDOW) 3335 continue; 3336 if (!(th->th_flags & TH_SYN)) 3337 continue; 3338 if (TCPS_HAVERCVDSYN(tp->t_state)) 3339 continue; 3340 tp->t_flags |= TF_RCVD_SCALE; 3341 tp->requested_s_scale = cp[2]; 3342 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 3343 char buf[INET6_ADDRSTRLEN]; 3344 struct ip *ip = mtod(m, struct ip *); 3345 #ifdef INET6 3346 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 3347 #endif 3348 if (ip) 3349 in_print(buf, sizeof(buf), 3350 &ip->ip_src); 3351 #ifdef INET6 3352 else if (ip6) 3353 in6_print(buf, sizeof(buf), 3354 &ip6->ip6_src); 3355 #endif 3356 else 3357 strlcpy(buf, "(unknown)", sizeof(buf)); 3358 log(LOG_ERR, "TCP: invalid wscale %d from %s, " 3359 "assuming %d\n", 3360 tp->requested_s_scale, buf, 3361 TCP_MAX_WINSHIFT); 3362 tp->requested_s_scale = TCP_MAX_WINSHIFT; 3363 } 3364 break; 3365 3366 case TCPOPT_TIMESTAMP: 3367 if (optlen != TCPOLEN_TIMESTAMP) 3368 continue; 3369 oi->ts_present = 1; 3370 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 3371 NTOHL(oi->ts_val); 3372 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 3373 NTOHL(oi->ts_ecr); 3374 3375 if (!(th->th_flags & TH_SYN)) 3376 continue; 3377 if (TCPS_HAVERCVDSYN(tp->t_state)) 3378 continue; 3379 /* 3380 * A timestamp received in a SYN makes 3381 * it ok to send timestamp requests and replies. 3382 */ 3383 tp->t_flags |= TF_RCVD_TSTMP; 3384 tp->ts_recent = oi->ts_val; 3385 tp->ts_recent_age = tcp_now; 3386 break; 3387 3388 case TCPOPT_SACK_PERMITTED: 3389 if (optlen != TCPOLEN_SACK_PERMITTED) 3390 continue; 3391 if (!(th->th_flags & TH_SYN)) 3392 continue; 3393 if (TCPS_HAVERCVDSYN(tp->t_state)) 3394 continue; 3395 if (tcp_do_sack) { 3396 tp->t_flags |= TF_SACK_PERMIT; 3397 tp->t_flags |= TF_WILL_SACK; 3398 } 3399 break; 3400 3401 case TCPOPT_SACK: 3402 tcp_sack_option(tp, th, cp, optlen); 3403 break; 3404 #ifdef TCP_SIGNATURE 3405 case TCPOPT_SIGNATURE: 3406 if (optlen != TCPOLEN_SIGNATURE) 3407 continue; 3408 if (sigp && memcmp(sigp, cp + 2, TCP_SIGLEN)) 3409 return (-1); 3410 3411 sigp = sigbuf; 3412 memcpy(sigbuf, cp + 2, TCP_SIGLEN); 3413 tp->t_flags |= TF_SIGNATURE; 3414 break; 3415 #endif 3416 } 3417 } 3418 3419 #ifndef TCP_SIGNATURE 3420 return 0; 3421 #else 3422 if (tp->t_flags & TF_SIGNATURE) { 3423 3424 sav = tcp_signature_getsav(m, th); 3425 3426 if (sav == NULL && tp->t_state == TCPS_LISTEN) 3427 return (-1); 3428 } 3429 3430 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) 3431 goto out; 3432 3433 if (sigp) { 3434 char sig[TCP_SIGLEN]; 3435 3436 tcp_fields_to_net(th); 3437 if (tcp_signature(m, th, toff, sav, sig) < 0) { 3438 tcp_fields_to_host(th); 3439 goto out; 3440 } 3441 tcp_fields_to_host(th); 3442 3443 if (memcmp(sig, sigp, TCP_SIGLEN)) { 3444 TCP_STATINC(TCP_STAT_BADSIG); 3445 goto out; 3446 } else 3447 TCP_STATINC(TCP_STAT_GOODSIG); 3448 3449 key_sa_recordxfer(sav, m); 3450 KEY_FREESAV(&sav); 3451 } 3452 return 0; 3453 out: 3454 if (sav != NULL) 3455 KEY_FREESAV(&sav); 3456 return -1; 3457 #endif 3458 } 3459 3460 /* 3461 * Pull out of band byte out of a segment so 3462 * it doesn't appear in the user's data queue. 3463 * It is still reflected in the segment length for 3464 * sequencing purposes. 3465 */ 3466 void 3467 tcp_pulloutofband(struct socket *so, struct tcphdr *th, 3468 struct mbuf *m, int off) 3469 { 3470 int cnt = off + th->th_urp - 1; 3471 3472 while (cnt >= 0) { 3473 if (m->m_len > cnt) { 3474 char *cp = mtod(m, char *) + cnt; 3475 struct tcpcb *tp = sototcpcb(so); 3476 3477 tp->t_iobc = *cp; 3478 tp->t_oobflags |= TCPOOB_HAVEDATA; 3479 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3480 m->m_len--; 3481 return; 3482 } 3483 cnt -= m->m_len; 3484 m = m->m_next; 3485 if (m == 0) 3486 break; 3487 } 3488 panic("tcp_pulloutofband"); 3489 } 3490 3491 /* 3492 * Collect new round-trip time estimate 3493 * and update averages and current timeout. 3494 * 3495 * rtt is in units of slow ticks (typically 500 ms) -- essentially the 3496 * difference of two timestamps. 3497 */ 3498 void 3499 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt) 3500 { 3501 int32_t delta; 3502 3503 TCP_STATINC(TCP_STAT_RTTUPDATED); 3504 if (tp->t_srtt != 0) { 3505 /* 3506 * Compute the amount to add to srtt for smoothing, 3507 * *alpha, or 2^(-TCP_RTT_SHIFT). Because 3508 * srtt is stored in 1/32 slow ticks, we conceptually 3509 * shift left 5 bits, subtract srtt to get the 3510 * diference, and then shift right by TCP_RTT_SHIFT 3511 * (3) to obtain 1/8 of the difference. 3512 */ 3513 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 3514 /* 3515 * This can never happen, because delta's lowest 3516 * possible value is 1/8 of t_srtt. But if it does, 3517 * set srtt to some reasonable value, here chosen 3518 * as 1/8 tick. 3519 */ 3520 if ((tp->t_srtt += delta) <= 0) 3521 tp->t_srtt = 1 << 2; 3522 /* 3523 * RFC2988 requires that rttvar be updated first. 3524 * This code is compliant because "delta" is the old 3525 * srtt minus the new observation (scaled). 3526 * 3527 * RFC2988 says: 3528 * rttvar = (1-beta) * rttvar + beta * |srtt-observed| 3529 * 3530 * delta is in units of 1/32 ticks, and has then been 3531 * divided by 8. This is equivalent to being in 1/16s 3532 * units and divided by 4. Subtract from it 1/4 of 3533 * the existing rttvar to form the (signed) amount to 3534 * adjust. 3535 */ 3536 if (delta < 0) 3537 delta = -delta; 3538 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 3539 /* 3540 * As with srtt, this should never happen. There is 3541 * no support in RFC2988 for this operation. But 1/4s 3542 * as rttvar when faced with something arguably wrong 3543 * is ok. 3544 */ 3545 if ((tp->t_rttvar += delta) <= 0) 3546 tp->t_rttvar = 1 << 2; 3547 3548 /* 3549 * If srtt exceeds .01 second, ensure we use the 'remote' MSL 3550 * Problem is: it doesn't work. Disabled by defaulting 3551 * tcp_rttlocal to 0; see corresponding code in 3552 * tcp_subr that selects local vs remote in a different way. 3553 * 3554 * The static branch prediction hint here should be removed 3555 * when the rtt estimator is fixed and the rtt_enable code 3556 * is turned back on. 3557 */ 3558 if (__predict_false(tcp_rttlocal) && tcp_msl_enable 3559 && tp->t_srtt > tcp_msl_remote_threshold 3560 && tp->t_msl < tcp_msl_remote) { 3561 tp->t_msl = tcp_msl_remote; 3562 } 3563 } else { 3564 /* 3565 * This is the first measurement. Per RFC2988, 2.2, 3566 * set rtt=R and srtt=R/2. 3567 * For srtt, storage representation is 1/32 ticks, 3568 * so shift left by 5. 3569 * For rttvar, storage representation is 1/16 ticks, 3570 * So shift left by 4, but then right by 1 to halve. 3571 */ 3572 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 3573 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 3574 } 3575 tp->t_rtttime = 0; 3576 tp->t_rxtshift = 0; 3577 3578 /* 3579 * the retransmit should happen at rtt + 4 * rttvar. 3580 * Because of the way we do the smoothing, srtt and rttvar 3581 * will each average +1/2 tick of bias. When we compute 3582 * the retransmit timer, we want 1/2 tick of rounding and 3583 * 1 extra tick because of +-1/2 tick uncertainty in the 3584 * firing of the timer. The bias will give us exactly the 3585 * 1.5 tick we need. But, because the bias is 3586 * statistical, we have to test that we don't drop below 3587 * the minimum feasible timer (which is 2 ticks). 3588 */ 3589 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3590 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3591 3592 /* 3593 * We received an ack for a packet that wasn't retransmitted; 3594 * it is probably safe to discard any error indications we've 3595 * received recently. This isn't quite right, but close enough 3596 * for now (a route might have failed after we sent a segment, 3597 * and the return path might not be symmetrical). 3598 */ 3599 tp->t_softerror = 0; 3600 } 3601 3602 3603 /* 3604 * TCP compressed state engine. Currently used to hold compressed 3605 * state for SYN_RECEIVED. 3606 */ 3607 3608 u_long syn_cache_count; 3609 u_int32_t syn_hash1, syn_hash2; 3610 3611 #define SYN_HASH(sa, sp, dp) \ 3612 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 3613 ((u_int32_t)(sp)))^syn_hash2))) 3614 #ifndef INET6 3615 #define SYN_HASHALL(hash, src, dst) \ 3616 do { \ 3617 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 3618 ((const struct sockaddr_in *)(src))->sin_port, \ 3619 ((const struct sockaddr_in *)(dst))->sin_port); \ 3620 } while (/*CONSTCOND*/ 0) 3621 #else 3622 #define SYN_HASH6(sa, sp, dp) \ 3623 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 3624 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 3625 & 0x7fffffff) 3626 3627 #define SYN_HASHALL(hash, src, dst) \ 3628 do { \ 3629 switch ((src)->sa_family) { \ 3630 case AF_INET: \ 3631 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 3632 ((const struct sockaddr_in *)(src))->sin_port, \ 3633 ((const struct sockaddr_in *)(dst))->sin_port); \ 3634 break; \ 3635 case AF_INET6: \ 3636 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ 3637 ((const struct sockaddr_in6 *)(src))->sin6_port, \ 3638 ((const struct sockaddr_in6 *)(dst))->sin6_port); \ 3639 break; \ 3640 default: \ 3641 hash = 0; \ 3642 } \ 3643 } while (/*CONSTCOND*/0) 3644 #endif /* INET6 */ 3645 3646 static struct pool syn_cache_pool; 3647 3648 /* 3649 * We don't estimate RTT with SYNs, so each packet starts with the default 3650 * RTT and each timer step has a fixed timeout value. 3651 */ 3652 #define SYN_CACHE_TIMER_ARM(sc) \ 3653 do { \ 3654 TCPT_RANGESET((sc)->sc_rxtcur, \ 3655 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ 3656 TCPTV_REXMTMAX); \ 3657 callout_reset(&(sc)->sc_timer, \ 3658 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \ 3659 } while (/*CONSTCOND*/0) 3660 3661 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 3662 3663 static inline void 3664 syn_cache_rm(struct syn_cache *sc) 3665 { 3666 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 3667 sc, sc_bucketq); 3668 sc->sc_tp = NULL; 3669 LIST_REMOVE(sc, sc_tpq); 3670 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 3671 callout_stop(&sc->sc_timer); 3672 syn_cache_count--; 3673 } 3674 3675 static inline void 3676 syn_cache_put(struct syn_cache *sc) 3677 { 3678 if (sc->sc_ipopts) 3679 (void) m_free(sc->sc_ipopts); 3680 rtcache_free(&sc->sc_route); 3681 sc->sc_flags |= SCF_DEAD; 3682 if (!callout_invoking(&sc->sc_timer)) 3683 callout_schedule(&(sc)->sc_timer, 1); 3684 } 3685 3686 void 3687 syn_cache_init(void) 3688 { 3689 int i; 3690 3691 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 3692 "synpl", NULL, IPL_SOFTNET); 3693 3694 /* Initialize the hash buckets. */ 3695 for (i = 0; i < tcp_syn_cache_size; i++) 3696 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 3697 } 3698 3699 void 3700 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3701 { 3702 struct syn_cache_head *scp; 3703 struct syn_cache *sc2; 3704 int s; 3705 3706 /* 3707 * If there are no entries in the hash table, reinitialize 3708 * the hash secrets. 3709 */ 3710 if (syn_cache_count == 0) { 3711 syn_hash1 = cprng_fast32(); 3712 syn_hash2 = cprng_fast32(); 3713 } 3714 3715 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 3716 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 3717 scp = &tcp_syn_cache[sc->sc_bucketidx]; 3718 3719 /* 3720 * Make sure that we don't overflow the per-bucket 3721 * limit or the total cache size limit. 3722 */ 3723 s = splsoftnet(); 3724 if (scp->sch_length >= tcp_syn_bucket_limit) { 3725 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); 3726 /* 3727 * The bucket is full. Toss the oldest element in the 3728 * bucket. This will be the first entry in the bucket. 3729 */ 3730 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3731 #ifdef DIAGNOSTIC 3732 /* 3733 * This should never happen; we should always find an 3734 * entry in our bucket. 3735 */ 3736 if (sc2 == NULL) 3737 panic("syn_cache_insert: bucketoverflow: impossible"); 3738 #endif 3739 syn_cache_rm(sc2); 3740 syn_cache_put(sc2); /* calls pool_put but see spl above */ 3741 } else if (syn_cache_count >= tcp_syn_cache_limit) { 3742 struct syn_cache_head *scp2, *sce; 3743 3744 TCP_STATINC(TCP_STAT_SC_OVERFLOWED); 3745 /* 3746 * The cache is full. Toss the oldest entry in the 3747 * first non-empty bucket we can find. 3748 * 3749 * XXX We would really like to toss the oldest 3750 * entry in the cache, but we hope that this 3751 * condition doesn't happen very often. 3752 */ 3753 scp2 = scp; 3754 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3755 sce = &tcp_syn_cache[tcp_syn_cache_size]; 3756 for (++scp2; scp2 != scp; scp2++) { 3757 if (scp2 >= sce) 3758 scp2 = &tcp_syn_cache[0]; 3759 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3760 break; 3761 } 3762 #ifdef DIAGNOSTIC 3763 /* 3764 * This should never happen; we should always find a 3765 * non-empty bucket. 3766 */ 3767 if (scp2 == scp) 3768 panic("syn_cache_insert: cacheoverflow: " 3769 "impossible"); 3770 #endif 3771 } 3772 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3773 syn_cache_rm(sc2); 3774 syn_cache_put(sc2); /* calls pool_put but see spl above */ 3775 } 3776 3777 /* 3778 * Initialize the entry's timer. 3779 */ 3780 sc->sc_rxttot = 0; 3781 sc->sc_rxtshift = 0; 3782 SYN_CACHE_TIMER_ARM(sc); 3783 3784 /* Link it from tcpcb entry */ 3785 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3786 3787 /* Put it into the bucket. */ 3788 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3789 scp->sch_length++; 3790 syn_cache_count++; 3791 3792 TCP_STATINC(TCP_STAT_SC_ADDED); 3793 splx(s); 3794 } 3795 3796 /* 3797 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3798 * If we have retransmitted an entry the maximum number of times, expire 3799 * that entry. 3800 */ 3801 void 3802 syn_cache_timer(void *arg) 3803 { 3804 struct syn_cache *sc = arg; 3805 3806 mutex_enter(softnet_lock); 3807 KERNEL_LOCK(1, NULL); 3808 callout_ack(&sc->sc_timer); 3809 3810 if (__predict_false(sc->sc_flags & SCF_DEAD)) { 3811 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); 3812 callout_destroy(&sc->sc_timer); 3813 pool_put(&syn_cache_pool, sc); 3814 KERNEL_UNLOCK_ONE(NULL); 3815 mutex_exit(softnet_lock); 3816 return; 3817 } 3818 3819 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3820 /* Drop it -- too many retransmissions. */ 3821 goto dropit; 3822 } 3823 3824 /* 3825 * Compute the total amount of time this entry has 3826 * been on a queue. If this entry has been on longer 3827 * than the keep alive timer would allow, expire it. 3828 */ 3829 sc->sc_rxttot += sc->sc_rxtcur; 3830 if (sc->sc_rxttot >= tcp_keepinit) 3831 goto dropit; 3832 3833 TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); 3834 (void) syn_cache_respond(sc, NULL); 3835 3836 /* Advance the timer back-off. */ 3837 sc->sc_rxtshift++; 3838 SYN_CACHE_TIMER_ARM(sc); 3839 3840 KERNEL_UNLOCK_ONE(NULL); 3841 mutex_exit(softnet_lock); 3842 return; 3843 3844 dropit: 3845 TCP_STATINC(TCP_STAT_SC_TIMED_OUT); 3846 syn_cache_rm(sc); 3847 if (sc->sc_ipopts) 3848 (void) m_free(sc->sc_ipopts); 3849 rtcache_free(&sc->sc_route); 3850 callout_destroy(&sc->sc_timer); 3851 pool_put(&syn_cache_pool, sc); 3852 KERNEL_UNLOCK_ONE(NULL); 3853 mutex_exit(softnet_lock); 3854 } 3855 3856 /* 3857 * Remove syn cache created by the specified tcb entry, 3858 * because this does not make sense to keep them 3859 * (if there's no tcb entry, syn cache entry will never be used) 3860 */ 3861 void 3862 syn_cache_cleanup(struct tcpcb *tp) 3863 { 3864 struct syn_cache *sc, *nsc; 3865 int s; 3866 3867 s = splsoftnet(); 3868 3869 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 3870 nsc = LIST_NEXT(sc, sc_tpq); 3871 3872 #ifdef DIAGNOSTIC 3873 if (sc->sc_tp != tp) 3874 panic("invalid sc_tp in syn_cache_cleanup"); 3875 #endif 3876 syn_cache_rm(sc); 3877 syn_cache_put(sc); /* calls pool_put but see spl above */ 3878 } 3879 /* just for safety */ 3880 LIST_INIT(&tp->t_sc); 3881 3882 splx(s); 3883 } 3884 3885 /* 3886 * Find an entry in the syn cache. 3887 */ 3888 struct syn_cache * 3889 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 3890 struct syn_cache_head **headp) 3891 { 3892 struct syn_cache *sc; 3893 struct syn_cache_head *scp; 3894 u_int32_t hash; 3895 int s; 3896 3897 SYN_HASHALL(hash, src, dst); 3898 3899 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 3900 *headp = scp; 3901 s = splsoftnet(); 3902 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 3903 sc = TAILQ_NEXT(sc, sc_bucketq)) { 3904 if (sc->sc_hash != hash) 3905 continue; 3906 if (!memcmp(&sc->sc_src, src, src->sa_len) && 3907 !memcmp(&sc->sc_dst, dst, dst->sa_len)) { 3908 splx(s); 3909 return (sc); 3910 } 3911 } 3912 splx(s); 3913 return (NULL); 3914 } 3915 3916 /* 3917 * This function gets called when we receive an ACK for a 3918 * socket in the LISTEN state. We look up the connection 3919 * in the syn cache, and if its there, we pull it out of 3920 * the cache and turn it into a full-blown connection in 3921 * the SYN-RECEIVED state. 3922 * 3923 * The return values may not be immediately obvious, and their effects 3924 * can be subtle, so here they are: 3925 * 3926 * NULL SYN was not found in cache; caller should drop the 3927 * packet and send an RST. 3928 * 3929 * -1 We were unable to create the new connection, and are 3930 * aborting it. An ACK,RST is being sent to the peer 3931 * (unless we got screwey sequence numbners; see below), 3932 * because the 3-way handshake has been completed. Caller 3933 * should not free the mbuf, since we may be using it. If 3934 * we are not, we will free it. 3935 * 3936 * Otherwise, the return value is a pointer to the new socket 3937 * associated with the connection. 3938 */ 3939 struct socket * 3940 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, 3941 struct tcphdr *th, unsigned int hlen, unsigned int tlen, 3942 struct socket *so, struct mbuf *m) 3943 { 3944 struct syn_cache *sc; 3945 struct syn_cache_head *scp; 3946 struct inpcb *inp = NULL; 3947 #ifdef INET6 3948 struct in6pcb *in6p = NULL; 3949 #endif 3950 struct tcpcb *tp = 0; 3951 int s; 3952 struct socket *oso; 3953 3954 s = splsoftnet(); 3955 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 3956 splx(s); 3957 return (NULL); 3958 } 3959 3960 /* 3961 * Verify the sequence and ack numbers. Try getting the correct 3962 * response again. 3963 */ 3964 if ((th->th_ack != sc->sc_iss + 1) || 3965 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3966 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3967 (void) syn_cache_respond(sc, m); 3968 splx(s); 3969 return ((struct socket *)(-1)); 3970 } 3971 3972 /* Remove this cache entry */ 3973 syn_cache_rm(sc); 3974 splx(s); 3975 3976 /* 3977 * Ok, create the full blown connection, and set things up 3978 * as they would have been set up if we had created the 3979 * connection when the SYN arrived. If we can't create 3980 * the connection, abort it. 3981 */ 3982 /* 3983 * inp still has the OLD in_pcb stuff, set the 3984 * v6-related flags on the new guy, too. This is 3985 * done particularly for the case where an AF_INET6 3986 * socket is bound only to a port, and a v4 connection 3987 * comes in on that port. 3988 * we also copy the flowinfo from the original pcb 3989 * to the new one. 3990 */ 3991 oso = so; 3992 so = sonewconn(so, true); 3993 if (so == NULL) 3994 goto resetandabort; 3995 3996 switch (so->so_proto->pr_domain->dom_family) { 3997 #ifdef INET 3998 case AF_INET: 3999 inp = sotoinpcb(so); 4000 break; 4001 #endif 4002 #ifdef INET6 4003 case AF_INET6: 4004 in6p = sotoin6pcb(so); 4005 break; 4006 #endif 4007 } 4008 switch (src->sa_family) { 4009 #ifdef INET 4010 case AF_INET: 4011 if (inp) { 4012 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 4013 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 4014 inp->inp_options = ip_srcroute(); 4015 in_pcbstate(inp, INP_BOUND); 4016 if (inp->inp_options == NULL) { 4017 inp->inp_options = sc->sc_ipopts; 4018 sc->sc_ipopts = NULL; 4019 } 4020 } 4021 #ifdef INET6 4022 else if (in6p) { 4023 /* IPv4 packet to AF_INET6 socket */ 4024 memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr)); 4025 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 4026 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 4027 &in6p->in6p_laddr.s6_addr32[3], 4028 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 4029 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 4030 in6totcpcb(in6p)->t_family = AF_INET; 4031 if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY) 4032 in6p->in6p_flags |= IN6P_IPV6_V6ONLY; 4033 else 4034 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY; 4035 in6_pcbstate(in6p, IN6P_BOUND); 4036 } 4037 #endif 4038 break; 4039 #endif 4040 #ifdef INET6 4041 case AF_INET6: 4042 if (in6p) { 4043 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 4044 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 4045 in6_pcbstate(in6p, IN6P_BOUND); 4046 } 4047 break; 4048 #endif 4049 } 4050 #ifdef INET6 4051 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 4052 struct in6pcb *oin6p = sotoin6pcb(oso); 4053 /* inherit socket options from the listening socket */ 4054 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 4055 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 4056 m_freem(in6p->in6p_options); 4057 in6p->in6p_options = 0; 4058 } 4059 ip6_savecontrol(in6p, &in6p->in6p_options, 4060 mtod(m, struct ip6_hdr *), m); 4061 } 4062 #endif 4063 4064 #if defined(IPSEC) 4065 if (ipsec_used) { 4066 /* 4067 * we make a copy of policy, instead of sharing the policy, for 4068 * better behavior in terms of SA lookup and dead SA removal. 4069 */ 4070 if (inp) { 4071 /* copy old policy into new socket's */ 4072 if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, 4073 inp->inp_sp)) 4074 printf("tcp_input: could not copy policy\n"); 4075 } 4076 #ifdef INET6 4077 else if (in6p) { 4078 /* copy old policy into new socket's */ 4079 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp, 4080 in6p->in6p_sp)) 4081 printf("tcp_input: could not copy policy\n"); 4082 } 4083 #endif 4084 } 4085 #endif 4086 4087 /* 4088 * Give the new socket our cached route reference. 4089 */ 4090 if (inp) { 4091 rtcache_copy(&inp->inp_route, &sc->sc_route); 4092 rtcache_free(&sc->sc_route); 4093 } 4094 #ifdef INET6 4095 else { 4096 rtcache_copy(&in6p->in6p_route, &sc->sc_route); 4097 rtcache_free(&sc->sc_route); 4098 } 4099 #endif 4100 4101 if (inp) { 4102 struct sockaddr_in sin; 4103 memcpy(&sin, src, src->sa_len); 4104 if (in_pcbconnect(inp, &sin, &lwp0)) { 4105 goto resetandabort; 4106 } 4107 } 4108 #ifdef INET6 4109 else if (in6p) { 4110 struct sockaddr_in6 sin6; 4111 memcpy(&sin6, src, src->sa_len); 4112 if (src->sa_family == AF_INET) { 4113 /* IPv4 packet to AF_INET6 socket */ 4114 in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); 4115 } 4116 if (in6_pcbconnect(in6p, &sin6, NULL)) { 4117 goto resetandabort; 4118 } 4119 } 4120 #endif 4121 else { 4122 goto resetandabort; 4123 } 4124 4125 if (inp) 4126 tp = intotcpcb(inp); 4127 #ifdef INET6 4128 else if (in6p) 4129 tp = in6totcpcb(in6p); 4130 #endif 4131 else 4132 tp = NULL; 4133 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 4134 if (sc->sc_request_r_scale != 15) { 4135 tp->requested_s_scale = sc->sc_requested_s_scale; 4136 tp->request_r_scale = sc->sc_request_r_scale; 4137 tp->snd_scale = sc->sc_requested_s_scale; 4138 tp->rcv_scale = sc->sc_request_r_scale; 4139 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 4140 } 4141 if (sc->sc_flags & SCF_TIMESTAMP) 4142 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 4143 tp->ts_timebase = sc->sc_timebase; 4144 4145 tp->t_template = tcp_template(tp); 4146 if (tp->t_template == 0) { 4147 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 4148 so = NULL; 4149 m_freem(m); 4150 goto abort; 4151 } 4152 4153 tp->iss = sc->sc_iss; 4154 tp->irs = sc->sc_irs; 4155 tcp_sendseqinit(tp); 4156 tcp_rcvseqinit(tp); 4157 tp->t_state = TCPS_SYN_RECEIVED; 4158 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); 4159 TCP_STATINC(TCP_STAT_ACCEPTS); 4160 4161 if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) 4162 tp->t_flags |= TF_WILL_SACK; 4163 4164 if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) 4165 tp->t_flags |= TF_ECN_PERMIT; 4166 4167 #ifdef TCP_SIGNATURE 4168 if (sc->sc_flags & SCF_SIGNATURE) 4169 tp->t_flags |= TF_SIGNATURE; 4170 #endif 4171 4172 /* Initialize tp->t_ourmss before we deal with the peer's! */ 4173 tp->t_ourmss = sc->sc_ourmaxseg; 4174 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 4175 4176 /* 4177 * Initialize the initial congestion window. If we 4178 * had to retransmit the SYN,ACK, we must initialize cwnd 4179 * to 1 segment (i.e. the Loss Window). 4180 */ 4181 if (sc->sc_rxtshift) 4182 tp->snd_cwnd = tp->t_peermss; 4183 else { 4184 int ss = tcp_init_win; 4185 #ifdef INET 4186 if (inp != NULL && in_localaddr(inp->inp_faddr)) 4187 ss = tcp_init_win_local; 4188 #endif 4189 #ifdef INET6 4190 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) 4191 ss = tcp_init_win_local; 4192 #endif 4193 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 4194 } 4195 4196 tcp_rmx_rtt(tp); 4197 tp->snd_wl1 = sc->sc_irs; 4198 tp->rcv_up = sc->sc_irs + 1; 4199 4200 /* 4201 * This is what whould have happened in tcp_output() when 4202 * the SYN,ACK was sent. 4203 */ 4204 tp->snd_up = tp->snd_una; 4205 tp->snd_max = tp->snd_nxt = tp->iss+1; 4206 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 4207 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 4208 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 4209 tp->last_ack_sent = tp->rcv_nxt; 4210 tp->t_partialacks = -1; 4211 tp->t_dupacks = 0; 4212 4213 TCP_STATINC(TCP_STAT_SC_COMPLETED); 4214 s = splsoftnet(); 4215 syn_cache_put(sc); 4216 splx(s); 4217 return (so); 4218 4219 resetandabort: 4220 (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 4221 abort: 4222 if (so != NULL) { 4223 (void) soqremque(so, 1); 4224 (void) soabort(so); 4225 mutex_enter(softnet_lock); 4226 } 4227 s = splsoftnet(); 4228 syn_cache_put(sc); 4229 splx(s); 4230 TCP_STATINC(TCP_STAT_SC_ABORTED); 4231 return ((struct socket *)(-1)); 4232 } 4233 4234 /* 4235 * This function is called when we get a RST for a 4236 * non-existent connection, so that we can see if the 4237 * connection is in the syn cache. If it is, zap it. 4238 */ 4239 4240 void 4241 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 4242 { 4243 struct syn_cache *sc; 4244 struct syn_cache_head *scp; 4245 int s = splsoftnet(); 4246 4247 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 4248 splx(s); 4249 return; 4250 } 4251 if (SEQ_LT(th->th_seq, sc->sc_irs) || 4252 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 4253 splx(s); 4254 return; 4255 } 4256 syn_cache_rm(sc); 4257 TCP_STATINC(TCP_STAT_SC_RESET); 4258 syn_cache_put(sc); /* calls pool_put but see spl above */ 4259 splx(s); 4260 } 4261 4262 void 4263 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 4264 struct tcphdr *th) 4265 { 4266 struct syn_cache *sc; 4267 struct syn_cache_head *scp; 4268 int s; 4269 4270 s = splsoftnet(); 4271 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 4272 splx(s); 4273 return; 4274 } 4275 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 4276 if (ntohl (th->th_seq) != sc->sc_iss) { 4277 splx(s); 4278 return; 4279 } 4280 4281 /* 4282 * If we've retransmitted 3 times and this is our second error, 4283 * we remove the entry. Otherwise, we allow it to continue on. 4284 * This prevents us from incorrectly nuking an entry during a 4285 * spurious network outage. 4286 * 4287 * See tcp_notify(). 4288 */ 4289 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 4290 sc->sc_flags |= SCF_UNREACH; 4291 splx(s); 4292 return; 4293 } 4294 4295 syn_cache_rm(sc); 4296 TCP_STATINC(TCP_STAT_SC_UNREACH); 4297 syn_cache_put(sc); /* calls pool_put but see spl above */ 4298 splx(s); 4299 } 4300 4301 /* 4302 * Given a LISTEN socket and an inbound SYN request, add 4303 * this to the syn cache, and send back a segment: 4304 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 4305 * to the source. 4306 * 4307 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 4308 * Doing so would require that we hold onto the data and deliver it 4309 * to the application. However, if we are the target of a SYN-flood 4310 * DoS attack, an attacker could send data which would eventually 4311 * consume all available buffer space if it were ACKed. By not ACKing 4312 * the data, we avoid this DoS scenario. 4313 */ 4314 4315 int 4316 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 4317 unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp, 4318 int optlen, struct tcp_opt_info *oi) 4319 { 4320 struct tcpcb tb, *tp; 4321 long win; 4322 struct syn_cache *sc; 4323 struct syn_cache_head *scp; 4324 struct mbuf *ipopts; 4325 struct tcp_opt_info opti; 4326 int s; 4327 4328 tp = sototcpcb(so); 4329 4330 memset(&opti, 0, sizeof(opti)); 4331 4332 /* 4333 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 4334 * 4335 * Note this check is performed in tcp_input() very early on. 4336 */ 4337 4338 /* 4339 * Initialize some local state. 4340 */ 4341 win = sbspace(&so->so_rcv); 4342 if (win > TCP_MAXWIN) 4343 win = TCP_MAXWIN; 4344 4345 switch (src->sa_family) { 4346 #ifdef INET 4347 case AF_INET: 4348 /* 4349 * Remember the IP options, if any. 4350 */ 4351 ipopts = ip_srcroute(); 4352 break; 4353 #endif 4354 default: 4355 ipopts = NULL; 4356 } 4357 4358 #ifdef TCP_SIGNATURE 4359 if (optp || (tp->t_flags & TF_SIGNATURE)) 4360 #else 4361 if (optp) 4362 #endif 4363 { 4364 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 4365 #ifdef TCP_SIGNATURE 4366 tb.t_flags |= (tp->t_flags & TF_SIGNATURE); 4367 #endif 4368 tb.t_state = TCPS_LISTEN; 4369 if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len - 4370 sizeof(struct tcphdr) - optlen - hlen, oi) < 0) 4371 return (0); 4372 } else 4373 tb.t_flags = 0; 4374 4375 /* 4376 * See if we already have an entry for this connection. 4377 * If we do, resend the SYN,ACK. We do not count this 4378 * as a retransmission (XXX though maybe we should). 4379 */ 4380 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 4381 TCP_STATINC(TCP_STAT_SC_DUPESYN); 4382 if (ipopts) { 4383 /* 4384 * If we were remembering a previous source route, 4385 * forget it and use the new one we've been given. 4386 */ 4387 if (sc->sc_ipopts) 4388 (void) m_free(sc->sc_ipopts); 4389 sc->sc_ipopts = ipopts; 4390 } 4391 sc->sc_timestamp = tb.ts_recent; 4392 if (syn_cache_respond(sc, m) == 0) { 4393 uint64_t *tcps = TCP_STAT_GETREF(); 4394 tcps[TCP_STAT_SNDACKS]++; 4395 tcps[TCP_STAT_SNDTOTAL]++; 4396 TCP_STAT_PUTREF(); 4397 } 4398 return (1); 4399 } 4400 4401 s = splsoftnet(); 4402 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 4403 splx(s); 4404 if (sc == NULL) { 4405 if (ipopts) 4406 (void) m_free(ipopts); 4407 return (0); 4408 } 4409 4410 /* 4411 * Fill in the cache, and put the necessary IP and TCP 4412 * options into the reply. 4413 */ 4414 memset(sc, 0, sizeof(struct syn_cache)); 4415 callout_init(&sc->sc_timer, CALLOUT_MPSAFE); 4416 bcopy(src, &sc->sc_src, src->sa_len); 4417 bcopy(dst, &sc->sc_dst, dst->sa_len); 4418 sc->sc_flags = 0; 4419 sc->sc_ipopts = ipopts; 4420 sc->sc_irs = th->th_seq; 4421 switch (src->sa_family) { 4422 #ifdef INET 4423 case AF_INET: 4424 { 4425 struct sockaddr_in *srcin = (void *) src; 4426 struct sockaddr_in *dstin = (void *) dst; 4427 4428 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 4429 &srcin->sin_addr, dstin->sin_port, 4430 srcin->sin_port, sizeof(dstin->sin_addr), 0); 4431 break; 4432 } 4433 #endif /* INET */ 4434 #ifdef INET6 4435 case AF_INET6: 4436 { 4437 struct sockaddr_in6 *srcin6 = (void *) src; 4438 struct sockaddr_in6 *dstin6 = (void *) dst; 4439 4440 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 4441 &srcin6->sin6_addr, dstin6->sin6_port, 4442 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); 4443 break; 4444 } 4445 #endif /* INET6 */ 4446 } 4447 sc->sc_peermaxseg = oi->maxseg; 4448 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 4449 m_get_rcvif_NOMPSAFE(m) : NULL, 4450 sc->sc_src.sa.sa_family); 4451 sc->sc_win = win; 4452 sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ 4453 sc->sc_timestamp = tb.ts_recent; 4454 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 4455 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 4456 sc->sc_flags |= SCF_TIMESTAMP; 4457 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 4458 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 4459 sc->sc_requested_s_scale = tb.requested_s_scale; 4460 sc->sc_request_r_scale = 0; 4461 /* 4462 * Pick the smallest possible scaling factor that 4463 * will still allow us to scale up to sb_max. 4464 * 4465 * We do this because there are broken firewalls that 4466 * will corrupt the window scale option, leading to 4467 * the other endpoint believing that our advertised 4468 * window is unscaled. At scale factors larger than 4469 * 5 the unscaled window will drop below 1500 bytes, 4470 * leading to serious problems when traversing these 4471 * broken firewalls. 4472 * 4473 * With the default sbmax of 256K, a scale factor 4474 * of 3 will be chosen by this algorithm. Those who 4475 * choose a larger sbmax should watch out 4476 * for the compatiblity problems mentioned above. 4477 * 4478 * RFC1323: The Window field in a SYN (i.e., a <SYN> 4479 * or <SYN,ACK>) segment itself is never scaled. 4480 */ 4481 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 4482 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 4483 sc->sc_request_r_scale++; 4484 } else { 4485 sc->sc_requested_s_scale = 15; 4486 sc->sc_request_r_scale = 15; 4487 } 4488 if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) 4489 sc->sc_flags |= SCF_SACK_PERMIT; 4490 4491 /* 4492 * ECN setup packet recieved. 4493 */ 4494 if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) 4495 sc->sc_flags |= SCF_ECN_PERMIT; 4496 4497 #ifdef TCP_SIGNATURE 4498 if (tb.t_flags & TF_SIGNATURE) 4499 sc->sc_flags |= SCF_SIGNATURE; 4500 #endif 4501 sc->sc_tp = tp; 4502 if (syn_cache_respond(sc, m) == 0) { 4503 uint64_t *tcps = TCP_STAT_GETREF(); 4504 tcps[TCP_STAT_SNDACKS]++; 4505 tcps[TCP_STAT_SNDTOTAL]++; 4506 TCP_STAT_PUTREF(); 4507 syn_cache_insert(sc, tp); 4508 } else { 4509 s = splsoftnet(); 4510 /* 4511 * syn_cache_put() will try to schedule the timer, so 4512 * we need to initialize it 4513 */ 4514 SYN_CACHE_TIMER_ARM(sc); 4515 syn_cache_put(sc); 4516 splx(s); 4517 TCP_STATINC(TCP_STAT_SC_DROPPED); 4518 } 4519 return (1); 4520 } 4521 4522 /* 4523 * syn_cache_respond: (re)send SYN+ACK. 4524 * 4525 * returns 0 on success. otherwise returns an errno, typically ENOBUFS. 4526 */ 4527 4528 int 4529 syn_cache_respond(struct syn_cache *sc, struct mbuf *m) 4530 { 4531 #ifdef INET6 4532 struct rtentry *rt = NULL; 4533 #endif 4534 struct route *ro; 4535 u_int8_t *optp; 4536 int optlen, error; 4537 u_int16_t tlen; 4538 struct ip *ip = NULL; 4539 #ifdef INET6 4540 struct ip6_hdr *ip6 = NULL; 4541 #endif 4542 struct tcpcb *tp = NULL; 4543 struct tcphdr *th; 4544 u_int hlen; 4545 struct socket *so; 4546 4547 ro = &sc->sc_route; 4548 switch (sc->sc_src.sa.sa_family) { 4549 case AF_INET: 4550 hlen = sizeof(struct ip); 4551 break; 4552 #ifdef INET6 4553 case AF_INET6: 4554 hlen = sizeof(struct ip6_hdr); 4555 break; 4556 #endif 4557 default: 4558 if (m) 4559 m_freem(m); 4560 return (EAFNOSUPPORT); 4561 } 4562 4563 /* Compute the size of the TCP options. */ 4564 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 4565 ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) + 4566 #ifdef TCP_SIGNATURE 4567 ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) + 4568 #endif 4569 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 4570 4571 tlen = hlen + sizeof(struct tcphdr) + optlen; 4572 4573 /* 4574 * Create the IP+TCP header from scratch. 4575 */ 4576 if (m) 4577 m_freem(m); 4578 #ifdef DIAGNOSTIC 4579 if (max_linkhdr + tlen > MCLBYTES) 4580 return (ENOBUFS); 4581 #endif 4582 MGETHDR(m, M_DONTWAIT, MT_DATA); 4583 if (m && (max_linkhdr + tlen) > MHLEN) { 4584 MCLGET(m, M_DONTWAIT); 4585 if ((m->m_flags & M_EXT) == 0) { 4586 m_freem(m); 4587 m = NULL; 4588 } 4589 } 4590 if (m == NULL) 4591 return (ENOBUFS); 4592 MCLAIM(m, &tcp_tx_mowner); 4593 4594 /* Fixup the mbuf. */ 4595 m->m_data += max_linkhdr; 4596 m->m_len = m->m_pkthdr.len = tlen; 4597 if (sc->sc_tp) { 4598 tp = sc->sc_tp; 4599 if (tp->t_inpcb) 4600 so = tp->t_inpcb->inp_socket; 4601 #ifdef INET6 4602 else if (tp->t_in6pcb) 4603 so = tp->t_in6pcb->in6p_socket; 4604 #endif 4605 else 4606 so = NULL; 4607 } else 4608 so = NULL; 4609 m_reset_rcvif(m); 4610 memset(mtod(m, u_char *), 0, tlen); 4611 4612 switch (sc->sc_src.sa.sa_family) { 4613 case AF_INET: 4614 ip = mtod(m, struct ip *); 4615 ip->ip_v = 4; 4616 ip->ip_dst = sc->sc_src.sin.sin_addr; 4617 ip->ip_src = sc->sc_dst.sin.sin_addr; 4618 ip->ip_p = IPPROTO_TCP; 4619 th = (struct tcphdr *)(ip + 1); 4620 th->th_dport = sc->sc_src.sin.sin_port; 4621 th->th_sport = sc->sc_dst.sin.sin_port; 4622 break; 4623 #ifdef INET6 4624 case AF_INET6: 4625 ip6 = mtod(m, struct ip6_hdr *); 4626 ip6->ip6_vfc = IPV6_VERSION; 4627 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4628 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4629 ip6->ip6_nxt = IPPROTO_TCP; 4630 /* ip6_plen will be updated in ip6_output() */ 4631 th = (struct tcphdr *)(ip6 + 1); 4632 th->th_dport = sc->sc_src.sin6.sin6_port; 4633 th->th_sport = sc->sc_dst.sin6.sin6_port; 4634 break; 4635 #endif 4636 default: 4637 th = NULL; 4638 } 4639 4640 th->th_seq = htonl(sc->sc_iss); 4641 th->th_ack = htonl(sc->sc_irs + 1); 4642 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4643 th->th_flags = TH_SYN|TH_ACK; 4644 th->th_win = htons(sc->sc_win); 4645 /* th_sum already 0 */ 4646 /* th_urp already 0 */ 4647 4648 /* Tack on the TCP options. */ 4649 optp = (u_int8_t *)(th + 1); 4650 *optp++ = TCPOPT_MAXSEG; 4651 *optp++ = 4; 4652 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4653 *optp++ = sc->sc_ourmaxseg & 0xff; 4654 4655 if (sc->sc_request_r_scale != 15) { 4656 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4657 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4658 sc->sc_request_r_scale); 4659 optp += 4; 4660 } 4661 4662 if (sc->sc_flags & SCF_TIMESTAMP) { 4663 u_int32_t *lp = (u_int32_t *)(optp); 4664 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4665 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4666 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 4667 *lp = htonl(sc->sc_timestamp); 4668 optp += TCPOLEN_TSTAMP_APPA; 4669 } 4670 4671 if (sc->sc_flags & SCF_SACK_PERMIT) { 4672 u_int8_t *p = optp; 4673 4674 /* Let the peer know that we will SACK. */ 4675 p[0] = TCPOPT_SACK_PERMITTED; 4676 p[1] = 2; 4677 p[2] = TCPOPT_NOP; 4678 p[3] = TCPOPT_NOP; 4679 optp += 4; 4680 } 4681 4682 /* 4683 * Send ECN SYN-ACK setup packet. 4684 * Routes can be asymetric, so, even if we receive a packet 4685 * with ECE and CWR set, we must not assume no one will block 4686 * the ECE packet we are about to send. 4687 */ 4688 if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && 4689 SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 4690 th->th_flags |= TH_ECE; 4691 TCP_STATINC(TCP_STAT_ECN_SHS); 4692 4693 /* 4694 * draft-ietf-tcpm-ecnsyn-00.txt 4695 * 4696 * "[...] a TCP node MAY respond to an ECN-setup 4697 * SYN packet by setting ECT in the responding 4698 * ECN-setup SYN/ACK packet, indicating to routers 4699 * that the SYN/ACK packet is ECN-Capable. 4700 * This allows a congested router along the path 4701 * to mark the packet instead of dropping the 4702 * packet as an indication of congestion." 4703 * 4704 * "[...] There can be a great benefit in setting 4705 * an ECN-capable codepoint in SYN/ACK packets [...] 4706 * Congestion is most likely to occur in 4707 * the server-to-client direction. As a result, 4708 * setting an ECN-capable codepoint in SYN/ACK 4709 * packets can reduce the occurence of three-second 4710 * retransmit timeouts resulting from the drop 4711 * of SYN/ACK packets." 4712 * 4713 * Page 4 and 6, January 2006. 4714 */ 4715 4716 switch (sc->sc_src.sa.sa_family) { 4717 #ifdef INET 4718 case AF_INET: 4719 ip->ip_tos |= IPTOS_ECN_ECT0; 4720 break; 4721 #endif 4722 #ifdef INET6 4723 case AF_INET6: 4724 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 4725 break; 4726 #endif 4727 } 4728 TCP_STATINC(TCP_STAT_ECN_ECT); 4729 } 4730 4731 #ifdef TCP_SIGNATURE 4732 if (sc->sc_flags & SCF_SIGNATURE) { 4733 struct secasvar *sav; 4734 u_int8_t *sigp; 4735 4736 sav = tcp_signature_getsav(m, th); 4737 4738 if (sav == NULL) { 4739 if (m) 4740 m_freem(m); 4741 return (EPERM); 4742 } 4743 4744 *optp++ = TCPOPT_SIGNATURE; 4745 *optp++ = TCPOLEN_SIGNATURE; 4746 sigp = optp; 4747 memset(optp, 0, TCP_SIGLEN); 4748 optp += TCP_SIGLEN; 4749 *optp++ = TCPOPT_NOP; 4750 *optp++ = TCPOPT_EOL; 4751 4752 (void)tcp_signature(m, th, hlen, sav, sigp); 4753 4754 key_sa_recordxfer(sav, m); 4755 KEY_FREESAV(&sav); 4756 } 4757 #endif 4758 4759 /* Compute the packet's checksum. */ 4760 switch (sc->sc_src.sa.sa_family) { 4761 case AF_INET: 4762 ip->ip_len = htons(tlen - hlen); 4763 th->th_sum = 0; 4764 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4765 break; 4766 #ifdef INET6 4767 case AF_INET6: 4768 ip6->ip6_plen = htons(tlen - hlen); 4769 th->th_sum = 0; 4770 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 4771 break; 4772 #endif 4773 } 4774 4775 /* 4776 * Fill in some straggling IP bits. Note the stack expects 4777 * ip_len to be in host order, for convenience. 4778 */ 4779 switch (sc->sc_src.sa.sa_family) { 4780 #ifdef INET 4781 case AF_INET: 4782 ip->ip_len = htons(tlen); 4783 ip->ip_ttl = ip_defttl; 4784 /* XXX tos? */ 4785 break; 4786 #endif 4787 #ifdef INET6 4788 case AF_INET6: 4789 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4790 ip6->ip6_vfc |= IPV6_VERSION; 4791 ip6->ip6_plen = htons(tlen - hlen); 4792 /* ip6_hlim will be initialized afterwards */ 4793 /* XXX flowlabel? */ 4794 break; 4795 #endif 4796 } 4797 4798 /* XXX use IPsec policy on listening socket, on SYN ACK */ 4799 tp = sc->sc_tp; 4800 4801 switch (sc->sc_src.sa.sa_family) { 4802 #ifdef INET 4803 case AF_INET: 4804 error = ip_output(m, sc->sc_ipopts, ro, 4805 (ip_mtudisc ? IP_MTUDISC : 0), 4806 NULL, so); 4807 break; 4808 #endif 4809 #ifdef INET6 4810 case AF_INET6: 4811 ip6->ip6_hlim = in6_selecthlim(NULL, 4812 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); 4813 rtcache_unref(rt, ro); 4814 4815 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, so, NULL); 4816 break; 4817 #endif 4818 default: 4819 error = EAFNOSUPPORT; 4820 break; 4821 } 4822 return (error); 4823 } 4824