1 /* $NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 * 35 * NRL grants permission for redistribution and use in source and binary 36 * forms, with or without modification, of the software and documentation 37 * created at NRL provided that the following conditions are met: 38 * 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgements: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * This product includes software developed at the Information 49 * Technology Division, US Naval Research Laboratory. 50 * 4. Neither the name of the NRL nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 * 66 * The views and conclusions contained in the software and documentation 67 * are those of the authors and should not be interpreted as representing 68 * official policies, either expressed or implied, of the US Naval 69 * Research Laboratory (NRL). 70 */ 71 72 /*- 73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, 74 * 2011 The NetBSD Foundation, Inc. 75 * All rights reserved. 76 * 77 * This code is derived from software contributed to The NetBSD Foundation 78 * by Coyote Point Systems, Inc. 79 * This code is derived from software contributed to The NetBSD Foundation 80 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 81 * Facility, NASA Ames Research Center. 82 * This code is derived from software contributed to The NetBSD Foundation 83 * by Charles M. Hannum. 84 * This code is derived from software contributed to The NetBSD Foundation 85 * by Rui Paulo. 86 * 87 * Redistribution and use in source and binary forms, with or without 88 * modification, are permitted provided that the following conditions 89 * are met: 90 * 1. Redistributions of source code must retain the above copyright 91 * notice, this list of conditions and the following disclaimer. 92 * 2. Redistributions in binary form must reproduce the above copyright 93 * notice, this list of conditions and the following disclaimer in the 94 * documentation and/or other materials provided with the distribution. 95 * 96 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 97 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 98 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 99 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 100 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 101 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 102 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 103 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 104 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 105 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 106 * POSSIBILITY OF SUCH DAMAGE. 107 */ 108 109 /* 110 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 111 * The Regents of the University of California. All rights reserved. 112 * 113 * Redistribution and use in source and binary forms, with or without 114 * modification, are permitted provided that the following conditions 115 * are met: 116 * 1. Redistributions of source code must retain the above copyright 117 * notice, this list of conditions and the following disclaimer. 118 * 2. Redistributions in binary form must reproduce the above copyright 119 * notice, this list of conditions and the following disclaimer in the 120 * documentation and/or other materials provided with the distribution. 121 * 3. Neither the name of the University nor the names of its contributors 122 * may be used to endorse or promote products derived from this software 123 * without specific prior written permission. 124 * 125 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 126 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 127 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 128 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 129 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 130 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 131 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 132 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 133 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 134 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 135 * SUCH DAMAGE. 136 * 137 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 138 */ 139 140 /* 141 * TODO list for SYN cache stuff: 142 * 143 * Find room for a "state" field, which is needed to keep a 144 * compressed state for TIME_WAIT TCBs. It's been noted already 145 * that this is fairly important for very high-volume web and 146 * mail servers, which use a large number of short-lived 147 * connections. 148 */ 149 150 #include <sys/cdefs.h> 151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $"); 152 153 #ifdef _KERNEL_OPT 154 #include "opt_inet.h" 155 #include "opt_ipsec.h" 156 #endif 157 158 #include <sys/param.h> 159 #include <sys/systm.h> 160 #include <sys/mbuf.h> 161 #include <sys/protosw.h> 162 #include <sys/socket.h> 163 #include <sys/socketvar.h> 164 #include <sys/errno.h> 165 #include <sys/syslog.h> 166 #include <sys/pool.h> 167 #include <sys/domain.h> 168 #include <sys/kernel.h> 169 #include <sys/lwp.h> /* for lwp0 */ 170 #include <sys/cprng.h> 171 172 #include <netinet/in.h> 173 #include <netinet/ip.h> 174 #include <netinet/in_pcb.h> 175 #include <netinet/in_var.h> 176 #include <netinet/ip_var.h> 177 178 #include <netinet/ip6.h> 179 #ifdef INET6 180 #include <netinet6/ip6_var.h> 181 #include <netinet6/in6_pcb.h> 182 #include <netinet6/ip6_var.h> 183 #include <netinet6/in6_var.h> 184 #endif 185 186 #include <netinet/tcp.h> 187 #include <netinet/tcp_fsm.h> 188 #include <netinet/tcp_seq.h> 189 #include <netinet/tcp_timer.h> 190 #include <netinet/tcp_var.h> 191 #include <netinet/tcp_private.h> 192 #include <netinet/tcp_syncache.h> 193 194 #ifdef TCP_SIGNATURE 195 #ifdef IPSEC 196 #include <netipsec/ipsec.h> 197 #include <netipsec/key.h> 198 #ifdef INET6 199 #include <netipsec/ipsec6.h> 200 #endif 201 #endif /* IPSEC*/ 202 #endif 203 204 static void syn_cache_timer(void *); 205 static struct syn_cache * 206 syn_cache_lookup(const struct sockaddr *, const struct sockaddr *, 207 struct syn_cache_head **); 208 static int syn_cache_respond(struct syn_cache *); 209 210 /* syn hash parameters */ 211 #define TCP_SYN_HASH_SIZE 293 212 #define TCP_SYN_BUCKET_SIZE 35 213 static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 214 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 215 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 216 static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; 217 218 /* 219 * TCP compressed state engine. Currently used to hold compressed 220 * state for SYN_RECEIVED. 221 */ 222 223 u_long syn_cache_count; 224 static u_int32_t syn_hash1, syn_hash2; 225 226 #define SYN_HASH(sa, sp, dp) \ 227 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 228 ((u_int32_t)(sp)))^syn_hash2))) 229 #ifndef INET6 230 #define SYN_HASHALL(hash, src, dst) \ 231 do { \ 232 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 233 ((const struct sockaddr_in *)(src))->sin_port, \ 234 ((const struct sockaddr_in *)(dst))->sin_port); \ 235 } while (/*CONSTCOND*/ 0) 236 #else 237 #define SYN_HASH6(sa, sp, dp) \ 238 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 239 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 240 & 0x7fffffff) 241 242 #define SYN_HASHALL(hash, src, dst) \ 243 do { \ 244 switch ((src)->sa_family) { \ 245 case AF_INET: \ 246 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 247 ((const struct sockaddr_in *)(src))->sin_port, \ 248 ((const struct sockaddr_in *)(dst))->sin_port); \ 249 break; \ 250 case AF_INET6: \ 251 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ 252 ((const struct sockaddr_in6 *)(src))->sin6_port, \ 253 ((const struct sockaddr_in6 *)(dst))->sin6_port); \ 254 break; \ 255 default: \ 256 hash = 0; \ 257 } \ 258 } while (/*CONSTCOND*/0) 259 #endif /* INET6 */ 260 261 static struct pool syn_cache_pool; 262 263 /* 264 * We don't estimate RTT with SYNs, so each packet starts with the default 265 * RTT and each timer step has a fixed timeout value. 266 */ 267 static inline void 268 syn_cache_timer_arm(struct syn_cache *sc) 269 { 270 271 TCPT_RANGESET(sc->sc_rxtcur, 272 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 273 TCPTV_REXMTMAX); 274 callout_reset(&sc->sc_timer, 275 sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc); 276 } 277 278 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 279 280 static inline void 281 syn_cache_rm(struct syn_cache *sc) 282 { 283 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 284 sc, sc_bucketq); 285 sc->sc_tp = NULL; 286 LIST_REMOVE(sc, sc_tpq); 287 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 288 callout_stop(&sc->sc_timer); 289 syn_cache_count--; 290 } 291 292 static inline void 293 syn_cache_put(struct syn_cache *sc) 294 { 295 if (sc->sc_ipopts) 296 (void) m_free(sc->sc_ipopts); 297 rtcache_free(&sc->sc_route); 298 sc->sc_flags |= SCF_DEAD; 299 if (!callout_invoking(&sc->sc_timer)) 300 callout_schedule(&(sc)->sc_timer, 1); 301 } 302 303 void 304 syn_cache_init(void) 305 { 306 int i; 307 308 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 309 "synpl", NULL, IPL_SOFTNET); 310 311 /* Initialize the hash buckets. */ 312 for (i = 0; i < tcp_syn_cache_size; i++) 313 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 314 } 315 316 void 317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 318 { 319 struct syn_cache_head *scp; 320 struct syn_cache *sc2; 321 int s; 322 323 /* 324 * If there are no entries in the hash table, reinitialize 325 * the hash secrets. 326 */ 327 if (syn_cache_count == 0) { 328 syn_hash1 = cprng_fast32(); 329 syn_hash2 = cprng_fast32(); 330 } 331 332 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 333 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 334 scp = &tcp_syn_cache[sc->sc_bucketidx]; 335 336 /* 337 * Make sure that we don't overflow the per-bucket 338 * limit or the total cache size limit. 339 */ 340 s = splsoftnet(); 341 if (scp->sch_length >= tcp_syn_bucket_limit) { 342 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); 343 /* 344 * The bucket is full. Toss the oldest element in the 345 * bucket. This will be the first entry in the bucket. 346 */ 347 sc2 = TAILQ_FIRST(&scp->sch_bucket); 348 #ifdef DIAGNOSTIC 349 /* 350 * This should never happen; we should always find an 351 * entry in our bucket. 352 */ 353 if (sc2 == NULL) 354 panic("syn_cache_insert: bucketoverflow: impossible"); 355 #endif 356 syn_cache_rm(sc2); 357 syn_cache_put(sc2); /* calls pool_put but see spl above */ 358 } else if (syn_cache_count >= tcp_syn_cache_limit) { 359 struct syn_cache_head *scp2, *sce; 360 361 TCP_STATINC(TCP_STAT_SC_OVERFLOWED); 362 /* 363 * The cache is full. Toss the oldest entry in the 364 * first non-empty bucket we can find. 365 * 366 * XXX We would really like to toss the oldest 367 * entry in the cache, but we hope that this 368 * condition doesn't happen very often. 369 */ 370 scp2 = scp; 371 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 372 sce = &tcp_syn_cache[tcp_syn_cache_size]; 373 for (++scp2; scp2 != scp; scp2++) { 374 if (scp2 >= sce) 375 scp2 = &tcp_syn_cache[0]; 376 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 377 break; 378 } 379 #ifdef DIAGNOSTIC 380 /* 381 * This should never happen; we should always find a 382 * non-empty bucket. 383 */ 384 if (scp2 == scp) 385 panic("syn_cache_insert: cacheoverflow: " 386 "impossible"); 387 #endif 388 } 389 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 390 syn_cache_rm(sc2); 391 syn_cache_put(sc2); /* calls pool_put but see spl above */ 392 } 393 394 /* 395 * Initialize the entry's timer. 396 */ 397 sc->sc_rxttot = 0; 398 sc->sc_rxtshift = 0; 399 syn_cache_timer_arm(sc); 400 401 /* Link it from tcpcb entry */ 402 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 403 404 /* Put it into the bucket. */ 405 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 406 scp->sch_length++; 407 syn_cache_count++; 408 409 TCP_STATINC(TCP_STAT_SC_ADDED); 410 splx(s); 411 } 412 413 /* 414 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 415 * If we have retransmitted an entry the maximum number of times, expire 416 * that entry. 417 */ 418 static void 419 syn_cache_timer(void *arg) 420 { 421 struct syn_cache *sc = arg; 422 423 mutex_enter(softnet_lock); 424 KERNEL_LOCK(1, NULL); 425 426 callout_ack(&sc->sc_timer); 427 428 if (__predict_false(sc->sc_flags & SCF_DEAD)) { 429 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); 430 goto free; 431 } 432 433 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 434 /* Drop it -- too many retransmissions. */ 435 goto dropit; 436 } 437 438 /* 439 * Compute the total amount of time this entry has 440 * been on a queue. If this entry has been on longer 441 * than the keep alive timer would allow, expire it. 442 */ 443 sc->sc_rxttot += sc->sc_rxtcur; 444 if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS)) 445 goto dropit; 446 447 TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); 448 (void)syn_cache_respond(sc); 449 450 /* Advance the timer back-off. */ 451 sc->sc_rxtshift++; 452 syn_cache_timer_arm(sc); 453 454 goto out; 455 456 dropit: 457 TCP_STATINC(TCP_STAT_SC_TIMED_OUT); 458 syn_cache_rm(sc); 459 if (sc->sc_ipopts) 460 (void) m_free(sc->sc_ipopts); 461 rtcache_free(&sc->sc_route); 462 463 free: 464 callout_destroy(&sc->sc_timer); 465 pool_put(&syn_cache_pool, sc); 466 467 out: 468 KERNEL_UNLOCK_ONE(NULL); 469 mutex_exit(softnet_lock); 470 } 471 472 /* 473 * Remove syn cache created by the specified tcb entry, 474 * because this does not make sense to keep them 475 * (if there's no tcb entry, syn cache entry will never be used) 476 */ 477 void 478 syn_cache_cleanup(struct tcpcb *tp) 479 { 480 struct syn_cache *sc, *nsc; 481 int s; 482 483 s = splsoftnet(); 484 485 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 486 nsc = LIST_NEXT(sc, sc_tpq); 487 488 #ifdef DIAGNOSTIC 489 if (sc->sc_tp != tp) 490 panic("invalid sc_tp in syn_cache_cleanup"); 491 #endif 492 syn_cache_rm(sc); 493 syn_cache_put(sc); /* calls pool_put but see spl above */ 494 } 495 /* just for safety */ 496 LIST_INIT(&tp->t_sc); 497 498 splx(s); 499 } 500 501 /* 502 * Find an entry in the syn cache. 503 */ 504 static struct syn_cache * 505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 506 struct syn_cache_head **headp) 507 { 508 struct syn_cache *sc; 509 struct syn_cache_head *scp; 510 u_int32_t hash; 511 int s; 512 513 SYN_HASHALL(hash, src, dst); 514 515 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 516 *headp = scp; 517 s = splsoftnet(); 518 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 519 sc = TAILQ_NEXT(sc, sc_bucketq)) { 520 if (sc->sc_hash != hash) 521 continue; 522 if (!memcmp(&sc->sc_src, src, src->sa_len) && 523 !memcmp(&sc->sc_dst, dst, dst->sa_len)) { 524 splx(s); 525 return (sc); 526 } 527 } 528 splx(s); 529 return (NULL); 530 } 531 532 /* 533 * This function gets called when we receive an ACK for a socket in the 534 * LISTEN state. We look up the connection in the syn cache, and if it's 535 * there, we pull it out of the cache and turn it into a full-blown 536 * connection in the SYN-RECEIVED state. 537 * 538 * The return values may not be immediately obvious, and their effects 539 * can be subtle, so here they are: 540 * 541 * NULL SYN was not found in cache; caller should drop the 542 * packet and send an RST. 543 * 544 * -1 We were unable to create the new connection, and are 545 * aborting it. An ACK,RST is being sent to the peer 546 * (unless we got screwey sequence numbers; see below), 547 * because the 3-way handshake has been completed. Caller 548 * should not free the mbuf, since we may be using it. If 549 * we are not, we will free it. 550 * 551 * Otherwise, the return value is a pointer to the new socket 552 * associated with the connection. 553 */ 554 struct socket * 555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, 556 struct tcphdr *th, struct socket *so, struct mbuf *m) 557 { 558 struct syn_cache *sc; 559 struct syn_cache_head *scp; 560 struct inpcb *inp = NULL; 561 #ifdef INET6 562 struct in6pcb *in6p = NULL; 563 #endif 564 struct tcpcb *tp; 565 int s; 566 struct socket *oso; 567 568 s = splsoftnet(); 569 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 570 splx(s); 571 return NULL; 572 } 573 574 /* 575 * Verify the sequence and ack numbers. Try getting the correct 576 * response again. 577 */ 578 if ((th->th_ack != sc->sc_iss + 1) || 579 SEQ_LEQ(th->th_seq, sc->sc_irs) || 580 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 581 m_freem(m); 582 (void)syn_cache_respond(sc); 583 splx(s); 584 return ((struct socket *)(-1)); 585 } 586 587 /* Remove this cache entry */ 588 syn_cache_rm(sc); 589 splx(s); 590 591 /* 592 * Ok, create the full blown connection, and set things up 593 * as they would have been set up if we had created the 594 * connection when the SYN arrived. If we can't create 595 * the connection, abort it. 596 */ 597 /* 598 * inp still has the OLD in_pcb stuff, set the 599 * v6-related flags on the new guy, too. This is 600 * done particularly for the case where an AF_INET6 601 * socket is bound only to a port, and a v4 connection 602 * comes in on that port. 603 * we also copy the flowinfo from the original pcb 604 * to the new one. 605 */ 606 oso = so; 607 so = sonewconn(so, true); 608 if (so == NULL) 609 goto resetandabort; 610 611 switch (so->so_proto->pr_domain->dom_family) { 612 case AF_INET: 613 inp = sotoinpcb(so); 614 break; 615 #ifdef INET6 616 case AF_INET6: 617 in6p = sotoin6pcb(so); 618 break; 619 #endif 620 } 621 622 switch (src->sa_family) { 623 case AF_INET: 624 if (inp) { 625 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; 626 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 627 inp->inp_options = ip_srcroute(m); 628 in_pcbstate(inp, INP_BOUND); 629 if (inp->inp_options == NULL) { 630 inp->inp_options = sc->sc_ipopts; 631 sc->sc_ipopts = NULL; 632 } 633 } 634 #ifdef INET6 635 else if (in6p) { 636 /* IPv4 packet to AF_INET6 socket */ 637 memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr)); 638 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); 639 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 640 &in6p->in6p_laddr.s6_addr32[3], 641 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 642 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; 643 in6totcpcb(in6p)->t_family = AF_INET; 644 if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY) 645 in6p->in6p_flags |= IN6P_IPV6_V6ONLY; 646 else 647 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY; 648 in6_pcbstate(in6p, IN6P_BOUND); 649 } 650 #endif 651 break; 652 #ifdef INET6 653 case AF_INET6: 654 if (in6p) { 655 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; 656 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 657 in6_pcbstate(in6p, IN6P_BOUND); 658 } 659 break; 660 #endif 661 } 662 663 #ifdef INET6 664 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { 665 struct in6pcb *oin6p = sotoin6pcb(oso); 666 /* inherit socket options from the listening socket */ 667 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); 668 if (in6p->in6p_flags & IN6P_CONTROLOPTS) { 669 m_freem(in6p->in6p_options); 670 in6p->in6p_options = NULL; 671 } 672 ip6_savecontrol(in6p, &in6p->in6p_options, 673 mtod(m, struct ip6_hdr *), m); 674 } 675 #endif 676 677 /* 678 * Give the new socket our cached route reference. 679 */ 680 if (inp) { 681 rtcache_copy(&inp->inp_route, &sc->sc_route); 682 rtcache_free(&sc->sc_route); 683 } 684 #ifdef INET6 685 else { 686 rtcache_copy(&in6p->in6p_route, &sc->sc_route); 687 rtcache_free(&sc->sc_route); 688 } 689 #endif 690 691 if (inp) { 692 struct sockaddr_in sin; 693 memcpy(&sin, src, src->sa_len); 694 if (in_pcbconnect(inp, &sin, &lwp0)) { 695 goto resetandabort; 696 } 697 } 698 #ifdef INET6 699 else if (in6p) { 700 struct sockaddr_in6 sin6; 701 memcpy(&sin6, src, src->sa_len); 702 if (src->sa_family == AF_INET) { 703 /* IPv4 packet to AF_INET6 socket */ 704 in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); 705 } 706 if (in6_pcbconnect(in6p, &sin6, NULL)) { 707 goto resetandabort; 708 } 709 } 710 #endif 711 else { 712 goto resetandabort; 713 } 714 715 if (inp) 716 tp = intotcpcb(inp); 717 #ifdef INET6 718 else if (in6p) 719 tp = in6totcpcb(in6p); 720 #endif 721 else 722 tp = NULL; 723 724 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 725 if (sc->sc_request_r_scale != 15) { 726 tp->requested_s_scale = sc->sc_requested_s_scale; 727 tp->request_r_scale = sc->sc_request_r_scale; 728 tp->snd_scale = sc->sc_requested_s_scale; 729 tp->rcv_scale = sc->sc_request_r_scale; 730 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 731 } 732 if (sc->sc_flags & SCF_TIMESTAMP) 733 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 734 tp->ts_timebase = sc->sc_timebase; 735 736 tp->t_template = tcp_template(tp); 737 if (tp->t_template == 0) { 738 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 739 so = NULL; 740 m_freem(m); 741 goto abort; 742 } 743 744 tp->iss = sc->sc_iss; 745 tp->irs = sc->sc_irs; 746 tcp_sendseqinit(tp); 747 tcp_rcvseqinit(tp); 748 tp->t_state = TCPS_SYN_RECEIVED; 749 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); 750 TCP_STATINC(TCP_STAT_ACCEPTS); 751 752 if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) 753 tp->t_flags |= TF_WILL_SACK; 754 755 if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) 756 tp->t_flags |= TF_ECN_PERMIT; 757 758 #ifdef TCP_SIGNATURE 759 if (sc->sc_flags & SCF_SIGNATURE) 760 tp->t_flags |= TF_SIGNATURE; 761 #endif 762 763 /* Initialize tp->t_ourmss before we deal with the peer's! */ 764 tp->t_ourmss = sc->sc_ourmaxseg; 765 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 766 767 /* 768 * Initialize the initial congestion window. If we 769 * had to retransmit the SYN,ACK, we must initialize cwnd 770 * to 1 segment (i.e. the Loss Window). 771 */ 772 if (sc->sc_rxtshift) 773 tp->snd_cwnd = tp->t_peermss; 774 else { 775 int ss = tcp_init_win; 776 if (inp != NULL && in_localaddr(inp->inp_faddr)) 777 ss = tcp_init_win_local; 778 #ifdef INET6 779 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) 780 ss = tcp_init_win_local; 781 #endif 782 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 783 } 784 785 tcp_rmx_rtt(tp); 786 tp->snd_wl1 = sc->sc_irs; 787 tp->rcv_up = sc->sc_irs + 1; 788 789 /* 790 * This is what would have happened in tcp_output() when 791 * the SYN,ACK was sent. 792 */ 793 tp->snd_up = tp->snd_una; 794 tp->snd_max = tp->snd_nxt = tp->iss+1; 795 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 796 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 797 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 798 tp->last_ack_sent = tp->rcv_nxt; 799 tp->t_partialacks = -1; 800 tp->t_dupacks = 0; 801 802 TCP_STATINC(TCP_STAT_SC_COMPLETED); 803 s = splsoftnet(); 804 syn_cache_put(sc); 805 splx(s); 806 return so; 807 808 resetandabort: 809 (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 810 abort: 811 if (so != NULL) { 812 (void) soqremque(so, 1); 813 (void) soabort(so); 814 mutex_enter(softnet_lock); 815 } 816 s = splsoftnet(); 817 syn_cache_put(sc); 818 splx(s); 819 TCP_STATINC(TCP_STAT_SC_ABORTED); 820 return ((struct socket *)(-1)); 821 } 822 823 /* 824 * This function is called when we get a RST for a 825 * non-existent connection, so that we can see if the 826 * connection is in the syn cache. If it is, zap it. 827 */ 828 829 void 830 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 831 { 832 struct syn_cache *sc; 833 struct syn_cache_head *scp; 834 int s = splsoftnet(); 835 836 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 837 splx(s); 838 return; 839 } 840 if (SEQ_LT(th->th_seq, sc->sc_irs) || 841 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 842 splx(s); 843 return; 844 } 845 syn_cache_rm(sc); 846 TCP_STATINC(TCP_STAT_SC_RESET); 847 syn_cache_put(sc); /* calls pool_put but see spl above */ 848 splx(s); 849 } 850 851 void 852 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 853 struct tcphdr *th) 854 { 855 struct syn_cache *sc; 856 struct syn_cache_head *scp; 857 int s; 858 859 s = splsoftnet(); 860 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 861 splx(s); 862 return; 863 } 864 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 865 if (ntohl(th->th_seq) != sc->sc_iss) { 866 splx(s); 867 return; 868 } 869 870 /* 871 * If we've retransmitted 3 times and this is our second error, 872 * we remove the entry. Otherwise, we allow it to continue on. 873 * This prevents us from incorrectly nuking an entry during a 874 * spurious network outage. 875 * 876 * See tcp_notify(). 877 */ 878 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 879 sc->sc_flags |= SCF_UNREACH; 880 splx(s); 881 return; 882 } 883 884 syn_cache_rm(sc); 885 TCP_STATINC(TCP_STAT_SC_UNREACH); 886 syn_cache_put(sc); /* calls pool_put but see spl above */ 887 splx(s); 888 } 889 890 /* 891 * Given a LISTEN socket and an inbound SYN request, add this to the syn 892 * cache, and send back a segment: 893 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 894 * to the source. 895 * 896 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 897 * Doing so would require that we hold onto the data and deliver it 898 * to the application. However, if we are the target of a SYN-flood 899 * DoS attack, an attacker could send data which would eventually 900 * consume all available buffer space if it were ACKed. By not ACKing 901 * the data, we avoid this DoS scenario. 902 */ 903 int 904 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 905 unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp, 906 int optlen, struct tcp_opt_info *oi) 907 { 908 struct tcpcb tb, *tp; 909 long win; 910 struct syn_cache *sc; 911 struct syn_cache_head *scp; 912 struct mbuf *ipopts; 913 int s; 914 915 tp = sototcpcb(so); 916 917 /* 918 * Initialize some local state. 919 */ 920 win = sbspace(&so->so_rcv); 921 if (win > TCP_MAXWIN) 922 win = TCP_MAXWIN; 923 924 #ifdef TCP_SIGNATURE 925 if (optp || (tp->t_flags & TF_SIGNATURE)) 926 #else 927 if (optp) 928 #endif 929 { 930 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 931 #ifdef TCP_SIGNATURE 932 tb.t_flags |= (tp->t_flags & TF_SIGNATURE); 933 #endif 934 tb.t_state = TCPS_LISTEN; 935 if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0) 936 return 0; 937 } else 938 tb.t_flags = 0; 939 940 switch (src->sa_family) { 941 case AF_INET: 942 /* Remember the IP options, if any. */ 943 ipopts = ip_srcroute(m); 944 break; 945 default: 946 ipopts = NULL; 947 } 948 949 /* 950 * See if we already have an entry for this connection. 951 * If we do, resend the SYN,ACK. We do not count this 952 * as a retransmission (XXX though maybe we should). 953 */ 954 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 955 TCP_STATINC(TCP_STAT_SC_DUPESYN); 956 if (ipopts) { 957 /* 958 * If we were remembering a previous source route, 959 * forget it and use the new one we've been given. 960 */ 961 if (sc->sc_ipopts) 962 (void)m_free(sc->sc_ipopts); 963 sc->sc_ipopts = ipopts; 964 } 965 sc->sc_timestamp = tb.ts_recent; 966 m_freem(m); 967 if (syn_cache_respond(sc) == 0) { 968 uint64_t *tcps = TCP_STAT_GETREF(); 969 tcps[TCP_STAT_SNDACKS]++; 970 tcps[TCP_STAT_SNDTOTAL]++; 971 TCP_STAT_PUTREF(); 972 } 973 return 1; 974 } 975 976 s = splsoftnet(); 977 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 978 splx(s); 979 if (sc == NULL) { 980 if (ipopts) 981 (void)m_free(ipopts); 982 return 0; 983 } 984 985 /* 986 * Fill in the cache, and put the necessary IP and TCP 987 * options into the reply. 988 */ 989 memset(sc, 0, sizeof(struct syn_cache)); 990 callout_init(&sc->sc_timer, CALLOUT_MPSAFE); 991 memcpy(&sc->sc_src, src, src->sa_len); 992 memcpy(&sc->sc_dst, dst, dst->sa_len); 993 sc->sc_flags = 0; 994 sc->sc_ipopts = ipopts; 995 sc->sc_irs = th->th_seq; 996 switch (src->sa_family) { 997 case AF_INET: 998 { 999 struct sockaddr_in *srcin = (void *)src; 1000 struct sockaddr_in *dstin = (void *)dst; 1001 1002 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 1003 &srcin->sin_addr, dstin->sin_port, 1004 srcin->sin_port, sizeof(dstin->sin_addr)); 1005 break; 1006 } 1007 #ifdef INET6 1008 case AF_INET6: 1009 { 1010 struct sockaddr_in6 *srcin6 = (void *)src; 1011 struct sockaddr_in6 *dstin6 = (void *)dst; 1012 1013 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 1014 &srcin6->sin6_addr, dstin6->sin6_port, 1015 srcin6->sin6_port, sizeof(dstin6->sin6_addr)); 1016 break; 1017 } 1018 #endif 1019 } 1020 sc->sc_peermaxseg = oi->maxseg; 1021 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 1022 m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family); 1023 sc->sc_win = win; 1024 sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ 1025 sc->sc_timestamp = tb.ts_recent; 1026 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 1027 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1028 sc->sc_flags |= SCF_TIMESTAMP; 1029 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1030 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1031 sc->sc_requested_s_scale = tb.requested_s_scale; 1032 sc->sc_request_r_scale = 0; 1033 /* 1034 * Pick the smallest possible scaling factor that 1035 * will still allow us to scale up to sb_max. 1036 * 1037 * We do this because there are broken firewalls that 1038 * will corrupt the window scale option, leading to 1039 * the other endpoint believing that our advertised 1040 * window is unscaled. At scale factors larger than 1041 * 5 the unscaled window will drop below 1500 bytes, 1042 * leading to serious problems when traversing these 1043 * broken firewalls. 1044 * 1045 * With the default sbmax of 256K, a scale factor 1046 * of 3 will be chosen by this algorithm. Those who 1047 * choose a larger sbmax should watch out 1048 * for the compatibility problems mentioned above. 1049 * 1050 * RFC1323: The Window field in a SYN (i.e., a <SYN> 1051 * or <SYN,ACK>) segment itself is never scaled. 1052 */ 1053 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 1054 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 1055 sc->sc_request_r_scale++; 1056 } else { 1057 sc->sc_requested_s_scale = 15; 1058 sc->sc_request_r_scale = 15; 1059 } 1060 if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) 1061 sc->sc_flags |= SCF_SACK_PERMIT; 1062 1063 /* 1064 * ECN setup packet received. 1065 */ 1066 if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) 1067 sc->sc_flags |= SCF_ECN_PERMIT; 1068 1069 #ifdef TCP_SIGNATURE 1070 if (tb.t_flags & TF_SIGNATURE) 1071 sc->sc_flags |= SCF_SIGNATURE; 1072 #endif 1073 sc->sc_tp = tp; 1074 m_freem(m); 1075 if (syn_cache_respond(sc) == 0) { 1076 uint64_t *tcps = TCP_STAT_GETREF(); 1077 tcps[TCP_STAT_SNDACKS]++; 1078 tcps[TCP_STAT_SNDTOTAL]++; 1079 TCP_STAT_PUTREF(); 1080 syn_cache_insert(sc, tp); 1081 } else { 1082 s = splsoftnet(); 1083 /* 1084 * syn_cache_put() will try to schedule the timer, so 1085 * we need to initialize it 1086 */ 1087 syn_cache_timer_arm(sc); 1088 syn_cache_put(sc); 1089 splx(s); 1090 TCP_STATINC(TCP_STAT_SC_DROPPED); 1091 } 1092 return 1; 1093 } 1094 1095 /* 1096 * syn_cache_respond: (re)send SYN+ACK. 1097 * 1098 * Returns 0 on success. 1099 */ 1100 1101 static int 1102 syn_cache_respond(struct syn_cache *sc) 1103 { 1104 #ifdef INET6 1105 struct rtentry *rt = NULL; 1106 #endif 1107 struct route *ro; 1108 u_int8_t *optp; 1109 int optlen, error; 1110 u_int16_t tlen; 1111 struct ip *ip = NULL; 1112 #ifdef INET6 1113 struct ip6_hdr *ip6 = NULL; 1114 #endif 1115 struct tcpcb *tp; 1116 struct tcphdr *th; 1117 struct mbuf *m; 1118 u_int hlen; 1119 #ifdef TCP_SIGNATURE 1120 struct secasvar *sav = NULL; 1121 u_int8_t *sigp = NULL; 1122 #endif 1123 1124 ro = &sc->sc_route; 1125 switch (sc->sc_src.sa.sa_family) { 1126 case AF_INET: 1127 hlen = sizeof(struct ip); 1128 break; 1129 #ifdef INET6 1130 case AF_INET6: 1131 hlen = sizeof(struct ip6_hdr); 1132 break; 1133 #endif 1134 default: 1135 return EAFNOSUPPORT; 1136 } 1137 1138 /* Worst case scenario, since we don't know the option size yet. */ 1139 tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN; 1140 KASSERT(max_linkhdr + tlen <= MCLBYTES); 1141 1142 /* 1143 * Create the IP+TCP header from scratch. 1144 */ 1145 MGETHDR(m, M_DONTWAIT, MT_DATA); 1146 if (m && (max_linkhdr + tlen) > MHLEN) { 1147 MCLGET(m, M_DONTWAIT); 1148 if ((m->m_flags & M_EXT) == 0) { 1149 m_freem(m); 1150 m = NULL; 1151 } 1152 } 1153 if (m == NULL) 1154 return ENOBUFS; 1155 MCLAIM(m, &tcp_tx_mowner); 1156 1157 tp = sc->sc_tp; 1158 1159 /* Fixup the mbuf. */ 1160 m->m_data += max_linkhdr; 1161 m_reset_rcvif(m); 1162 memset(mtod(m, void *), 0, tlen); 1163 1164 switch (sc->sc_src.sa.sa_family) { 1165 case AF_INET: 1166 ip = mtod(m, struct ip *); 1167 ip->ip_v = 4; 1168 ip->ip_dst = sc->sc_src.sin.sin_addr; 1169 ip->ip_src = sc->sc_dst.sin.sin_addr; 1170 ip->ip_p = IPPROTO_TCP; 1171 th = (struct tcphdr *)(ip + 1); 1172 th->th_dport = sc->sc_src.sin.sin_port; 1173 th->th_sport = sc->sc_dst.sin.sin_port; 1174 break; 1175 #ifdef INET6 1176 case AF_INET6: 1177 ip6 = mtod(m, struct ip6_hdr *); 1178 ip6->ip6_vfc = IPV6_VERSION; 1179 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 1180 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 1181 ip6->ip6_nxt = IPPROTO_TCP; 1182 /* ip6_plen will be updated in ip6_output() */ 1183 th = (struct tcphdr *)(ip6 + 1); 1184 th->th_dport = sc->sc_src.sin6.sin6_port; 1185 th->th_sport = sc->sc_dst.sin6.sin6_port; 1186 break; 1187 #endif 1188 default: 1189 panic("%s: impossible (1)", __func__); 1190 } 1191 1192 th->th_seq = htonl(sc->sc_iss); 1193 th->th_ack = htonl(sc->sc_irs + 1); 1194 th->th_flags = TH_SYN|TH_ACK; 1195 th->th_win = htons(sc->sc_win); 1196 /* th_x2, th_sum, th_urp already 0 from memset */ 1197 1198 /* Tack on the TCP options. */ 1199 optp = (u_int8_t *)(th + 1); 1200 optlen = 0; 1201 *optp++ = TCPOPT_MAXSEG; 1202 *optp++ = TCPOLEN_MAXSEG; 1203 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 1204 *optp++ = sc->sc_ourmaxseg & 0xff; 1205 optlen += TCPOLEN_MAXSEG; 1206 1207 if (sc->sc_request_r_scale != 15) { 1208 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 1209 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 1210 sc->sc_request_r_scale); 1211 optp += TCPOLEN_WINDOW + TCPOLEN_NOP; 1212 optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; 1213 } 1214 1215 if (sc->sc_flags & SCF_SACK_PERMIT) { 1216 /* Let the peer know that we will SACK. */ 1217 *optp++ = TCPOPT_SACK_PERMITTED; 1218 *optp++ = TCPOLEN_SACK_PERMITTED; 1219 optlen += TCPOLEN_SACK_PERMITTED; 1220 } 1221 1222 if (sc->sc_flags & SCF_TIMESTAMP) { 1223 while (optlen % 4 != 2) { 1224 optlen += TCPOLEN_NOP; 1225 *optp++ = TCPOPT_NOP; 1226 } 1227 *optp++ = TCPOPT_TIMESTAMP; 1228 *optp++ = TCPOLEN_TIMESTAMP; 1229 u_int32_t *lp = (u_int32_t *)(optp); 1230 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1231 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 1232 *lp = htonl(sc->sc_timestamp); 1233 optp += TCPOLEN_TIMESTAMP - 2; 1234 optlen += TCPOLEN_TIMESTAMP; 1235 } 1236 1237 #ifdef TCP_SIGNATURE 1238 if (sc->sc_flags & SCF_SIGNATURE) { 1239 sav = tcp_signature_getsav(m); 1240 if (sav == NULL) { 1241 m_freem(m); 1242 return EPERM; 1243 } 1244 1245 *optp++ = TCPOPT_SIGNATURE; 1246 *optp++ = TCPOLEN_SIGNATURE; 1247 sigp = optp; 1248 memset(optp, 0, TCP_SIGLEN); 1249 optp += TCP_SIGLEN; 1250 optlen += TCPOLEN_SIGNATURE; 1251 } 1252 #endif 1253 1254 /* 1255 * Terminate and pad TCP options to a 4 byte boundary. 1256 * 1257 * According to RFC793: "The content of the header beyond the 1258 * End-of-Option option must be header padding (i.e., zero)." 1259 * And later: "The padding is composed of zeros." 1260 */ 1261 if (optlen % 4) { 1262 optlen += TCPOLEN_EOL; 1263 *optp++ = TCPOPT_EOL; 1264 } 1265 while (optlen % 4) { 1266 optlen += TCPOLEN_PAD; 1267 *optp++ = TCPOPT_PAD; 1268 } 1269 1270 /* Compute the actual values now that we've added the options. */ 1271 tlen = hlen + sizeof(struct tcphdr) + optlen; 1272 m->m_len = m->m_pkthdr.len = tlen; 1273 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1274 1275 #ifdef TCP_SIGNATURE 1276 if (sav) { 1277 (void)tcp_signature(m, th, hlen, sav, sigp); 1278 key_sa_recordxfer(sav, m); 1279 KEY_SA_UNREF(&sav); 1280 } 1281 #endif 1282 1283 /* 1284 * Send ECN SYN-ACK setup packet. 1285 * Routes can be asymmetric, so, even if we receive a packet 1286 * with ECE and CWR set, we must not assume no one will block 1287 * the ECE packet we are about to send. 1288 */ 1289 if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && 1290 SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 1291 th->th_flags |= TH_ECE; 1292 TCP_STATINC(TCP_STAT_ECN_SHS); 1293 1294 /* 1295 * draft-ietf-tcpm-ecnsyn-00.txt 1296 * 1297 * "[...] a TCP node MAY respond to an ECN-setup 1298 * SYN packet by setting ECT in the responding 1299 * ECN-setup SYN/ACK packet, indicating to routers 1300 * that the SYN/ACK packet is ECN-Capable. 1301 * This allows a congested router along the path 1302 * to mark the packet instead of dropping the 1303 * packet as an indication of congestion." 1304 * 1305 * "[...] There can be a great benefit in setting 1306 * an ECN-capable codepoint in SYN/ACK packets [...] 1307 * Congestion is most likely to occur in 1308 * the server-to-client direction. As a result, 1309 * setting an ECN-capable codepoint in SYN/ACK 1310 * packets can reduce the occurrence of three-second 1311 * retransmit timeouts resulting from the drop 1312 * of SYN/ACK packets." 1313 * 1314 * Page 4 and 6, January 2006. 1315 */ 1316 1317 switch (sc->sc_src.sa.sa_family) { 1318 case AF_INET: 1319 ip->ip_tos |= IPTOS_ECN_ECT0; 1320 break; 1321 #ifdef INET6 1322 case AF_INET6: 1323 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 1324 break; 1325 #endif 1326 } 1327 TCP_STATINC(TCP_STAT_ECN_ECT); 1328 } 1329 1330 1331 /* 1332 * Compute the packet's checksum. 1333 * 1334 * Fill in some straggling IP bits. Note the stack expects 1335 * ip_len to be in host order, for convenience. 1336 */ 1337 switch (sc->sc_src.sa.sa_family) { 1338 case AF_INET: 1339 ip->ip_len = htons(tlen - hlen); 1340 th->th_sum = 0; 1341 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 1342 ip->ip_len = htons(tlen); 1343 ip->ip_ttl = ip_defttl; 1344 /* XXX tos? */ 1345 break; 1346 #ifdef INET6 1347 case AF_INET6: 1348 ip6->ip6_plen = htons(tlen - hlen); 1349 th->th_sum = 0; 1350 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 1351 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 1352 ip6->ip6_vfc |= IPV6_VERSION; 1353 ip6->ip6_plen = htons(tlen - hlen); 1354 /* ip6_hlim will be initialized afterwards */ 1355 /* XXX flowlabel? */ 1356 break; 1357 #endif 1358 } 1359 1360 /* XXX use IPsec policy on listening socket, on SYN ACK */ 1361 tp = sc->sc_tp; 1362 1363 switch (sc->sc_src.sa.sa_family) { 1364 case AF_INET: 1365 error = ip_output(m, sc->sc_ipopts, ro, 1366 (ip_mtudisc ? IP_MTUDISC : 0), 1367 NULL, tp ? tp->t_inpcb : NULL); 1368 break; 1369 #ifdef INET6 1370 case AF_INET6: 1371 ip6->ip6_hlim = in6_selecthlim(NULL, 1372 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); 1373 rtcache_unref(rt, ro); 1374 1375 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, 1376 tp ? tp->t_in6pcb : NULL, NULL); 1377 break; 1378 #endif 1379 default: 1380 panic("%s: impossible (2)", __func__); 1381 } 1382 1383 return error; 1384 } 1385