1 /* $NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 * 35 * NRL grants permission for redistribution and use in source and binary 36 * forms, with or without modification, of the software and documentation 37 * created at NRL provided that the following conditions are met: 38 * 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgements: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * This product includes software developed at the Information 49 * Technology Division, US Naval Research Laboratory. 50 * 4. Neither the name of the NRL nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 * 66 * The views and conclusions contained in the software and documentation 67 * are those of the authors and should not be interpreted as representing 68 * official policies, either expressed or implied, of the US Naval 69 * Research Laboratory (NRL). 70 */ 71 72 /*- 73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, 74 * 2011 The NetBSD Foundation, Inc. 75 * All rights reserved. 76 * 77 * This code is derived from software contributed to The NetBSD Foundation 78 * by Coyote Point Systems, Inc. 79 * This code is derived from software contributed to The NetBSD Foundation 80 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 81 * Facility, NASA Ames Research Center. 82 * This code is derived from software contributed to The NetBSD Foundation 83 * by Charles M. Hannum. 84 * This code is derived from software contributed to The NetBSD Foundation 85 * by Rui Paulo. 86 * 87 * Redistribution and use in source and binary forms, with or without 88 * modification, are permitted provided that the following conditions 89 * are met: 90 * 1. Redistributions of source code must retain the above copyright 91 * notice, this list of conditions and the following disclaimer. 92 * 2. Redistributions in binary form must reproduce the above copyright 93 * notice, this list of conditions and the following disclaimer in the 94 * documentation and/or other materials provided with the distribution. 95 * 96 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 97 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 98 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 99 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 100 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 101 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 102 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 103 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 104 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 105 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 106 * POSSIBILITY OF SUCH DAMAGE. 107 */ 108 109 /* 110 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 111 * The Regents of the University of California. All rights reserved. 112 * 113 * Redistribution and use in source and binary forms, with or without 114 * modification, are permitted provided that the following conditions 115 * are met: 116 * 1. Redistributions of source code must retain the above copyright 117 * notice, this list of conditions and the following disclaimer. 118 * 2. Redistributions in binary form must reproduce the above copyright 119 * notice, this list of conditions and the following disclaimer in the 120 * documentation and/or other materials provided with the distribution. 121 * 3. Neither the name of the University nor the names of its contributors 122 * may be used to endorse or promote products derived from this software 123 * without specific prior written permission. 124 * 125 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 126 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 127 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 128 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 129 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 130 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 131 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 132 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 133 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 134 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 135 * SUCH DAMAGE. 136 * 137 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 138 */ 139 140 /* 141 * TODO list for SYN cache stuff: 142 * 143 * Find room for a "state" field, which is needed to keep a 144 * compressed state for TIME_WAIT TCBs. It's been noted already 145 * that this is fairly important for very high-volume web and 146 * mail servers, which use a large number of short-lived 147 * connections. 148 */ 149 150 #include <sys/cdefs.h> 151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $"); 152 153 #ifdef _KERNEL_OPT 154 #include "opt_inet.h" 155 #include "opt_ipsec.h" 156 #endif 157 158 #include <sys/param.h> 159 #include <sys/systm.h> 160 #include <sys/mbuf.h> 161 #include <sys/protosw.h> 162 #include <sys/socket.h> 163 #include <sys/socketvar.h> 164 #include <sys/errno.h> 165 #include <sys/syslog.h> 166 #include <sys/pool.h> 167 #include <sys/domain.h> 168 #include <sys/kernel.h> 169 #include <sys/lwp.h> /* for lwp0 */ 170 #include <sys/cprng.h> 171 172 #include <netinet/in.h> 173 #include <netinet/ip.h> 174 #include <netinet/in_pcb.h> 175 #include <netinet/in_var.h> 176 #include <netinet/ip_var.h> 177 178 #include <netinet/ip6.h> 179 #ifdef INET6 180 #include <netinet6/ip6_var.h> 181 #include <netinet6/in6_pcb.h> 182 #include <netinet6/ip6_var.h> 183 #include <netinet6/in6_var.h> 184 #endif 185 186 #include <netinet/tcp.h> 187 #include <netinet/tcp_fsm.h> 188 #include <netinet/tcp_seq.h> 189 #include <netinet/tcp_timer.h> 190 #include <netinet/tcp_var.h> 191 #include <netinet/tcp_private.h> 192 #include <netinet/tcp_syncache.h> 193 194 #ifdef TCP_SIGNATURE 195 #ifdef IPSEC 196 #include <netipsec/ipsec.h> 197 #include <netipsec/key.h> 198 #ifdef INET6 199 #include <netipsec/ipsec6.h> 200 #endif 201 #endif /* IPSEC*/ 202 #endif 203 204 static void syn_cache_timer(void *); 205 static struct syn_cache * 206 syn_cache_lookup(const struct sockaddr *, const struct sockaddr *, 207 struct syn_cache_head **); 208 static int syn_cache_respond(struct syn_cache *); 209 210 /* syn hash parameters */ 211 #define TCP_SYN_HASH_SIZE 293 212 #define TCP_SYN_BUCKET_SIZE 35 213 static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 214 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 215 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 216 static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; 217 218 /* 219 * TCP compressed state engine. Currently used to hold compressed 220 * state for SYN_RECEIVED. 221 */ 222 223 u_long syn_cache_count; 224 static u_int32_t syn_hash1, syn_hash2; 225 226 #define SYN_HASH(sa, sp, dp) \ 227 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 228 ((u_int32_t)(sp)))^syn_hash2))) 229 #ifndef INET6 230 #define SYN_HASHALL(hash, src, dst) \ 231 do { \ 232 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 233 ((const struct sockaddr_in *)(src))->sin_port, \ 234 ((const struct sockaddr_in *)(dst))->sin_port); \ 235 } while (/*CONSTCOND*/ 0) 236 #else 237 #define SYN_HASH6(sa, sp, dp) \ 238 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ 239 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ 240 & 0x7fffffff) 241 242 #define SYN_HASHALL(hash, src, dst) \ 243 do { \ 244 switch ((src)->sa_family) { \ 245 case AF_INET: \ 246 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ 247 ((const struct sockaddr_in *)(src))->sin_port, \ 248 ((const struct sockaddr_in *)(dst))->sin_port); \ 249 break; \ 250 case AF_INET6: \ 251 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ 252 ((const struct sockaddr_in6 *)(src))->sin6_port, \ 253 ((const struct sockaddr_in6 *)(dst))->sin6_port); \ 254 break; \ 255 default: \ 256 hash = 0; \ 257 } \ 258 } while (/*CONSTCOND*/0) 259 #endif /* INET6 */ 260 261 static struct pool syn_cache_pool; 262 263 /* 264 * We don't estimate RTT with SYNs, so each packet starts with the default 265 * RTT and each timer step has a fixed timeout value. 266 */ 267 static inline void 268 syn_cache_timer_arm(struct syn_cache *sc) 269 { 270 271 TCPT_RANGESET(sc->sc_rxtcur, 272 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 273 TCPTV_REXMTMAX); 274 callout_reset(&sc->sc_timer, 275 sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc); 276 } 277 278 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) 279 280 static inline void 281 syn_cache_rm(struct syn_cache *sc) 282 { 283 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, 284 sc, sc_bucketq); 285 sc->sc_tp = NULL; 286 LIST_REMOVE(sc, sc_tpq); 287 tcp_syn_cache[sc->sc_bucketidx].sch_length--; 288 callout_stop(&sc->sc_timer); 289 syn_cache_count--; 290 } 291 292 static inline void 293 syn_cache_put(struct syn_cache *sc) 294 { 295 if (sc->sc_ipopts) 296 (void) m_free(sc->sc_ipopts); 297 rtcache_free(&sc->sc_route); 298 sc->sc_flags |= SCF_DEAD; 299 if (!callout_invoking(&sc->sc_timer)) 300 callout_schedule(&(sc)->sc_timer, 1); 301 } 302 303 void 304 syn_cache_init(void) 305 { 306 int i; 307 308 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, 309 "synpl", NULL, IPL_SOFTNET); 310 311 /* Initialize the hash buckets. */ 312 for (i = 0; i < tcp_syn_cache_size; i++) 313 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); 314 } 315 316 void 317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 318 { 319 struct syn_cache_head *scp; 320 struct syn_cache *sc2; 321 int s; 322 323 /* 324 * If there are no entries in the hash table, reinitialize 325 * the hash secrets. 326 */ 327 if (syn_cache_count == 0) { 328 syn_hash1 = cprng_fast32(); 329 syn_hash2 = cprng_fast32(); 330 } 331 332 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); 333 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; 334 scp = &tcp_syn_cache[sc->sc_bucketidx]; 335 336 /* 337 * Make sure that we don't overflow the per-bucket 338 * limit or the total cache size limit. 339 */ 340 s = splsoftnet(); 341 if (scp->sch_length >= tcp_syn_bucket_limit) { 342 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); 343 /* 344 * The bucket is full. Toss the oldest element in the 345 * bucket. This will be the first entry in the bucket. 346 */ 347 sc2 = TAILQ_FIRST(&scp->sch_bucket); 348 #ifdef DIAGNOSTIC 349 /* 350 * This should never happen; we should always find an 351 * entry in our bucket. 352 */ 353 if (sc2 == NULL) 354 panic("syn_cache_insert: bucketoverflow: impossible"); 355 #endif 356 syn_cache_rm(sc2); 357 syn_cache_put(sc2); /* calls pool_put but see spl above */ 358 } else if (syn_cache_count >= tcp_syn_cache_limit) { 359 struct syn_cache_head *scp2, *sce; 360 361 TCP_STATINC(TCP_STAT_SC_OVERFLOWED); 362 /* 363 * The cache is full. Toss the oldest entry in the 364 * first non-empty bucket we can find. 365 * 366 * XXX We would really like to toss the oldest 367 * entry in the cache, but we hope that this 368 * condition doesn't happen very often. 369 */ 370 scp2 = scp; 371 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 372 sce = &tcp_syn_cache[tcp_syn_cache_size]; 373 for (++scp2; scp2 != scp; scp2++) { 374 if (scp2 >= sce) 375 scp2 = &tcp_syn_cache[0]; 376 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 377 break; 378 } 379 #ifdef DIAGNOSTIC 380 /* 381 * This should never happen; we should always find a 382 * non-empty bucket. 383 */ 384 if (scp2 == scp) 385 panic("syn_cache_insert: cacheoverflow: " 386 "impossible"); 387 #endif 388 } 389 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 390 syn_cache_rm(sc2); 391 syn_cache_put(sc2); /* calls pool_put but see spl above */ 392 } 393 394 /* 395 * Initialize the entry's timer. 396 */ 397 sc->sc_rxttot = 0; 398 sc->sc_rxtshift = 0; 399 syn_cache_timer_arm(sc); 400 401 /* Link it from tcpcb entry */ 402 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 403 404 /* Put it into the bucket. */ 405 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 406 scp->sch_length++; 407 syn_cache_count++; 408 409 TCP_STATINC(TCP_STAT_SC_ADDED); 410 splx(s); 411 } 412 413 /* 414 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 415 * If we have retransmitted an entry the maximum number of times, expire 416 * that entry. 417 */ 418 static void 419 syn_cache_timer(void *arg) 420 { 421 struct syn_cache *sc = arg; 422 423 mutex_enter(softnet_lock); 424 KERNEL_LOCK(1, NULL); 425 426 callout_ack(&sc->sc_timer); 427 428 if (__predict_false(sc->sc_flags & SCF_DEAD)) { 429 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); 430 goto free; 431 } 432 433 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 434 /* Drop it -- too many retransmissions. */ 435 goto dropit; 436 } 437 438 /* 439 * Compute the total amount of time this entry has 440 * been on a queue. If this entry has been on longer 441 * than the keep alive timer would allow, expire it. 442 */ 443 sc->sc_rxttot += sc->sc_rxtcur; 444 if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS)) 445 goto dropit; 446 447 TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); 448 (void)syn_cache_respond(sc); 449 450 /* Advance the timer back-off. */ 451 sc->sc_rxtshift++; 452 syn_cache_timer_arm(sc); 453 454 goto out; 455 456 dropit: 457 TCP_STATINC(TCP_STAT_SC_TIMED_OUT); 458 syn_cache_rm(sc); 459 if (sc->sc_ipopts) 460 (void) m_free(sc->sc_ipopts); 461 rtcache_free(&sc->sc_route); 462 463 free: 464 callout_destroy(&sc->sc_timer); 465 pool_put(&syn_cache_pool, sc); 466 467 out: 468 KERNEL_UNLOCK_ONE(NULL); 469 mutex_exit(softnet_lock); 470 } 471 472 /* 473 * Remove syn cache created by the specified tcb entry, 474 * because this does not make sense to keep them 475 * (if there's no tcb entry, syn cache entry will never be used) 476 */ 477 void 478 syn_cache_cleanup(struct tcpcb *tp) 479 { 480 struct syn_cache *sc, *nsc; 481 int s; 482 483 s = splsoftnet(); 484 485 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { 486 nsc = LIST_NEXT(sc, sc_tpq); 487 488 #ifdef DIAGNOSTIC 489 if (sc->sc_tp != tp) 490 panic("invalid sc_tp in syn_cache_cleanup"); 491 #endif 492 syn_cache_rm(sc); 493 syn_cache_put(sc); /* calls pool_put but see spl above */ 494 } 495 /* just for safety */ 496 LIST_INIT(&tp->t_sc); 497 498 splx(s); 499 } 500 501 /* 502 * Find an entry in the syn cache. 503 */ 504 static struct syn_cache * 505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, 506 struct syn_cache_head **headp) 507 { 508 struct syn_cache *sc; 509 struct syn_cache_head *scp; 510 u_int32_t hash; 511 int s; 512 513 SYN_HASHALL(hash, src, dst); 514 515 scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; 516 *headp = scp; 517 s = splsoftnet(); 518 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; 519 sc = TAILQ_NEXT(sc, sc_bucketq)) { 520 if (sc->sc_hash != hash) 521 continue; 522 if (!memcmp(&sc->sc_src, src, src->sa_len) && 523 !memcmp(&sc->sc_dst, dst, dst->sa_len)) { 524 splx(s); 525 return (sc); 526 } 527 } 528 splx(s); 529 return (NULL); 530 } 531 532 /* 533 * This function gets called when we receive an ACK for a socket in the 534 * LISTEN state. We look up the connection in the syn cache, and if it's 535 * there, we pull it out of the cache and turn it into a full-blown 536 * connection in the SYN-RECEIVED state. 537 * 538 * The return values may not be immediately obvious, and their effects 539 * can be subtle, so here they are: 540 * 541 * NULL SYN was not found in cache; caller should drop the 542 * packet and send an RST. 543 * 544 * -1 We were unable to create the new connection, and are 545 * aborting it. An ACK,RST is being sent to the peer 546 * (unless we got screwey sequence numbers; see below), 547 * because the 3-way handshake has been completed. Caller 548 * should not free the mbuf, since we may be using it. If 549 * we are not, we will free it. 550 * 551 * Otherwise, the return value is a pointer to the new socket 552 * associated with the connection. 553 */ 554 struct socket * 555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, 556 struct tcphdr *th, struct socket *so, struct mbuf *m) 557 { 558 struct syn_cache *sc; 559 struct syn_cache_head *scp; 560 struct inpcb *inp = NULL; 561 struct tcpcb *tp; 562 int s; 563 struct socket *oso; 564 565 s = splsoftnet(); 566 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 567 splx(s); 568 return NULL; 569 } 570 571 /* 572 * Verify the sequence and ack numbers. Try getting the correct 573 * response again. 574 */ 575 if ((th->th_ack != sc->sc_iss + 1) || 576 SEQ_LEQ(th->th_seq, sc->sc_irs) || 577 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 578 m_freem(m); 579 (void)syn_cache_respond(sc); 580 splx(s); 581 return ((struct socket *)(-1)); 582 } 583 584 /* Remove this cache entry */ 585 syn_cache_rm(sc); 586 splx(s); 587 588 /* 589 * Ok, create the full blown connection, and set things up 590 * as they would have been set up if we had created the 591 * connection when the SYN arrived. If we can't create 592 * the connection, abort it. 593 */ 594 /* 595 * inp still has the OLD in_pcb stuff, set the 596 * v6-related flags on the new guy, too. This is 597 * done particularly for the case where an AF_INET6 598 * socket is bound only to a port, and a v4 connection 599 * comes in on that port. 600 * we also copy the flowinfo from the original pcb 601 * to the new one. 602 */ 603 oso = so; 604 so = sonewconn(so, true); 605 if (so == NULL) 606 goto resetandabort; 607 608 inp = sotoinpcb(so); 609 610 switch (src->sa_family) { 611 case AF_INET: 612 if (inp->inp_af == AF_INET) { 613 in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr; 614 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 615 inp->inp_options = ip_srcroute(m); 616 inpcb_set_state(inp, INP_BOUND); 617 if (inp->inp_options == NULL) { 618 inp->inp_options = sc->sc_ipopts; 619 sc->sc_ipopts = NULL; 620 } 621 } 622 #ifdef INET6 623 else if (inp->inp_af == AF_INET6) { 624 /* IPv4 packet to AF_INET6 socket */ 625 memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp))); 626 in6p_laddr(inp).s6_addr16[5] = htons(0xffff); 627 bcopy(&((struct sockaddr_in *)dst)->sin_addr, 628 &in6p_laddr(inp).s6_addr32[3], 629 sizeof(((struct sockaddr_in *)dst)->sin_addr)); 630 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; 631 intotcpcb(inp)->t_family = AF_INET; 632 if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY) 633 inp->inp_flags |= IN6P_IPV6_V6ONLY; 634 else 635 inp->inp_flags &= ~IN6P_IPV6_V6ONLY; 636 inpcb_set_state(inp, INP_BOUND); 637 } 638 #endif 639 break; 640 #ifdef INET6 641 case AF_INET6: 642 if (inp->inp_af == AF_INET6) { 643 in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr; 644 inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port; 645 inpcb_set_state(inp, INP_BOUND); 646 } 647 break; 648 #endif 649 } 650 651 #ifdef INET6 652 if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) { 653 struct inpcb *oinp = sotoinpcb(oso); 654 /* inherit socket options from the listening socket */ 655 inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS); 656 if (inp->inp_flags & IN6P_CONTROLOPTS) { 657 m_freem(inp->inp_options); 658 inp->inp_options = NULL; 659 } 660 ip6_savecontrol(inp, &inp->inp_options, 661 mtod(m, struct ip6_hdr *), m); 662 } 663 #endif 664 665 /* 666 * Give the new socket our cached route reference. 667 */ 668 rtcache_copy(&inp->inp_route, &sc->sc_route); 669 rtcache_free(&sc->sc_route); 670 671 if (inp->inp_af == AF_INET) { 672 struct sockaddr_in sin; 673 memcpy(&sin, src, src->sa_len); 674 if (inpcb_connect(inp, &sin, &lwp0)) { 675 goto resetandabort; 676 } 677 } 678 #ifdef INET6 679 else if (inp->inp_af == AF_INET6) { 680 struct sockaddr_in6 sin6; 681 memcpy(&sin6, src, src->sa_len); 682 if (src->sa_family == AF_INET) { 683 /* IPv4 packet to AF_INET6 socket */ 684 in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); 685 } 686 if (in6pcb_connect(inp, &sin6, NULL)) { 687 goto resetandabort; 688 } 689 } 690 #endif 691 else { 692 goto resetandabort; 693 } 694 695 tp = intotcpcb(inp); 696 697 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; 698 if (sc->sc_request_r_scale != 15) { 699 tp->requested_s_scale = sc->sc_requested_s_scale; 700 tp->request_r_scale = sc->sc_request_r_scale; 701 tp->snd_scale = sc->sc_requested_s_scale; 702 tp->rcv_scale = sc->sc_request_r_scale; 703 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 704 } 705 if (sc->sc_flags & SCF_TIMESTAMP) 706 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 707 tp->ts_timebase = sc->sc_timebase; 708 709 tp->t_template = tcp_template(tp); 710 if (tp->t_template == 0) { 711 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 712 so = NULL; 713 m_freem(m); 714 goto abort; 715 } 716 717 tp->iss = sc->sc_iss; 718 tp->irs = sc->sc_irs; 719 tcp_sendseqinit(tp); 720 tcp_rcvseqinit(tp); 721 tp->t_state = TCPS_SYN_RECEIVED; 722 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); 723 TCP_STATINC(TCP_STAT_ACCEPTS); 724 725 if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) 726 tp->t_flags |= TF_WILL_SACK; 727 728 if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) 729 tp->t_flags |= TF_ECN_PERMIT; 730 731 #ifdef TCP_SIGNATURE 732 if (sc->sc_flags & SCF_SIGNATURE) 733 tp->t_flags |= TF_SIGNATURE; 734 #endif 735 736 /* Initialize tp->t_ourmss before we deal with the peer's! */ 737 tp->t_ourmss = sc->sc_ourmaxseg; 738 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 739 740 /* 741 * Initialize the initial congestion window. If we 742 * had to retransmit the SYN,ACK, we must initialize cwnd 743 * to 1 segment (i.e. the Loss Window). 744 */ 745 if (sc->sc_rxtshift) 746 tp->snd_cwnd = tp->t_peermss; 747 else { 748 int ss = tcp_init_win; 749 if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp))) 750 ss = tcp_init_win_local; 751 #ifdef INET6 752 else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp))) 753 ss = tcp_init_win_local; 754 #endif 755 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 756 } 757 758 tcp_rmx_rtt(tp); 759 tp->snd_wl1 = sc->sc_irs; 760 tp->rcv_up = sc->sc_irs + 1; 761 762 /* 763 * This is what would have happened in tcp_output() when 764 * the SYN,ACK was sent. 765 */ 766 tp->snd_up = tp->snd_una; 767 tp->snd_max = tp->snd_nxt = tp->iss+1; 768 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 769 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 770 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 771 tp->last_ack_sent = tp->rcv_nxt; 772 tp->t_partialacks = -1; 773 tp->t_dupacks = 0; 774 775 TCP_STATINC(TCP_STAT_SC_COMPLETED); 776 s = splsoftnet(); 777 syn_cache_put(sc); 778 splx(s); 779 return so; 780 781 resetandabort: 782 (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 783 abort: 784 if (so != NULL) { 785 (void) soqremque(so, 1); 786 (void) soabort(so); 787 mutex_enter(softnet_lock); 788 } 789 s = splsoftnet(); 790 syn_cache_put(sc); 791 splx(s); 792 TCP_STATINC(TCP_STAT_SC_ABORTED); 793 return ((struct socket *)(-1)); 794 } 795 796 /* 797 * This function is called when we get a RST for a 798 * non-existent connection, so that we can see if the 799 * connection is in the syn cache. If it is, zap it. 800 */ 801 802 void 803 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) 804 { 805 struct syn_cache *sc; 806 struct syn_cache_head *scp; 807 int s = splsoftnet(); 808 809 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 810 splx(s); 811 return; 812 } 813 if (SEQ_LT(th->th_seq, sc->sc_irs) || 814 SEQ_GT(th->th_seq, sc->sc_irs+1)) { 815 splx(s); 816 return; 817 } 818 syn_cache_rm(sc); 819 TCP_STATINC(TCP_STAT_SC_RESET); 820 syn_cache_put(sc); /* calls pool_put but see spl above */ 821 splx(s); 822 } 823 824 void 825 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, 826 struct tcphdr *th) 827 { 828 struct syn_cache *sc; 829 struct syn_cache_head *scp; 830 int s; 831 832 s = splsoftnet(); 833 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { 834 splx(s); 835 return; 836 } 837 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 838 if (ntohl(th->th_seq) != sc->sc_iss) { 839 splx(s); 840 return; 841 } 842 843 /* 844 * If we've retransmitted 3 times and this is our second error, 845 * we remove the entry. Otherwise, we allow it to continue on. 846 * This prevents us from incorrectly nuking an entry during a 847 * spurious network outage. 848 * 849 * See tcp_notify(). 850 */ 851 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { 852 sc->sc_flags |= SCF_UNREACH; 853 splx(s); 854 return; 855 } 856 857 syn_cache_rm(sc); 858 TCP_STATINC(TCP_STAT_SC_UNREACH); 859 syn_cache_put(sc); /* calls pool_put but see spl above */ 860 splx(s); 861 } 862 863 /* 864 * Given a LISTEN socket and an inbound SYN request, add this to the syn 865 * cache, and send back a segment: 866 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 867 * to the source. 868 * 869 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 870 * Doing so would require that we hold onto the data and deliver it 871 * to the application. However, if we are the target of a SYN-flood 872 * DoS attack, an attacker could send data which would eventually 873 * consume all available buffer space if it were ACKed. By not ACKing 874 * the data, we avoid this DoS scenario. 875 */ 876 int 877 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 878 unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp, 879 int optlen, struct tcp_opt_info *oi) 880 { 881 struct tcpcb tb, *tp; 882 long win; 883 struct syn_cache *sc; 884 struct syn_cache_head *scp; 885 struct mbuf *ipopts; 886 int s; 887 888 tp = sototcpcb(so); 889 890 /* 891 * Initialize some local state. 892 */ 893 win = sbspace(&so->so_rcv); 894 if (win > TCP_MAXWIN) 895 win = TCP_MAXWIN; 896 897 #ifdef TCP_SIGNATURE 898 if (optp || (tp->t_flags & TF_SIGNATURE)) 899 #else 900 if (optp) 901 #endif 902 { 903 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 904 #ifdef TCP_SIGNATURE 905 tb.t_flags |= (tp->t_flags & TF_SIGNATURE); 906 #endif 907 tb.t_state = TCPS_LISTEN; 908 if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0) 909 return 0; 910 } else 911 tb.t_flags = 0; 912 913 switch (src->sa_family) { 914 case AF_INET: 915 /* Remember the IP options, if any. */ 916 ipopts = ip_srcroute(m); 917 break; 918 default: 919 ipopts = NULL; 920 } 921 922 /* 923 * See if we already have an entry for this connection. 924 * If we do, resend the SYN,ACK. We do not count this 925 * as a retransmission (XXX though maybe we should). 926 */ 927 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { 928 TCP_STATINC(TCP_STAT_SC_DUPESYN); 929 if (ipopts) { 930 /* 931 * If we were remembering a previous source route, 932 * forget it and use the new one we've been given. 933 */ 934 if (sc->sc_ipopts) 935 (void)m_free(sc->sc_ipopts); 936 sc->sc_ipopts = ipopts; 937 } 938 sc->sc_timestamp = tb.ts_recent; 939 m_freem(m); 940 if (syn_cache_respond(sc) == 0) { 941 net_stat_ref_t tcps = TCP_STAT_GETREF(); 942 _NET_STATINC_REF(tcps, TCP_STAT_SNDACKS); 943 _NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL); 944 TCP_STAT_PUTREF(); 945 } 946 return 1; 947 } 948 949 s = splsoftnet(); 950 sc = pool_get(&syn_cache_pool, PR_NOWAIT); 951 splx(s); 952 if (sc == NULL) { 953 if (ipopts) 954 (void)m_free(ipopts); 955 return 0; 956 } 957 958 /* 959 * Fill in the cache, and put the necessary IP and TCP 960 * options into the reply. 961 */ 962 memset(sc, 0, sizeof(struct syn_cache)); 963 callout_init(&sc->sc_timer, CALLOUT_MPSAFE); 964 memcpy(&sc->sc_src, src, src->sa_len); 965 memcpy(&sc->sc_dst, dst, dst->sa_len); 966 sc->sc_flags = 0; 967 sc->sc_ipopts = ipopts; 968 sc->sc_irs = th->th_seq; 969 switch (src->sa_family) { 970 case AF_INET: 971 { 972 struct sockaddr_in *srcin = (void *)src; 973 struct sockaddr_in *dstin = (void *)dst; 974 975 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, 976 &srcin->sin_addr, dstin->sin_port, 977 srcin->sin_port, sizeof(dstin->sin_addr)); 978 break; 979 } 980 #ifdef INET6 981 case AF_INET6: 982 { 983 struct sockaddr_in6 *srcin6 = (void *)src; 984 struct sockaddr_in6 *dstin6 = (void *)dst; 985 986 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, 987 &srcin6->sin6_addr, dstin6->sin6_port, 988 srcin6->sin6_port, sizeof(dstin6->sin6_addr)); 989 break; 990 } 991 #endif 992 } 993 sc->sc_peermaxseg = oi->maxseg; 994 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? 995 m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family); 996 sc->sc_win = win; 997 sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ 998 sc->sc_timestamp = tb.ts_recent; 999 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 1000 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1001 sc->sc_flags |= SCF_TIMESTAMP; 1002 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1003 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1004 sc->sc_requested_s_scale = tb.requested_s_scale; 1005 sc->sc_request_r_scale = 0; 1006 /* 1007 * Pick the smallest possible scaling factor that 1008 * will still allow us to scale up to sb_max. 1009 * 1010 * We do this because there are broken firewalls that 1011 * will corrupt the window scale option, leading to 1012 * the other endpoint believing that our advertised 1013 * window is unscaled. At scale factors larger than 1014 * 5 the unscaled window will drop below 1500 bytes, 1015 * leading to serious problems when traversing these 1016 * broken firewalls. 1017 * 1018 * With the default sbmax of 256K, a scale factor 1019 * of 3 will be chosen by this algorithm. Those who 1020 * choose a larger sbmax should watch out 1021 * for the compatibility problems mentioned above. 1022 * 1023 * RFC1323: The Window field in a SYN (i.e., a <SYN> 1024 * or <SYN,ACK>) segment itself is never scaled. 1025 */ 1026 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 1027 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 1028 sc->sc_request_r_scale++; 1029 } else { 1030 sc->sc_requested_s_scale = 15; 1031 sc->sc_request_r_scale = 15; 1032 } 1033 if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) 1034 sc->sc_flags |= SCF_SACK_PERMIT; 1035 1036 /* 1037 * ECN setup packet received. 1038 */ 1039 if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) 1040 sc->sc_flags |= SCF_ECN_PERMIT; 1041 1042 #ifdef TCP_SIGNATURE 1043 if (tb.t_flags & TF_SIGNATURE) 1044 sc->sc_flags |= SCF_SIGNATURE; 1045 #endif 1046 sc->sc_tp = tp; 1047 m_freem(m); 1048 if (syn_cache_respond(sc) == 0) { 1049 net_stat_ref_t tcps = TCP_STAT_GETREF(); 1050 _NET_STATINC_REF(tcps, TCP_STAT_SNDACKS); 1051 _NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL); 1052 TCP_STAT_PUTREF(); 1053 syn_cache_insert(sc, tp); 1054 } else { 1055 s = splsoftnet(); 1056 /* 1057 * syn_cache_put() will try to schedule the timer, so 1058 * we need to initialize it 1059 */ 1060 syn_cache_timer_arm(sc); 1061 syn_cache_put(sc); 1062 splx(s); 1063 TCP_STATINC(TCP_STAT_SC_DROPPED); 1064 } 1065 return 1; 1066 } 1067 1068 /* 1069 * syn_cache_respond: (re)send SYN+ACK. 1070 * 1071 * Returns 0 on success. 1072 */ 1073 1074 static int 1075 syn_cache_respond(struct syn_cache *sc) 1076 { 1077 #ifdef INET6 1078 struct rtentry *rt = NULL; 1079 #endif 1080 struct route *ro; 1081 u_int8_t *optp; 1082 int optlen, error; 1083 u_int16_t tlen; 1084 struct ip *ip = NULL; 1085 #ifdef INET6 1086 struct ip6_hdr *ip6 = NULL; 1087 #endif 1088 struct tcpcb *tp; 1089 struct tcphdr *th; 1090 struct mbuf *m; 1091 u_int hlen; 1092 #ifdef TCP_SIGNATURE 1093 struct secasvar *sav = NULL; 1094 u_int8_t *sigp = NULL; 1095 #endif 1096 1097 ro = &sc->sc_route; 1098 switch (sc->sc_src.sa.sa_family) { 1099 case AF_INET: 1100 hlen = sizeof(struct ip); 1101 break; 1102 #ifdef INET6 1103 case AF_INET6: 1104 hlen = sizeof(struct ip6_hdr); 1105 break; 1106 #endif 1107 default: 1108 return EAFNOSUPPORT; 1109 } 1110 1111 /* Worst case scenario, since we don't know the option size yet. */ 1112 tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN; 1113 KASSERT(max_linkhdr + tlen <= MCLBYTES); 1114 1115 /* 1116 * Create the IP+TCP header from scratch. 1117 */ 1118 MGETHDR(m, M_DONTWAIT, MT_DATA); 1119 if (m && (max_linkhdr + tlen) > MHLEN) { 1120 MCLGET(m, M_DONTWAIT); 1121 if ((m->m_flags & M_EXT) == 0) { 1122 m_freem(m); 1123 m = NULL; 1124 } 1125 } 1126 if (m == NULL) 1127 return ENOBUFS; 1128 MCLAIM(m, &tcp_tx_mowner); 1129 1130 tp = sc->sc_tp; 1131 1132 /* Fixup the mbuf. */ 1133 m->m_data += max_linkhdr; 1134 m_reset_rcvif(m); 1135 memset(mtod(m, void *), 0, tlen); 1136 1137 switch (sc->sc_src.sa.sa_family) { 1138 case AF_INET: 1139 ip = mtod(m, struct ip *); 1140 ip->ip_v = 4; 1141 ip->ip_dst = sc->sc_src.sin.sin_addr; 1142 ip->ip_src = sc->sc_dst.sin.sin_addr; 1143 ip->ip_p = IPPROTO_TCP; 1144 th = (struct tcphdr *)(ip + 1); 1145 th->th_dport = sc->sc_src.sin.sin_port; 1146 th->th_sport = sc->sc_dst.sin.sin_port; 1147 break; 1148 #ifdef INET6 1149 case AF_INET6: 1150 ip6 = mtod(m, struct ip6_hdr *); 1151 ip6->ip6_vfc = IPV6_VERSION; 1152 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 1153 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 1154 ip6->ip6_nxt = IPPROTO_TCP; 1155 /* ip6_plen will be updated in ip6_output() */ 1156 th = (struct tcphdr *)(ip6 + 1); 1157 th->th_dport = sc->sc_src.sin6.sin6_port; 1158 th->th_sport = sc->sc_dst.sin6.sin6_port; 1159 break; 1160 #endif 1161 default: 1162 panic("%s: impossible (1)", __func__); 1163 } 1164 1165 th->th_seq = htonl(sc->sc_iss); 1166 th->th_ack = htonl(sc->sc_irs + 1); 1167 th->th_flags = TH_SYN|TH_ACK; 1168 th->th_win = htons(sc->sc_win); 1169 /* th_x2, th_sum, th_urp already 0 from memset */ 1170 1171 /* Tack on the TCP options. */ 1172 optp = (u_int8_t *)(th + 1); 1173 optlen = 0; 1174 *optp++ = TCPOPT_MAXSEG; 1175 *optp++ = TCPOLEN_MAXSEG; 1176 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 1177 *optp++ = sc->sc_ourmaxseg & 0xff; 1178 optlen += TCPOLEN_MAXSEG; 1179 1180 if (sc->sc_request_r_scale != 15) { 1181 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 1182 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 1183 sc->sc_request_r_scale); 1184 optp += TCPOLEN_WINDOW + TCPOLEN_NOP; 1185 optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; 1186 } 1187 1188 if (sc->sc_flags & SCF_SACK_PERMIT) { 1189 /* Let the peer know that we will SACK. */ 1190 *optp++ = TCPOPT_SACK_PERMITTED; 1191 *optp++ = TCPOLEN_SACK_PERMITTED; 1192 optlen += TCPOLEN_SACK_PERMITTED; 1193 } 1194 1195 if (sc->sc_flags & SCF_TIMESTAMP) { 1196 while (optlen % 4 != 2) { 1197 optlen += TCPOLEN_NOP; 1198 *optp++ = TCPOPT_NOP; 1199 } 1200 *optp++ = TCPOPT_TIMESTAMP; 1201 *optp++ = TCPOLEN_TIMESTAMP; 1202 u_int32_t *lp = (u_int32_t *)(optp); 1203 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1204 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); 1205 *lp = htonl(sc->sc_timestamp); 1206 optp += TCPOLEN_TIMESTAMP - 2; 1207 optlen += TCPOLEN_TIMESTAMP; 1208 } 1209 1210 #ifdef TCP_SIGNATURE 1211 if (sc->sc_flags & SCF_SIGNATURE) { 1212 sav = tcp_signature_getsav(m); 1213 if (sav == NULL) { 1214 m_freem(m); 1215 return EPERM; 1216 } 1217 1218 *optp++ = TCPOPT_SIGNATURE; 1219 *optp++ = TCPOLEN_SIGNATURE; 1220 sigp = optp; 1221 memset(optp, 0, TCP_SIGLEN); 1222 optp += TCP_SIGLEN; 1223 optlen += TCPOLEN_SIGNATURE; 1224 } 1225 #endif 1226 1227 /* 1228 * Terminate and pad TCP options to a 4 byte boundary. 1229 * 1230 * According to RFC793: "The content of the header beyond the 1231 * End-of-Option option must be header padding (i.e., zero)." 1232 * And later: "The padding is composed of zeros." 1233 */ 1234 if (optlen % 4) { 1235 optlen += TCPOLEN_EOL; 1236 *optp++ = TCPOPT_EOL; 1237 } 1238 while (optlen % 4) { 1239 optlen += TCPOLEN_PAD; 1240 *optp++ = TCPOPT_PAD; 1241 } 1242 1243 /* Compute the actual values now that we've added the options. */ 1244 tlen = hlen + sizeof(struct tcphdr) + optlen; 1245 m->m_len = m->m_pkthdr.len = tlen; 1246 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1247 1248 #ifdef TCP_SIGNATURE 1249 if (sav) { 1250 (void)tcp_signature(m, th, hlen, sav, sigp); 1251 key_sa_recordxfer(sav, m); 1252 KEY_SA_UNREF(&sav); 1253 } 1254 #endif 1255 1256 /* 1257 * Send ECN SYN-ACK setup packet. 1258 * Routes can be asymmetric, so, even if we receive a packet 1259 * with ECE and CWR set, we must not assume no one will block 1260 * the ECE packet we are about to send. 1261 */ 1262 if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && 1263 SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 1264 th->th_flags |= TH_ECE; 1265 TCP_STATINC(TCP_STAT_ECN_SHS); 1266 1267 /* 1268 * draft-ietf-tcpm-ecnsyn-00.txt 1269 * 1270 * "[...] a TCP node MAY respond to an ECN-setup 1271 * SYN packet by setting ECT in the responding 1272 * ECN-setup SYN/ACK packet, indicating to routers 1273 * that the SYN/ACK packet is ECN-Capable. 1274 * This allows a congested router along the path 1275 * to mark the packet instead of dropping the 1276 * packet as an indication of congestion." 1277 * 1278 * "[...] There can be a great benefit in setting 1279 * an ECN-capable codepoint in SYN/ACK packets [...] 1280 * Congestion is most likely to occur in 1281 * the server-to-client direction. As a result, 1282 * setting an ECN-capable codepoint in SYN/ACK 1283 * packets can reduce the occurrence of three-second 1284 * retransmit timeouts resulting from the drop 1285 * of SYN/ACK packets." 1286 * 1287 * Page 4 and 6, January 2006. 1288 */ 1289 1290 switch (sc->sc_src.sa.sa_family) { 1291 case AF_INET: 1292 ip->ip_tos |= IPTOS_ECN_ECT0; 1293 break; 1294 #ifdef INET6 1295 case AF_INET6: 1296 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 1297 break; 1298 #endif 1299 } 1300 TCP_STATINC(TCP_STAT_ECN_ECT); 1301 } 1302 1303 1304 /* 1305 * Compute the packet's checksum. 1306 * 1307 * Fill in some straggling IP bits. Note the stack expects 1308 * ip_len to be in host order, for convenience. 1309 */ 1310 switch (sc->sc_src.sa.sa_family) { 1311 case AF_INET: 1312 ip->ip_len = htons(tlen - hlen); 1313 th->th_sum = 0; 1314 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 1315 ip->ip_len = htons(tlen); 1316 ip->ip_ttl = ip_defttl; 1317 /* XXX tos? */ 1318 break; 1319 #ifdef INET6 1320 case AF_INET6: 1321 ip6->ip6_plen = htons(tlen - hlen); 1322 th->th_sum = 0; 1323 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 1324 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 1325 ip6->ip6_vfc |= IPV6_VERSION; 1326 ip6->ip6_plen = htons(tlen - hlen); 1327 /* ip6_hlim will be initialized afterwards */ 1328 /* XXX flowlabel? */ 1329 break; 1330 #endif 1331 } 1332 1333 /* XXX use IPsec policy on listening socket, on SYN ACK */ 1334 tp = sc->sc_tp; 1335 1336 switch (sc->sc_src.sa.sa_family) { 1337 case AF_INET: 1338 error = ip_output(m, sc->sc_ipopts, ro, 1339 (ip_mtudisc ? IP_MTUDISC : 0), 1340 NULL, tp ? tp->t_inpcb : NULL); 1341 break; 1342 #ifdef INET6 1343 case AF_INET6: 1344 ip6->ip6_hlim = in6pcb_selecthlim(NULL, 1345 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); 1346 rtcache_unref(rt, ro); 1347 1348 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, 1349 tp ? tp->t_inpcb : NULL, NULL); 1350 break; 1351 #endif 1352 default: 1353 panic("%s: impossible (2)", __func__); 1354 } 1355 1356 return error; 1357 } 1358