xref: /netbsd-src/sys/netinet/tcp_syncache.c (revision 023842dd7aa71afbdae78525ee699a0a8181af29)
1*023842ddSriastradh /*	$NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $	*/
23761620bSozaki-r 
33761620bSozaki-r /*
43761620bSozaki-r  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
53761620bSozaki-r  * All rights reserved.
63761620bSozaki-r  *
73761620bSozaki-r  * Redistribution and use in source and binary forms, with or without
83761620bSozaki-r  * modification, are permitted provided that the following conditions
93761620bSozaki-r  * are met:
103761620bSozaki-r  * 1. Redistributions of source code must retain the above copyright
113761620bSozaki-r  *    notice, this list of conditions and the following disclaimer.
123761620bSozaki-r  * 2. Redistributions in binary form must reproduce the above copyright
133761620bSozaki-r  *    notice, this list of conditions and the following disclaimer in the
143761620bSozaki-r  *    documentation and/or other materials provided with the distribution.
153761620bSozaki-r  * 3. Neither the name of the project nor the names of its contributors
163761620bSozaki-r  *    may be used to endorse or promote products derived from this software
173761620bSozaki-r  *    without specific prior written permission.
183761620bSozaki-r  *
193761620bSozaki-r  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
203761620bSozaki-r  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
213761620bSozaki-r  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
223761620bSozaki-r  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
233761620bSozaki-r  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
243761620bSozaki-r  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
253761620bSozaki-r  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
263761620bSozaki-r  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
273761620bSozaki-r  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
283761620bSozaki-r  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
293761620bSozaki-r  * SUCH DAMAGE.
303761620bSozaki-r  */
313761620bSozaki-r 
323761620bSozaki-r /*
333761620bSozaki-r  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
343761620bSozaki-r  *
353761620bSozaki-r  * NRL grants permission for redistribution and use in source and binary
363761620bSozaki-r  * forms, with or without modification, of the software and documentation
373761620bSozaki-r  * created at NRL provided that the following conditions are met:
383761620bSozaki-r  *
393761620bSozaki-r  * 1. Redistributions of source code must retain the above copyright
403761620bSozaki-r  *    notice, this list of conditions and the following disclaimer.
413761620bSozaki-r  * 2. Redistributions in binary form must reproduce the above copyright
423761620bSozaki-r  *    notice, this list of conditions and the following disclaimer in the
433761620bSozaki-r  *    documentation and/or other materials provided with the distribution.
443761620bSozaki-r  * 3. All advertising materials mentioning features or use of this software
453761620bSozaki-r  *    must display the following acknowledgements:
463761620bSozaki-r  *      This product includes software developed by the University of
473761620bSozaki-r  *      California, Berkeley and its contributors.
483761620bSozaki-r  *      This product includes software developed at the Information
493761620bSozaki-r  *      Technology Division, US Naval Research Laboratory.
503761620bSozaki-r  * 4. Neither the name of the NRL nor the names of its contributors
513761620bSozaki-r  *    may be used to endorse or promote products derived from this software
523761620bSozaki-r  *    without specific prior written permission.
533761620bSozaki-r  *
543761620bSozaki-r  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
553761620bSozaki-r  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
563761620bSozaki-r  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
573761620bSozaki-r  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
583761620bSozaki-r  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
593761620bSozaki-r  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
603761620bSozaki-r  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
613761620bSozaki-r  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
623761620bSozaki-r  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
633761620bSozaki-r  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
643761620bSozaki-r  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
653761620bSozaki-r  *
663761620bSozaki-r  * The views and conclusions contained in the software and documentation
673761620bSozaki-r  * are those of the authors and should not be interpreted as representing
683761620bSozaki-r  * official policies, either expressed or implied, of the US Naval
693761620bSozaki-r  * Research Laboratory (NRL).
703761620bSozaki-r  */
713761620bSozaki-r 
723761620bSozaki-r /*-
733761620bSozaki-r  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
743761620bSozaki-r  * 2011 The NetBSD Foundation, Inc.
753761620bSozaki-r  * All rights reserved.
763761620bSozaki-r  *
773761620bSozaki-r  * This code is derived from software contributed to The NetBSD Foundation
783761620bSozaki-r  * by Coyote Point Systems, Inc.
793761620bSozaki-r  * This code is derived from software contributed to The NetBSD Foundation
803761620bSozaki-r  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
813761620bSozaki-r  * Facility, NASA Ames Research Center.
823761620bSozaki-r  * This code is derived from software contributed to The NetBSD Foundation
833761620bSozaki-r  * by Charles M. Hannum.
843761620bSozaki-r  * This code is derived from software contributed to The NetBSD Foundation
853761620bSozaki-r  * by Rui Paulo.
863761620bSozaki-r  *
873761620bSozaki-r  * Redistribution and use in source and binary forms, with or without
883761620bSozaki-r  * modification, are permitted provided that the following conditions
893761620bSozaki-r  * are met:
903761620bSozaki-r  * 1. Redistributions of source code must retain the above copyright
913761620bSozaki-r  *    notice, this list of conditions and the following disclaimer.
923761620bSozaki-r  * 2. Redistributions in binary form must reproduce the above copyright
933761620bSozaki-r  *    notice, this list of conditions and the following disclaimer in the
943761620bSozaki-r  *    documentation and/or other materials provided with the distribution.
953761620bSozaki-r  *
963761620bSozaki-r  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
973761620bSozaki-r  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
983761620bSozaki-r  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
993761620bSozaki-r  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
1003761620bSozaki-r  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1013761620bSozaki-r  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1023761620bSozaki-r  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1033761620bSozaki-r  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1043761620bSozaki-r  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1053761620bSozaki-r  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1063761620bSozaki-r  * POSSIBILITY OF SUCH DAMAGE.
1073761620bSozaki-r  */
1083761620bSozaki-r 
1093761620bSozaki-r /*
1103761620bSozaki-r  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
1113761620bSozaki-r  *	The Regents of the University of California.  All rights reserved.
1123761620bSozaki-r  *
1133761620bSozaki-r  * Redistribution and use in source and binary forms, with or without
1143761620bSozaki-r  * modification, are permitted provided that the following conditions
1153761620bSozaki-r  * are met:
1163761620bSozaki-r  * 1. Redistributions of source code must retain the above copyright
1173761620bSozaki-r  *    notice, this list of conditions and the following disclaimer.
1183761620bSozaki-r  * 2. Redistributions in binary form must reproduce the above copyright
1193761620bSozaki-r  *    notice, this list of conditions and the following disclaimer in the
1203761620bSozaki-r  *    documentation and/or other materials provided with the distribution.
1213761620bSozaki-r  * 3. Neither the name of the University nor the names of its contributors
1223761620bSozaki-r  *    may be used to endorse or promote products derived from this software
1233761620bSozaki-r  *    without specific prior written permission.
1243761620bSozaki-r  *
1253761620bSozaki-r  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1263761620bSozaki-r  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1273761620bSozaki-r  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1283761620bSozaki-r  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
1293761620bSozaki-r  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1303761620bSozaki-r  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1313761620bSozaki-r  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1323761620bSozaki-r  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1333761620bSozaki-r  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1343761620bSozaki-r  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1353761620bSozaki-r  * SUCH DAMAGE.
1363761620bSozaki-r  *
1373761620bSozaki-r  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
1383761620bSozaki-r  */
1393761620bSozaki-r 
1403761620bSozaki-r /*
1413761620bSozaki-r  *	TODO list for SYN cache stuff:
1423761620bSozaki-r  *
1433761620bSozaki-r  *	Find room for a "state" field, which is needed to keep a
1443761620bSozaki-r  *	compressed state for TIME_WAIT TCBs.  It's been noted already
1453761620bSozaki-r  *	that this is fairly important for very high-volume web and
1463761620bSozaki-r  *	mail servers, which use a large number of short-lived
1473761620bSozaki-r  *	connections.
1483761620bSozaki-r  */
1493761620bSozaki-r 
1503761620bSozaki-r #include <sys/cdefs.h>
151*023842ddSriastradh __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $");
1523761620bSozaki-r 
1533761620bSozaki-r #ifdef _KERNEL_OPT
1543761620bSozaki-r #include "opt_inet.h"
1553761620bSozaki-r #include "opt_ipsec.h"
1563761620bSozaki-r #endif
1573761620bSozaki-r 
1583761620bSozaki-r #include <sys/param.h>
1593761620bSozaki-r #include <sys/systm.h>
1603761620bSozaki-r #include <sys/mbuf.h>
1613761620bSozaki-r #include <sys/protosw.h>
1623761620bSozaki-r #include <sys/socket.h>
1633761620bSozaki-r #include <sys/socketvar.h>
1643761620bSozaki-r #include <sys/errno.h>
1653761620bSozaki-r #include <sys/syslog.h>
1663761620bSozaki-r #include <sys/pool.h>
1673761620bSozaki-r #include <sys/domain.h>
1683761620bSozaki-r #include <sys/kernel.h>
1693761620bSozaki-r #include <sys/lwp.h> /* for lwp0 */
1703761620bSozaki-r #include <sys/cprng.h>
1713761620bSozaki-r 
1723761620bSozaki-r #include <netinet/in.h>
1733761620bSozaki-r #include <netinet/ip.h>
1743761620bSozaki-r #include <netinet/in_pcb.h>
1753761620bSozaki-r #include <netinet/in_var.h>
1763761620bSozaki-r #include <netinet/ip_var.h>
1773761620bSozaki-r 
1783761620bSozaki-r #include <netinet/ip6.h>
1793761620bSozaki-r #ifdef INET6
1803761620bSozaki-r #include <netinet6/ip6_var.h>
1813761620bSozaki-r #include <netinet6/in6_pcb.h>
1823761620bSozaki-r #include <netinet6/ip6_var.h>
1833761620bSozaki-r #include <netinet6/in6_var.h>
1843761620bSozaki-r #endif
1853761620bSozaki-r 
1863761620bSozaki-r #include <netinet/tcp.h>
1873761620bSozaki-r #include <netinet/tcp_fsm.h>
1883761620bSozaki-r #include <netinet/tcp_seq.h>
1893761620bSozaki-r #include <netinet/tcp_timer.h>
1903761620bSozaki-r #include <netinet/tcp_var.h>
1913761620bSozaki-r #include <netinet/tcp_private.h>
1923761620bSozaki-r #include <netinet/tcp_syncache.h>
1933761620bSozaki-r 
1943761620bSozaki-r #ifdef TCP_SIGNATURE
1953761620bSozaki-r #ifdef IPSEC
1963761620bSozaki-r #include <netipsec/ipsec.h>
1973761620bSozaki-r #include <netipsec/key.h>
1983761620bSozaki-r #ifdef INET6
1993761620bSozaki-r #include <netipsec/ipsec6.h>
2003761620bSozaki-r #endif
2013761620bSozaki-r #endif	/* IPSEC*/
2023761620bSozaki-r #endif
2033761620bSozaki-r 
2043761620bSozaki-r static void	syn_cache_timer(void *);
2059815eb9bSozaki-r static struct syn_cache *
2069815eb9bSozaki-r 		syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
2079815eb9bSozaki-r 		struct syn_cache_head **);
2089815eb9bSozaki-r static int	syn_cache_respond(struct syn_cache *);
2093761620bSozaki-r 
2103761620bSozaki-r /* syn hash parameters */
2113761620bSozaki-r #define	TCP_SYN_HASH_SIZE	293
2123761620bSozaki-r #define	TCP_SYN_BUCKET_SIZE	35
2133761620bSozaki-r static int	tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
2143761620bSozaki-r int		tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
2153761620bSozaki-r int		tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
2163761620bSozaki-r static struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
2173761620bSozaki-r 
2183761620bSozaki-r /*
2193761620bSozaki-r  * TCP compressed state engine.  Currently used to hold compressed
2203761620bSozaki-r  * state for SYN_RECEIVED.
2213761620bSozaki-r  */
2223761620bSozaki-r 
2233761620bSozaki-r u_long	syn_cache_count;
2243761620bSozaki-r static u_int32_t syn_hash1, syn_hash2;
2253761620bSozaki-r 
2263761620bSozaki-r #define SYN_HASH(sa, sp, dp) \
2273761620bSozaki-r 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
2283761620bSozaki-r 				     ((u_int32_t)(sp)))^syn_hash2)))
2293761620bSozaki-r #ifndef INET6
2303761620bSozaki-r #define	SYN_HASHALL(hash, src, dst) \
2313761620bSozaki-r do {									\
2323761620bSozaki-r 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
2333761620bSozaki-r 		((const struct sockaddr_in *)(src))->sin_port,		\
2343761620bSozaki-r 		((const struct sockaddr_in *)(dst))->sin_port);		\
2353761620bSozaki-r } while (/*CONSTCOND*/ 0)
2363761620bSozaki-r #else
2373761620bSozaki-r #define SYN_HASH6(sa, sp, dp) \
2383761620bSozaki-r 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
2393761620bSozaki-r 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
2403761620bSozaki-r 	 & 0x7fffffff)
2413761620bSozaki-r 
2423761620bSozaki-r #define SYN_HASHALL(hash, src, dst) \
2433761620bSozaki-r do {									\
2443761620bSozaki-r 	switch ((src)->sa_family) {					\
2453761620bSozaki-r 	case AF_INET:							\
2463761620bSozaki-r 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
2473761620bSozaki-r 			((const struct sockaddr_in *)(src))->sin_port,	\
2483761620bSozaki-r 			((const struct sockaddr_in *)(dst))->sin_port);	\
2493761620bSozaki-r 		break;							\
2503761620bSozaki-r 	case AF_INET6:							\
2513761620bSozaki-r 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
2523761620bSozaki-r 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
2533761620bSozaki-r 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
2543761620bSozaki-r 		break;							\
2553761620bSozaki-r 	default:							\
2563761620bSozaki-r 		hash = 0;						\
2573761620bSozaki-r 	}								\
2583761620bSozaki-r } while (/*CONSTCOND*/0)
2593761620bSozaki-r #endif /* INET6 */
2603761620bSozaki-r 
2613761620bSozaki-r static struct pool syn_cache_pool;
2623761620bSozaki-r 
2633761620bSozaki-r /*
2643761620bSozaki-r  * We don't estimate RTT with SYNs, so each packet starts with the default
2653761620bSozaki-r  * RTT and each timer step has a fixed timeout value.
2663761620bSozaki-r  */
2673761620bSozaki-r static inline void
syn_cache_timer_arm(struct syn_cache * sc)2683761620bSozaki-r syn_cache_timer_arm(struct syn_cache *sc)
2693761620bSozaki-r {
2703761620bSozaki-r 
2713761620bSozaki-r 	TCPT_RANGESET(sc->sc_rxtcur,
2723761620bSozaki-r 	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
2733761620bSozaki-r 	    TCPTV_REXMTMAX);
2743761620bSozaki-r 	callout_reset(&sc->sc_timer,
2753761620bSozaki-r 	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
2763761620bSozaki-r }
2773761620bSozaki-r 
2783761620bSozaki-r #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
2793761620bSozaki-r 
2803761620bSozaki-r static inline void
syn_cache_rm(struct syn_cache * sc)2813761620bSozaki-r syn_cache_rm(struct syn_cache *sc)
2823761620bSozaki-r {
2833761620bSozaki-r 	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
2843761620bSozaki-r 	    sc, sc_bucketq);
2853761620bSozaki-r 	sc->sc_tp = NULL;
2863761620bSozaki-r 	LIST_REMOVE(sc, sc_tpq);
2873761620bSozaki-r 	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
2883761620bSozaki-r 	callout_stop(&sc->sc_timer);
2893761620bSozaki-r 	syn_cache_count--;
2903761620bSozaki-r }
2913761620bSozaki-r 
2923761620bSozaki-r static inline void
syn_cache_put(struct syn_cache * sc)2933761620bSozaki-r syn_cache_put(struct syn_cache *sc)
2943761620bSozaki-r {
2953761620bSozaki-r 	if (sc->sc_ipopts)
2963761620bSozaki-r 		(void) m_free(sc->sc_ipopts);
2973761620bSozaki-r 	rtcache_free(&sc->sc_route);
2983761620bSozaki-r 	sc->sc_flags |= SCF_DEAD;
2993761620bSozaki-r 	if (!callout_invoking(&sc->sc_timer))
3003761620bSozaki-r 		callout_schedule(&(sc)->sc_timer, 1);
3013761620bSozaki-r }
3023761620bSozaki-r 
3033761620bSozaki-r void
syn_cache_init(void)3043761620bSozaki-r syn_cache_init(void)
3053761620bSozaki-r {
3063761620bSozaki-r 	int i;
3073761620bSozaki-r 
3083761620bSozaki-r 	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
3093761620bSozaki-r 	    "synpl", NULL, IPL_SOFTNET);
3103761620bSozaki-r 
3113761620bSozaki-r 	/* Initialize the hash buckets. */
3123761620bSozaki-r 	for (i = 0; i < tcp_syn_cache_size; i++)
3133761620bSozaki-r 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3143761620bSozaki-r }
3153761620bSozaki-r 
3163761620bSozaki-r void
syn_cache_insert(struct syn_cache * sc,struct tcpcb * tp)3173761620bSozaki-r syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3183761620bSozaki-r {
3193761620bSozaki-r 	struct syn_cache_head *scp;
3203761620bSozaki-r 	struct syn_cache *sc2;
3213761620bSozaki-r 	int s;
3223761620bSozaki-r 
3233761620bSozaki-r 	/*
3243761620bSozaki-r 	 * If there are no entries in the hash table, reinitialize
3253761620bSozaki-r 	 * the hash secrets.
3263761620bSozaki-r 	 */
3273761620bSozaki-r 	if (syn_cache_count == 0) {
3283761620bSozaki-r 		syn_hash1 = cprng_fast32();
3293761620bSozaki-r 		syn_hash2 = cprng_fast32();
3303761620bSozaki-r 	}
3313761620bSozaki-r 
3323761620bSozaki-r 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3333761620bSozaki-r 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3343761620bSozaki-r 	scp = &tcp_syn_cache[sc->sc_bucketidx];
3353761620bSozaki-r 
3363761620bSozaki-r 	/*
3373761620bSozaki-r 	 * Make sure that we don't overflow the per-bucket
3383761620bSozaki-r 	 * limit or the total cache size limit.
3393761620bSozaki-r 	 */
3403761620bSozaki-r 	s = splsoftnet();
3413761620bSozaki-r 	if (scp->sch_length >= tcp_syn_bucket_limit) {
3423761620bSozaki-r 		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
3433761620bSozaki-r 		/*
3443761620bSozaki-r 		 * The bucket is full.  Toss the oldest element in the
3453761620bSozaki-r 		 * bucket.  This will be the first entry in the bucket.
3463761620bSozaki-r 		 */
3473761620bSozaki-r 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
3483761620bSozaki-r #ifdef DIAGNOSTIC
3493761620bSozaki-r 		/*
3503761620bSozaki-r 		 * This should never happen; we should always find an
3513761620bSozaki-r 		 * entry in our bucket.
3523761620bSozaki-r 		 */
3533761620bSozaki-r 		if (sc2 == NULL)
3543761620bSozaki-r 			panic("syn_cache_insert: bucketoverflow: impossible");
3553761620bSozaki-r #endif
3563761620bSozaki-r 		syn_cache_rm(sc2);
3573761620bSozaki-r 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
3583761620bSozaki-r 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
3593761620bSozaki-r 		struct syn_cache_head *scp2, *sce;
3603761620bSozaki-r 
3613761620bSozaki-r 		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
3623761620bSozaki-r 		/*
3633761620bSozaki-r 		 * The cache is full.  Toss the oldest entry in the
3643761620bSozaki-r 		 * first non-empty bucket we can find.
3653761620bSozaki-r 		 *
3663761620bSozaki-r 		 * XXX We would really like to toss the oldest
3673761620bSozaki-r 		 * entry in the cache, but we hope that this
3683761620bSozaki-r 		 * condition doesn't happen very often.
3693761620bSozaki-r 		 */
3703761620bSozaki-r 		scp2 = scp;
3713761620bSozaki-r 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3723761620bSozaki-r 			sce = &tcp_syn_cache[tcp_syn_cache_size];
3733761620bSozaki-r 			for (++scp2; scp2 != scp; scp2++) {
3743761620bSozaki-r 				if (scp2 >= sce)
3753761620bSozaki-r 					scp2 = &tcp_syn_cache[0];
3763761620bSozaki-r 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
3773761620bSozaki-r 					break;
3783761620bSozaki-r 			}
3793761620bSozaki-r #ifdef DIAGNOSTIC
3803761620bSozaki-r 			/*
3813761620bSozaki-r 			 * This should never happen; we should always find a
3823761620bSozaki-r 			 * non-empty bucket.
3833761620bSozaki-r 			 */
3843761620bSozaki-r 			if (scp2 == scp)
3853761620bSozaki-r 				panic("syn_cache_insert: cacheoverflow: "
3863761620bSozaki-r 				    "impossible");
3873761620bSozaki-r #endif
3883761620bSozaki-r 		}
3893761620bSozaki-r 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3903761620bSozaki-r 		syn_cache_rm(sc2);
3913761620bSozaki-r 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
3923761620bSozaki-r 	}
3933761620bSozaki-r 
3943761620bSozaki-r 	/*
3953761620bSozaki-r 	 * Initialize the entry's timer.
3963761620bSozaki-r 	 */
3973761620bSozaki-r 	sc->sc_rxttot = 0;
3983761620bSozaki-r 	sc->sc_rxtshift = 0;
3993761620bSozaki-r 	syn_cache_timer_arm(sc);
4003761620bSozaki-r 
4013761620bSozaki-r 	/* Link it from tcpcb entry */
4023761620bSozaki-r 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
4033761620bSozaki-r 
4043761620bSozaki-r 	/* Put it into the bucket. */
4053761620bSozaki-r 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
4063761620bSozaki-r 	scp->sch_length++;
4073761620bSozaki-r 	syn_cache_count++;
4083761620bSozaki-r 
4093761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_ADDED);
4103761620bSozaki-r 	splx(s);
4113761620bSozaki-r }
4123761620bSozaki-r 
4133761620bSozaki-r /*
4143761620bSozaki-r  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
4153761620bSozaki-r  * If we have retransmitted an entry the maximum number of times, expire
4163761620bSozaki-r  * that entry.
4173761620bSozaki-r  */
4183761620bSozaki-r static void
syn_cache_timer(void * arg)4193761620bSozaki-r syn_cache_timer(void *arg)
4203761620bSozaki-r {
4213761620bSozaki-r 	struct syn_cache *sc = arg;
4223761620bSozaki-r 
4233761620bSozaki-r 	mutex_enter(softnet_lock);
4243761620bSozaki-r 	KERNEL_LOCK(1, NULL);
4253761620bSozaki-r 
4263761620bSozaki-r 	callout_ack(&sc->sc_timer);
4273761620bSozaki-r 
4283761620bSozaki-r 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
4293761620bSozaki-r 		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
4303761620bSozaki-r 		goto free;
4313761620bSozaki-r 	}
4323761620bSozaki-r 
4333761620bSozaki-r 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
4343761620bSozaki-r 		/* Drop it -- too many retransmissions. */
4353761620bSozaki-r 		goto dropit;
4363761620bSozaki-r 	}
4373761620bSozaki-r 
4383761620bSozaki-r 	/*
4393761620bSozaki-r 	 * Compute the total amount of time this entry has
4403761620bSozaki-r 	 * been on a queue.  If this entry has been on longer
4413761620bSozaki-r 	 * than the keep alive timer would allow, expire it.
4423761620bSozaki-r 	 */
4433761620bSozaki-r 	sc->sc_rxttot += sc->sc_rxtcur;
4443761620bSozaki-r 	if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
4453761620bSozaki-r 		goto dropit;
4463761620bSozaki-r 
4473761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
4483761620bSozaki-r 	(void)syn_cache_respond(sc);
4493761620bSozaki-r 
4503761620bSozaki-r 	/* Advance the timer back-off. */
4513761620bSozaki-r 	sc->sc_rxtshift++;
4523761620bSozaki-r 	syn_cache_timer_arm(sc);
4533761620bSozaki-r 
4543761620bSozaki-r 	goto out;
4553761620bSozaki-r 
4563761620bSozaki-r  dropit:
4573761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
4583761620bSozaki-r 	syn_cache_rm(sc);
4593761620bSozaki-r 	if (sc->sc_ipopts)
4603761620bSozaki-r 		(void) m_free(sc->sc_ipopts);
4613761620bSozaki-r 	rtcache_free(&sc->sc_route);
4623761620bSozaki-r 
4633761620bSozaki-r  free:
4643761620bSozaki-r 	callout_destroy(&sc->sc_timer);
4653761620bSozaki-r 	pool_put(&syn_cache_pool, sc);
4663761620bSozaki-r 
4673761620bSozaki-r  out:
4683761620bSozaki-r 	KERNEL_UNLOCK_ONE(NULL);
4693761620bSozaki-r 	mutex_exit(softnet_lock);
4703761620bSozaki-r }
4713761620bSozaki-r 
4723761620bSozaki-r /*
4733761620bSozaki-r  * Remove syn cache created by the specified tcb entry,
4743761620bSozaki-r  * because this does not make sense to keep them
4753761620bSozaki-r  * (if there's no tcb entry, syn cache entry will never be used)
4763761620bSozaki-r  */
4773761620bSozaki-r void
syn_cache_cleanup(struct tcpcb * tp)4783761620bSozaki-r syn_cache_cleanup(struct tcpcb *tp)
4793761620bSozaki-r {
4803761620bSozaki-r 	struct syn_cache *sc, *nsc;
4813761620bSozaki-r 	int s;
4823761620bSozaki-r 
4833761620bSozaki-r 	s = splsoftnet();
4843761620bSozaki-r 
4853761620bSozaki-r 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
4863761620bSozaki-r 		nsc = LIST_NEXT(sc, sc_tpq);
4873761620bSozaki-r 
4883761620bSozaki-r #ifdef DIAGNOSTIC
4893761620bSozaki-r 		if (sc->sc_tp != tp)
4903761620bSozaki-r 			panic("invalid sc_tp in syn_cache_cleanup");
4913761620bSozaki-r #endif
4923761620bSozaki-r 		syn_cache_rm(sc);
4933761620bSozaki-r 		syn_cache_put(sc);	/* calls pool_put but see spl above */
4943761620bSozaki-r 	}
4953761620bSozaki-r 	/* just for safety */
4963761620bSozaki-r 	LIST_INIT(&tp->t_sc);
4973761620bSozaki-r 
4983761620bSozaki-r 	splx(s);
4993761620bSozaki-r }
5003761620bSozaki-r 
5013761620bSozaki-r /*
5023761620bSozaki-r  * Find an entry in the syn cache.
5033761620bSozaki-r  */
5049815eb9bSozaki-r static struct syn_cache *
syn_cache_lookup(const struct sockaddr * src,const struct sockaddr * dst,struct syn_cache_head ** headp)5053761620bSozaki-r syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
5063761620bSozaki-r     struct syn_cache_head **headp)
5073761620bSozaki-r {
5083761620bSozaki-r 	struct syn_cache *sc;
5093761620bSozaki-r 	struct syn_cache_head *scp;
5103761620bSozaki-r 	u_int32_t hash;
5113761620bSozaki-r 	int s;
5123761620bSozaki-r 
5133761620bSozaki-r 	SYN_HASHALL(hash, src, dst);
5143761620bSozaki-r 
5153761620bSozaki-r 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
5163761620bSozaki-r 	*headp = scp;
5173761620bSozaki-r 	s = splsoftnet();
5183761620bSozaki-r 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
5193761620bSozaki-r 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
5203761620bSozaki-r 		if (sc->sc_hash != hash)
5213761620bSozaki-r 			continue;
5223761620bSozaki-r 		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
5233761620bSozaki-r 		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
5243761620bSozaki-r 			splx(s);
5253761620bSozaki-r 			return (sc);
5263761620bSozaki-r 		}
5273761620bSozaki-r 	}
5283761620bSozaki-r 	splx(s);
5293761620bSozaki-r 	return (NULL);
5303761620bSozaki-r }
5313761620bSozaki-r 
5323761620bSozaki-r /*
5333761620bSozaki-r  * This function gets called when we receive an ACK for a socket in the
5343761620bSozaki-r  * LISTEN state. We look up the connection in the syn cache, and if it's
5353761620bSozaki-r  * there, we pull it out of the cache and turn it into a full-blown
5363761620bSozaki-r  * connection in the SYN-RECEIVED state.
5373761620bSozaki-r  *
5383761620bSozaki-r  * The return values may not be immediately obvious, and their effects
5393761620bSozaki-r  * can be subtle, so here they are:
5403761620bSozaki-r  *
5413761620bSozaki-r  *	NULL	SYN was not found in cache; caller should drop the
5423761620bSozaki-r  *		packet and send an RST.
5433761620bSozaki-r  *
5443761620bSozaki-r  *	-1	We were unable to create the new connection, and are
5453761620bSozaki-r  *		aborting it.  An ACK,RST is being sent to the peer
5463761620bSozaki-r  *		(unless we got screwey sequence numbers; see below),
5473761620bSozaki-r  *		because the 3-way handshake has been completed.  Caller
5483761620bSozaki-r  *		should not free the mbuf, since we may be using it.  If
5493761620bSozaki-r  *		we are not, we will free it.
5503761620bSozaki-r  *
5513761620bSozaki-r  *	Otherwise, the return value is a pointer to the new socket
5523761620bSozaki-r  *	associated with the connection.
5533761620bSozaki-r  */
5543761620bSozaki-r struct socket *
syn_cache_get(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th,struct socket * so,struct mbuf * m)5553761620bSozaki-r syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
5563761620bSozaki-r     struct tcphdr *th, struct socket *so, struct mbuf *m)
5573761620bSozaki-r {
5583761620bSozaki-r 	struct syn_cache *sc;
5593761620bSozaki-r 	struct syn_cache_head *scp;
5603761620bSozaki-r 	struct inpcb *inp = NULL;
5613761620bSozaki-r 	struct tcpcb *tp;
5623761620bSozaki-r 	int s;
5633761620bSozaki-r 	struct socket *oso;
5643761620bSozaki-r 
5653761620bSozaki-r 	s = splsoftnet();
5663761620bSozaki-r 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
5673761620bSozaki-r 		splx(s);
5683761620bSozaki-r 		return NULL;
5693761620bSozaki-r 	}
5703761620bSozaki-r 
5713761620bSozaki-r 	/*
5723761620bSozaki-r 	 * Verify the sequence and ack numbers.  Try getting the correct
5733761620bSozaki-r 	 * response again.
5743761620bSozaki-r 	 */
5753761620bSozaki-r 	if ((th->th_ack != sc->sc_iss + 1) ||
5763761620bSozaki-r 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
5773761620bSozaki-r 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
5783761620bSozaki-r 		m_freem(m);
5793761620bSozaki-r 		(void)syn_cache_respond(sc);
5803761620bSozaki-r 		splx(s);
5813761620bSozaki-r 		return ((struct socket *)(-1));
5823761620bSozaki-r 	}
5833761620bSozaki-r 
5843761620bSozaki-r 	/* Remove this cache entry */
5853761620bSozaki-r 	syn_cache_rm(sc);
5863761620bSozaki-r 	splx(s);
5873761620bSozaki-r 
5883761620bSozaki-r 	/*
5893761620bSozaki-r 	 * Ok, create the full blown connection, and set things up
5903761620bSozaki-r 	 * as they would have been set up if we had created the
5913761620bSozaki-r 	 * connection when the SYN arrived.  If we can't create
5923761620bSozaki-r 	 * the connection, abort it.
5933761620bSozaki-r 	 */
5943761620bSozaki-r 	/*
5953761620bSozaki-r 	 * inp still has the OLD in_pcb stuff, set the
5963761620bSozaki-r 	 * v6-related flags on the new guy, too.   This is
5973761620bSozaki-r 	 * done particularly for the case where an AF_INET6
5983761620bSozaki-r 	 * socket is bound only to a port, and a v4 connection
5993761620bSozaki-r 	 * comes in on that port.
6003761620bSozaki-r 	 * we also copy the flowinfo from the original pcb
6013761620bSozaki-r 	 * to the new one.
6023761620bSozaki-r 	 */
6033761620bSozaki-r 	oso = so;
6043761620bSozaki-r 	so = sonewconn(so, true);
6053761620bSozaki-r 	if (so == NULL)
6063761620bSozaki-r 		goto resetandabort;
6073761620bSozaki-r 
6083761620bSozaki-r 	inp = sotoinpcb(so);
6093761620bSozaki-r 
6103761620bSozaki-r 	switch (src->sa_family) {
6113761620bSozaki-r 	case AF_INET:
6120e390eeeSozaki-r 		if (inp->inp_af == AF_INET) {
613a071c829Sozaki-r 			in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
6143761620bSozaki-r 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
6153761620bSozaki-r 			inp->inp_options = ip_srcroute(m);
6162ba9f052Sozaki-r 			inpcb_set_state(inp, INP_BOUND);
6173761620bSozaki-r 			if (inp->inp_options == NULL) {
6183761620bSozaki-r 				inp->inp_options = sc->sc_ipopts;
6193761620bSozaki-r 				sc->sc_ipopts = NULL;
6203761620bSozaki-r 			}
6213761620bSozaki-r 		}
6223761620bSozaki-r #ifdef INET6
6230e390eeeSozaki-r 		else if (inp->inp_af == AF_INET6) {
6243761620bSozaki-r 			/* IPv4 packet to AF_INET6 socket */
625a071c829Sozaki-r 			memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
626a071c829Sozaki-r 			in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
6273761620bSozaki-r 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
628a071c829Sozaki-r 				&in6p_laddr(inp).s6_addr32[3],
6293761620bSozaki-r 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
6300e390eeeSozaki-r 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
6310e390eeeSozaki-r 			intotcpcb(inp)->t_family = AF_INET;
6320e390eeeSozaki-r 			if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
6330e390eeeSozaki-r 				inp->inp_flags |= IN6P_IPV6_V6ONLY;
6343761620bSozaki-r 			else
6350e390eeeSozaki-r 				inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
6362ba9f052Sozaki-r 			inpcb_set_state(inp, INP_BOUND);
6373761620bSozaki-r 		}
6383761620bSozaki-r #endif
6393761620bSozaki-r 		break;
6403761620bSozaki-r #ifdef INET6
6413761620bSozaki-r 	case AF_INET6:
6420e390eeeSozaki-r 		if (inp->inp_af == AF_INET6) {
643a071c829Sozaki-r 			in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
6440e390eeeSozaki-r 			inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
6452ba9f052Sozaki-r 			inpcb_set_state(inp, INP_BOUND);
6463761620bSozaki-r 		}
6473761620bSozaki-r 		break;
6483761620bSozaki-r #endif
6493761620bSozaki-r 	}
6503761620bSozaki-r 
6513761620bSozaki-r #ifdef INET6
6520e390eeeSozaki-r 	if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
6530e390eeeSozaki-r 		struct inpcb *oinp = sotoinpcb(oso);
6543761620bSozaki-r 		/* inherit socket options from the listening socket */
6550e390eeeSozaki-r 		inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
6560e390eeeSozaki-r 		if (inp->inp_flags & IN6P_CONTROLOPTS) {
6570e390eeeSozaki-r 			m_freem(inp->inp_options);
6580e390eeeSozaki-r 			inp->inp_options = NULL;
6593761620bSozaki-r 		}
6600e390eeeSozaki-r 		ip6_savecontrol(inp, &inp->inp_options,
6613761620bSozaki-r 		    mtod(m, struct ip6_hdr *), m);
6623761620bSozaki-r 	}
6633761620bSozaki-r #endif
6643761620bSozaki-r 
6653761620bSozaki-r 	/*
6663761620bSozaki-r 	 * Give the new socket our cached route reference.
6673761620bSozaki-r 	 */
6683761620bSozaki-r 	rtcache_copy(&inp->inp_route, &sc->sc_route);
6693761620bSozaki-r 	rtcache_free(&sc->sc_route);
6703761620bSozaki-r 
6710e390eeeSozaki-r 	if (inp->inp_af == AF_INET) {
6723761620bSozaki-r 		struct sockaddr_in sin;
6733761620bSozaki-r 		memcpy(&sin, src, src->sa_len);
6742ba9f052Sozaki-r 		if (inpcb_connect(inp, &sin, &lwp0)) {
6753761620bSozaki-r 			goto resetandabort;
6763761620bSozaki-r 		}
6773761620bSozaki-r 	}
6783761620bSozaki-r #ifdef INET6
6790e390eeeSozaki-r 	else if (inp->inp_af == AF_INET6) {
6803761620bSozaki-r 		struct sockaddr_in6 sin6;
6813761620bSozaki-r 		memcpy(&sin6, src, src->sa_len);
6823761620bSozaki-r 		if (src->sa_family == AF_INET) {
6833761620bSozaki-r 			/* IPv4 packet to AF_INET6 socket */
6843761620bSozaki-r 			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
6853761620bSozaki-r 		}
686b000e63fSozaki-r 		if (in6pcb_connect(inp, &sin6, NULL)) {
6873761620bSozaki-r 			goto resetandabort;
6883761620bSozaki-r 		}
6893761620bSozaki-r 	}
6903761620bSozaki-r #endif
6913761620bSozaki-r 	else {
6923761620bSozaki-r 		goto resetandabort;
6933761620bSozaki-r 	}
6943761620bSozaki-r 
6953761620bSozaki-r 	tp = intotcpcb(inp);
6963761620bSozaki-r 
6973761620bSozaki-r 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
6983761620bSozaki-r 	if (sc->sc_request_r_scale != 15) {
6993761620bSozaki-r 		tp->requested_s_scale = sc->sc_requested_s_scale;
7003761620bSozaki-r 		tp->request_r_scale = sc->sc_request_r_scale;
7013761620bSozaki-r 		tp->snd_scale = sc->sc_requested_s_scale;
7023761620bSozaki-r 		tp->rcv_scale = sc->sc_request_r_scale;
7033761620bSozaki-r 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
7043761620bSozaki-r 	}
7053761620bSozaki-r 	if (sc->sc_flags & SCF_TIMESTAMP)
7063761620bSozaki-r 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
7073761620bSozaki-r 	tp->ts_timebase = sc->sc_timebase;
7083761620bSozaki-r 
7093761620bSozaki-r 	tp->t_template = tcp_template(tp);
7103761620bSozaki-r 	if (tp->t_template == 0) {
7113761620bSozaki-r 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
7123761620bSozaki-r 		so = NULL;
7133761620bSozaki-r 		m_freem(m);
7143761620bSozaki-r 		goto abort;
7153761620bSozaki-r 	}
7163761620bSozaki-r 
7173761620bSozaki-r 	tp->iss = sc->sc_iss;
7183761620bSozaki-r 	tp->irs = sc->sc_irs;
7193761620bSozaki-r 	tcp_sendseqinit(tp);
7203761620bSozaki-r 	tcp_rcvseqinit(tp);
7213761620bSozaki-r 	tp->t_state = TCPS_SYN_RECEIVED;
7223761620bSozaki-r 	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
7233761620bSozaki-r 	TCP_STATINC(TCP_STAT_ACCEPTS);
7243761620bSozaki-r 
7253761620bSozaki-r 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
7263761620bSozaki-r 		tp->t_flags |= TF_WILL_SACK;
7273761620bSozaki-r 
7283761620bSozaki-r 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
7293761620bSozaki-r 		tp->t_flags |= TF_ECN_PERMIT;
7303761620bSozaki-r 
7313761620bSozaki-r #ifdef TCP_SIGNATURE
7323761620bSozaki-r 	if (sc->sc_flags & SCF_SIGNATURE)
7333761620bSozaki-r 		tp->t_flags |= TF_SIGNATURE;
7343761620bSozaki-r #endif
7353761620bSozaki-r 
7363761620bSozaki-r 	/* Initialize tp->t_ourmss before we deal with the peer's! */
7373761620bSozaki-r 	tp->t_ourmss = sc->sc_ourmaxseg;
7383761620bSozaki-r 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
7393761620bSozaki-r 
7403761620bSozaki-r 	/*
7413761620bSozaki-r 	 * Initialize the initial congestion window.  If we
7423761620bSozaki-r 	 * had to retransmit the SYN,ACK, we must initialize cwnd
7433761620bSozaki-r 	 * to 1 segment (i.e. the Loss Window).
7443761620bSozaki-r 	 */
7453761620bSozaki-r 	if (sc->sc_rxtshift)
7463761620bSozaki-r 		tp->snd_cwnd = tp->t_peermss;
7473761620bSozaki-r 	else {
7483761620bSozaki-r 		int ss = tcp_init_win;
749a071c829Sozaki-r 		if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
7503761620bSozaki-r 			ss = tcp_init_win_local;
7513761620bSozaki-r #ifdef INET6
752a071c829Sozaki-r 		else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
7533761620bSozaki-r 			ss = tcp_init_win_local;
7543761620bSozaki-r #endif
7553761620bSozaki-r 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
7563761620bSozaki-r 	}
7573761620bSozaki-r 
7583761620bSozaki-r 	tcp_rmx_rtt(tp);
7593761620bSozaki-r 	tp->snd_wl1 = sc->sc_irs;
7603761620bSozaki-r 	tp->rcv_up = sc->sc_irs + 1;
7613761620bSozaki-r 
7623761620bSozaki-r 	/*
7633761620bSozaki-r 	 * This is what would have happened in tcp_output() when
7643761620bSozaki-r 	 * the SYN,ACK was sent.
7653761620bSozaki-r 	 */
7663761620bSozaki-r 	tp->snd_up = tp->snd_una;
7673761620bSozaki-r 	tp->snd_max = tp->snd_nxt = tp->iss+1;
7683761620bSozaki-r 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
7693761620bSozaki-r 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
7703761620bSozaki-r 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
7713761620bSozaki-r 	tp->last_ack_sent = tp->rcv_nxt;
7723761620bSozaki-r 	tp->t_partialacks = -1;
7733761620bSozaki-r 	tp->t_dupacks = 0;
7743761620bSozaki-r 
7753761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_COMPLETED);
7763761620bSozaki-r 	s = splsoftnet();
7773761620bSozaki-r 	syn_cache_put(sc);
7783761620bSozaki-r 	splx(s);
7793761620bSozaki-r 	return so;
7803761620bSozaki-r 
7813761620bSozaki-r resetandabort:
7823761620bSozaki-r 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
7833761620bSozaki-r abort:
7843761620bSozaki-r 	if (so != NULL) {
7853761620bSozaki-r 		(void) soqremque(so, 1);
7863761620bSozaki-r 		(void) soabort(so);
7873761620bSozaki-r 		mutex_enter(softnet_lock);
7883761620bSozaki-r 	}
7893761620bSozaki-r 	s = splsoftnet();
7903761620bSozaki-r 	syn_cache_put(sc);
7913761620bSozaki-r 	splx(s);
7923761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_ABORTED);
7933761620bSozaki-r 	return ((struct socket *)(-1));
7943761620bSozaki-r }
7953761620bSozaki-r 
7963761620bSozaki-r /*
7973761620bSozaki-r  * This function is called when we get a RST for a
7983761620bSozaki-r  * non-existent connection, so that we can see if the
7993761620bSozaki-r  * connection is in the syn cache.  If it is, zap it.
8003761620bSozaki-r  */
8013761620bSozaki-r 
8023761620bSozaki-r void
syn_cache_reset(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th)8033761620bSozaki-r syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
8043761620bSozaki-r {
8053761620bSozaki-r 	struct syn_cache *sc;
8063761620bSozaki-r 	struct syn_cache_head *scp;
8073761620bSozaki-r 	int s = splsoftnet();
8083761620bSozaki-r 
8093761620bSozaki-r 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
8103761620bSozaki-r 		splx(s);
8113761620bSozaki-r 		return;
8123761620bSozaki-r 	}
8133761620bSozaki-r 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
8143761620bSozaki-r 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
8153761620bSozaki-r 		splx(s);
8163761620bSozaki-r 		return;
8173761620bSozaki-r 	}
8183761620bSozaki-r 	syn_cache_rm(sc);
8193761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_RESET);
8203761620bSozaki-r 	syn_cache_put(sc);	/* calls pool_put but see spl above */
8213761620bSozaki-r 	splx(s);
8223761620bSozaki-r }
8233761620bSozaki-r 
8243761620bSozaki-r void
syn_cache_unreach(const struct sockaddr * src,const struct sockaddr * dst,struct tcphdr * th)8253761620bSozaki-r syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
8263761620bSozaki-r     struct tcphdr *th)
8273761620bSozaki-r {
8283761620bSozaki-r 	struct syn_cache *sc;
8293761620bSozaki-r 	struct syn_cache_head *scp;
8303761620bSozaki-r 	int s;
8313761620bSozaki-r 
8323761620bSozaki-r 	s = splsoftnet();
8333761620bSozaki-r 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
8343761620bSozaki-r 		splx(s);
8353761620bSozaki-r 		return;
8363761620bSozaki-r 	}
8373761620bSozaki-r 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
8383761620bSozaki-r 	if (ntohl(th->th_seq) != sc->sc_iss) {
8393761620bSozaki-r 		splx(s);
8403761620bSozaki-r 		return;
8413761620bSozaki-r 	}
8423761620bSozaki-r 
8433761620bSozaki-r 	/*
8443761620bSozaki-r 	 * If we've retransmitted 3 times and this is our second error,
8453761620bSozaki-r 	 * we remove the entry.  Otherwise, we allow it to continue on.
8463761620bSozaki-r 	 * This prevents us from incorrectly nuking an entry during a
8473761620bSozaki-r 	 * spurious network outage.
8483761620bSozaki-r 	 *
8493761620bSozaki-r 	 * See tcp_notify().
8503761620bSozaki-r 	 */
8513761620bSozaki-r 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
8523761620bSozaki-r 		sc->sc_flags |= SCF_UNREACH;
8533761620bSozaki-r 		splx(s);
8543761620bSozaki-r 		return;
8553761620bSozaki-r 	}
8563761620bSozaki-r 
8573761620bSozaki-r 	syn_cache_rm(sc);
8583761620bSozaki-r 	TCP_STATINC(TCP_STAT_SC_UNREACH);
8593761620bSozaki-r 	syn_cache_put(sc);	/* calls pool_put but see spl above */
8603761620bSozaki-r 	splx(s);
8613761620bSozaki-r }
8623761620bSozaki-r 
8633761620bSozaki-r /*
8643761620bSozaki-r  * Given a LISTEN socket and an inbound SYN request, add this to the syn
8653761620bSozaki-r  * cache, and send back a segment:
8663761620bSozaki-r  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
8673761620bSozaki-r  * to the source.
8683761620bSozaki-r  *
8693761620bSozaki-r  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
8703761620bSozaki-r  * Doing so would require that we hold onto the data and deliver it
8713761620bSozaki-r  * to the application.  However, if we are the target of a SYN-flood
8723761620bSozaki-r  * DoS attack, an attacker could send data which would eventually
8733761620bSozaki-r  * consume all available buffer space if it were ACKed.  By not ACKing
8743761620bSozaki-r  * the data, we avoid this DoS scenario.
8753761620bSozaki-r  */
8763761620bSozaki-r int
syn_cache_add(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th,unsigned int toff,struct socket * so,struct mbuf * m,u_char * optp,int optlen,struct tcp_opt_info * oi)8773761620bSozaki-r syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
8783761620bSozaki-r     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
8793761620bSozaki-r     int optlen, struct tcp_opt_info *oi)
8803761620bSozaki-r {
8813761620bSozaki-r 	struct tcpcb tb, *tp;
8823761620bSozaki-r 	long win;
8833761620bSozaki-r 	struct syn_cache *sc;
8843761620bSozaki-r 	struct syn_cache_head *scp;
8853761620bSozaki-r 	struct mbuf *ipopts;
8863761620bSozaki-r 	int s;
8873761620bSozaki-r 
8883761620bSozaki-r 	tp = sototcpcb(so);
8893761620bSozaki-r 
8903761620bSozaki-r 	/*
8913761620bSozaki-r 	 * Initialize some local state.
8923761620bSozaki-r 	 */
8933761620bSozaki-r 	win = sbspace(&so->so_rcv);
8943761620bSozaki-r 	if (win > TCP_MAXWIN)
8953761620bSozaki-r 		win = TCP_MAXWIN;
8963761620bSozaki-r 
8973761620bSozaki-r #ifdef TCP_SIGNATURE
8983761620bSozaki-r 	if (optp || (tp->t_flags & TF_SIGNATURE))
8993761620bSozaki-r #else
9003761620bSozaki-r 	if (optp)
9013761620bSozaki-r #endif
9023761620bSozaki-r 	{
9033761620bSozaki-r 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
9043761620bSozaki-r #ifdef TCP_SIGNATURE
9053761620bSozaki-r 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
9063761620bSozaki-r #endif
9073761620bSozaki-r 		tb.t_state = TCPS_LISTEN;
9083761620bSozaki-r 		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
9093761620bSozaki-r 			return 0;
9103761620bSozaki-r 	} else
9113761620bSozaki-r 		tb.t_flags = 0;
9123761620bSozaki-r 
9133761620bSozaki-r 	switch (src->sa_family) {
9143761620bSozaki-r 	case AF_INET:
9153761620bSozaki-r 		/* Remember the IP options, if any. */
9163761620bSozaki-r 		ipopts = ip_srcroute(m);
9173761620bSozaki-r 		break;
9183761620bSozaki-r 	default:
9193761620bSozaki-r 		ipopts = NULL;
9203761620bSozaki-r 	}
9213761620bSozaki-r 
9223761620bSozaki-r 	/*
9233761620bSozaki-r 	 * See if we already have an entry for this connection.
9243761620bSozaki-r 	 * If we do, resend the SYN,ACK.  We do not count this
9253761620bSozaki-r 	 * as a retransmission (XXX though maybe we should).
9263761620bSozaki-r 	 */
9273761620bSozaki-r 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
9283761620bSozaki-r 		TCP_STATINC(TCP_STAT_SC_DUPESYN);
9293761620bSozaki-r 		if (ipopts) {
9303761620bSozaki-r 			/*
9313761620bSozaki-r 			 * If we were remembering a previous source route,
9323761620bSozaki-r 			 * forget it and use the new one we've been given.
9333761620bSozaki-r 			 */
9343761620bSozaki-r 			if (sc->sc_ipopts)
9353761620bSozaki-r 				(void)m_free(sc->sc_ipopts);
9363761620bSozaki-r 			sc->sc_ipopts = ipopts;
9373761620bSozaki-r 		}
9383761620bSozaki-r 		sc->sc_timestamp = tb.ts_recent;
9393761620bSozaki-r 		m_freem(m);
9403761620bSozaki-r 		if (syn_cache_respond(sc) == 0) {
941*023842ddSriastradh 			net_stat_ref_t tcps = TCP_STAT_GETREF();
942*023842ddSriastradh 			_NET_STATINC_REF(tcps, TCP_STAT_SNDACKS);
943*023842ddSriastradh 			_NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL);
9443761620bSozaki-r 			TCP_STAT_PUTREF();
9453761620bSozaki-r 		}
9463761620bSozaki-r 		return 1;
9473761620bSozaki-r 	}
9483761620bSozaki-r 
9493761620bSozaki-r 	s = splsoftnet();
9503761620bSozaki-r 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
9513761620bSozaki-r 	splx(s);
9523761620bSozaki-r 	if (sc == NULL) {
9533761620bSozaki-r 		if (ipopts)
9543761620bSozaki-r 			(void)m_free(ipopts);
9553761620bSozaki-r 		return 0;
9563761620bSozaki-r 	}
9573761620bSozaki-r 
9583761620bSozaki-r 	/*
9593761620bSozaki-r 	 * Fill in the cache, and put the necessary IP and TCP
9603761620bSozaki-r 	 * options into the reply.
9613761620bSozaki-r 	 */
9623761620bSozaki-r 	memset(sc, 0, sizeof(struct syn_cache));
9633761620bSozaki-r 	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
9643761620bSozaki-r 	memcpy(&sc->sc_src, src, src->sa_len);
9653761620bSozaki-r 	memcpy(&sc->sc_dst, dst, dst->sa_len);
9663761620bSozaki-r 	sc->sc_flags = 0;
9673761620bSozaki-r 	sc->sc_ipopts = ipopts;
9683761620bSozaki-r 	sc->sc_irs = th->th_seq;
9693761620bSozaki-r 	switch (src->sa_family) {
9703761620bSozaki-r 	case AF_INET:
9713761620bSozaki-r 	    {
9723761620bSozaki-r 		struct sockaddr_in *srcin = (void *)src;
9733761620bSozaki-r 		struct sockaddr_in *dstin = (void *)dst;
9743761620bSozaki-r 
9753761620bSozaki-r 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
9763761620bSozaki-r 		    &srcin->sin_addr, dstin->sin_port,
9773761620bSozaki-r 		    srcin->sin_port, sizeof(dstin->sin_addr));
9783761620bSozaki-r 		break;
9793761620bSozaki-r 	    }
9803761620bSozaki-r #ifdef INET6
9813761620bSozaki-r 	case AF_INET6:
9823761620bSozaki-r 	    {
9833761620bSozaki-r 		struct sockaddr_in6 *srcin6 = (void *)src;
9843761620bSozaki-r 		struct sockaddr_in6 *dstin6 = (void *)dst;
9853761620bSozaki-r 
9863761620bSozaki-r 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
9873761620bSozaki-r 		    &srcin6->sin6_addr, dstin6->sin6_port,
9883761620bSozaki-r 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
9893761620bSozaki-r 		break;
9903761620bSozaki-r 	    }
9913761620bSozaki-r #endif
9923761620bSozaki-r 	}
9933761620bSozaki-r 	sc->sc_peermaxseg = oi->maxseg;
9943761620bSozaki-r 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
9953761620bSozaki-r 	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
9963761620bSozaki-r 	sc->sc_win = win;
9973761620bSozaki-r 	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
9983761620bSozaki-r 	sc->sc_timestamp = tb.ts_recent;
9993761620bSozaki-r 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
10003761620bSozaki-r 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
10013761620bSozaki-r 		sc->sc_flags |= SCF_TIMESTAMP;
10023761620bSozaki-r 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
10033761620bSozaki-r 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
10043761620bSozaki-r 		sc->sc_requested_s_scale = tb.requested_s_scale;
10053761620bSozaki-r 		sc->sc_request_r_scale = 0;
10063761620bSozaki-r 		/*
10073761620bSozaki-r 		 * Pick the smallest possible scaling factor that
10083761620bSozaki-r 		 * will still allow us to scale up to sb_max.
10093761620bSozaki-r 		 *
10103761620bSozaki-r 		 * We do this because there are broken firewalls that
10113761620bSozaki-r 		 * will corrupt the window scale option, leading to
10123761620bSozaki-r 		 * the other endpoint believing that our advertised
10133761620bSozaki-r 		 * window is unscaled.  At scale factors larger than
10143761620bSozaki-r 		 * 5 the unscaled window will drop below 1500 bytes,
10153761620bSozaki-r 		 * leading to serious problems when traversing these
10163761620bSozaki-r 		 * broken firewalls.
10173761620bSozaki-r 		 *
10183761620bSozaki-r 		 * With the default sbmax of 256K, a scale factor
10193761620bSozaki-r 		 * of 3 will be chosen by this algorithm.  Those who
10203761620bSozaki-r 		 * choose a larger sbmax should watch out
10213761620bSozaki-r 		 * for the compatibility problems mentioned above.
10223761620bSozaki-r 		 *
10233761620bSozaki-r 		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
10243761620bSozaki-r 		 * or <SYN,ACK>) segment itself is never scaled.
10253761620bSozaki-r 		 */
10263761620bSozaki-r 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
10273761620bSozaki-r 		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
10283761620bSozaki-r 			sc->sc_request_r_scale++;
10293761620bSozaki-r 	} else {
10303761620bSozaki-r 		sc->sc_requested_s_scale = 15;
10313761620bSozaki-r 		sc->sc_request_r_scale = 15;
10323761620bSozaki-r 	}
10333761620bSozaki-r 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
10343761620bSozaki-r 		sc->sc_flags |= SCF_SACK_PERMIT;
10353761620bSozaki-r 
10363761620bSozaki-r 	/*
10373761620bSozaki-r 	 * ECN setup packet received.
10383761620bSozaki-r 	 */
10393761620bSozaki-r 	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
10403761620bSozaki-r 		sc->sc_flags |= SCF_ECN_PERMIT;
10413761620bSozaki-r 
10423761620bSozaki-r #ifdef TCP_SIGNATURE
10433761620bSozaki-r 	if (tb.t_flags & TF_SIGNATURE)
10443761620bSozaki-r 		sc->sc_flags |= SCF_SIGNATURE;
10453761620bSozaki-r #endif
10463761620bSozaki-r 	sc->sc_tp = tp;
10473761620bSozaki-r 	m_freem(m);
10483761620bSozaki-r 	if (syn_cache_respond(sc) == 0) {
1049*023842ddSriastradh 		net_stat_ref_t tcps = TCP_STAT_GETREF();
1050*023842ddSriastradh 		_NET_STATINC_REF(tcps, TCP_STAT_SNDACKS);
1051*023842ddSriastradh 		_NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL);
10523761620bSozaki-r 		TCP_STAT_PUTREF();
10533761620bSozaki-r 		syn_cache_insert(sc, tp);
10543761620bSozaki-r 	} else {
10553761620bSozaki-r 		s = splsoftnet();
10563761620bSozaki-r 		/*
10573761620bSozaki-r 		 * syn_cache_put() will try to schedule the timer, so
10583761620bSozaki-r 		 * we need to initialize it
10593761620bSozaki-r 		 */
10603761620bSozaki-r 		syn_cache_timer_arm(sc);
10613761620bSozaki-r 		syn_cache_put(sc);
10623761620bSozaki-r 		splx(s);
10633761620bSozaki-r 		TCP_STATINC(TCP_STAT_SC_DROPPED);
10643761620bSozaki-r 	}
10653761620bSozaki-r 	return 1;
10663761620bSozaki-r }
10673761620bSozaki-r 
10683761620bSozaki-r /*
10693761620bSozaki-r  * syn_cache_respond: (re)send SYN+ACK.
10703761620bSozaki-r  *
10713761620bSozaki-r  * Returns 0 on success.
10723761620bSozaki-r  */
10733761620bSozaki-r 
10749815eb9bSozaki-r static int
syn_cache_respond(struct syn_cache * sc)10753761620bSozaki-r syn_cache_respond(struct syn_cache *sc)
10763761620bSozaki-r {
10773761620bSozaki-r #ifdef INET6
10783761620bSozaki-r 	struct rtentry *rt = NULL;
10793761620bSozaki-r #endif
10803761620bSozaki-r 	struct route *ro;
10813761620bSozaki-r 	u_int8_t *optp;
10823761620bSozaki-r 	int optlen, error;
10833761620bSozaki-r 	u_int16_t tlen;
10843761620bSozaki-r 	struct ip *ip = NULL;
10853761620bSozaki-r #ifdef INET6
10863761620bSozaki-r 	struct ip6_hdr *ip6 = NULL;
10873761620bSozaki-r #endif
10883761620bSozaki-r 	struct tcpcb *tp;
10893761620bSozaki-r 	struct tcphdr *th;
10903761620bSozaki-r 	struct mbuf *m;
10913761620bSozaki-r 	u_int hlen;
10923761620bSozaki-r #ifdef TCP_SIGNATURE
10933761620bSozaki-r 	struct secasvar *sav = NULL;
10943761620bSozaki-r 	u_int8_t *sigp = NULL;
10953761620bSozaki-r #endif
10963761620bSozaki-r 
10973761620bSozaki-r 	ro = &sc->sc_route;
10983761620bSozaki-r 	switch (sc->sc_src.sa.sa_family) {
10993761620bSozaki-r 	case AF_INET:
11003761620bSozaki-r 		hlen = sizeof(struct ip);
11013761620bSozaki-r 		break;
11023761620bSozaki-r #ifdef INET6
11033761620bSozaki-r 	case AF_INET6:
11043761620bSozaki-r 		hlen = sizeof(struct ip6_hdr);
11053761620bSozaki-r 		break;
11063761620bSozaki-r #endif
11073761620bSozaki-r 	default:
11083761620bSozaki-r 		return EAFNOSUPPORT;
11093761620bSozaki-r 	}
11103761620bSozaki-r 
11113761620bSozaki-r 	/* Worst case scenario, since we don't know the option size yet. */
11123761620bSozaki-r 	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
11133761620bSozaki-r 	KASSERT(max_linkhdr + tlen <= MCLBYTES);
11143761620bSozaki-r 
11153761620bSozaki-r 	/*
11163761620bSozaki-r 	 * Create the IP+TCP header from scratch.
11173761620bSozaki-r 	 */
11183761620bSozaki-r 	MGETHDR(m, M_DONTWAIT, MT_DATA);
11193761620bSozaki-r 	if (m && (max_linkhdr + tlen) > MHLEN) {
11203761620bSozaki-r 		MCLGET(m, M_DONTWAIT);
11213761620bSozaki-r 		if ((m->m_flags & M_EXT) == 0) {
11223761620bSozaki-r 			m_freem(m);
11233761620bSozaki-r 			m = NULL;
11243761620bSozaki-r 		}
11253761620bSozaki-r 	}
11263761620bSozaki-r 	if (m == NULL)
11273761620bSozaki-r 		return ENOBUFS;
11283761620bSozaki-r 	MCLAIM(m, &tcp_tx_mowner);
11293761620bSozaki-r 
11303761620bSozaki-r 	tp = sc->sc_tp;
11313761620bSozaki-r 
11323761620bSozaki-r 	/* Fixup the mbuf. */
11333761620bSozaki-r 	m->m_data += max_linkhdr;
11343761620bSozaki-r 	m_reset_rcvif(m);
11353761620bSozaki-r 	memset(mtod(m, void *), 0, tlen);
11363761620bSozaki-r 
11373761620bSozaki-r 	switch (sc->sc_src.sa.sa_family) {
11383761620bSozaki-r 	case AF_INET:
11393761620bSozaki-r 		ip = mtod(m, struct ip *);
11403761620bSozaki-r 		ip->ip_v = 4;
11413761620bSozaki-r 		ip->ip_dst = sc->sc_src.sin.sin_addr;
11423761620bSozaki-r 		ip->ip_src = sc->sc_dst.sin.sin_addr;
11433761620bSozaki-r 		ip->ip_p = IPPROTO_TCP;
11443761620bSozaki-r 		th = (struct tcphdr *)(ip + 1);
11453761620bSozaki-r 		th->th_dport = sc->sc_src.sin.sin_port;
11463761620bSozaki-r 		th->th_sport = sc->sc_dst.sin.sin_port;
11473761620bSozaki-r 		break;
11483761620bSozaki-r #ifdef INET6
11493761620bSozaki-r 	case AF_INET6:
11503761620bSozaki-r 		ip6 = mtod(m, struct ip6_hdr *);
11513761620bSozaki-r 		ip6->ip6_vfc = IPV6_VERSION;
11523761620bSozaki-r 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
11533761620bSozaki-r 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
11543761620bSozaki-r 		ip6->ip6_nxt = IPPROTO_TCP;
11553761620bSozaki-r 		/* ip6_plen will be updated in ip6_output() */
11563761620bSozaki-r 		th = (struct tcphdr *)(ip6 + 1);
11573761620bSozaki-r 		th->th_dport = sc->sc_src.sin6.sin6_port;
11583761620bSozaki-r 		th->th_sport = sc->sc_dst.sin6.sin6_port;
11593761620bSozaki-r 		break;
11603761620bSozaki-r #endif
11613761620bSozaki-r 	default:
11623761620bSozaki-r 		panic("%s: impossible (1)", __func__);
11633761620bSozaki-r 	}
11643761620bSozaki-r 
11653761620bSozaki-r 	th->th_seq = htonl(sc->sc_iss);
11663761620bSozaki-r 	th->th_ack = htonl(sc->sc_irs + 1);
11673761620bSozaki-r 	th->th_flags = TH_SYN|TH_ACK;
11683761620bSozaki-r 	th->th_win = htons(sc->sc_win);
11693761620bSozaki-r 	/* th_x2, th_sum, th_urp already 0 from memset */
11703761620bSozaki-r 
11713761620bSozaki-r 	/* Tack on the TCP options. */
11723761620bSozaki-r 	optp = (u_int8_t *)(th + 1);
11733761620bSozaki-r 	optlen = 0;
11743761620bSozaki-r 	*optp++ = TCPOPT_MAXSEG;
11753761620bSozaki-r 	*optp++ = TCPOLEN_MAXSEG;
11763761620bSozaki-r 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
11773761620bSozaki-r 	*optp++ = sc->sc_ourmaxseg & 0xff;
11783761620bSozaki-r 	optlen += TCPOLEN_MAXSEG;
11793761620bSozaki-r 
11803761620bSozaki-r 	if (sc->sc_request_r_scale != 15) {
11813761620bSozaki-r 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
11823761620bSozaki-r 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
11833761620bSozaki-r 		    sc->sc_request_r_scale);
11843761620bSozaki-r 		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
11853761620bSozaki-r 		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
11863761620bSozaki-r 	}
11873761620bSozaki-r 
11883761620bSozaki-r 	if (sc->sc_flags & SCF_SACK_PERMIT) {
11893761620bSozaki-r 		/* Let the peer know that we will SACK. */
11903761620bSozaki-r 		*optp++ = TCPOPT_SACK_PERMITTED;
11913761620bSozaki-r 		*optp++ = TCPOLEN_SACK_PERMITTED;
11923761620bSozaki-r 		optlen += TCPOLEN_SACK_PERMITTED;
11933761620bSozaki-r 	}
11943761620bSozaki-r 
11953761620bSozaki-r 	if (sc->sc_flags & SCF_TIMESTAMP) {
11963761620bSozaki-r 		while (optlen % 4 != 2) {
11973761620bSozaki-r 			optlen += TCPOLEN_NOP;
11983761620bSozaki-r 			*optp++ = TCPOPT_NOP;
11993761620bSozaki-r 		}
12003761620bSozaki-r 		*optp++ = TCPOPT_TIMESTAMP;
12013761620bSozaki-r 		*optp++ = TCPOLEN_TIMESTAMP;
12023761620bSozaki-r 		u_int32_t *lp = (u_int32_t *)(optp);
12033761620bSozaki-r 		/* Form timestamp option as shown in appendix A of RFC 1323. */
12043761620bSozaki-r 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
12053761620bSozaki-r 		*lp   = htonl(sc->sc_timestamp);
12063761620bSozaki-r 		optp += TCPOLEN_TIMESTAMP - 2;
12073761620bSozaki-r 		optlen += TCPOLEN_TIMESTAMP;
12083761620bSozaki-r 	}
12093761620bSozaki-r 
12103761620bSozaki-r #ifdef TCP_SIGNATURE
12113761620bSozaki-r 	if (sc->sc_flags & SCF_SIGNATURE) {
12123761620bSozaki-r 		sav = tcp_signature_getsav(m);
12133761620bSozaki-r 		if (sav == NULL) {
12143761620bSozaki-r 			m_freem(m);
12153761620bSozaki-r 			return EPERM;
12163761620bSozaki-r 		}
12173761620bSozaki-r 
12183761620bSozaki-r 		*optp++ = TCPOPT_SIGNATURE;
12193761620bSozaki-r 		*optp++ = TCPOLEN_SIGNATURE;
12203761620bSozaki-r 		sigp = optp;
12213761620bSozaki-r 		memset(optp, 0, TCP_SIGLEN);
12223761620bSozaki-r 		optp += TCP_SIGLEN;
12233761620bSozaki-r 		optlen += TCPOLEN_SIGNATURE;
12243761620bSozaki-r 	}
12253761620bSozaki-r #endif
12263761620bSozaki-r 
12273761620bSozaki-r 	/*
12283761620bSozaki-r 	 * Terminate and pad TCP options to a 4 byte boundary.
12293761620bSozaki-r 	 *
12303761620bSozaki-r 	 * According to RFC793: "The content of the header beyond the
12313761620bSozaki-r 	 * End-of-Option option must be header padding (i.e., zero)."
12323761620bSozaki-r 	 * And later: "The padding is composed of zeros."
12333761620bSozaki-r 	 */
12343761620bSozaki-r 	if (optlen % 4) {
12353761620bSozaki-r 		optlen += TCPOLEN_EOL;
12363761620bSozaki-r 		*optp++ = TCPOPT_EOL;
12373761620bSozaki-r 	}
12383761620bSozaki-r 	while (optlen % 4) {
12393761620bSozaki-r 		optlen += TCPOLEN_PAD;
12403761620bSozaki-r 		*optp++ = TCPOPT_PAD;
12413761620bSozaki-r 	}
12423761620bSozaki-r 
12433761620bSozaki-r 	/* Compute the actual values now that we've added the options. */
12443761620bSozaki-r 	tlen = hlen + sizeof(struct tcphdr) + optlen;
12453761620bSozaki-r 	m->m_len = m->m_pkthdr.len = tlen;
12463761620bSozaki-r 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
12473761620bSozaki-r 
12483761620bSozaki-r #ifdef TCP_SIGNATURE
12493761620bSozaki-r 	if (sav) {
12503761620bSozaki-r 		(void)tcp_signature(m, th, hlen, sav, sigp);
12513761620bSozaki-r 		key_sa_recordxfer(sav, m);
12523761620bSozaki-r 		KEY_SA_UNREF(&sav);
12533761620bSozaki-r 	}
12543761620bSozaki-r #endif
12553761620bSozaki-r 
12563761620bSozaki-r 	/*
12573761620bSozaki-r 	 * Send ECN SYN-ACK setup packet.
12583761620bSozaki-r 	 * Routes can be asymmetric, so, even if we receive a packet
12593761620bSozaki-r 	 * with ECE and CWR set, we must not assume no one will block
12603761620bSozaki-r 	 * the ECE packet we are about to send.
12613761620bSozaki-r 	 */
12623761620bSozaki-r 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
12633761620bSozaki-r 	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
12643761620bSozaki-r 		th->th_flags |= TH_ECE;
12653761620bSozaki-r 		TCP_STATINC(TCP_STAT_ECN_SHS);
12663761620bSozaki-r 
12673761620bSozaki-r 		/*
12683761620bSozaki-r 		 * draft-ietf-tcpm-ecnsyn-00.txt
12693761620bSozaki-r 		 *
12703761620bSozaki-r 		 * "[...] a TCP node MAY respond to an ECN-setup
12713761620bSozaki-r 		 * SYN packet by setting ECT in the responding
12723761620bSozaki-r 		 * ECN-setup SYN/ACK packet, indicating to routers
12733761620bSozaki-r 		 * that the SYN/ACK packet is ECN-Capable.
12743761620bSozaki-r 		 * This allows a congested router along the path
12753761620bSozaki-r 		 * to mark the packet instead of dropping the
12763761620bSozaki-r 		 * packet as an indication of congestion."
12773761620bSozaki-r 		 *
12783761620bSozaki-r 		 * "[...] There can be a great benefit in setting
12793761620bSozaki-r 		 * an ECN-capable codepoint in SYN/ACK packets [...]
12803761620bSozaki-r 		 * Congestion is  most likely to occur in
12813761620bSozaki-r 		 * the server-to-client direction.  As a result,
12823761620bSozaki-r 		 * setting an ECN-capable codepoint in SYN/ACK
12833761620bSozaki-r 		 * packets can reduce the occurrence of three-second
12843761620bSozaki-r 		 * retransmit timeouts resulting from the drop
12853761620bSozaki-r 		 * of SYN/ACK packets."
12863761620bSozaki-r 		 *
12873761620bSozaki-r 		 * Page 4 and 6, January 2006.
12883761620bSozaki-r 		 */
12893761620bSozaki-r 
12903761620bSozaki-r 		switch (sc->sc_src.sa.sa_family) {
12913761620bSozaki-r 		case AF_INET:
12923761620bSozaki-r 			ip->ip_tos |= IPTOS_ECN_ECT0;
12933761620bSozaki-r 			break;
12943761620bSozaki-r #ifdef INET6
12953761620bSozaki-r 		case AF_INET6:
12963761620bSozaki-r 			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
12973761620bSozaki-r 			break;
12983761620bSozaki-r #endif
12993761620bSozaki-r 		}
13003761620bSozaki-r 		TCP_STATINC(TCP_STAT_ECN_ECT);
13013761620bSozaki-r 	}
13023761620bSozaki-r 
13033761620bSozaki-r 
13043761620bSozaki-r 	/*
13053761620bSozaki-r 	 * Compute the packet's checksum.
13063761620bSozaki-r 	 *
13073761620bSozaki-r 	 * Fill in some straggling IP bits.  Note the stack expects
13083761620bSozaki-r 	 * ip_len to be in host order, for convenience.
13093761620bSozaki-r 	 */
13103761620bSozaki-r 	switch (sc->sc_src.sa.sa_family) {
13113761620bSozaki-r 	case AF_INET:
13123761620bSozaki-r 		ip->ip_len = htons(tlen - hlen);
13133761620bSozaki-r 		th->th_sum = 0;
13143761620bSozaki-r 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
13153761620bSozaki-r 		ip->ip_len = htons(tlen);
13163761620bSozaki-r 		ip->ip_ttl = ip_defttl;
13173761620bSozaki-r 		/* XXX tos? */
13183761620bSozaki-r 		break;
13193761620bSozaki-r #ifdef INET6
13203761620bSozaki-r 	case AF_INET6:
13213761620bSozaki-r 		ip6->ip6_plen = htons(tlen - hlen);
13223761620bSozaki-r 		th->th_sum = 0;
13233761620bSozaki-r 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
13243761620bSozaki-r 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
13253761620bSozaki-r 		ip6->ip6_vfc |= IPV6_VERSION;
13263761620bSozaki-r 		ip6->ip6_plen = htons(tlen - hlen);
13273761620bSozaki-r 		/* ip6_hlim will be initialized afterwards */
13283761620bSozaki-r 		/* XXX flowlabel? */
13293761620bSozaki-r 		break;
13303761620bSozaki-r #endif
13313761620bSozaki-r 	}
13323761620bSozaki-r 
13333761620bSozaki-r 	/* XXX use IPsec policy on listening socket, on SYN ACK */
13343761620bSozaki-r 	tp = sc->sc_tp;
13353761620bSozaki-r 
13363761620bSozaki-r 	switch (sc->sc_src.sa.sa_family) {
13373761620bSozaki-r 	case AF_INET:
13383761620bSozaki-r 		error = ip_output(m, sc->sc_ipopts, ro,
13393761620bSozaki-r 		    (ip_mtudisc ? IP_MTUDISC : 0),
13403761620bSozaki-r 		    NULL, tp ? tp->t_inpcb : NULL);
13413761620bSozaki-r 		break;
13423761620bSozaki-r #ifdef INET6
13433761620bSozaki-r 	case AF_INET6:
1344b000e63fSozaki-r 		ip6->ip6_hlim = in6pcb_selecthlim(NULL,
13453761620bSozaki-r 		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
13463761620bSozaki-r 		rtcache_unref(rt, ro);
13473761620bSozaki-r 
13483761620bSozaki-r 		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
13490e390eeeSozaki-r 		    tp ? tp->t_inpcb : NULL, NULL);
13503761620bSozaki-r 		break;
13513761620bSozaki-r #endif
13523761620bSozaki-r 	default:
13533761620bSozaki-r 		panic("%s: impossible (2)", __func__);
13543761620bSozaki-r 	}
13553761620bSozaki-r 
13563761620bSozaki-r 	return error;
13573761620bSozaki-r }
1358