sys/netinet/tcp_syncache.c

/*	$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $	*/

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 *      This product includes software developed at the Information
 *      Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

/*-
 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
 * 2011 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Coyote Point Systems, Inc.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Rui Paulo.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
 */

/*
 *	TODO list for SYN cache stuff:
 *
 *	Find room for a "state" field, which is needed to keep a
 *	compressed state for TIME_WAIT TCBs.  It's been noted already
 *	that this is fairly important for very high-volume web and
 *	mail servers, which use a large number of short-lived
 *	connections.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lwp.h> /* for lwp0 */
#include <sys/cprng.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>

#include <netinet/ip6.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_syncache.h>

#ifdef TCP_SIGNATURE
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif	/* IPSEC*/
#endif

static void	syn_cache_timer(void *);
static struct syn_cache *
		syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
		struct syn_cache_head **);
static int	syn_cache_respond(struct syn_cache *);

/* syn hash parameters */
#define	TCP_SYN_HASH_SIZE	293
#define	TCP_SYN_BUCKET_SIZE	35
static int	tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
int		tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
int		tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
static struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];

/*
 * TCP compressed state engine.  Currently used to hold compressed
 * state for SYN_RECEIVED.
 */

u_long	syn_cache_count;
static u_int32_t syn_hash1, syn_hash2;

#define SYN_HASH(sa, sp, dp) \
	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
				     ((u_int32_t)(sp)))^syn_hash2)))
#ifndef INET6
#define	SYN_HASHALL(hash, src, dst) \
do {									\
	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
		((const struct sockaddr_in *)(src))->sin_port,		\
		((const struct sockaddr_in *)(dst))->sin_port);		\
} while (/*CONSTCOND*/ 0)
#else
#define SYN_HASH6(sa, sp, dp) \
	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
	 & 0x7fffffff)

#define SYN_HASHALL(hash, src, dst) \
do {									\
	switch ((src)->sa_family) {					\
	case AF_INET:							\
		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
			((const struct sockaddr_in *)(src))->sin_port,	\
			((const struct sockaddr_in *)(dst))->sin_port);	\
		break;							\
	case AF_INET6:							\
		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
			((const struct sockaddr_in6 *)(src))->sin6_port,	\
			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
		break;							\
	default:							\
		hash = 0;						\
	}								\
} while (/*CONSTCOND*/0)
#endif /* INET6 */

static struct pool syn_cache_pool;

/*
 * We don't estimate RTT with SYNs, so each packet starts with the default
 * RTT and each timer step has a fixed timeout value.
 */
static inline void
syn_cache_timer_arm(struct syn_cache *sc)
{

	TCPT_RANGESET(sc->sc_rxtcur,
	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
	    TCPTV_REXMTMAX);
	callout_reset(&sc->sc_timer,
	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
}

#define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)

static inline void
syn_cache_rm(struct syn_cache *sc)
{
	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
	    sc, sc_bucketq);
	sc->sc_tp = NULL;
	LIST_REMOVE(sc, sc_tpq);
	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
	callout_stop(&sc->sc_timer);
	syn_cache_count--;
}

static inline void
syn_cache_put(struct syn_cache *sc)
{
	if (sc->sc_ipopts)
		(void) m_free(sc->sc_ipopts);
	rtcache_free(&sc->sc_route);
	sc->sc_flags |= SCF_DEAD;
	if (!callout_invoking(&sc->sc_timer))
		callout_schedule(&(sc)->sc_timer, 1);
}

void
syn_cache_init(void)
{
	int i;

	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
	    "synpl", NULL, IPL_SOFTNET);

	/* Initialize the hash buckets. */
	for (i = 0; i < tcp_syn_cache_size; i++)
		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
}

void
syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
{
	struct syn_cache_head *scp;
	struct syn_cache *sc2;
	int s;

	/*
	 * If there are no entries in the hash table, reinitialize
	 * the hash secrets.
	 */
	if (syn_cache_count == 0) {
		syn_hash1 = cprng_fast32();
		syn_hash2 = cprng_fast32();
	}

	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
	scp = &tcp_syn_cache[sc->sc_bucketidx];

	/*
	 * Make sure that we don't overflow the per-bucket
	 * limit or the total cache size limit.
	 */
	s = splsoftnet();
	if (scp->sch_length >= tcp_syn_bucket_limit) {
		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
		/*
		 * The bucket is full.  Toss the oldest element in the
		 * bucket.  This will be the first entry in the bucket.
		 */
		sc2 = TAILQ_FIRST(&scp->sch_bucket);
#ifdef DIAGNOSTIC
		/*
		 * This should never happen; we should always find an
		 * entry in our bucket.
		 */
		if (sc2 == NULL)
			panic("syn_cache_insert: bucketoverflow: impossible");
#endif
		syn_cache_rm(sc2);
		syn_cache_put(sc2);	/* calls pool_put but see spl above */
	} else if (syn_cache_count >= tcp_syn_cache_limit) {
		struct syn_cache_head *scp2, *sce;

		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
		/*
		 * The cache is full.  Toss the oldest entry in the
		 * first non-empty bucket we can find.
		 *
		 * XXX We would really like to toss the oldest
		 * entry in the cache, but we hope that this
		 * condition doesn't happen very often.
		 */
		scp2 = scp;
		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
			sce = &tcp_syn_cache[tcp_syn_cache_size];
			for (++scp2; scp2 != scp; scp2++) {
				if (scp2 >= sce)
					scp2 = &tcp_syn_cache[0];
				if (! TAILQ_EMPTY(&scp2->sch_bucket))
					break;
			}
#ifdef DIAGNOSTIC
			/*
			 * This should never happen; we should always find a
			 * non-empty bucket.
			 */
			if (scp2 == scp)
				panic("syn_cache_insert: cacheoverflow: "
				    "impossible");
#endif
		}
		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
		syn_cache_rm(sc2);
		syn_cache_put(sc2);	/* calls pool_put but see spl above */
	}

	/*
	 * Initialize the entry's timer.
	 */
	sc->sc_rxttot = 0;
	sc->sc_rxtshift = 0;
	syn_cache_timer_arm(sc);

	/* Link it from tcpcb entry */
	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);

	/* Put it into the bucket. */
	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
	scp->sch_length++;
	syn_cache_count++;

	TCP_STATINC(TCP_STAT_SC_ADDED);
	splx(s);
}

/*
 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 * If we have retransmitted an entry the maximum number of times, expire
 * that entry.
 */
static void
syn_cache_timer(void *arg)
{
	struct syn_cache *sc = arg;

	mutex_enter(softnet_lock);
	KERNEL_LOCK(1, NULL);

	callout_ack(&sc->sc_timer);

	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
		goto free;
	}

	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
		/* Drop it -- too many retransmissions. */
		goto dropit;
	}

	/*
	 * Compute the total amount of time this entry has
	 * been on a queue.  If this entry has been on longer
	 * than the keep alive timer would allow, expire it.
	 */
	sc->sc_rxttot += sc->sc_rxtcur;
	if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
		goto dropit;

	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
	(void)syn_cache_respond(sc);

	/* Advance the timer back-off. */
	sc->sc_rxtshift++;
	syn_cache_timer_arm(sc);

	goto out;

 dropit:
	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
	syn_cache_rm(sc);
	if (sc->sc_ipopts)
		(void) m_free(sc->sc_ipopts);
	rtcache_free(&sc->sc_route);

 free:
	callout_destroy(&sc->sc_timer);
	pool_put(&syn_cache_pool, sc);

 out:
	KERNEL_UNLOCK_ONE(NULL);
	mutex_exit(softnet_lock);
}

/*
 * Remove syn cache created by the specified tcb entry,
 * because this does not make sense to keep them
 * (if there's no tcb entry, syn cache entry will never be used)
 */
void
syn_cache_cleanup(struct tcpcb *tp)
{
	struct syn_cache *sc, *nsc;
	int s;

	s = splsoftnet();

	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
		nsc = LIST_NEXT(sc, sc_tpq);

#ifdef DIAGNOSTIC
		if (sc->sc_tp != tp)
			panic("invalid sc_tp in syn_cache_cleanup");
#endif
		syn_cache_rm(sc);
		syn_cache_put(sc);	/* calls pool_put but see spl above */
	}
	/* just for safety */
	LIST_INIT(&tp->t_sc);

	splx(s);
}

/*
 * Find an entry in the syn cache.
 */
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
    struct syn_cache_head **headp)
{
	struct syn_cache *sc;
	struct syn_cache_head *scp;
	u_int32_t hash;
	int s;

	SYN_HASHALL(hash, src, dst);

	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
	*headp = scp;
	s = splsoftnet();
	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
		if (sc->sc_hash != hash)
			continue;
		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
			splx(s);
			return (sc);
		}
	}
	splx(s);
	return (NULL);
}

/*
 * This function gets called when we receive an ACK for a socket in the
 * LISTEN state. We look up the connection in the syn cache, and if it's
 * there, we pull it out of the cache and turn it into a full-blown
 * connection in the SYN-RECEIVED state.
 *
 * The return values may not be immediately obvious, and their effects
 * can be subtle, so here they are:
 *
 *	NULL	SYN was not found in cache; caller should drop the
 *		packet and send an RST.
 *
 *	-1	We were unable to create the new connection, and are
 *		aborting it.  An ACK,RST is being sent to the peer
 *		(unless we got screwey sequence numbers; see below),
 *		because the 3-way handshake has been completed.  Caller
 *		should not free the mbuf, since we may be using it.  If
 *		we are not, we will free it.
 *
 *	Otherwise, the return value is a pointer to the new socket
 *	associated with the connection.
 */
struct socket *
syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
    struct tcphdr *th, struct socket *so, struct mbuf *m)
{
	struct syn_cache *sc;
	struct syn_cache_head *scp;
	struct inpcb *inp = NULL;
#ifdef INET6
	struct in6pcb *in6p = NULL;
#endif
	struct tcpcb *tp;
	int s;
	struct socket *oso;

	s = splsoftnet();
	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
		splx(s);
		return NULL;
	}

	/*
	 * Verify the sequence and ack numbers.  Try getting the correct
	 * response again.
	 */
	if ((th->th_ack != sc->sc_iss + 1) ||
	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
		m_freem(m);
		(void)syn_cache_respond(sc);
		splx(s);
		return ((struct socket *)(-1));
	}

	/* Remove this cache entry */
	syn_cache_rm(sc);
	splx(s);

	/*
	 * Ok, create the full blown connection, and set things up
	 * as they would have been set up if we had created the
	 * connection when the SYN arrived.  If we can't create
	 * the connection, abort it.
	 */
	/*
	 * inp still has the OLD in_pcb stuff, set the
	 * v6-related flags on the new guy, too.   This is
	 * done particularly for the case where an AF_INET6
	 * socket is bound only to a port, and a v4 connection
	 * comes in on that port.
	 * we also copy the flowinfo from the original pcb
	 * to the new one.
	 */
	oso = so;
	so = sonewconn(so, true);
	if (so == NULL)
		goto resetandabort;

	switch (so->so_proto->pr_domain->dom_family) {
	case AF_INET:
		inp = sotoinpcb(so);
		break;
#ifdef INET6
	case AF_INET6:
		in6p = sotoin6pcb(so);
		break;
#endif
	}

	switch (src->sa_family) {
	case AF_INET:
		if (inp) {
			inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
			inp->inp_options = ip_srcroute(m);
			in_pcbstate(inp, INP_BOUND);
			if (inp->inp_options == NULL) {
				inp->inp_options = sc->sc_ipopts;
				sc->sc_ipopts = NULL;
			}
		}
#ifdef INET6
		else if (in6p) {
			/* IPv4 packet to AF_INET6 socket */
			memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr));
			in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
				&in6p->in6p_laddr.s6_addr32[3],
				sizeof(((struct sockaddr_in *)dst)->sin_addr));
			in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
			in6totcpcb(in6p)->t_family = AF_INET;
			if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
				in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
			else
				in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
			in6_pcbstate(in6p, IN6P_BOUND);
		}
#endif
		break;
#ifdef INET6
	case AF_INET6:
		if (in6p) {
			in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
			in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
			in6_pcbstate(in6p, IN6P_BOUND);
		}
		break;
#endif
	}

#ifdef INET6
	if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
		struct in6pcb *oin6p = sotoin6pcb(oso);
		/* inherit socket options from the listening socket */
		in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
		if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
			m_freem(in6p->in6p_options);
			in6p->in6p_options = NULL;
		}
		ip6_savecontrol(in6p, &in6p->in6p_options,
		    mtod(m, struct ip6_hdr *), m);
	}
#endif

	/*
	 * Give the new socket our cached route reference.
	 */
	if (inp) {
		rtcache_copy(&inp->inp_route, &sc->sc_route);
		rtcache_free(&sc->sc_route);
	}
#ifdef INET6
	else {
		rtcache_copy(&in6p->in6p_route, &sc->sc_route);
		rtcache_free(&sc->sc_route);
	}
#endif

	if (inp) {
		struct sockaddr_in sin;
		memcpy(&sin, src, src->sa_len);
		if (in_pcbconnect(inp, &sin, &lwp0)) {
			goto resetandabort;
		}
	}
#ifdef INET6
	else if (in6p) {
		struct sockaddr_in6 sin6;
		memcpy(&sin6, src, src->sa_len);
		if (src->sa_family == AF_INET) {
			/* IPv4 packet to AF_INET6 socket */
			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
		}
		if (in6_pcbconnect(in6p, &sin6, NULL)) {
			goto resetandabort;
		}
	}
#endif
	else {
		goto resetandabort;
	}

	if (inp)
		tp = intotcpcb(inp);
#ifdef INET6
	else if (in6p)
		tp = in6totcpcb(in6p);
#endif
	else
		tp = NULL;

	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
	if (sc->sc_request_r_scale != 15) {
		tp->requested_s_scale = sc->sc_requested_s_scale;
		tp->request_r_scale = sc->sc_request_r_scale;
		tp->snd_scale = sc->sc_requested_s_scale;
		tp->rcv_scale = sc->sc_request_r_scale;
		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
	}
	if (sc->sc_flags & SCF_TIMESTAMP)
		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
	tp->ts_timebase = sc->sc_timebase;

	tp->t_template = tcp_template(tp);
	if (tp->t_template == 0) {
		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
		so = NULL;
		m_freem(m);
		goto abort;
	}

	tp->iss = sc->sc_iss;
	tp->irs = sc->sc_irs;
	tcp_sendseqinit(tp);
	tcp_rcvseqinit(tp);
	tp->t_state = TCPS_SYN_RECEIVED;
	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
	TCP_STATINC(TCP_STAT_ACCEPTS);

	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
		tp->t_flags |= TF_WILL_SACK;

	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
		tp->t_flags |= TF_ECN_PERMIT;

#ifdef TCP_SIGNATURE
	if (sc->sc_flags & SCF_SIGNATURE)
		tp->t_flags |= TF_SIGNATURE;
#endif

	/* Initialize tp->t_ourmss before we deal with the peer's! */
	tp->t_ourmss = sc->sc_ourmaxseg;
	tcp_mss_from_peer(tp, sc->sc_peermaxseg);

	/*
	 * Initialize the initial congestion window.  If we
	 * had to retransmit the SYN,ACK, we must initialize cwnd
	 * to 1 segment (i.e. the Loss Window).
	 */
	if (sc->sc_rxtshift)
		tp->snd_cwnd = tp->t_peermss;
	else {
		int ss = tcp_init_win;
		if (inp != NULL && in_localaddr(inp->inp_faddr))
			ss = tcp_init_win_local;
#ifdef INET6
		if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
			ss = tcp_init_win_local;
#endif
		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
	}

	tcp_rmx_rtt(tp);
	tp->snd_wl1 = sc->sc_irs;
	tp->rcv_up = sc->sc_irs + 1;

	/*
	 * This is what would have happened in tcp_output() when
	 * the SYN,ACK was sent.
	 */
	tp->snd_up = tp->snd_una;
	tp->snd_max = tp->snd_nxt = tp->iss+1;
	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
	tp->last_ack_sent = tp->rcv_nxt;
	tp->t_partialacks = -1;
	tp->t_dupacks = 0;

	TCP_STATINC(TCP_STAT_SC_COMPLETED);
	s = splsoftnet();
	syn_cache_put(sc);
	splx(s);
	return so;

resetandabort:
	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
abort:
	if (so != NULL) {
		(void) soqremque(so, 1);
		(void) soabort(so);
		mutex_enter(softnet_lock);
	}
	s = splsoftnet();
	syn_cache_put(sc);
	splx(s);
	TCP_STATINC(TCP_STAT_SC_ABORTED);
	return ((struct socket *)(-1));
}

/*
 * This function is called when we get a RST for a
 * non-existent connection, so that we can see if the
 * connection is in the syn cache.  If it is, zap it.
 */

void
syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
{
	struct syn_cache *sc;
	struct syn_cache_head *scp;
	int s = splsoftnet();

	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
		splx(s);
		return;
	}
	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
		splx(s);
		return;
	}
	syn_cache_rm(sc);
	TCP_STATINC(TCP_STAT_SC_RESET);
	syn_cache_put(sc);	/* calls pool_put but see spl above */
	splx(s);
}

void
syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
    struct tcphdr *th)
{
	struct syn_cache *sc;
	struct syn_cache_head *scp;
	int s;

	s = splsoftnet();
	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
		splx(s);
		return;
	}
	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
	if (ntohl(th->th_seq) != sc->sc_iss) {
		splx(s);
		return;
	}

	/*
	 * If we've retransmitted 3 times and this is our second error,
	 * we remove the entry.  Otherwise, we allow it to continue on.
	 * This prevents us from incorrectly nuking an entry during a
	 * spurious network outage.
	 *
	 * See tcp_notify().
	 */
	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
		sc->sc_flags |= SCF_UNREACH;
		splx(s);
		return;
	}

	syn_cache_rm(sc);
	TCP_STATINC(TCP_STAT_SC_UNREACH);
	syn_cache_put(sc);	/* calls pool_put but see spl above */
	splx(s);
}

/*
 * Given a LISTEN socket and an inbound SYN request, add this to the syn
 * cache, and send back a segment:
 *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 * to the source.
 *
 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 * Doing so would require that we hold onto the data and deliver it
 * to the application.  However, if we are the target of a SYN-flood
 * DoS attack, an attacker could send data which would eventually
 * consume all available buffer space if it were ACKed.  By not ACKing
 * the data, we avoid this DoS scenario.
 */
int
syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
    unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
    int optlen, struct tcp_opt_info *oi)
{
	struct tcpcb tb, *tp;
	long win;
	struct syn_cache *sc;
	struct syn_cache_head *scp;
	struct mbuf *ipopts;
	int s;

	tp = sototcpcb(so);

	/*
	 * Initialize some local state.
	 */
	win = sbspace(&so->so_rcv);
	if (win > TCP_MAXWIN)
		win = TCP_MAXWIN;

#ifdef TCP_SIGNATURE
	if (optp || (tp->t_flags & TF_SIGNATURE))
#else
	if (optp)
#endif
	{
		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
#ifdef TCP_SIGNATURE
		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
#endif
		tb.t_state = TCPS_LISTEN;
		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
			return 0;
	} else
		tb.t_flags = 0;

	switch (src->sa_family) {
	case AF_INET:
		/* Remember the IP options, if any. */
		ipopts = ip_srcroute(m);
		break;
	default:
		ipopts = NULL;
	}

	/*
	 * See if we already have an entry for this connection.
	 * If we do, resend the SYN,ACK.  We do not count this
	 * as a retransmission (XXX though maybe we should).
	 */
	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
		TCP_STATINC(TCP_STAT_SC_DUPESYN);
		if (ipopts) {
			/*
			 * If we were remembering a previous source route,
			 * forget it and use the new one we've been given.
			 */
			if (sc->sc_ipopts)
				(void)m_free(sc->sc_ipopts);
			sc->sc_ipopts = ipopts;
		}
		sc->sc_timestamp = tb.ts_recent;
		m_freem(m);
		if (syn_cache_respond(sc) == 0) {
			uint64_t *tcps = TCP_STAT_GETREF();
			tcps[TCP_STAT_SNDACKS]++;
			tcps[TCP_STAT_SNDTOTAL]++;
			TCP_STAT_PUTREF();
		}
		return 1;
	}

	s = splsoftnet();
	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
	splx(s);
	if (sc == NULL) {
		if (ipopts)
			(void)m_free(ipopts);
		return 0;
	}

	/*
	 * Fill in the cache, and put the necessary IP and TCP
	 * options into the reply.
	 */
	memset(sc, 0, sizeof(struct syn_cache));
	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
	memcpy(&sc->sc_src, src, src->sa_len);
	memcpy(&sc->sc_dst, dst, dst->sa_len);
	sc->sc_flags = 0;
	sc->sc_ipopts = ipopts;
	sc->sc_irs = th->th_seq;
	switch (src->sa_family) {
	case AF_INET:
	    {
		struct sockaddr_in *srcin = (void *)src;
		struct sockaddr_in *dstin = (void *)dst;

		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
		    &srcin->sin_addr, dstin->sin_port,
		    srcin->sin_port, sizeof(dstin->sin_addr));
		break;
	    }
#ifdef INET6
	case AF_INET6:
	    {
		struct sockaddr_in6 *srcin6 = (void *)src;
		struct sockaddr_in6 *dstin6 = (void *)dst;

		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
		    &srcin6->sin6_addr, dstin6->sin6_port,
		    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
		break;
	    }
#endif
	}
	sc->sc_peermaxseg = oi->maxseg;
	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
	sc->sc_win = win;
	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
	sc->sc_timestamp = tb.ts_recent;
	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
		sc->sc_flags |= SCF_TIMESTAMP;
	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
		sc->sc_requested_s_scale = tb.requested_s_scale;
		sc->sc_request_r_scale = 0;
		/*
		 * Pick the smallest possible scaling factor that
		 * will still allow us to scale up to sb_max.
		 *
		 * We do this because there are broken firewalls that
		 * will corrupt the window scale option, leading to
		 * the other endpoint believing that our advertised
		 * window is unscaled.  At scale factors larger than
		 * 5 the unscaled window will drop below 1500 bytes,
		 * leading to serious problems when traversing these
		 * broken firewalls.
		 *
		 * With the default sbmax of 256K, a scale factor
		 * of 3 will be chosen by this algorithm.  Those who
		 * choose a larger sbmax should watch out
		 * for the compatibility problems mentioned above.
		 *
		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
		 * or <SYN,ACK>) segment itself is never scaled.
		 */
		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
			sc->sc_request_r_scale++;
	} else {
		sc->sc_requested_s_scale = 15;
		sc->sc_request_r_scale = 15;
	}
	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
		sc->sc_flags |= SCF_SACK_PERMIT;

	/*
	 * ECN setup packet received.
	 */
	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
		sc->sc_flags |= SCF_ECN_PERMIT;

#ifdef TCP_SIGNATURE
	if (tb.t_flags & TF_SIGNATURE)
		sc->sc_flags |= SCF_SIGNATURE;
#endif
	sc->sc_tp = tp;
	m_freem(m);
	if (syn_cache_respond(sc) == 0) {
		uint64_t *tcps = TCP_STAT_GETREF();
		tcps[TCP_STAT_SNDACKS]++;
		tcps[TCP_STAT_SNDTOTAL]++;
		TCP_STAT_PUTREF();
		syn_cache_insert(sc, tp);
	} else {
		s = splsoftnet();
		/*
		 * syn_cache_put() will try to schedule the timer, so
		 * we need to initialize it
		 */
		syn_cache_timer_arm(sc);
		syn_cache_put(sc);
		splx(s);
		TCP_STATINC(TCP_STAT_SC_DROPPED);
	}
	return 1;
}

/*
 * syn_cache_respond: (re)send SYN+ACK.
 *
 * Returns 0 on success.
 */

static int
syn_cache_respond(struct syn_cache *sc)
{
#ifdef INET6
	struct rtentry *rt = NULL;
#endif
	struct route *ro;
	u_int8_t *optp;
	int optlen, error;
	u_int16_t tlen;
	struct ip *ip = NULL;
#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
#endif
	struct tcpcb *tp;
	struct tcphdr *th;
	struct mbuf *m;
	u_int hlen;
#ifdef TCP_SIGNATURE
	struct secasvar *sav = NULL;
	u_int8_t *sigp = NULL;
#endif

	ro = &sc->sc_route;
	switch (sc->sc_src.sa.sa_family) {
	case AF_INET:
		hlen = sizeof(struct ip);
		break;
#ifdef INET6
	case AF_INET6:
		hlen = sizeof(struct ip6_hdr);
		break;
#endif
	default:
		return EAFNOSUPPORT;
	}

	/* Worst case scenario, since we don't know the option size yet. */
	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
	KASSERT(max_linkhdr + tlen <= MCLBYTES);

	/*
	 * Create the IP+TCP header from scratch.
	 */
	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m && (max_linkhdr + tlen) > MHLEN) {
		MCLGET(m, M_DONTWAIT);
		if ((m->m_flags & M_EXT) == 0) {
			m_freem(m);
			m = NULL;
		}
	}
	if (m == NULL)
		return ENOBUFS;
	MCLAIM(m, &tcp_tx_mowner);

	tp = sc->sc_tp;

	/* Fixup the mbuf. */
	m->m_data += max_linkhdr;
	m_reset_rcvif(m);
	memset(mtod(m, void *), 0, tlen);

	switch (sc->sc_src.sa.sa_family) {
	case AF_INET:
		ip = mtod(m, struct ip *);
		ip->ip_v = 4;
		ip->ip_dst = sc->sc_src.sin.sin_addr;
		ip->ip_src = sc->sc_dst.sin.sin_addr;
		ip->ip_p = IPPROTO_TCP;
		th = (struct tcphdr *)(ip + 1);
		th->th_dport = sc->sc_src.sin.sin_port;
		th->th_sport = sc->sc_dst.sin.sin_port;
		break;
#ifdef INET6
	case AF_INET6:
		ip6 = mtod(m, struct ip6_hdr *);
		ip6->ip6_vfc = IPV6_VERSION;
		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
		ip6->ip6_nxt = IPPROTO_TCP;
		/* ip6_plen will be updated in ip6_output() */
		th = (struct tcphdr *)(ip6 + 1);
		th->th_dport = sc->sc_src.sin6.sin6_port;
		th->th_sport = sc->sc_dst.sin6.sin6_port;
		break;
#endif
	default:
		panic("%s: impossible (1)", __func__);
	}

	th->th_seq = htonl(sc->sc_iss);
	th->th_ack = htonl(sc->sc_irs + 1);
	th->th_flags = TH_SYN|TH_ACK;
	th->th_win = htons(sc->sc_win);
	/* th_x2, th_sum, th_urp already 0 from memset */

	/* Tack on the TCP options. */
	optp = (u_int8_t *)(th + 1);
	optlen = 0;
	*optp++ = TCPOPT_MAXSEG;
	*optp++ = TCPOLEN_MAXSEG;
	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
	*optp++ = sc->sc_ourmaxseg & 0xff;
	optlen += TCPOLEN_MAXSEG;

	if (sc->sc_request_r_scale != 15) {
		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
		    sc->sc_request_r_scale);
		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
	}

	if (sc->sc_flags & SCF_SACK_PERMIT) {
		/* Let the peer know that we will SACK. */
		*optp++ = TCPOPT_SACK_PERMITTED;
		*optp++ = TCPOLEN_SACK_PERMITTED;
		optlen += TCPOLEN_SACK_PERMITTED;
	}

	if (sc->sc_flags & SCF_TIMESTAMP) {
		while (optlen % 4 != 2) {
			optlen += TCPOLEN_NOP;
			*optp++ = TCPOPT_NOP;
		}
		*optp++ = TCPOPT_TIMESTAMP;
		*optp++ = TCPOLEN_TIMESTAMP;
		u_int32_t *lp = (u_int32_t *)(optp);
		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
		*lp   = htonl(sc->sc_timestamp);
		optp += TCPOLEN_TIMESTAMP - 2;
		optlen += TCPOLEN_TIMESTAMP;
	}

#ifdef TCP_SIGNATURE
	if (sc->sc_flags & SCF_SIGNATURE) {
		sav = tcp_signature_getsav(m);
		if (sav == NULL) {
			m_freem(m);
			return EPERM;
		}

		*optp++ = TCPOPT_SIGNATURE;
		*optp++ = TCPOLEN_SIGNATURE;
		sigp = optp;
		memset(optp, 0, TCP_SIGLEN);
		optp += TCP_SIGLEN;
		optlen += TCPOLEN_SIGNATURE;
	}
#endif

	/*
	 * Terminate and pad TCP options to a 4 byte boundary.
	 *
	 * According to RFC793: "The content of the header beyond the
	 * End-of-Option option must be header padding (i.e., zero)."
	 * And later: "The padding is composed of zeros."
	 */
	if (optlen % 4) {
		optlen += TCPOLEN_EOL;
		*optp++ = TCPOPT_EOL;
	}
	while (optlen % 4) {
		optlen += TCPOLEN_PAD;
		*optp++ = TCPOPT_PAD;
	}

	/* Compute the actual values now that we've added the options. */
	tlen = hlen + sizeof(struct tcphdr) + optlen;
	m->m_len = m->m_pkthdr.len = tlen;
	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;

#ifdef TCP_SIGNATURE
	if (sav) {
		(void)tcp_signature(m, th, hlen, sav, sigp);
		key_sa_recordxfer(sav, m);
		KEY_SA_UNREF(&sav);
	}
#endif

	/*
	 * Send ECN SYN-ACK setup packet.
	 * Routes can be asymmetric, so, even if we receive a packet
	 * with ECE and CWR set, we must not assume no one will block
	 * the ECE packet we are about to send.
	 */
	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
		th->th_flags |= TH_ECE;
		TCP_STATINC(TCP_STAT_ECN_SHS);

		/*
		 * draft-ietf-tcpm-ecnsyn-00.txt
		 *
		 * "[...] a TCP node MAY respond to an ECN-setup
		 * SYN packet by setting ECT in the responding
		 * ECN-setup SYN/ACK packet, indicating to routers
		 * that the SYN/ACK packet is ECN-Capable.
		 * This allows a congested router along the path
		 * to mark the packet instead of dropping the
		 * packet as an indication of congestion."
		 *
		 * "[...] There can be a great benefit in setting
		 * an ECN-capable codepoint in SYN/ACK packets [...]
		 * Congestion is  most likely to occur in
		 * the server-to-client direction.  As a result,
		 * setting an ECN-capable codepoint in SYN/ACK
		 * packets can reduce the occurrence of three-second
		 * retransmit timeouts resulting from the drop
		 * of SYN/ACK packets."
		 *
		 * Page 4 and 6, January 2006.
		 */

		switch (sc->sc_src.sa.sa_family) {
		case AF_INET:
			ip->ip_tos |= IPTOS_ECN_ECT0;
			break;
#ifdef INET6
		case AF_INET6:
			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
			break;
#endif
		}
		TCP_STATINC(TCP_STAT_ECN_ECT);
	}


	/*
	 * Compute the packet's checksum.
	 *
	 * Fill in some straggling IP bits.  Note the stack expects
	 * ip_len to be in host order, for convenience.
	 */
	switch (sc->sc_src.sa.sa_family) {
	case AF_INET:
		ip->ip_len = htons(tlen - hlen);
		th->th_sum = 0;
		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
		ip->ip_len = htons(tlen);
		ip->ip_ttl = ip_defttl;
		/* XXX tos? */
		break;
#ifdef INET6
	case AF_INET6:
		ip6->ip6_plen = htons(tlen - hlen);
		th->th_sum = 0;
		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
		ip6->ip6_vfc |= IPV6_VERSION;
		ip6->ip6_plen = htons(tlen - hlen);
		/* ip6_hlim will be initialized afterwards */
		/* XXX flowlabel? */
		break;
#endif
	}

	/* XXX use IPsec policy on listening socket, on SYN ACK */
	tp = sc->sc_tp;

	switch (sc->sc_src.sa.sa_family) {
	case AF_INET:
		error = ip_output(m, sc->sc_ipopts, ro,
		    (ip_mtudisc ? IP_MTUDISC : 0),
		    NULL, tp ? tp->t_inpcb : NULL);
		break;
#ifdef INET6
	case AF_INET6:
		ip6->ip6_hlim = in6_selecthlim(NULL,
		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
		rtcache_unref(rt, ro);

		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
		    tp ? tp->t_in6pcb : NULL, NULL);
		break;
#endif
	default:
		panic("%s: impossible (2)", __func__);
	}

	return error;
}