xref: /netbsd-src/sys/netinet/tcp_syncache.c (revision 023842dd7aa71afbdae78525ee699a0a8181af29)
1 /*	$NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $	*/
2 
3 /*
4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the project nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
34  *
35  * NRL grants permission for redistribution and use in source and binary
36  * forms, with or without modification, of the software and documentation
37  * created at NRL provided that the following conditions are met:
38  *
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgements:
46  *      This product includes software developed by the University of
47  *      California, Berkeley and its contributors.
48  *      This product includes software developed at the Information
49  *      Technology Division, US Naval Research Laboratory.
50  * 4. Neither the name of the NRL nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  *
66  * The views and conclusions contained in the software and documentation
67  * are those of the authors and should not be interpreted as representing
68  * official policies, either expressed or implied, of the US Naval
69  * Research Laboratory (NRL).
70  */
71 
72 /*-
73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
74  * 2011 The NetBSD Foundation, Inc.
75  * All rights reserved.
76  *
77  * This code is derived from software contributed to The NetBSD Foundation
78  * by Coyote Point Systems, Inc.
79  * This code is derived from software contributed to The NetBSD Foundation
80  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
81  * Facility, NASA Ames Research Center.
82  * This code is derived from software contributed to The NetBSD Foundation
83  * by Charles M. Hannum.
84  * This code is derived from software contributed to The NetBSD Foundation
85  * by Rui Paulo.
86  *
87  * Redistribution and use in source and binary forms, with or without
88  * modification, are permitted provided that the following conditions
89  * are met:
90  * 1. Redistributions of source code must retain the above copyright
91  *    notice, this list of conditions and the following disclaimer.
92  * 2. Redistributions in binary form must reproduce the above copyright
93  *    notice, this list of conditions and the following disclaimer in the
94  *    documentation and/or other materials provided with the distribution.
95  *
96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
106  * POSSIBILITY OF SUCH DAMAGE.
107  */
108 
109 /*
110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
111  *	The Regents of the University of California.  All rights reserved.
112  *
113  * Redistribution and use in source and binary forms, with or without
114  * modification, are permitted provided that the following conditions
115  * are met:
116  * 1. Redistributions of source code must retain the above copyright
117  *    notice, this list of conditions and the following disclaimer.
118  * 2. Redistributions in binary form must reproduce the above copyright
119  *    notice, this list of conditions and the following disclaimer in the
120  *    documentation and/or other materials provided with the distribution.
121  * 3. Neither the name of the University nor the names of its contributors
122  *    may be used to endorse or promote products derived from this software
123  *    without specific prior written permission.
124  *
125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
135  * SUCH DAMAGE.
136  *
137  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
138  */
139 
140 /*
141  *	TODO list for SYN cache stuff:
142  *
143  *	Find room for a "state" field, which is needed to keep a
144  *	compressed state for TIME_WAIT TCBs.  It's been noted already
145  *	that this is fairly important for very high-volume web and
146  *	mail servers, which use a large number of short-lived
147  *	connections.
148  */
149 
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.7 2024/06/29 12:59:08 riastradh Exp $");
152 
153 #ifdef _KERNEL_OPT
154 #include "opt_inet.h"
155 #include "opt_ipsec.h"
156 #endif
157 
158 #include <sys/param.h>
159 #include <sys/systm.h>
160 #include <sys/mbuf.h>
161 #include <sys/protosw.h>
162 #include <sys/socket.h>
163 #include <sys/socketvar.h>
164 #include <sys/errno.h>
165 #include <sys/syslog.h>
166 #include <sys/pool.h>
167 #include <sys/domain.h>
168 #include <sys/kernel.h>
169 #include <sys/lwp.h> /* for lwp0 */
170 #include <sys/cprng.h>
171 
172 #include <netinet/in.h>
173 #include <netinet/ip.h>
174 #include <netinet/in_pcb.h>
175 #include <netinet/in_var.h>
176 #include <netinet/ip_var.h>
177 
178 #include <netinet/ip6.h>
179 #ifdef INET6
180 #include <netinet6/ip6_var.h>
181 #include <netinet6/in6_pcb.h>
182 #include <netinet6/ip6_var.h>
183 #include <netinet6/in6_var.h>
184 #endif
185 
186 #include <netinet/tcp.h>
187 #include <netinet/tcp_fsm.h>
188 #include <netinet/tcp_seq.h>
189 #include <netinet/tcp_timer.h>
190 #include <netinet/tcp_var.h>
191 #include <netinet/tcp_private.h>
192 #include <netinet/tcp_syncache.h>
193 
194 #ifdef TCP_SIGNATURE
195 #ifdef IPSEC
196 #include <netipsec/ipsec.h>
197 #include <netipsec/key.h>
198 #ifdef INET6
199 #include <netipsec/ipsec6.h>
200 #endif
201 #endif	/* IPSEC*/
202 #endif
203 
204 static void	syn_cache_timer(void *);
205 static struct syn_cache *
206 		syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
207 		struct syn_cache_head **);
208 static int	syn_cache_respond(struct syn_cache *);
209 
210 /* syn hash parameters */
211 #define	TCP_SYN_HASH_SIZE	293
212 #define	TCP_SYN_BUCKET_SIZE	35
213 static int	tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
214 int		tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
215 int		tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
216 static struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
217 
218 /*
219  * TCP compressed state engine.  Currently used to hold compressed
220  * state for SYN_RECEIVED.
221  */
222 
223 u_long	syn_cache_count;
224 static u_int32_t syn_hash1, syn_hash2;
225 
226 #define SYN_HASH(sa, sp, dp) \
227 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
228 				     ((u_int32_t)(sp)))^syn_hash2)))
229 #ifndef INET6
230 #define	SYN_HASHALL(hash, src, dst) \
231 do {									\
232 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
233 		((const struct sockaddr_in *)(src))->sin_port,		\
234 		((const struct sockaddr_in *)(dst))->sin_port);		\
235 } while (/*CONSTCOND*/ 0)
236 #else
237 #define SYN_HASH6(sa, sp, dp) \
238 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
239 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
240 	 & 0x7fffffff)
241 
242 #define SYN_HASHALL(hash, src, dst) \
243 do {									\
244 	switch ((src)->sa_family) {					\
245 	case AF_INET:							\
246 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
247 			((const struct sockaddr_in *)(src))->sin_port,	\
248 			((const struct sockaddr_in *)(dst))->sin_port);	\
249 		break;							\
250 	case AF_INET6:							\
251 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
252 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
253 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
254 		break;							\
255 	default:							\
256 		hash = 0;						\
257 	}								\
258 } while (/*CONSTCOND*/0)
259 #endif /* INET6 */
260 
261 static struct pool syn_cache_pool;
262 
263 /*
264  * We don't estimate RTT with SYNs, so each packet starts with the default
265  * RTT and each timer step has a fixed timeout value.
266  */
267 static inline void
syn_cache_timer_arm(struct syn_cache * sc)268 syn_cache_timer_arm(struct syn_cache *sc)
269 {
270 
271 	TCPT_RANGESET(sc->sc_rxtcur,
272 	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
273 	    TCPTV_REXMTMAX);
274 	callout_reset(&sc->sc_timer,
275 	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
276 }
277 
278 #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
279 
280 static inline void
syn_cache_rm(struct syn_cache * sc)281 syn_cache_rm(struct syn_cache *sc)
282 {
283 	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
284 	    sc, sc_bucketq);
285 	sc->sc_tp = NULL;
286 	LIST_REMOVE(sc, sc_tpq);
287 	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
288 	callout_stop(&sc->sc_timer);
289 	syn_cache_count--;
290 }
291 
292 static inline void
syn_cache_put(struct syn_cache * sc)293 syn_cache_put(struct syn_cache *sc)
294 {
295 	if (sc->sc_ipopts)
296 		(void) m_free(sc->sc_ipopts);
297 	rtcache_free(&sc->sc_route);
298 	sc->sc_flags |= SCF_DEAD;
299 	if (!callout_invoking(&sc->sc_timer))
300 		callout_schedule(&(sc)->sc_timer, 1);
301 }
302 
303 void
syn_cache_init(void)304 syn_cache_init(void)
305 {
306 	int i;
307 
308 	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
309 	    "synpl", NULL, IPL_SOFTNET);
310 
311 	/* Initialize the hash buckets. */
312 	for (i = 0; i < tcp_syn_cache_size; i++)
313 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
314 }
315 
316 void
syn_cache_insert(struct syn_cache * sc,struct tcpcb * tp)317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
318 {
319 	struct syn_cache_head *scp;
320 	struct syn_cache *sc2;
321 	int s;
322 
323 	/*
324 	 * If there are no entries in the hash table, reinitialize
325 	 * the hash secrets.
326 	 */
327 	if (syn_cache_count == 0) {
328 		syn_hash1 = cprng_fast32();
329 		syn_hash2 = cprng_fast32();
330 	}
331 
332 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
333 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
334 	scp = &tcp_syn_cache[sc->sc_bucketidx];
335 
336 	/*
337 	 * Make sure that we don't overflow the per-bucket
338 	 * limit or the total cache size limit.
339 	 */
340 	s = splsoftnet();
341 	if (scp->sch_length >= tcp_syn_bucket_limit) {
342 		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
343 		/*
344 		 * The bucket is full.  Toss the oldest element in the
345 		 * bucket.  This will be the first entry in the bucket.
346 		 */
347 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
348 #ifdef DIAGNOSTIC
349 		/*
350 		 * This should never happen; we should always find an
351 		 * entry in our bucket.
352 		 */
353 		if (sc2 == NULL)
354 			panic("syn_cache_insert: bucketoverflow: impossible");
355 #endif
356 		syn_cache_rm(sc2);
357 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
358 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
359 		struct syn_cache_head *scp2, *sce;
360 
361 		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
362 		/*
363 		 * The cache is full.  Toss the oldest entry in the
364 		 * first non-empty bucket we can find.
365 		 *
366 		 * XXX We would really like to toss the oldest
367 		 * entry in the cache, but we hope that this
368 		 * condition doesn't happen very often.
369 		 */
370 		scp2 = scp;
371 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
372 			sce = &tcp_syn_cache[tcp_syn_cache_size];
373 			for (++scp2; scp2 != scp; scp2++) {
374 				if (scp2 >= sce)
375 					scp2 = &tcp_syn_cache[0];
376 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
377 					break;
378 			}
379 #ifdef DIAGNOSTIC
380 			/*
381 			 * This should never happen; we should always find a
382 			 * non-empty bucket.
383 			 */
384 			if (scp2 == scp)
385 				panic("syn_cache_insert: cacheoverflow: "
386 				    "impossible");
387 #endif
388 		}
389 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
390 		syn_cache_rm(sc2);
391 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
392 	}
393 
394 	/*
395 	 * Initialize the entry's timer.
396 	 */
397 	sc->sc_rxttot = 0;
398 	sc->sc_rxtshift = 0;
399 	syn_cache_timer_arm(sc);
400 
401 	/* Link it from tcpcb entry */
402 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
403 
404 	/* Put it into the bucket. */
405 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
406 	scp->sch_length++;
407 	syn_cache_count++;
408 
409 	TCP_STATINC(TCP_STAT_SC_ADDED);
410 	splx(s);
411 }
412 
413 /*
414  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
415  * If we have retransmitted an entry the maximum number of times, expire
416  * that entry.
417  */
418 static void
syn_cache_timer(void * arg)419 syn_cache_timer(void *arg)
420 {
421 	struct syn_cache *sc = arg;
422 
423 	mutex_enter(softnet_lock);
424 	KERNEL_LOCK(1, NULL);
425 
426 	callout_ack(&sc->sc_timer);
427 
428 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
429 		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
430 		goto free;
431 	}
432 
433 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
434 		/* Drop it -- too many retransmissions. */
435 		goto dropit;
436 	}
437 
438 	/*
439 	 * Compute the total amount of time this entry has
440 	 * been on a queue.  If this entry has been on longer
441 	 * than the keep alive timer would allow, expire it.
442 	 */
443 	sc->sc_rxttot += sc->sc_rxtcur;
444 	if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
445 		goto dropit;
446 
447 	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
448 	(void)syn_cache_respond(sc);
449 
450 	/* Advance the timer back-off. */
451 	sc->sc_rxtshift++;
452 	syn_cache_timer_arm(sc);
453 
454 	goto out;
455 
456  dropit:
457 	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
458 	syn_cache_rm(sc);
459 	if (sc->sc_ipopts)
460 		(void) m_free(sc->sc_ipopts);
461 	rtcache_free(&sc->sc_route);
462 
463  free:
464 	callout_destroy(&sc->sc_timer);
465 	pool_put(&syn_cache_pool, sc);
466 
467  out:
468 	KERNEL_UNLOCK_ONE(NULL);
469 	mutex_exit(softnet_lock);
470 }
471 
472 /*
473  * Remove syn cache created by the specified tcb entry,
474  * because this does not make sense to keep them
475  * (if there's no tcb entry, syn cache entry will never be used)
476  */
477 void
syn_cache_cleanup(struct tcpcb * tp)478 syn_cache_cleanup(struct tcpcb *tp)
479 {
480 	struct syn_cache *sc, *nsc;
481 	int s;
482 
483 	s = splsoftnet();
484 
485 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
486 		nsc = LIST_NEXT(sc, sc_tpq);
487 
488 #ifdef DIAGNOSTIC
489 		if (sc->sc_tp != tp)
490 			panic("invalid sc_tp in syn_cache_cleanup");
491 #endif
492 		syn_cache_rm(sc);
493 		syn_cache_put(sc);	/* calls pool_put but see spl above */
494 	}
495 	/* just for safety */
496 	LIST_INIT(&tp->t_sc);
497 
498 	splx(s);
499 }
500 
501 /*
502  * Find an entry in the syn cache.
503  */
504 static struct syn_cache *
syn_cache_lookup(const struct sockaddr * src,const struct sockaddr * dst,struct syn_cache_head ** headp)505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
506     struct syn_cache_head **headp)
507 {
508 	struct syn_cache *sc;
509 	struct syn_cache_head *scp;
510 	u_int32_t hash;
511 	int s;
512 
513 	SYN_HASHALL(hash, src, dst);
514 
515 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
516 	*headp = scp;
517 	s = splsoftnet();
518 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
519 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
520 		if (sc->sc_hash != hash)
521 			continue;
522 		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
523 		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
524 			splx(s);
525 			return (sc);
526 		}
527 	}
528 	splx(s);
529 	return (NULL);
530 }
531 
532 /*
533  * This function gets called when we receive an ACK for a socket in the
534  * LISTEN state. We look up the connection in the syn cache, and if it's
535  * there, we pull it out of the cache and turn it into a full-blown
536  * connection in the SYN-RECEIVED state.
537  *
538  * The return values may not be immediately obvious, and their effects
539  * can be subtle, so here they are:
540  *
541  *	NULL	SYN was not found in cache; caller should drop the
542  *		packet and send an RST.
543  *
544  *	-1	We were unable to create the new connection, and are
545  *		aborting it.  An ACK,RST is being sent to the peer
546  *		(unless we got screwey sequence numbers; see below),
547  *		because the 3-way handshake has been completed.  Caller
548  *		should not free the mbuf, since we may be using it.  If
549  *		we are not, we will free it.
550  *
551  *	Otherwise, the return value is a pointer to the new socket
552  *	associated with the connection.
553  */
554 struct socket *
syn_cache_get(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th,struct socket * so,struct mbuf * m)555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
556     struct tcphdr *th, struct socket *so, struct mbuf *m)
557 {
558 	struct syn_cache *sc;
559 	struct syn_cache_head *scp;
560 	struct inpcb *inp = NULL;
561 	struct tcpcb *tp;
562 	int s;
563 	struct socket *oso;
564 
565 	s = splsoftnet();
566 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
567 		splx(s);
568 		return NULL;
569 	}
570 
571 	/*
572 	 * Verify the sequence and ack numbers.  Try getting the correct
573 	 * response again.
574 	 */
575 	if ((th->th_ack != sc->sc_iss + 1) ||
576 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
577 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
578 		m_freem(m);
579 		(void)syn_cache_respond(sc);
580 		splx(s);
581 		return ((struct socket *)(-1));
582 	}
583 
584 	/* Remove this cache entry */
585 	syn_cache_rm(sc);
586 	splx(s);
587 
588 	/*
589 	 * Ok, create the full blown connection, and set things up
590 	 * as they would have been set up if we had created the
591 	 * connection when the SYN arrived.  If we can't create
592 	 * the connection, abort it.
593 	 */
594 	/*
595 	 * inp still has the OLD in_pcb stuff, set the
596 	 * v6-related flags on the new guy, too.   This is
597 	 * done particularly for the case where an AF_INET6
598 	 * socket is bound only to a port, and a v4 connection
599 	 * comes in on that port.
600 	 * we also copy the flowinfo from the original pcb
601 	 * to the new one.
602 	 */
603 	oso = so;
604 	so = sonewconn(so, true);
605 	if (so == NULL)
606 		goto resetandabort;
607 
608 	inp = sotoinpcb(so);
609 
610 	switch (src->sa_family) {
611 	case AF_INET:
612 		if (inp->inp_af == AF_INET) {
613 			in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
614 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
615 			inp->inp_options = ip_srcroute(m);
616 			inpcb_set_state(inp, INP_BOUND);
617 			if (inp->inp_options == NULL) {
618 				inp->inp_options = sc->sc_ipopts;
619 				sc->sc_ipopts = NULL;
620 			}
621 		}
622 #ifdef INET6
623 		else if (inp->inp_af == AF_INET6) {
624 			/* IPv4 packet to AF_INET6 socket */
625 			memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
626 			in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
627 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
628 				&in6p_laddr(inp).s6_addr32[3],
629 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
630 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
631 			intotcpcb(inp)->t_family = AF_INET;
632 			if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
633 				inp->inp_flags |= IN6P_IPV6_V6ONLY;
634 			else
635 				inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
636 			inpcb_set_state(inp, INP_BOUND);
637 		}
638 #endif
639 		break;
640 #ifdef INET6
641 	case AF_INET6:
642 		if (inp->inp_af == AF_INET6) {
643 			in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
644 			inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
645 			inpcb_set_state(inp, INP_BOUND);
646 		}
647 		break;
648 #endif
649 	}
650 
651 #ifdef INET6
652 	if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
653 		struct inpcb *oinp = sotoinpcb(oso);
654 		/* inherit socket options from the listening socket */
655 		inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
656 		if (inp->inp_flags & IN6P_CONTROLOPTS) {
657 			m_freem(inp->inp_options);
658 			inp->inp_options = NULL;
659 		}
660 		ip6_savecontrol(inp, &inp->inp_options,
661 		    mtod(m, struct ip6_hdr *), m);
662 	}
663 #endif
664 
665 	/*
666 	 * Give the new socket our cached route reference.
667 	 */
668 	rtcache_copy(&inp->inp_route, &sc->sc_route);
669 	rtcache_free(&sc->sc_route);
670 
671 	if (inp->inp_af == AF_INET) {
672 		struct sockaddr_in sin;
673 		memcpy(&sin, src, src->sa_len);
674 		if (inpcb_connect(inp, &sin, &lwp0)) {
675 			goto resetandabort;
676 		}
677 	}
678 #ifdef INET6
679 	else if (inp->inp_af == AF_INET6) {
680 		struct sockaddr_in6 sin6;
681 		memcpy(&sin6, src, src->sa_len);
682 		if (src->sa_family == AF_INET) {
683 			/* IPv4 packet to AF_INET6 socket */
684 			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
685 		}
686 		if (in6pcb_connect(inp, &sin6, NULL)) {
687 			goto resetandabort;
688 		}
689 	}
690 #endif
691 	else {
692 		goto resetandabort;
693 	}
694 
695 	tp = intotcpcb(inp);
696 
697 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
698 	if (sc->sc_request_r_scale != 15) {
699 		tp->requested_s_scale = sc->sc_requested_s_scale;
700 		tp->request_r_scale = sc->sc_request_r_scale;
701 		tp->snd_scale = sc->sc_requested_s_scale;
702 		tp->rcv_scale = sc->sc_request_r_scale;
703 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
704 	}
705 	if (sc->sc_flags & SCF_TIMESTAMP)
706 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
707 	tp->ts_timebase = sc->sc_timebase;
708 
709 	tp->t_template = tcp_template(tp);
710 	if (tp->t_template == 0) {
711 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
712 		so = NULL;
713 		m_freem(m);
714 		goto abort;
715 	}
716 
717 	tp->iss = sc->sc_iss;
718 	tp->irs = sc->sc_irs;
719 	tcp_sendseqinit(tp);
720 	tcp_rcvseqinit(tp);
721 	tp->t_state = TCPS_SYN_RECEIVED;
722 	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
723 	TCP_STATINC(TCP_STAT_ACCEPTS);
724 
725 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
726 		tp->t_flags |= TF_WILL_SACK;
727 
728 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
729 		tp->t_flags |= TF_ECN_PERMIT;
730 
731 #ifdef TCP_SIGNATURE
732 	if (sc->sc_flags & SCF_SIGNATURE)
733 		tp->t_flags |= TF_SIGNATURE;
734 #endif
735 
736 	/* Initialize tp->t_ourmss before we deal with the peer's! */
737 	tp->t_ourmss = sc->sc_ourmaxseg;
738 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
739 
740 	/*
741 	 * Initialize the initial congestion window.  If we
742 	 * had to retransmit the SYN,ACK, we must initialize cwnd
743 	 * to 1 segment (i.e. the Loss Window).
744 	 */
745 	if (sc->sc_rxtshift)
746 		tp->snd_cwnd = tp->t_peermss;
747 	else {
748 		int ss = tcp_init_win;
749 		if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
750 			ss = tcp_init_win_local;
751 #ifdef INET6
752 		else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
753 			ss = tcp_init_win_local;
754 #endif
755 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
756 	}
757 
758 	tcp_rmx_rtt(tp);
759 	tp->snd_wl1 = sc->sc_irs;
760 	tp->rcv_up = sc->sc_irs + 1;
761 
762 	/*
763 	 * This is what would have happened in tcp_output() when
764 	 * the SYN,ACK was sent.
765 	 */
766 	tp->snd_up = tp->snd_una;
767 	tp->snd_max = tp->snd_nxt = tp->iss+1;
768 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
769 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
770 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
771 	tp->last_ack_sent = tp->rcv_nxt;
772 	tp->t_partialacks = -1;
773 	tp->t_dupacks = 0;
774 
775 	TCP_STATINC(TCP_STAT_SC_COMPLETED);
776 	s = splsoftnet();
777 	syn_cache_put(sc);
778 	splx(s);
779 	return so;
780 
781 resetandabort:
782 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
783 abort:
784 	if (so != NULL) {
785 		(void) soqremque(so, 1);
786 		(void) soabort(so);
787 		mutex_enter(softnet_lock);
788 	}
789 	s = splsoftnet();
790 	syn_cache_put(sc);
791 	splx(s);
792 	TCP_STATINC(TCP_STAT_SC_ABORTED);
793 	return ((struct socket *)(-1));
794 }
795 
796 /*
797  * This function is called when we get a RST for a
798  * non-existent connection, so that we can see if the
799  * connection is in the syn cache.  If it is, zap it.
800  */
801 
802 void
syn_cache_reset(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th)803 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
804 {
805 	struct syn_cache *sc;
806 	struct syn_cache_head *scp;
807 	int s = splsoftnet();
808 
809 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
810 		splx(s);
811 		return;
812 	}
813 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
814 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
815 		splx(s);
816 		return;
817 	}
818 	syn_cache_rm(sc);
819 	TCP_STATINC(TCP_STAT_SC_RESET);
820 	syn_cache_put(sc);	/* calls pool_put but see spl above */
821 	splx(s);
822 }
823 
824 void
syn_cache_unreach(const struct sockaddr * src,const struct sockaddr * dst,struct tcphdr * th)825 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
826     struct tcphdr *th)
827 {
828 	struct syn_cache *sc;
829 	struct syn_cache_head *scp;
830 	int s;
831 
832 	s = splsoftnet();
833 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
834 		splx(s);
835 		return;
836 	}
837 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
838 	if (ntohl(th->th_seq) != sc->sc_iss) {
839 		splx(s);
840 		return;
841 	}
842 
843 	/*
844 	 * If we've retransmitted 3 times and this is our second error,
845 	 * we remove the entry.  Otherwise, we allow it to continue on.
846 	 * This prevents us from incorrectly nuking an entry during a
847 	 * spurious network outage.
848 	 *
849 	 * See tcp_notify().
850 	 */
851 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
852 		sc->sc_flags |= SCF_UNREACH;
853 		splx(s);
854 		return;
855 	}
856 
857 	syn_cache_rm(sc);
858 	TCP_STATINC(TCP_STAT_SC_UNREACH);
859 	syn_cache_put(sc);	/* calls pool_put but see spl above */
860 	splx(s);
861 }
862 
863 /*
864  * Given a LISTEN socket and an inbound SYN request, add this to the syn
865  * cache, and send back a segment:
866  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
867  * to the source.
868  *
869  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
870  * Doing so would require that we hold onto the data and deliver it
871  * to the application.  However, if we are the target of a SYN-flood
872  * DoS attack, an attacker could send data which would eventually
873  * consume all available buffer space if it were ACKed.  By not ACKing
874  * the data, we avoid this DoS scenario.
875  */
876 int
syn_cache_add(struct sockaddr * src,struct sockaddr * dst,struct tcphdr * th,unsigned int toff,struct socket * so,struct mbuf * m,u_char * optp,int optlen,struct tcp_opt_info * oi)877 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
878     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
879     int optlen, struct tcp_opt_info *oi)
880 {
881 	struct tcpcb tb, *tp;
882 	long win;
883 	struct syn_cache *sc;
884 	struct syn_cache_head *scp;
885 	struct mbuf *ipopts;
886 	int s;
887 
888 	tp = sototcpcb(so);
889 
890 	/*
891 	 * Initialize some local state.
892 	 */
893 	win = sbspace(&so->so_rcv);
894 	if (win > TCP_MAXWIN)
895 		win = TCP_MAXWIN;
896 
897 #ifdef TCP_SIGNATURE
898 	if (optp || (tp->t_flags & TF_SIGNATURE))
899 #else
900 	if (optp)
901 #endif
902 	{
903 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
904 #ifdef TCP_SIGNATURE
905 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
906 #endif
907 		tb.t_state = TCPS_LISTEN;
908 		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
909 			return 0;
910 	} else
911 		tb.t_flags = 0;
912 
913 	switch (src->sa_family) {
914 	case AF_INET:
915 		/* Remember the IP options, if any. */
916 		ipopts = ip_srcroute(m);
917 		break;
918 	default:
919 		ipopts = NULL;
920 	}
921 
922 	/*
923 	 * See if we already have an entry for this connection.
924 	 * If we do, resend the SYN,ACK.  We do not count this
925 	 * as a retransmission (XXX though maybe we should).
926 	 */
927 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
928 		TCP_STATINC(TCP_STAT_SC_DUPESYN);
929 		if (ipopts) {
930 			/*
931 			 * If we were remembering a previous source route,
932 			 * forget it and use the new one we've been given.
933 			 */
934 			if (sc->sc_ipopts)
935 				(void)m_free(sc->sc_ipopts);
936 			sc->sc_ipopts = ipopts;
937 		}
938 		sc->sc_timestamp = tb.ts_recent;
939 		m_freem(m);
940 		if (syn_cache_respond(sc) == 0) {
941 			net_stat_ref_t tcps = TCP_STAT_GETREF();
942 			_NET_STATINC_REF(tcps, TCP_STAT_SNDACKS);
943 			_NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL);
944 			TCP_STAT_PUTREF();
945 		}
946 		return 1;
947 	}
948 
949 	s = splsoftnet();
950 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
951 	splx(s);
952 	if (sc == NULL) {
953 		if (ipopts)
954 			(void)m_free(ipopts);
955 		return 0;
956 	}
957 
958 	/*
959 	 * Fill in the cache, and put the necessary IP and TCP
960 	 * options into the reply.
961 	 */
962 	memset(sc, 0, sizeof(struct syn_cache));
963 	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
964 	memcpy(&sc->sc_src, src, src->sa_len);
965 	memcpy(&sc->sc_dst, dst, dst->sa_len);
966 	sc->sc_flags = 0;
967 	sc->sc_ipopts = ipopts;
968 	sc->sc_irs = th->th_seq;
969 	switch (src->sa_family) {
970 	case AF_INET:
971 	    {
972 		struct sockaddr_in *srcin = (void *)src;
973 		struct sockaddr_in *dstin = (void *)dst;
974 
975 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
976 		    &srcin->sin_addr, dstin->sin_port,
977 		    srcin->sin_port, sizeof(dstin->sin_addr));
978 		break;
979 	    }
980 #ifdef INET6
981 	case AF_INET6:
982 	    {
983 		struct sockaddr_in6 *srcin6 = (void *)src;
984 		struct sockaddr_in6 *dstin6 = (void *)dst;
985 
986 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
987 		    &srcin6->sin6_addr, dstin6->sin6_port,
988 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
989 		break;
990 	    }
991 #endif
992 	}
993 	sc->sc_peermaxseg = oi->maxseg;
994 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
995 	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
996 	sc->sc_win = win;
997 	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
998 	sc->sc_timestamp = tb.ts_recent;
999 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
1000 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1001 		sc->sc_flags |= SCF_TIMESTAMP;
1002 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1003 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1004 		sc->sc_requested_s_scale = tb.requested_s_scale;
1005 		sc->sc_request_r_scale = 0;
1006 		/*
1007 		 * Pick the smallest possible scaling factor that
1008 		 * will still allow us to scale up to sb_max.
1009 		 *
1010 		 * We do this because there are broken firewalls that
1011 		 * will corrupt the window scale option, leading to
1012 		 * the other endpoint believing that our advertised
1013 		 * window is unscaled.  At scale factors larger than
1014 		 * 5 the unscaled window will drop below 1500 bytes,
1015 		 * leading to serious problems when traversing these
1016 		 * broken firewalls.
1017 		 *
1018 		 * With the default sbmax of 256K, a scale factor
1019 		 * of 3 will be chosen by this algorithm.  Those who
1020 		 * choose a larger sbmax should watch out
1021 		 * for the compatibility problems mentioned above.
1022 		 *
1023 		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
1024 		 * or <SYN,ACK>) segment itself is never scaled.
1025 		 */
1026 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
1027 		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
1028 			sc->sc_request_r_scale++;
1029 	} else {
1030 		sc->sc_requested_s_scale = 15;
1031 		sc->sc_request_r_scale = 15;
1032 	}
1033 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
1034 		sc->sc_flags |= SCF_SACK_PERMIT;
1035 
1036 	/*
1037 	 * ECN setup packet received.
1038 	 */
1039 	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
1040 		sc->sc_flags |= SCF_ECN_PERMIT;
1041 
1042 #ifdef TCP_SIGNATURE
1043 	if (tb.t_flags & TF_SIGNATURE)
1044 		sc->sc_flags |= SCF_SIGNATURE;
1045 #endif
1046 	sc->sc_tp = tp;
1047 	m_freem(m);
1048 	if (syn_cache_respond(sc) == 0) {
1049 		net_stat_ref_t tcps = TCP_STAT_GETREF();
1050 		_NET_STATINC_REF(tcps, TCP_STAT_SNDACKS);
1051 		_NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL);
1052 		TCP_STAT_PUTREF();
1053 		syn_cache_insert(sc, tp);
1054 	} else {
1055 		s = splsoftnet();
1056 		/*
1057 		 * syn_cache_put() will try to schedule the timer, so
1058 		 * we need to initialize it
1059 		 */
1060 		syn_cache_timer_arm(sc);
1061 		syn_cache_put(sc);
1062 		splx(s);
1063 		TCP_STATINC(TCP_STAT_SC_DROPPED);
1064 	}
1065 	return 1;
1066 }
1067 
1068 /*
1069  * syn_cache_respond: (re)send SYN+ACK.
1070  *
1071  * Returns 0 on success.
1072  */
1073 
1074 static int
syn_cache_respond(struct syn_cache * sc)1075 syn_cache_respond(struct syn_cache *sc)
1076 {
1077 #ifdef INET6
1078 	struct rtentry *rt = NULL;
1079 #endif
1080 	struct route *ro;
1081 	u_int8_t *optp;
1082 	int optlen, error;
1083 	u_int16_t tlen;
1084 	struct ip *ip = NULL;
1085 #ifdef INET6
1086 	struct ip6_hdr *ip6 = NULL;
1087 #endif
1088 	struct tcpcb *tp;
1089 	struct tcphdr *th;
1090 	struct mbuf *m;
1091 	u_int hlen;
1092 #ifdef TCP_SIGNATURE
1093 	struct secasvar *sav = NULL;
1094 	u_int8_t *sigp = NULL;
1095 #endif
1096 
1097 	ro = &sc->sc_route;
1098 	switch (sc->sc_src.sa.sa_family) {
1099 	case AF_INET:
1100 		hlen = sizeof(struct ip);
1101 		break;
1102 #ifdef INET6
1103 	case AF_INET6:
1104 		hlen = sizeof(struct ip6_hdr);
1105 		break;
1106 #endif
1107 	default:
1108 		return EAFNOSUPPORT;
1109 	}
1110 
1111 	/* Worst case scenario, since we don't know the option size yet. */
1112 	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
1113 	KASSERT(max_linkhdr + tlen <= MCLBYTES);
1114 
1115 	/*
1116 	 * Create the IP+TCP header from scratch.
1117 	 */
1118 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1119 	if (m && (max_linkhdr + tlen) > MHLEN) {
1120 		MCLGET(m, M_DONTWAIT);
1121 		if ((m->m_flags & M_EXT) == 0) {
1122 			m_freem(m);
1123 			m = NULL;
1124 		}
1125 	}
1126 	if (m == NULL)
1127 		return ENOBUFS;
1128 	MCLAIM(m, &tcp_tx_mowner);
1129 
1130 	tp = sc->sc_tp;
1131 
1132 	/* Fixup the mbuf. */
1133 	m->m_data += max_linkhdr;
1134 	m_reset_rcvif(m);
1135 	memset(mtod(m, void *), 0, tlen);
1136 
1137 	switch (sc->sc_src.sa.sa_family) {
1138 	case AF_INET:
1139 		ip = mtod(m, struct ip *);
1140 		ip->ip_v = 4;
1141 		ip->ip_dst = sc->sc_src.sin.sin_addr;
1142 		ip->ip_src = sc->sc_dst.sin.sin_addr;
1143 		ip->ip_p = IPPROTO_TCP;
1144 		th = (struct tcphdr *)(ip + 1);
1145 		th->th_dport = sc->sc_src.sin.sin_port;
1146 		th->th_sport = sc->sc_dst.sin.sin_port;
1147 		break;
1148 #ifdef INET6
1149 	case AF_INET6:
1150 		ip6 = mtod(m, struct ip6_hdr *);
1151 		ip6->ip6_vfc = IPV6_VERSION;
1152 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
1153 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
1154 		ip6->ip6_nxt = IPPROTO_TCP;
1155 		/* ip6_plen will be updated in ip6_output() */
1156 		th = (struct tcphdr *)(ip6 + 1);
1157 		th->th_dport = sc->sc_src.sin6.sin6_port;
1158 		th->th_sport = sc->sc_dst.sin6.sin6_port;
1159 		break;
1160 #endif
1161 	default:
1162 		panic("%s: impossible (1)", __func__);
1163 	}
1164 
1165 	th->th_seq = htonl(sc->sc_iss);
1166 	th->th_ack = htonl(sc->sc_irs + 1);
1167 	th->th_flags = TH_SYN|TH_ACK;
1168 	th->th_win = htons(sc->sc_win);
1169 	/* th_x2, th_sum, th_urp already 0 from memset */
1170 
1171 	/* Tack on the TCP options. */
1172 	optp = (u_int8_t *)(th + 1);
1173 	optlen = 0;
1174 	*optp++ = TCPOPT_MAXSEG;
1175 	*optp++ = TCPOLEN_MAXSEG;
1176 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
1177 	*optp++ = sc->sc_ourmaxseg & 0xff;
1178 	optlen += TCPOLEN_MAXSEG;
1179 
1180 	if (sc->sc_request_r_scale != 15) {
1181 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1182 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1183 		    sc->sc_request_r_scale);
1184 		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
1185 		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
1186 	}
1187 
1188 	if (sc->sc_flags & SCF_SACK_PERMIT) {
1189 		/* Let the peer know that we will SACK. */
1190 		*optp++ = TCPOPT_SACK_PERMITTED;
1191 		*optp++ = TCPOLEN_SACK_PERMITTED;
1192 		optlen += TCPOLEN_SACK_PERMITTED;
1193 	}
1194 
1195 	if (sc->sc_flags & SCF_TIMESTAMP) {
1196 		while (optlen % 4 != 2) {
1197 			optlen += TCPOLEN_NOP;
1198 			*optp++ = TCPOPT_NOP;
1199 		}
1200 		*optp++ = TCPOPT_TIMESTAMP;
1201 		*optp++ = TCPOLEN_TIMESTAMP;
1202 		u_int32_t *lp = (u_int32_t *)(optp);
1203 		/* Form timestamp option as shown in appendix A of RFC 1323. */
1204 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
1205 		*lp   = htonl(sc->sc_timestamp);
1206 		optp += TCPOLEN_TIMESTAMP - 2;
1207 		optlen += TCPOLEN_TIMESTAMP;
1208 	}
1209 
1210 #ifdef TCP_SIGNATURE
1211 	if (sc->sc_flags & SCF_SIGNATURE) {
1212 		sav = tcp_signature_getsav(m);
1213 		if (sav == NULL) {
1214 			m_freem(m);
1215 			return EPERM;
1216 		}
1217 
1218 		*optp++ = TCPOPT_SIGNATURE;
1219 		*optp++ = TCPOLEN_SIGNATURE;
1220 		sigp = optp;
1221 		memset(optp, 0, TCP_SIGLEN);
1222 		optp += TCP_SIGLEN;
1223 		optlen += TCPOLEN_SIGNATURE;
1224 	}
1225 #endif
1226 
1227 	/*
1228 	 * Terminate and pad TCP options to a 4 byte boundary.
1229 	 *
1230 	 * According to RFC793: "The content of the header beyond the
1231 	 * End-of-Option option must be header padding (i.e., zero)."
1232 	 * And later: "The padding is composed of zeros."
1233 	 */
1234 	if (optlen % 4) {
1235 		optlen += TCPOLEN_EOL;
1236 		*optp++ = TCPOPT_EOL;
1237 	}
1238 	while (optlen % 4) {
1239 		optlen += TCPOLEN_PAD;
1240 		*optp++ = TCPOPT_PAD;
1241 	}
1242 
1243 	/* Compute the actual values now that we've added the options. */
1244 	tlen = hlen + sizeof(struct tcphdr) + optlen;
1245 	m->m_len = m->m_pkthdr.len = tlen;
1246 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1247 
1248 #ifdef TCP_SIGNATURE
1249 	if (sav) {
1250 		(void)tcp_signature(m, th, hlen, sav, sigp);
1251 		key_sa_recordxfer(sav, m);
1252 		KEY_SA_UNREF(&sav);
1253 	}
1254 #endif
1255 
1256 	/*
1257 	 * Send ECN SYN-ACK setup packet.
1258 	 * Routes can be asymmetric, so, even if we receive a packet
1259 	 * with ECE and CWR set, we must not assume no one will block
1260 	 * the ECE packet we are about to send.
1261 	 */
1262 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
1263 	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
1264 		th->th_flags |= TH_ECE;
1265 		TCP_STATINC(TCP_STAT_ECN_SHS);
1266 
1267 		/*
1268 		 * draft-ietf-tcpm-ecnsyn-00.txt
1269 		 *
1270 		 * "[...] a TCP node MAY respond to an ECN-setup
1271 		 * SYN packet by setting ECT in the responding
1272 		 * ECN-setup SYN/ACK packet, indicating to routers
1273 		 * that the SYN/ACK packet is ECN-Capable.
1274 		 * This allows a congested router along the path
1275 		 * to mark the packet instead of dropping the
1276 		 * packet as an indication of congestion."
1277 		 *
1278 		 * "[...] There can be a great benefit in setting
1279 		 * an ECN-capable codepoint in SYN/ACK packets [...]
1280 		 * Congestion is  most likely to occur in
1281 		 * the server-to-client direction.  As a result,
1282 		 * setting an ECN-capable codepoint in SYN/ACK
1283 		 * packets can reduce the occurrence of three-second
1284 		 * retransmit timeouts resulting from the drop
1285 		 * of SYN/ACK packets."
1286 		 *
1287 		 * Page 4 and 6, January 2006.
1288 		 */
1289 
1290 		switch (sc->sc_src.sa.sa_family) {
1291 		case AF_INET:
1292 			ip->ip_tos |= IPTOS_ECN_ECT0;
1293 			break;
1294 #ifdef INET6
1295 		case AF_INET6:
1296 			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1297 			break;
1298 #endif
1299 		}
1300 		TCP_STATINC(TCP_STAT_ECN_ECT);
1301 	}
1302 
1303 
1304 	/*
1305 	 * Compute the packet's checksum.
1306 	 *
1307 	 * Fill in some straggling IP bits.  Note the stack expects
1308 	 * ip_len to be in host order, for convenience.
1309 	 */
1310 	switch (sc->sc_src.sa.sa_family) {
1311 	case AF_INET:
1312 		ip->ip_len = htons(tlen - hlen);
1313 		th->th_sum = 0;
1314 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1315 		ip->ip_len = htons(tlen);
1316 		ip->ip_ttl = ip_defttl;
1317 		/* XXX tos? */
1318 		break;
1319 #ifdef INET6
1320 	case AF_INET6:
1321 		ip6->ip6_plen = htons(tlen - hlen);
1322 		th->th_sum = 0;
1323 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1324 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
1325 		ip6->ip6_vfc |= IPV6_VERSION;
1326 		ip6->ip6_plen = htons(tlen - hlen);
1327 		/* ip6_hlim will be initialized afterwards */
1328 		/* XXX flowlabel? */
1329 		break;
1330 #endif
1331 	}
1332 
1333 	/* XXX use IPsec policy on listening socket, on SYN ACK */
1334 	tp = sc->sc_tp;
1335 
1336 	switch (sc->sc_src.sa.sa_family) {
1337 	case AF_INET:
1338 		error = ip_output(m, sc->sc_ipopts, ro,
1339 		    (ip_mtudisc ? IP_MTUDISC : 0),
1340 		    NULL, tp ? tp->t_inpcb : NULL);
1341 		break;
1342 #ifdef INET6
1343 	case AF_INET6:
1344 		ip6->ip6_hlim = in6pcb_selecthlim(NULL,
1345 		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
1346 		rtcache_unref(rt, ro);
1347 
1348 		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
1349 		    tp ? tp->t_inpcb : NULL, NULL);
1350 		break;
1351 #endif
1352 	default:
1353 		panic("%s: impossible (2)", __func__);
1354 	}
1355 
1356 	return error;
1357 }
1358