xref: /netbsd-src/sys/netinet/tcp_input.c (revision aaf4ece63a859a04e37cf3a7229b5fab0157cc06)
1 /*	$NetBSD: tcp_input.c,v 1.237 2005/11/15 18:39:46 dsl Exp $	*/
2 
3 /*
4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the project nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
34  *
35  * NRL grants permission for redistribution and use in source and binary
36  * forms, with or without modification, of the software and documentation
37  * created at NRL provided that the following conditions are met:
38  *
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgements:
46  *      This product includes software developed by the University of
47  *      California, Berkeley and its contributors.
48  *      This product includes software developed at the Information
49  *      Technology Division, US Naval Research Laboratory.
50  * 4. Neither the name of the NRL nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  *
66  * The views and conclusions contained in the software and documentation
67  * are those of the authors and should not be interpreted as representing
68  * official policies, either expressed or implied, of the US Naval
69  * Research Laboratory (NRL).
70  */
71 
72 /*-
73  * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
74  * All rights reserved.
75  *
76  * This code is derived from software contributed to The NetBSD Foundation
77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78  * Facility, NASA Ames Research Center.
79  * This code is derived from software contributed to The NetBSD Foundation
80  * by Charles M. Hannum.
81  *
82  * Redistribution and use in source and binary forms, with or without
83  * modification, are permitted provided that the following conditions
84  * are met:
85  * 1. Redistributions of source code must retain the above copyright
86  *    notice, this list of conditions and the following disclaimer.
87  * 2. Redistributions in binary form must reproduce the above copyright
88  *    notice, this list of conditions and the following disclaimer in the
89  *    documentation and/or other materials provided with the distribution.
90  * 3. All advertising materials mentioning features or use of this software
91  *    must display the following acknowledgement:
92  *	This product includes software developed by the NetBSD
93  *	Foundation, Inc. and its contributors.
94  * 4. Neither the name of The NetBSD Foundation nor the names of its
95  *    contributors may be used to endorse or promote products derived
96  *    from this software without specific prior written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
99  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
102  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108  * POSSIBILITY OF SUCH DAMAGE.
109  */
110 
111 /*
112  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
113  *	The Regents of the University of California.  All rights reserved.
114  *
115  * Redistribution and use in source and binary forms, with or without
116  * modification, are permitted provided that the following conditions
117  * are met:
118  * 1. Redistributions of source code must retain the above copyright
119  *    notice, this list of conditions and the following disclaimer.
120  * 2. Redistributions in binary form must reproduce the above copyright
121  *    notice, this list of conditions and the following disclaimer in the
122  *    documentation and/or other materials provided with the distribution.
123  * 3. Neither the name of the University nor the names of its contributors
124  *    may be used to endorse or promote products derived from this software
125  *    without specific prior written permission.
126  *
127  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
128  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
129  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
130  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
131  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
132  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
133  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
134  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
135  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
136  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
137  * SUCH DAMAGE.
138  *
139  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
140  */
141 
142 /*
143  *	TODO list for SYN cache stuff:
144  *
145  *	Find room for a "state" field, which is needed to keep a
146  *	compressed state for TIME_WAIT TCBs.  It's been noted already
147  *	that this is fairly important for very high-volume web and
148  *	mail servers, which use a large number of short-lived
149  *	connections.
150  */
151 
152 #include <sys/cdefs.h>
153 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.237 2005/11/15 18:39:46 dsl Exp $");
154 
155 #include "opt_inet.h"
156 #include "opt_ipsec.h"
157 #include "opt_inet_csum.h"
158 #include "opt_tcp_debug.h"
159 
160 #include <sys/param.h>
161 #include <sys/systm.h>
162 #include <sys/malloc.h>
163 #include <sys/mbuf.h>
164 #include <sys/protosw.h>
165 #include <sys/socket.h>
166 #include <sys/socketvar.h>
167 #include <sys/errno.h>
168 #include <sys/syslog.h>
169 #include <sys/pool.h>
170 #include <sys/domain.h>
171 #include <sys/kernel.h>
172 #ifdef TCP_SIGNATURE
173 #include <sys/md5.h>
174 #endif
175 
176 #include <net/if.h>
177 #include <net/route.h>
178 #include <net/if_types.h>
179 
180 #include <netinet/in.h>
181 #include <netinet/in_systm.h>
182 #include <netinet/ip.h>
183 #include <netinet/in_pcb.h>
184 #include <netinet/in_var.h>
185 #include <netinet/ip_var.h>
186 #include <netinet/in_offload.h>
187 
188 #ifdef INET6
189 #ifndef INET
190 #include <netinet/in.h>
191 #endif
192 #include <netinet/ip6.h>
193 #include <netinet6/ip6_var.h>
194 #include <netinet6/in6_pcb.h>
195 #include <netinet6/ip6_var.h>
196 #include <netinet6/in6_var.h>
197 #include <netinet/icmp6.h>
198 #include <netinet6/nd6.h>
199 #endif
200 
201 #ifndef INET6
202 /* always need ip6.h for IP6_EXTHDR_GET */
203 #include <netinet/ip6.h>
204 #endif
205 
206 #include <netinet/tcp.h>
207 #include <netinet/tcp_fsm.h>
208 #include <netinet/tcp_seq.h>
209 #include <netinet/tcp_timer.h>
210 #include <netinet/tcp_var.h>
211 #include <netinet/tcpip.h>
212 #include <netinet/tcp_debug.h>
213 
214 #include <machine/stdarg.h>
215 
216 #ifdef IPSEC
217 #include <netinet6/ipsec.h>
218 #include <netkey/key.h>
219 #endif /*IPSEC*/
220 #ifdef INET6
221 #include "faith.h"
222 #if defined(NFAITH) && NFAITH > 0
223 #include <net/if_faith.h>
224 #endif
225 #endif	/* IPSEC */
226 
227 #ifdef FAST_IPSEC
228 #include <netipsec/ipsec.h>
229 #include <netipsec/ipsec_var.h>			/* XXX ipsecstat namespace */
230 #include <netipsec/key.h>
231 #ifdef INET6
232 #include <netipsec/ipsec6.h>
233 #endif
234 #endif	/* FAST_IPSEC*/
235 
236 int	tcprexmtthresh = 3;
237 int	tcp_log_refused;
238 
239 static int tcp_rst_ppslim_count = 0;
240 static struct timeval tcp_rst_ppslim_last;
241 static int tcp_ackdrop_ppslim_count = 0;
242 static struct timeval tcp_ackdrop_ppslim_last;
243 
244 #define TCP_PAWS_IDLE	(24U * 24 * 60 * 60 * PR_SLOWHZ)
245 
246 /* for modulo comparisons of timestamps */
247 #define TSTMP_LT(a,b)	((int)((a)-(b)) < 0)
248 #define TSTMP_GEQ(a,b)	((int)((a)-(b)) >= 0)
249 
250 /*
251  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
252  */
253 #ifdef INET6
254 #define ND6_HINT(tp) \
255 do { \
256 	if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 && \
257 	    tp->t_in6pcb->in6p_route.ro_rt) { \
258 		nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \
259 	} \
260 } while (/*CONSTCOND*/ 0)
261 #else
262 #define ND6_HINT(tp)
263 #endif
264 
265 /*
266  * Macro to compute ACK transmission behavior.  Delay the ACK unless
267  * we have already delayed an ACK (must send an ACK every two segments).
268  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
269  * option is enabled.
270  */
271 #define	TCP_SETUP_ACK(tp, th) \
272 do { \
273 	if ((tp)->t_flags & TF_DELACK || \
274 	    (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \
275 		tp->t_flags |= TF_ACKNOW; \
276 	else \
277 		TCP_SET_DELACK(tp); \
278 } while (/*CONSTCOND*/ 0)
279 
280 #define ICMP_CHECK(tp, th, acked) \
281 do { \
282 	/* \
283 	 * If we had a pending ICMP message that \
284 	 * refers to data that have just been  \
285 	 * acknowledged, disregard the recorded ICMP \
286 	 * message. \
287 	 */ \
288 	if (((tp)->t_flags & TF_PMTUD_PEND) && \
289 	    SEQ_GT((th)->th_ack, (tp)->t_pmtud_th_seq)) \
290 		(tp)->t_flags &= ~TF_PMTUD_PEND; \
291 \
292 	/* \
293 	 * Keep track of the largest chunk of data \
294 	 * acknowledged since last PMTU update \
295 	 */ \
296 	if ((tp)->t_pmtud_mss_acked < (acked)) \
297 		(tp)->t_pmtud_mss_acked = (acked); \
298 } while (/*CONSTCOND*/ 0)
299 
300 /*
301  * Convert TCP protocol fields to host order for easier processing.
302  */
303 #define	TCP_FIELDS_TO_HOST(th)						\
304 do {									\
305 	NTOHL((th)->th_seq);						\
306 	NTOHL((th)->th_ack);						\
307 	NTOHS((th)->th_win);						\
308 	NTOHS((th)->th_urp);						\
309 } while (/*CONSTCOND*/ 0)
310 
311 /*
312  * ... and reverse the above.
313  */
314 #define	TCP_FIELDS_TO_NET(th)						\
315 do {									\
316 	HTONL((th)->th_seq);						\
317 	HTONL((th)->th_ack);						\
318 	HTONS((th)->th_win);						\
319 	HTONS((th)->th_urp);						\
320 } while (/*CONSTCOND*/ 0)
321 
322 #ifdef TCP_CSUM_COUNTERS
323 #include <sys/device.h>
324 
325 #if defined(INET)
326 extern struct evcnt tcp_hwcsum_ok;
327 extern struct evcnt tcp_hwcsum_bad;
328 extern struct evcnt tcp_hwcsum_data;
329 extern struct evcnt tcp_swcsum;
330 #endif /* defined(INET) */
331 #if defined(INET6)
332 extern struct evcnt tcp6_hwcsum_ok;
333 extern struct evcnt tcp6_hwcsum_bad;
334 extern struct evcnt tcp6_hwcsum_data;
335 extern struct evcnt tcp6_swcsum;
336 #endif /* defined(INET6) */
337 
338 #define	TCP_CSUM_COUNTER_INCR(ev)	(ev)->ev_count++
339 
340 #else
341 
342 #define	TCP_CSUM_COUNTER_INCR(ev)	/* nothing */
343 
344 #endif /* TCP_CSUM_COUNTERS */
345 
346 #ifdef TCP_REASS_COUNTERS
347 #include <sys/device.h>
348 
349 extern struct evcnt tcp_reass_;
350 extern struct evcnt tcp_reass_empty;
351 extern struct evcnt tcp_reass_iteration[8];
352 extern struct evcnt tcp_reass_prependfirst;
353 extern struct evcnt tcp_reass_prepend;
354 extern struct evcnt tcp_reass_insert;
355 extern struct evcnt tcp_reass_inserttail;
356 extern struct evcnt tcp_reass_append;
357 extern struct evcnt tcp_reass_appendtail;
358 extern struct evcnt tcp_reass_overlaptail;
359 extern struct evcnt tcp_reass_overlapfront;
360 extern struct evcnt tcp_reass_segdup;
361 extern struct evcnt tcp_reass_fragdup;
362 
363 #define	TCP_REASS_COUNTER_INCR(ev)	(ev)->ev_count++
364 
365 #else
366 
367 #define	TCP_REASS_COUNTER_INCR(ev)	/* nothing */
368 
369 #endif /* TCP_REASS_COUNTERS */
370 
371 #ifdef INET
372 static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
373 #endif
374 #ifdef INET6
375 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
376 #endif
377 
378 #define	TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
379 
380 POOL_INIT(tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl", NULL);
381 
382 struct ipqent *
383 tcpipqent_alloc()
384 {
385 	struct ipqent *ipqe;
386 	int s;
387 
388 	s = splvm();
389 	ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
390 	splx(s);
391 
392 	return ipqe;
393 }
394 
395 void
396 tcpipqent_free(struct ipqent *ipqe)
397 {
398 	int s;
399 
400 	s = splvm();
401 	pool_put(&tcpipqent_pool, ipqe);
402 	splx(s);
403 }
404 
405 int
406 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
407 {
408 	struct ipqent *p, *q, *nq, *tiqe = NULL;
409 	struct socket *so = NULL;
410 	int pkt_flags;
411 	tcp_seq pkt_seq;
412 	unsigned pkt_len;
413 	u_long rcvpartdupbyte = 0;
414 	u_long rcvoobyte;
415 #ifdef TCP_REASS_COUNTERS
416 	u_int count = 0;
417 #endif
418 
419 	if (tp->t_inpcb)
420 		so = tp->t_inpcb->inp_socket;
421 #ifdef INET6
422 	else if (tp->t_in6pcb)
423 		so = tp->t_in6pcb->in6p_socket;
424 #endif
425 
426 	TCP_REASS_LOCK_CHECK(tp);
427 
428 	/*
429 	 * Call with th==0 after become established to
430 	 * force pre-ESTABLISHED data up to user socket.
431 	 */
432 	if (th == 0)
433 		goto present;
434 
435 	rcvoobyte = *tlen;
436 	/*
437 	 * Copy these to local variables because the tcpiphdr
438 	 * gets munged while we are collapsing mbufs.
439 	 */
440 	pkt_seq = th->th_seq;
441 	pkt_len = *tlen;
442 	pkt_flags = th->th_flags;
443 
444 	TCP_REASS_COUNTER_INCR(&tcp_reass_);
445 
446 	if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
447 		/*
448 		 * When we miss a packet, the vast majority of time we get
449 		 * packets that follow it in order.  So optimize for that.
450 		 */
451 		if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
452 			p->ipqe_len += pkt_len;
453 			p->ipqe_flags |= pkt_flags;
454 			m_cat(p->ipre_mlast, m);
455 			TRAVERSE(p->ipre_mlast);
456 			m = NULL;
457 			tiqe = p;
458 			TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
459 			TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
460 			goto skip_replacement;
461 		}
462 		/*
463 		 * While we're here, if the pkt is completely beyond
464 		 * anything we have, just insert it at the tail.
465 		 */
466 		if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
467 			TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
468 			goto insert_it;
469 		}
470 	}
471 
472 	q = TAILQ_FIRST(&tp->segq);
473 
474 	if (q != NULL) {
475 		/*
476 		 * If this segment immediately precedes the first out-of-order
477 		 * block, simply slap the segment in front of it and (mostly)
478 		 * skip the complicated logic.
479 		 */
480 		if (pkt_seq + pkt_len == q->ipqe_seq) {
481 			q->ipqe_seq = pkt_seq;
482 			q->ipqe_len += pkt_len;
483 			q->ipqe_flags |= pkt_flags;
484 			m_cat(m, q->ipqe_m);
485 			q->ipqe_m = m;
486 			q->ipre_mlast = m; /* last mbuf may have changed */
487 			TRAVERSE(q->ipre_mlast);
488 			tiqe = q;
489 			TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
490 			TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
491 			goto skip_replacement;
492 		}
493 	} else {
494 		TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
495 	}
496 
497 	/*
498 	 * Find a segment which begins after this one does.
499 	 */
500 	for (p = NULL; q != NULL; q = nq) {
501 		nq = TAILQ_NEXT(q, ipqe_q);
502 #ifdef TCP_REASS_COUNTERS
503 		count++;
504 #endif
505 		/*
506 		 * If the received segment is just right after this
507 		 * fragment, merge the two together and then check
508 		 * for further overlaps.
509 		 */
510 		if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
511 #ifdef TCPREASS_DEBUG
512 			printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
513 			       tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
514 			       q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
515 #endif
516 			pkt_len += q->ipqe_len;
517 			pkt_flags |= q->ipqe_flags;
518 			pkt_seq = q->ipqe_seq;
519 			m_cat(q->ipre_mlast, m);
520 			TRAVERSE(q->ipre_mlast);
521 			m = q->ipqe_m;
522 			TCP_REASS_COUNTER_INCR(&tcp_reass_append);
523 			goto free_ipqe;
524 		}
525 		/*
526 		 * If the received segment is completely past this
527 		 * fragment, we need to go the next fragment.
528 		 */
529 		if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
530 			p = q;
531 			continue;
532 		}
533 		/*
534 		 * If the fragment is past the received segment,
535 		 * it (or any following) can't be concatenated.
536 		 */
537 		if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
538 			TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
539 			break;
540 		}
541 
542 		/*
543 		 * We've received all the data in this segment before.
544 		 * mark it as a duplicate and return.
545 		 */
546 		if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
547 		    SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
548 			tcpstat.tcps_rcvduppack++;
549 			tcpstat.tcps_rcvdupbyte += pkt_len;
550 			tcp_new_dsack(tp, pkt_seq, pkt_len);
551 			m_freem(m);
552 			if (tiqe != NULL) {
553 				tcpipqent_free(tiqe);
554 			}
555 			TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
556 			return (0);
557 		}
558 		/*
559 		 * Received segment completely overlaps this fragment
560 		 * so we drop the fragment (this keeps the temporal
561 		 * ordering of segments correct).
562 		 */
563 		if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
564 		    SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
565 			rcvpartdupbyte += q->ipqe_len;
566 			m_freem(q->ipqe_m);
567 			TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
568 			goto free_ipqe;
569 		}
570 		/*
571 		 * RX'ed segment extends past the end of the
572 		 * fragment.  Drop the overlapping bytes.  Then
573 		 * merge the fragment and segment then treat as
574 		 * a longer received packet.
575 		 */
576 		if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
577 		    SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
578 			int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
579 #ifdef TCPREASS_DEBUG
580 			printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
581 			       tp, overlap,
582 			       pkt_seq, pkt_seq + pkt_len, pkt_len);
583 #endif
584 			m_adj(m, overlap);
585 			rcvpartdupbyte += overlap;
586 			m_cat(q->ipre_mlast, m);
587 			TRAVERSE(q->ipre_mlast);
588 			m = q->ipqe_m;
589 			pkt_seq = q->ipqe_seq;
590 			pkt_len += q->ipqe_len - overlap;
591 			rcvoobyte -= overlap;
592 			TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
593 			goto free_ipqe;
594 		}
595 		/*
596 		 * RX'ed segment extends past the front of the
597 		 * fragment.  Drop the overlapping bytes on the
598 		 * received packet.  The packet will then be
599 		 * contatentated with this fragment a bit later.
600 		 */
601 		if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
602 		    SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
603 			int overlap = pkt_seq + pkt_len - q->ipqe_seq;
604 #ifdef TCPREASS_DEBUG
605 			printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
606 			       tp, overlap,
607 			       pkt_seq, pkt_seq + pkt_len, pkt_len);
608 #endif
609 			m_adj(m, -overlap);
610 			pkt_len -= overlap;
611 			rcvpartdupbyte += overlap;
612 			TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
613 			rcvoobyte -= overlap;
614 		}
615 		/*
616 		 * If the received segment immediates precedes this
617 		 * fragment then tack the fragment onto this segment
618 		 * and reinsert the data.
619 		 */
620 		if (q->ipqe_seq == pkt_seq + pkt_len) {
621 #ifdef TCPREASS_DEBUG
622 			printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
623 			       tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
624 			       pkt_seq, pkt_seq + pkt_len, pkt_len);
625 #endif
626 			pkt_len += q->ipqe_len;
627 			pkt_flags |= q->ipqe_flags;
628 			m_cat(m, q->ipqe_m);
629 			TAILQ_REMOVE(&tp->segq, q, ipqe_q);
630 			TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
631 			tp->t_segqlen--;
632 			KASSERT(tp->t_segqlen >= 0);
633 			KASSERT(tp->t_segqlen != 0 ||
634 			    (TAILQ_EMPTY(&tp->segq) &&
635 			    TAILQ_EMPTY(&tp->timeq)));
636 			if (tiqe == NULL) {
637 				tiqe = q;
638 			} else {
639 				tcpipqent_free(q);
640 			}
641 			TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
642 			break;
643 		}
644 		/*
645 		 * If the fragment is before the segment, remember it.
646 		 * When this loop is terminated, p will contain the
647 		 * pointer to fragment that is right before the received
648 		 * segment.
649 		 */
650 		if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
651 			p = q;
652 
653 		continue;
654 
655 		/*
656 		 * This is a common operation.  It also will allow
657 		 * to save doing a malloc/free in most instances.
658 		 */
659 	  free_ipqe:
660 		TAILQ_REMOVE(&tp->segq, q, ipqe_q);
661 		TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
662 		tp->t_segqlen--;
663 		KASSERT(tp->t_segqlen >= 0);
664 		KASSERT(tp->t_segqlen != 0 ||
665 		    (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
666 		if (tiqe == NULL) {
667 			tiqe = q;
668 		} else {
669 			tcpipqent_free(q);
670 		}
671 	}
672 
673 #ifdef TCP_REASS_COUNTERS
674 	if (count > 7)
675 		TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
676 	else if (count > 0)
677 		TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
678 #endif
679 
680     insert_it:
681 
682 	/*
683 	 * Allocate a new queue entry since the received segment did not
684 	 * collapse onto any other out-of-order block; thus we are allocating
685 	 * a new block.  If it had collapsed, tiqe would not be NULL and
686 	 * we would be reusing it.
687 	 * XXX If we can't, just drop the packet.  XXX
688 	 */
689 	if (tiqe == NULL) {
690 		tiqe = tcpipqent_alloc();
691 		if (tiqe == NULL) {
692 			tcpstat.tcps_rcvmemdrop++;
693 			m_freem(m);
694 			return (0);
695 		}
696 	}
697 
698 	/*
699 	 * Update the counters.
700 	 */
701 	tcpstat.tcps_rcvoopack++;
702 	tcpstat.tcps_rcvoobyte += rcvoobyte;
703 	if (rcvpartdupbyte) {
704 	    tcpstat.tcps_rcvpartduppack++;
705 	    tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
706 	}
707 
708 	/*
709 	 * Insert the new fragment queue entry into both queues.
710 	 */
711 	tiqe->ipqe_m = m;
712 	tiqe->ipre_mlast = m;
713 	tiqe->ipqe_seq = pkt_seq;
714 	tiqe->ipqe_len = pkt_len;
715 	tiqe->ipqe_flags = pkt_flags;
716 	if (p == NULL) {
717 		TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
718 #ifdef TCPREASS_DEBUG
719 		if (tiqe->ipqe_seq != tp->rcv_nxt)
720 			printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
721 			       tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
722 #endif
723 	} else {
724 		TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
725 #ifdef TCPREASS_DEBUG
726 		printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
727 		       tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
728 		       p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
729 #endif
730 	}
731 	tp->t_segqlen++;
732 
733 skip_replacement:
734 
735 	TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
736 
737 present:
738 	/*
739 	 * Present data to user, advancing rcv_nxt through
740 	 * completed sequence space.
741 	 */
742 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
743 		return (0);
744 	q = TAILQ_FIRST(&tp->segq);
745 	if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
746 		return (0);
747 	if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
748 		return (0);
749 
750 	tp->rcv_nxt += q->ipqe_len;
751 	pkt_flags = q->ipqe_flags & TH_FIN;
752 	ND6_HINT(tp);
753 
754 	TAILQ_REMOVE(&tp->segq, q, ipqe_q);
755 	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
756 	tp->t_segqlen--;
757 	KASSERT(tp->t_segqlen >= 0);
758 	KASSERT(tp->t_segqlen != 0 ||
759 	    (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
760 	if (so->so_state & SS_CANTRCVMORE)
761 		m_freem(q->ipqe_m);
762 	else
763 		sbappendstream(&so->so_rcv, q->ipqe_m);
764 	tcpipqent_free(q);
765 	sorwakeup(so);
766 	return (pkt_flags);
767 }
768 
769 #ifdef INET6
770 int
771 tcp6_input(struct mbuf **mp, int *offp, int proto)
772 {
773 	struct mbuf *m = *mp;
774 
775 	/*
776 	 * draft-itojun-ipv6-tcp-to-anycast
777 	 * better place to put this in?
778 	 */
779 	if (m->m_flags & M_ANYCAST6) {
780 		struct ip6_hdr *ip6;
781 		if (m->m_len < sizeof(struct ip6_hdr)) {
782 			if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
783 				tcpstat.tcps_rcvshort++;
784 				return IPPROTO_DONE;
785 			}
786 		}
787 		ip6 = mtod(m, struct ip6_hdr *);
788 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
789 		    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
790 		return IPPROTO_DONE;
791 	}
792 
793 	tcp_input(m, *offp, proto);
794 	return IPPROTO_DONE;
795 }
796 #endif
797 
798 #ifdef INET
799 static void
800 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
801 {
802 	char src[4*sizeof "123"];
803 	char dst[4*sizeof "123"];
804 
805 	if (ip) {
806 		strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
807 		strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
808 	}
809 	else {
810 		strlcpy(src, "(unknown)", sizeof(src));
811 		strlcpy(dst, "(unknown)", sizeof(dst));
812 	}
813 	log(LOG_INFO,
814 	    "Connection attempt to TCP %s:%d from %s:%d\n",
815 	    dst, ntohs(th->th_dport),
816 	    src, ntohs(th->th_sport));
817 }
818 #endif
819 
820 #ifdef INET6
821 static void
822 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
823 {
824 	char src[INET6_ADDRSTRLEN];
825 	char dst[INET6_ADDRSTRLEN];
826 
827 	if (ip6) {
828 		strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
829 		strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
830 	}
831 	else {
832 		strlcpy(src, "(unknown v6)", sizeof(src));
833 		strlcpy(dst, "(unknown v6)", sizeof(dst));
834 	}
835 	log(LOG_INFO,
836 	    "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
837 	    dst, ntohs(th->th_dport),
838 	    src, ntohs(th->th_sport));
839 }
840 #endif
841 
842 /*
843  * Checksum extended TCP header and data.
844  */
845 int
846 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, int toff,
847     int off, int tlen)
848 {
849 
850 	/*
851 	 * XXX it's better to record and check if this mbuf is
852 	 * already checked.
853 	 */
854 
855 	switch (af) {
856 #ifdef INET
857 	case AF_INET:
858 		switch (m->m_pkthdr.csum_flags &
859 			((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
860 			 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
861 		case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
862 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
863 			goto badcsum;
864 
865 		case M_CSUM_TCPv4|M_CSUM_DATA: {
866 			u_int32_t hw_csum = m->m_pkthdr.csum_data;
867 
868 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
869 			if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
870 				const struct ip *ip =
871 				    mtod(m, const struct ip *);
872 
873 				hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
874 				    ip->ip_dst.s_addr,
875 				    htons(hw_csum + tlen + off + IPPROTO_TCP));
876 			}
877 			if ((hw_csum ^ 0xffff) != 0)
878 				goto badcsum;
879 			break;
880 		}
881 
882 		case M_CSUM_TCPv4:
883 			/* Checksum was okay. */
884 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
885 			break;
886 
887 		default:
888 			/*
889 			 * Must compute it ourselves.  Maybe skip checksum
890 			 * on loopback interfaces.
891 			 */
892 			if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
893 					     IFF_LOOPBACK) ||
894 					   tcp_do_loopback_cksum)) {
895 				TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
896 				if (in4_cksum(m, IPPROTO_TCP, toff,
897 					      tlen + off) != 0)
898 					goto badcsum;
899 			}
900 			break;
901 		}
902 		break;
903 #endif /* INET4 */
904 
905 #ifdef INET6
906 	case AF_INET6:
907 		switch (m->m_pkthdr.csum_flags &
908 			((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
909 			 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
910 		case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
911 			TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
912 			goto badcsum;
913 
914 #if 0 /* notyet */
915 		case M_CSUM_TCPv6|M_CSUM_DATA:
916 #endif
917 
918 		case M_CSUM_TCPv6:
919 			/* Checksum was okay. */
920 			TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
921 			break;
922 
923 		default:
924 			/*
925 			 * Must compute it ourselves.  Maybe skip checksum
926 			 * on loopback interfaces.
927 			 */
928 			if (__predict_true((m->m_flags & M_LOOP) == 0 ||
929 			    tcp_do_loopback_cksum)) {
930 				TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
931 				if (in6_cksum(m, IPPROTO_TCP, toff,
932 				    tlen + off) != 0)
933 					goto badcsum;
934 			}
935 		}
936 		break;
937 #endif /* INET6 */
938 	}
939 
940 	return 0;
941 
942 badcsum:
943 	tcpstat.tcps_rcvbadsum++;
944 	return -1;
945 }
946 
947 /*
948  * TCP input routine, follows pages 65-76 of RFC 793 very closely.
949  */
950 void
951 tcp_input(struct mbuf *m, ...)
952 {
953 	struct tcphdr *th;
954 	struct ip *ip;
955 	struct inpcb *inp;
956 #ifdef INET6
957 	struct ip6_hdr *ip6;
958 	struct in6pcb *in6p;
959 #endif
960 	u_int8_t *optp = NULL;
961 	int optlen = 0;
962 	int len, tlen, toff, hdroptlen = 0;
963 	struct tcpcb *tp = 0;
964 	int tiflags;
965 	struct socket *so = NULL;
966 	int todrop, dupseg, acked, ourfinisacked, needoutput = 0;
967 #ifdef TCP_DEBUG
968 	short ostate = 0;
969 #endif
970 	int iss = 0;
971 	u_long tiwin;
972 	struct tcp_opt_info opti;
973 	int off, iphlen;
974 	va_list ap;
975 	int af;		/* af on the wire */
976 	struct mbuf *tcp_saveti = NULL;
977 	uint32_t ts_rtt;
978 
979 	MCLAIM(m, &tcp_rx_mowner);
980 	va_start(ap, m);
981 	toff = va_arg(ap, int);
982 	(void)va_arg(ap, int);		/* ignore value, advance ap */
983 	va_end(ap);
984 
985 	tcpstat.tcps_rcvtotal++;
986 
987 	bzero(&opti, sizeof(opti));
988 	opti.ts_present = 0;
989 	opti.maxseg = 0;
990 
991 	/*
992 	 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
993 	 *
994 	 * TCP is, by definition, unicast, so we reject all
995 	 * multicast outright.
996 	 *
997 	 * Note, there are additional src/dst address checks in
998 	 * the AF-specific code below.
999 	 */
1000 	if (m->m_flags & (M_BCAST|M_MCAST)) {
1001 		/* XXX stat */
1002 		goto drop;
1003 	}
1004 #ifdef INET6
1005 	if (m->m_flags & M_ANYCAST6) {
1006 		/* XXX stat */
1007 		goto drop;
1008 	}
1009 #endif
1010 
1011 	/*
1012 	 * Get IP and TCP header.
1013 	 * Note: IP leaves IP header in first mbuf.
1014 	 */
1015 	ip = mtod(m, struct ip *);
1016 #ifdef INET6
1017 	ip6 = NULL;
1018 #endif
1019 	switch (ip->ip_v) {
1020 #ifdef INET
1021 	case 4:
1022 		af = AF_INET;
1023 		iphlen = sizeof(struct ip);
1024 		ip = mtod(m, struct ip *);
1025 		IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1026 			sizeof(struct tcphdr));
1027 		if (th == NULL) {
1028 			tcpstat.tcps_rcvshort++;
1029 			return;
1030 		}
1031 		/* We do the checksum after PCB lookup... */
1032 		len = ntohs(ip->ip_len);
1033 		tlen = len - toff;
1034 		break;
1035 #endif
1036 #ifdef INET6
1037 	case 6:
1038 		ip = NULL;
1039 		iphlen = sizeof(struct ip6_hdr);
1040 		af = AF_INET6;
1041 		ip6 = mtod(m, struct ip6_hdr *);
1042 		IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
1043 			sizeof(struct tcphdr));
1044 		if (th == NULL) {
1045 			tcpstat.tcps_rcvshort++;
1046 			return;
1047 		}
1048 
1049 		/* Be proactive about malicious use of IPv4 mapped address */
1050 		if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
1051 		    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
1052 			/* XXX stat */
1053 			goto drop;
1054 		}
1055 
1056 		/*
1057 		 * Be proactive about unspecified IPv6 address in source.
1058 		 * As we use all-zero to indicate unbounded/unconnected pcb,
1059 		 * unspecified IPv6 address can be used to confuse us.
1060 		 *
1061 		 * Note that packets with unspecified IPv6 destination is
1062 		 * already dropped in ip6_input.
1063 		 */
1064 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1065 			/* XXX stat */
1066 			goto drop;
1067 		}
1068 
1069 		/*
1070 		 * Make sure destination address is not multicast.
1071 		 * Source address checked in ip6_input().
1072 		 */
1073 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
1074 			/* XXX stat */
1075 			goto drop;
1076 		}
1077 
1078 		/* We do the checksum after PCB lookup... */
1079 		len = m->m_pkthdr.len;
1080 		tlen = len - toff;
1081 		break;
1082 #endif
1083 	default:
1084 		m_freem(m);
1085 		return;
1086 	}
1087 
1088 	KASSERT(TCP_HDR_ALIGNED_P(th));
1089 
1090 	/*
1091 	 * Check that TCP offset makes sense,
1092 	 * pull out TCP options and adjust length.		XXX
1093 	 */
1094 	off = th->th_off << 2;
1095 	if (off < sizeof (struct tcphdr) || off > tlen) {
1096 		tcpstat.tcps_rcvbadoff++;
1097 		goto drop;
1098 	}
1099 	tlen -= off;
1100 
1101 	/*
1102 	 * tcp_input() has been modified to use tlen to mean the TCP data
1103 	 * length throughout the function.  Other functions can use
1104 	 * m->m_pkthdr.len as the basis for calculating the TCP data length.
1105 	 * rja
1106 	 */
1107 
1108 	if (off > sizeof (struct tcphdr)) {
1109 		IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
1110 		if (th == NULL) {
1111 			tcpstat.tcps_rcvshort++;
1112 			return;
1113 		}
1114 		/*
1115 		 * NOTE: ip/ip6 will not be affected by m_pulldown()
1116 		 * (as they're before toff) and we don't need to update those.
1117 		 */
1118 		KASSERT(TCP_HDR_ALIGNED_P(th));
1119 		optlen = off - sizeof (struct tcphdr);
1120 		optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
1121 		/*
1122 		 * Do quick retrieval of timestamp options ("options
1123 		 * prediction?").  If timestamp is the only option and it's
1124 		 * formatted as recommended in RFC 1323 appendix A, we
1125 		 * quickly get the values now and not bother calling
1126 		 * tcp_dooptions(), etc.
1127 		 */
1128 		if ((optlen == TCPOLEN_TSTAMP_APPA ||
1129 		     (optlen > TCPOLEN_TSTAMP_APPA &&
1130 			optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1131 		     *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1132 		     (th->th_flags & TH_SYN) == 0) {
1133 			opti.ts_present = 1;
1134 			opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
1135 			opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
1136 			optp = NULL;	/* we've parsed the options */
1137 		}
1138 	}
1139 	tiflags = th->th_flags;
1140 
1141 	/*
1142 	 * Locate pcb for segment.
1143 	 */
1144 findpcb:
1145 	inp = NULL;
1146 #ifdef INET6
1147 	in6p = NULL;
1148 #endif
1149 	switch (af) {
1150 #ifdef INET
1151 	case AF_INET:
1152 		inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
1153 		    ip->ip_dst, th->th_dport);
1154 		if (inp == 0) {
1155 			++tcpstat.tcps_pcbhashmiss;
1156 			inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
1157 		}
1158 #ifdef INET6
1159 		if (inp == 0) {
1160 			struct in6_addr s, d;
1161 
1162 			/* mapped addr case */
1163 			bzero(&s, sizeof(s));
1164 			s.s6_addr16[5] = htons(0xffff);
1165 			bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
1166 			bzero(&d, sizeof(d));
1167 			d.s6_addr16[5] = htons(0xffff);
1168 			bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
1169 			in6p = in6_pcblookup_connect(&tcbtable, &s,
1170 			    th->th_sport, &d, th->th_dport, 0);
1171 			if (in6p == 0) {
1172 				++tcpstat.tcps_pcbhashmiss;
1173 				in6p = in6_pcblookup_bind(&tcbtable, &d,
1174 				    th->th_dport, 0);
1175 			}
1176 		}
1177 #endif
1178 #ifndef INET6
1179 		if (inp == 0)
1180 #else
1181 		if (inp == 0 && in6p == 0)
1182 #endif
1183 		{
1184 			++tcpstat.tcps_noport;
1185 			if (tcp_log_refused &&
1186 			    (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1187 				tcp4_log_refused(ip, th);
1188 			}
1189 			TCP_FIELDS_TO_HOST(th);
1190 			goto dropwithreset_ratelim;
1191 		}
1192 #if defined(IPSEC) || defined(FAST_IPSEC)
1193 		if (inp && (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 &&
1194 		    ipsec4_in_reject(m, inp)) {
1195 			ipsecstat.in_polvio++;
1196 			goto drop;
1197 		}
1198 #ifdef INET6
1199 		else if (in6p &&
1200 		    (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1201 		    ipsec4_in_reject_so(m, in6p->in6p_socket)) {
1202 			ipsecstat.in_polvio++;
1203 			goto drop;
1204 		}
1205 #endif
1206 #endif /*IPSEC*/
1207 		break;
1208 #endif /*INET*/
1209 #ifdef INET6
1210 	case AF_INET6:
1211 	    {
1212 		int faith;
1213 
1214 #if defined(NFAITH) && NFAITH > 0
1215 		faith = faithprefix(&ip6->ip6_dst);
1216 #else
1217 		faith = 0;
1218 #endif
1219 		in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
1220 		    th->th_sport, &ip6->ip6_dst, th->th_dport, faith);
1221 		if (in6p == NULL) {
1222 			++tcpstat.tcps_pcbhashmiss;
1223 			in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
1224 				th->th_dport, faith);
1225 		}
1226 		if (in6p == NULL) {
1227 			++tcpstat.tcps_noport;
1228 			if (tcp_log_refused &&
1229 			    (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1230 				tcp6_log_refused(ip6, th);
1231 			}
1232 			TCP_FIELDS_TO_HOST(th);
1233 			goto dropwithreset_ratelim;
1234 		}
1235 #if defined(IPSEC) || defined(FAST_IPSEC)
1236 		if ((in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1237 		    ipsec6_in_reject(m, in6p)) {
1238 			ipsec6stat.in_polvio++;
1239 			goto drop;
1240 		}
1241 #endif /*IPSEC*/
1242 		break;
1243 	    }
1244 #endif
1245 	}
1246 
1247 	/*
1248 	 * If the state is CLOSED (i.e., TCB does not exist) then
1249 	 * all data in the incoming segment is discarded.
1250 	 * If the TCB exists but is in CLOSED state, it is embryonic,
1251 	 * but should either do a listen or a connect soon.
1252 	 */
1253 	tp = NULL;
1254 	so = NULL;
1255 	if (inp) {
1256 		tp = intotcpcb(inp);
1257 		so = inp->inp_socket;
1258 	}
1259 #ifdef INET6
1260 	else if (in6p) {
1261 		tp = in6totcpcb(in6p);
1262 		so = in6p->in6p_socket;
1263 	}
1264 #endif
1265 	if (tp == 0) {
1266 		TCP_FIELDS_TO_HOST(th);
1267 		goto dropwithreset_ratelim;
1268 	}
1269 	if (tp->t_state == TCPS_CLOSED)
1270 		goto drop;
1271 
1272 	/*
1273 	 * Checksum extended TCP header and data.
1274 	 */
1275 	if (tcp_input_checksum(af, m, th, toff, off, tlen))
1276 		goto badcsum;
1277 
1278 	TCP_FIELDS_TO_HOST(th);
1279 
1280 	/* Unscale the window into a 32-bit value. */
1281 	if ((tiflags & TH_SYN) == 0)
1282 		tiwin = th->th_win << tp->snd_scale;
1283 	else
1284 		tiwin = th->th_win;
1285 
1286 #ifdef INET6
1287 	/* save packet options if user wanted */
1288 	if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
1289 		if (in6p->in6p_options) {
1290 			m_freem(in6p->in6p_options);
1291 			in6p->in6p_options = 0;
1292 		}
1293 		ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
1294 	}
1295 #endif
1296 
1297 	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1298 		union syn_cache_sa src;
1299 		union syn_cache_sa dst;
1300 
1301 		bzero(&src, sizeof(src));
1302 		bzero(&dst, sizeof(dst));
1303 		switch (af) {
1304 #ifdef INET
1305 		case AF_INET:
1306 			src.sin.sin_len = sizeof(struct sockaddr_in);
1307 			src.sin.sin_family = AF_INET;
1308 			src.sin.sin_addr = ip->ip_src;
1309 			src.sin.sin_port = th->th_sport;
1310 
1311 			dst.sin.sin_len = sizeof(struct sockaddr_in);
1312 			dst.sin.sin_family = AF_INET;
1313 			dst.sin.sin_addr = ip->ip_dst;
1314 			dst.sin.sin_port = th->th_dport;
1315 			break;
1316 #endif
1317 #ifdef INET6
1318 		case AF_INET6:
1319 			src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1320 			src.sin6.sin6_family = AF_INET6;
1321 			src.sin6.sin6_addr = ip6->ip6_src;
1322 			src.sin6.sin6_port = th->th_sport;
1323 
1324 			dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1325 			dst.sin6.sin6_family = AF_INET6;
1326 			dst.sin6.sin6_addr = ip6->ip6_dst;
1327 			dst.sin6.sin6_port = th->th_dport;
1328 			break;
1329 #endif /* INET6 */
1330 		default:
1331 			goto badsyn;	/*sanity*/
1332 		}
1333 
1334 		if (so->so_options & SO_DEBUG) {
1335 #ifdef TCP_DEBUG
1336 			ostate = tp->t_state;
1337 #endif
1338 
1339 			tcp_saveti = NULL;
1340 			if (iphlen + sizeof(struct tcphdr) > MHLEN)
1341 				goto nosave;
1342 
1343 			if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
1344 				tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
1345 				if (!tcp_saveti)
1346 					goto nosave;
1347 			} else {
1348 				MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1349 				if (!tcp_saveti)
1350 					goto nosave;
1351 				MCLAIM(m, &tcp_mowner);
1352 				tcp_saveti->m_len = iphlen;
1353 				m_copydata(m, 0, iphlen,
1354 				    mtod(tcp_saveti, caddr_t));
1355 			}
1356 
1357 			if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1358 				m_freem(tcp_saveti);
1359 				tcp_saveti = NULL;
1360 			} else {
1361 				tcp_saveti->m_len += sizeof(struct tcphdr);
1362 				bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen,
1363 				    sizeof(struct tcphdr));
1364 			}
1365 	nosave:;
1366 		}
1367 		if (so->so_options & SO_ACCEPTCONN) {
1368 			if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1369 				if (tiflags & TH_RST) {
1370 					syn_cache_reset(&src.sa, &dst.sa, th);
1371 				} else if ((tiflags & (TH_ACK|TH_SYN)) ==
1372 				    (TH_ACK|TH_SYN)) {
1373 					/*
1374 					 * Received a SYN,ACK.  This should
1375 					 * never happen while we are in
1376 					 * LISTEN.  Send an RST.
1377 					 */
1378 					goto badsyn;
1379 				} else if (tiflags & TH_ACK) {
1380 					so = syn_cache_get(&src.sa, &dst.sa,
1381 						th, toff, tlen, so, m);
1382 					if (so == NULL) {
1383 						/*
1384 						 * We don't have a SYN for
1385 						 * this ACK; send an RST.
1386 						 */
1387 						goto badsyn;
1388 					} else if (so ==
1389 					    (struct socket *)(-1)) {
1390 						/*
1391 						 * We were unable to create
1392 						 * the connection.  If the
1393 						 * 3-way handshake was
1394 						 * completed, and RST has
1395 						 * been sent to the peer.
1396 						 * Since the mbuf might be
1397 						 * in use for the reply,
1398 						 * do not free it.
1399 						 */
1400 						m = NULL;
1401 					} else {
1402 						/*
1403 						 * We have created a
1404 						 * full-blown connection.
1405 						 */
1406 						tp = NULL;
1407 						inp = NULL;
1408 #ifdef INET6
1409 						in6p = NULL;
1410 #endif
1411 						switch (so->so_proto->pr_domain->dom_family) {
1412 #ifdef INET
1413 						case AF_INET:
1414 							inp = sotoinpcb(so);
1415 							tp = intotcpcb(inp);
1416 							break;
1417 #endif
1418 #ifdef INET6
1419 						case AF_INET6:
1420 							in6p = sotoin6pcb(so);
1421 							tp = in6totcpcb(in6p);
1422 							break;
1423 #endif
1424 						}
1425 						if (tp == NULL)
1426 							goto badsyn;	/*XXX*/
1427 						tiwin <<= tp->snd_scale;
1428 						goto after_listen;
1429 					}
1430 				} else {
1431 					/*
1432 					 * None of RST, SYN or ACK was set.
1433 					 * This is an invalid packet for a
1434 					 * TCB in LISTEN state.  Send a RST.
1435 					 */
1436 					goto badsyn;
1437 				}
1438 			} else {
1439 				/*
1440 				 * Received a SYN.
1441 				 */
1442 
1443 #ifdef INET6
1444 				/*
1445 				 * If deprecated address is forbidden, we do
1446 				 * not accept SYN to deprecated interface
1447 				 * address to prevent any new inbound
1448 				 * connection from getting established.
1449 				 * When we do not accept SYN, we send a TCP
1450 				 * RST, with deprecated source address (instead
1451 				 * of dropping it).  We compromise it as it is
1452 				 * much better for peer to send a RST, and
1453 				 * RST will be the final packet for the
1454 				 * exchange.
1455 				 *
1456 				 * If we do not forbid deprecated addresses, we
1457 				 * accept the SYN packet.  RFC2462 does not
1458 				 * suggest dropping SYN in this case.
1459 				 * If we decipher RFC2462 5.5.4, it says like
1460 				 * this:
1461 				 * 1. use of deprecated addr with existing
1462 				 *    communication is okay - "SHOULD continue
1463 				 *    to be used"
1464 				 * 2. use of it with new communication:
1465 				 *   (2a) "SHOULD NOT be used if alternate
1466 				 *        address with sufficient scope is
1467 				 *        available"
1468 				 *   (2b) nothing mentioned otherwise.
1469 				 * Here we fall into (2b) case as we have no
1470 				 * choice in our source address selection - we
1471 				 * must obey the peer.
1472 				 *
1473 				 * The wording in RFC2462 is confusing, and
1474 				 * there are multiple description text for
1475 				 * deprecated address handling - worse, they
1476 				 * are not exactly the same.  I believe 5.5.4
1477 				 * is the best one, so we follow 5.5.4.
1478 				 */
1479 				if (af == AF_INET6 && !ip6_use_deprecated) {
1480 					struct in6_ifaddr *ia6;
1481 					if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
1482 					    &ip6->ip6_dst)) &&
1483 					    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1484 						tp = NULL;
1485 						goto dropwithreset;
1486 					}
1487 				}
1488 #endif
1489 
1490 #ifdef IPSEC
1491 				switch (af) {
1492 #ifdef INET
1493 				case AF_INET:
1494 					if (ipsec4_in_reject_so(m, so)) {
1495 						ipsecstat.in_polvio++;
1496 						tp = NULL;
1497 						goto dropwithreset;
1498 					}
1499 					break;
1500 #endif
1501 #ifdef INET6
1502 				case AF_INET6:
1503 					if (ipsec6_in_reject_so(m, so)) {
1504 						ipsec6stat.in_polvio++;
1505 						tp = NULL;
1506 						goto dropwithreset;
1507 					}
1508 					break;
1509 #endif
1510 				}
1511 #endif
1512 
1513 				/*
1514 				 * LISTEN socket received a SYN
1515 				 * from itself?  This can't possibly
1516 				 * be valid; drop the packet.
1517 				 */
1518 				if (th->th_sport == th->th_dport) {
1519 					int i;
1520 
1521 					switch (af) {
1522 #ifdef INET
1523 					case AF_INET:
1524 						i = in_hosteq(ip->ip_src, ip->ip_dst);
1525 						break;
1526 #endif
1527 #ifdef INET6
1528 					case AF_INET6:
1529 						i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1530 						break;
1531 #endif
1532 					default:
1533 						i = 1;
1534 					}
1535 					if (i) {
1536 						tcpstat.tcps_badsyn++;
1537 						goto drop;
1538 					}
1539 				}
1540 
1541 				/*
1542 				 * SYN looks ok; create compressed TCP
1543 				 * state for it.
1544 				 */
1545 				if (so->so_qlen <= so->so_qlimit &&
1546 				    syn_cache_add(&src.sa, &dst.sa, th, tlen,
1547 						so, m, optp, optlen, &opti))
1548 					m = NULL;
1549 			}
1550 			goto drop;
1551 		}
1552 	}
1553 
1554 after_listen:
1555 #ifdef DIAGNOSTIC
1556 	/*
1557 	 * Should not happen now that all embryonic connections
1558 	 * are handled with compressed state.
1559 	 */
1560 	if (tp->t_state == TCPS_LISTEN)
1561 		panic("tcp_input: TCPS_LISTEN");
1562 #endif
1563 
1564 	/*
1565 	 * Segment received on connection.
1566 	 * Reset idle time and keep-alive timer.
1567 	 */
1568 	tp->t_rcvtime = tcp_now;
1569 	if (TCPS_HAVEESTABLISHED(tp->t_state))
1570 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1571 
1572 	/*
1573 	 * Process options.
1574 	 */
1575 #ifdef TCP_SIGNATURE
1576 	if (optp || (tp->t_flags & TF_SIGNATURE))
1577 #else
1578 	if (optp)
1579 #endif
1580 		if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
1581 			goto drop;
1582 
1583 	if (TCP_SACK_ENABLED(tp)) {
1584 		tcp_del_sackholes(tp, th);
1585 	}
1586 
1587 	if (opti.ts_present && opti.ts_ecr) {
1588 		/*
1589 		 * Calculate the RTT from the returned time stamp and the
1590 		 * connection's time base.  If the time stamp is later than
1591 		 * the current time, or is extremely old, fall back to non-1323
1592 		 * RTT calculation.  Since ts_ecr is unsigned, we can test both
1593 		 * at the same time.
1594 		 */
1595 		ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
1596 		if (ts_rtt > TCP_PAWS_IDLE)
1597 			ts_rtt = 0;
1598 	} else {
1599 		ts_rtt = 0;
1600 	}
1601 
1602 	/*
1603 	 * Header prediction: check for the two common cases
1604 	 * of a uni-directional data xfer.  If the packet has
1605 	 * no control flags, is in-sequence, the window didn't
1606 	 * change and we're not retransmitting, it's a
1607 	 * candidate.  If the length is zero and the ack moved
1608 	 * forward, we're the sender side of the xfer.  Just
1609 	 * free the data acked & wake any higher level process
1610 	 * that was blocked waiting for space.  If the length
1611 	 * is non-zero and the ack didn't move, we're the
1612 	 * receiver side.  If we're getting packets in-order
1613 	 * (the reassembly queue is empty), add the data to
1614 	 * the socket buffer and note that we need a delayed ack.
1615 	 */
1616 	if (tp->t_state == TCPS_ESTABLISHED &&
1617 	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1618 	    (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1619 	    th->th_seq == tp->rcv_nxt &&
1620 	    tiwin && tiwin == tp->snd_wnd &&
1621 	    tp->snd_nxt == tp->snd_max) {
1622 
1623 		/*
1624 		 * If last ACK falls within this segment's sequence numbers,
1625 		 *  record the timestamp.
1626 		 * NOTE:
1627 		 * 1) That the test incorporates suggestions from the latest
1628 		 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
1629 		 * 2) That updating only on newer timestamps interferes with
1630 		 *    our earlier PAWS tests, so this check should be solely
1631 		 *    predicated on the sequence space of this segment.
1632 		 * 3) That we modify the segment boundary check to be
1633 		 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
1634 		 *    instead of RFC1323's
1635 		 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
1636 		 *    This modified check allows us to overcome RFC1323's
1637 		 *    limitations as described in Stevens TCP/IP Illustrated
1638 		 *    Vol. 2 p.869. In such cases, we can still calculate the
1639 		 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
1640 		 */
1641 		if (opti.ts_present &&
1642 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1643 		    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1644 		    ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
1645 			tp->ts_recent_age = tcp_now;
1646 			tp->ts_recent = opti.ts_val;
1647 		}
1648 
1649 		if (tlen == 0) {
1650 			/* Ack prediction. */
1651 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
1652 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
1653 			    tp->snd_cwnd >= tp->snd_wnd &&
1654 			    tp->t_partialacks < 0) {
1655 				/*
1656 				 * this is a pure ack for outstanding data.
1657 				 */
1658 				++tcpstat.tcps_predack;
1659 				if (ts_rtt)
1660 					tcp_xmit_timer(tp, ts_rtt);
1661 				else if (tp->t_rtttime &&
1662 				    SEQ_GT(th->th_ack, tp->t_rtseq))
1663 					tcp_xmit_timer(tp,
1664 					  tcp_now - tp->t_rtttime);
1665 				acked = th->th_ack - tp->snd_una;
1666 				tcpstat.tcps_rcvackpack++;
1667 				tcpstat.tcps_rcvackbyte += acked;
1668 				ND6_HINT(tp);
1669 
1670 				if (acked > (tp->t_lastoff - tp->t_inoff))
1671 					tp->t_lastm = NULL;
1672 				sbdrop(&so->so_snd, acked);
1673 				tp->t_lastoff -= acked;
1674 
1675 				ICMP_CHECK(tp, th, acked);
1676 
1677 				tp->snd_una = th->th_ack;
1678 				tp->snd_fack = tp->snd_una;
1679 				if (SEQ_LT(tp->snd_high, tp->snd_una))
1680 					tp->snd_high = tp->snd_una;
1681 				m_freem(m);
1682 
1683 				/*
1684 				 * If all outstanding data are acked, stop
1685 				 * retransmit timer, otherwise restart timer
1686 				 * using current (possibly backed-off) value.
1687 				 * If process is waiting for space,
1688 				 * wakeup/selwakeup/signal.  If data
1689 				 * are ready to send, let tcp_output
1690 				 * decide between more output or persist.
1691 				 */
1692 				if (tp->snd_una == tp->snd_max)
1693 					TCP_TIMER_DISARM(tp, TCPT_REXMT);
1694 				else if (TCP_TIMER_ISARMED(tp,
1695 				    TCPT_PERSIST) == 0)
1696 					TCP_TIMER_ARM(tp, TCPT_REXMT,
1697 					    tp->t_rxtcur);
1698 
1699 				sowwakeup(so);
1700 				if (so->so_snd.sb_cc)
1701 					(void) tcp_output(tp);
1702 				if (tcp_saveti)
1703 					m_freem(tcp_saveti);
1704 				return;
1705 			}
1706 		} else if (th->th_ack == tp->snd_una &&
1707 		    TAILQ_FIRST(&tp->segq) == NULL &&
1708 		    tlen <= sbspace(&so->so_rcv)) {
1709 			/*
1710 			 * this is a pure, in-sequence data packet
1711 			 * with nothing on the reassembly queue and
1712 			 * we have enough buffer space to take it.
1713 			 */
1714 			++tcpstat.tcps_preddat;
1715 			tp->rcv_nxt += tlen;
1716 			tcpstat.tcps_rcvpack++;
1717 			tcpstat.tcps_rcvbyte += tlen;
1718 			ND6_HINT(tp);
1719 			/*
1720 			 * Drop TCP, IP headers and TCP options then add data
1721 			 * to socket buffer.
1722 			 */
1723 			if (so->so_state & SS_CANTRCVMORE)
1724 				m_freem(m);
1725 			else {
1726 				m_adj(m, toff + off);
1727 				sbappendstream(&so->so_rcv, m);
1728 			}
1729 			sorwakeup(so);
1730 			TCP_SETUP_ACK(tp, th);
1731 			if (tp->t_flags & TF_ACKNOW)
1732 				(void) tcp_output(tp);
1733 			if (tcp_saveti)
1734 				m_freem(tcp_saveti);
1735 			return;
1736 		}
1737 	}
1738 
1739 	/*
1740 	 * Compute mbuf offset to TCP data segment.
1741 	 */
1742 	hdroptlen = toff + off;
1743 
1744 	/*
1745 	 * Calculate amount of space in receive window,
1746 	 * and then do TCP input processing.
1747 	 * Receive window is amount of space in rcv queue,
1748 	 * but not less than advertised window.
1749 	 */
1750 	{ int win;
1751 
1752 	win = sbspace(&so->so_rcv);
1753 	if (win < 0)
1754 		win = 0;
1755 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1756 	}
1757 
1758 	switch (tp->t_state) {
1759 	case TCPS_LISTEN:
1760 		/*
1761 		 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1762 		 */
1763 		if (m->m_flags & (M_BCAST|M_MCAST))
1764 			goto drop;
1765 		switch (af) {
1766 #ifdef INET6
1767 		case AF_INET6:
1768 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
1769 				goto drop;
1770 			break;
1771 #endif /* INET6 */
1772 		case AF_INET:
1773 			if (IN_MULTICAST(ip->ip_dst.s_addr) ||
1774 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1775 				goto drop;
1776 			break;
1777 		}
1778 		break;
1779 
1780 	/*
1781 	 * If the state is SYN_SENT:
1782 	 *	if seg contains an ACK, but not for our SYN, drop the input.
1783 	 *	if seg contains a RST, then drop the connection.
1784 	 *	if seg does not contain SYN, then drop it.
1785 	 * Otherwise this is an acceptable SYN segment
1786 	 *	initialize tp->rcv_nxt and tp->irs
1787 	 *	if seg contains ack then advance tp->snd_una
1788 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1789 	 *	arrange for segment to be acked (eventually)
1790 	 *	continue processing rest of data/controls, beginning with URG
1791 	 */
1792 	case TCPS_SYN_SENT:
1793 		if ((tiflags & TH_ACK) &&
1794 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1795 		     SEQ_GT(th->th_ack, tp->snd_max)))
1796 			goto dropwithreset;
1797 		if (tiflags & TH_RST) {
1798 			if (tiflags & TH_ACK)
1799 				tp = tcp_drop(tp, ECONNREFUSED);
1800 			goto drop;
1801 		}
1802 		if ((tiflags & TH_SYN) == 0)
1803 			goto drop;
1804 		if (tiflags & TH_ACK) {
1805 			tp->snd_una = th->th_ack;
1806 			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1807 				tp->snd_nxt = tp->snd_una;
1808 			if (SEQ_LT(tp->snd_high, tp->snd_una))
1809 				tp->snd_high = tp->snd_una;
1810 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
1811 		}
1812 		tp->irs = th->th_seq;
1813 		tcp_rcvseqinit(tp);
1814 		tp->t_flags |= TF_ACKNOW;
1815 		tcp_mss_from_peer(tp, opti.maxseg);
1816 
1817 		/*
1818 		 * Initialize the initial congestion window.  If we
1819 		 * had to retransmit the SYN, we must initialize cwnd
1820 		 * to 1 segment (i.e. the Loss Window).
1821 		 */
1822 		if (tp->t_flags & TF_SYN_REXMT)
1823 			tp->snd_cwnd = tp->t_peermss;
1824 		else {
1825 			int ss = tcp_init_win;
1826 #ifdef INET
1827 			if (inp != NULL && in_localaddr(inp->inp_faddr))
1828 				ss = tcp_init_win_local;
1829 #endif
1830 #ifdef INET6
1831 			if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
1832 				ss = tcp_init_win_local;
1833 #endif
1834 			tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
1835 		}
1836 
1837 		tcp_rmx_rtt(tp);
1838 		if (tiflags & TH_ACK) {
1839 			tcpstat.tcps_connects++;
1840 			soisconnected(so);
1841 			tcp_established(tp);
1842 			/* Do window scaling on this connection? */
1843 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1844 			    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1845 				tp->snd_scale = tp->requested_s_scale;
1846 				tp->rcv_scale = tp->request_r_scale;
1847 			}
1848 			TCP_REASS_LOCK(tp);
1849 			(void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1850 			TCP_REASS_UNLOCK(tp);
1851 			/*
1852 			 * if we didn't have to retransmit the SYN,
1853 			 * use its rtt as our initial srtt & rtt var.
1854 			 */
1855 			if (tp->t_rtttime)
1856 				tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1857 		} else
1858 			tp->t_state = TCPS_SYN_RECEIVED;
1859 
1860 		/*
1861 		 * Advance th->th_seq to correspond to first data byte.
1862 		 * If data, trim to stay within window,
1863 		 * dropping FIN if necessary.
1864 		 */
1865 		th->th_seq++;
1866 		if (tlen > tp->rcv_wnd) {
1867 			todrop = tlen - tp->rcv_wnd;
1868 			m_adj(m, -todrop);
1869 			tlen = tp->rcv_wnd;
1870 			tiflags &= ~TH_FIN;
1871 			tcpstat.tcps_rcvpackafterwin++;
1872 			tcpstat.tcps_rcvbyteafterwin += todrop;
1873 		}
1874 		tp->snd_wl1 = th->th_seq - 1;
1875 		tp->rcv_up = th->th_seq;
1876 		goto step6;
1877 
1878 	/*
1879 	 * If the state is SYN_RECEIVED:
1880 	 *	If seg contains an ACK, but not for our SYN, drop the input
1881 	 *	and generate an RST.  See page 36, rfc793
1882 	 */
1883 	case TCPS_SYN_RECEIVED:
1884 		if ((tiflags & TH_ACK) &&
1885 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1886 		     SEQ_GT(th->th_ack, tp->snd_max)))
1887 			goto dropwithreset;
1888 		break;
1889 	}
1890 
1891 	/*
1892 	 * States other than LISTEN or SYN_SENT.
1893 	 * First check timestamp, if present.
1894 	 * Then check that at least some bytes of segment are within
1895 	 * receive window.  If segment begins before rcv_nxt,
1896 	 * drop leading data (and SYN); if nothing left, just ack.
1897 	 *
1898 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1899 	 * and it's less than ts_recent, drop it.
1900 	 */
1901 	if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1902 	    TSTMP_LT(opti.ts_val, tp->ts_recent)) {
1903 
1904 		/* Check to see if ts_recent is over 24 days old.  */
1905 		if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
1906 			/*
1907 			 * Invalidate ts_recent.  If this segment updates
1908 			 * ts_recent, the age will be reset later and ts_recent
1909 			 * will get a valid value.  If it does not, setting
1910 			 * ts_recent to zero will at least satisfy the
1911 			 * requirement that zero be placed in the timestamp
1912 			 * echo reply when ts_recent isn't valid.  The
1913 			 * age isn't reset until we get a valid ts_recent
1914 			 * because we don't want out-of-order segments to be
1915 			 * dropped when ts_recent is old.
1916 			 */
1917 			tp->ts_recent = 0;
1918 		} else {
1919 			tcpstat.tcps_rcvduppack++;
1920 			tcpstat.tcps_rcvdupbyte += tlen;
1921 			tcpstat.tcps_pawsdrop++;
1922 			tcp_new_dsack(tp, th->th_seq, tlen);
1923 			goto dropafterack;
1924 		}
1925 	}
1926 
1927 	todrop = tp->rcv_nxt - th->th_seq;
1928 	dupseg = FALSE;
1929 	if (todrop > 0) {
1930 		if (tiflags & TH_SYN) {
1931 			tiflags &= ~TH_SYN;
1932 			th->th_seq++;
1933 			if (th->th_urp > 1)
1934 				th->th_urp--;
1935 			else {
1936 				tiflags &= ~TH_URG;
1937 				th->th_urp = 0;
1938 			}
1939 			todrop--;
1940 		}
1941 		if (todrop > tlen ||
1942 		    (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1943 			/*
1944 			 * Any valid FIN or RST must be to the left of the
1945 			 * window.  At this point the FIN or RST must be a
1946 			 * duplicate or out of sequence; drop it.
1947 			 */
1948 			if (tiflags & TH_RST)
1949 				goto drop;
1950 			tiflags &= ~(TH_FIN|TH_RST);
1951 			/*
1952 			 * Send an ACK to resynchronize and drop any data.
1953 			 * But keep on processing for RST or ACK.
1954 			 */
1955 			tp->t_flags |= TF_ACKNOW;
1956 			todrop = tlen;
1957 			dupseg = TRUE;
1958 			tcpstat.tcps_rcvdupbyte += todrop;
1959 			tcpstat.tcps_rcvduppack++;
1960 		} else if ((tiflags & TH_RST) &&
1961 			   th->th_seq != tp->last_ack_sent) {
1962 			/*
1963 			 * Test for reset before adjusting the sequence
1964 			 * number for overlapping data.
1965 			 */
1966 			goto dropafterack_ratelim;
1967 		} else {
1968 			tcpstat.tcps_rcvpartduppack++;
1969 			tcpstat.tcps_rcvpartdupbyte += todrop;
1970 		}
1971 		tcp_new_dsack(tp, th->th_seq, todrop);
1972 		hdroptlen += todrop;	/*drop from head afterwards*/
1973 		th->th_seq += todrop;
1974 		tlen -= todrop;
1975 		if (th->th_urp > todrop)
1976 			th->th_urp -= todrop;
1977 		else {
1978 			tiflags &= ~TH_URG;
1979 			th->th_urp = 0;
1980 		}
1981 	}
1982 
1983 	/*
1984 	 * If new data are received on a connection after the
1985 	 * user processes are gone, then RST the other end.
1986 	 */
1987 	if ((so->so_state & SS_NOFDREF) &&
1988 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1989 		tp = tcp_close(tp);
1990 		tcpstat.tcps_rcvafterclose++;
1991 		goto dropwithreset;
1992 	}
1993 
1994 	/*
1995 	 * If segment ends after window, drop trailing data
1996 	 * (and PUSH and FIN); if nothing left, just ACK.
1997 	 */
1998 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1999 	if (todrop > 0) {
2000 		tcpstat.tcps_rcvpackafterwin++;
2001 		if (todrop >= tlen) {
2002 			/*
2003 			 * The segment actually starts after the window.
2004 			 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2005 			 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2006 			 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2007 			 */
2008 			tcpstat.tcps_rcvbyteafterwin += tlen;
2009 			/*
2010 			 * If a new connection request is received
2011 			 * while in TIME_WAIT, drop the old connection
2012 			 * and start over if the sequence numbers
2013 			 * are above the previous ones.
2014 			 *
2015 			 * NOTE: We will checksum the packet again, and
2016 			 * so we need to put the header fields back into
2017 			 * network order!
2018 			 * XXX This kind of sucks, but we don't expect
2019 			 * XXX this to happen very often, so maybe it
2020 			 * XXX doesn't matter so much.
2021 			 */
2022 			if (tiflags & TH_SYN &&
2023 			    tp->t_state == TCPS_TIME_WAIT &&
2024 			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2025 				iss = tcp_new_iss(tp, tp->snd_nxt);
2026 				tp = tcp_close(tp);
2027 				TCP_FIELDS_TO_NET(th);
2028 				goto findpcb;
2029 			}
2030 			/*
2031 			 * If window is closed can only take segments at
2032 			 * window edge, and have to drop data and PUSH from
2033 			 * incoming segments.  Continue processing, but
2034 			 * remember to ack.  Otherwise, drop segment
2035 			 * and (if not RST) ack.
2036 			 */
2037 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2038 				tp->t_flags |= TF_ACKNOW;
2039 				tcpstat.tcps_rcvwinprobe++;
2040 			} else
2041 				goto dropafterack;
2042 		} else
2043 			tcpstat.tcps_rcvbyteafterwin += todrop;
2044 		m_adj(m, -todrop);
2045 		tlen -= todrop;
2046 		tiflags &= ~(TH_PUSH|TH_FIN);
2047 	}
2048 
2049 	/*
2050 	 * If last ACK falls within this segment's sequence numbers,
2051 	 * and the timestamp is newer, record it.
2052 	 */
2053 	if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
2054 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2055 	    SEQ_LT(tp->last_ack_sent, th->th_seq + tlen +
2056 		   ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
2057 		tp->ts_recent_age = tcp_now;
2058 		tp->ts_recent = opti.ts_val;
2059 	}
2060 
2061 	/*
2062 	 * If the RST bit is set examine the state:
2063 	 *    SYN_RECEIVED STATE:
2064 	 *	If passive open, return to LISTEN state.
2065 	 *	If active open, inform user that connection was refused.
2066 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
2067 	 *	Inform user that connection was reset, and close tcb.
2068 	 *    CLOSING, LAST_ACK, TIME_WAIT STATES
2069 	 *	Close the tcb.
2070 	 */
2071 	if (tiflags & TH_RST) {
2072 		if (th->th_seq != tp->last_ack_sent)
2073 			goto dropafterack_ratelim;
2074 
2075 		switch (tp->t_state) {
2076 		case TCPS_SYN_RECEIVED:
2077 			so->so_error = ECONNREFUSED;
2078 			goto close;
2079 
2080 		case TCPS_ESTABLISHED:
2081 		case TCPS_FIN_WAIT_1:
2082 		case TCPS_FIN_WAIT_2:
2083 		case TCPS_CLOSE_WAIT:
2084 			so->so_error = ECONNRESET;
2085 		close:
2086 			tp->t_state = TCPS_CLOSED;
2087 			tcpstat.tcps_drops++;
2088 			tp = tcp_close(tp);
2089 			goto drop;
2090 
2091 		case TCPS_CLOSING:
2092 		case TCPS_LAST_ACK:
2093 		case TCPS_TIME_WAIT:
2094 			tp = tcp_close(tp);
2095 			goto drop;
2096 		}
2097 	}
2098 
2099 	/*
2100 	 * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2101 	 * we must be in a synchronized state.  RFC791 states (under RST
2102 	 * generation) that any unacceptable segment (an out-of-order SYN
2103 	 * qualifies) received in a synchronized state must elicit only an
2104 	 * empty acknowledgment segment ... and the connection remains in
2105 	 * the same state.
2106 	 */
2107 	if (tiflags & TH_SYN) {
2108 		if (tp->rcv_nxt == th->th_seq) {
2109 			tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
2110 			    TH_ACK);
2111 			if (tcp_saveti)
2112 				m_freem(tcp_saveti);
2113 			return;
2114 		}
2115 
2116 		goto dropafterack_ratelim;
2117 	}
2118 
2119 	/*
2120 	 * If the ACK bit is off we drop the segment and return.
2121 	 */
2122 	if ((tiflags & TH_ACK) == 0) {
2123 		if (tp->t_flags & TF_ACKNOW)
2124 			goto dropafterack;
2125 		else
2126 			goto drop;
2127 	}
2128 
2129 	/*
2130 	 * Ack processing.
2131 	 */
2132 	switch (tp->t_state) {
2133 
2134 	/*
2135 	 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2136 	 * ESTABLISHED state and continue processing, otherwise
2137 	 * send an RST.
2138 	 */
2139 	case TCPS_SYN_RECEIVED:
2140 		if (SEQ_GT(tp->snd_una, th->th_ack) ||
2141 		    SEQ_GT(th->th_ack, tp->snd_max))
2142 			goto dropwithreset;
2143 		tcpstat.tcps_connects++;
2144 		soisconnected(so);
2145 		tcp_established(tp);
2146 		/* Do window scaling? */
2147 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2148 		    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2149 			tp->snd_scale = tp->requested_s_scale;
2150 			tp->rcv_scale = tp->request_r_scale;
2151 		}
2152 		TCP_REASS_LOCK(tp);
2153 		(void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
2154 		TCP_REASS_UNLOCK(tp);
2155 		tp->snd_wl1 = th->th_seq - 1;
2156 		/* fall into ... */
2157 
2158 	/*
2159 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2160 	 * ACKs.  If the ack is in the range
2161 	 *	tp->snd_una < th->th_ack <= tp->snd_max
2162 	 * then advance tp->snd_una to th->th_ack and drop
2163 	 * data from the retransmission queue.  If this ACK reflects
2164 	 * more up to date window information we update our window information.
2165 	 */
2166 	case TCPS_ESTABLISHED:
2167 	case TCPS_FIN_WAIT_1:
2168 	case TCPS_FIN_WAIT_2:
2169 	case TCPS_CLOSE_WAIT:
2170 	case TCPS_CLOSING:
2171 	case TCPS_LAST_ACK:
2172 	case TCPS_TIME_WAIT:
2173 
2174 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2175 			if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
2176 				tcpstat.tcps_rcvdupack++;
2177 				/*
2178 				 * If we have outstanding data (other than
2179 				 * a window probe), this is a completely
2180 				 * duplicate ack (ie, window info didn't
2181 				 * change), the ack is the biggest we've
2182 				 * seen and we've seen exactly our rexmt
2183 				 * threshhold of them, assume a packet
2184 				 * has been dropped and retransmit it.
2185 				 * Kludge snd_nxt & the congestion
2186 				 * window so we send only this one
2187 				 * packet.
2188 				 *
2189 				 * We know we're losing at the current
2190 				 * window size so do congestion avoidance
2191 				 * (set ssthresh to half the current window
2192 				 * and pull our congestion window back to
2193 				 * the new ssthresh).
2194 				 *
2195 				 * Dup acks mean that packets have left the
2196 				 * network (they're now cached at the receiver)
2197 				 * so bump cwnd by the amount in the receiver
2198 				 * to keep a constant cwnd packets in the
2199 				 * network.
2200 				 *
2201 				 * If we are using TCP/SACK, then enter
2202 				 * Fast Recovery if the receiver SACKs
2203 				 * data that is tcprexmtthresh * MSS
2204 				 * bytes past the last ACKed segment,
2205 				 * irrespective of the number of DupAcks.
2206 				 */
2207 				if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
2208 				    th->th_ack != tp->snd_una)
2209 					tp->t_dupacks = 0;
2210 				else if (tp->t_partialacks < 0 &&
2211 					 (++tp->t_dupacks == tcprexmtthresh ||
2212 					 TCP_FACK_FASTRECOV(tp))) {
2213 					tcp_seq onxt;
2214 					u_int win;
2215 
2216 					if (tcp_do_newreno &&
2217 					    SEQ_LT(th->th_ack, tp->snd_high)) {
2218 						/*
2219 						 * False fast retransmit after
2220 						 * timeout.  Do not enter fast
2221 						 * recovery.
2222 						 */
2223 						tp->t_dupacks = 0;
2224 						break;
2225 					}
2226 
2227 					onxt = tp->snd_nxt;
2228 					win = min(tp->snd_wnd, tp->snd_cwnd) /
2229 					    2 /	tp->t_segsz;
2230 					if (win < 2)
2231 						win = 2;
2232 					tp->snd_ssthresh = win * tp->t_segsz;
2233 					tp->snd_recover = tp->snd_max;
2234 					tp->t_partialacks = 0;
2235 					TCP_TIMER_DISARM(tp, TCPT_REXMT);
2236 					tp->t_rtttime = 0;
2237 					if (TCP_SACK_ENABLED(tp)) {
2238 						tp->t_dupacks = tcprexmtthresh;
2239 						tp->sack_newdata = tp->snd_nxt;
2240 						tp->snd_cwnd = tp->t_segsz;
2241 						(void) tcp_output(tp);
2242 						goto drop;
2243 					}
2244 					tp->snd_nxt = th->th_ack;
2245 					tp->snd_cwnd = tp->t_segsz;
2246 					(void) tcp_output(tp);
2247 					tp->snd_cwnd = tp->snd_ssthresh +
2248 					       tp->t_segsz * tp->t_dupacks;
2249 					if (SEQ_GT(onxt, tp->snd_nxt))
2250 						tp->snd_nxt = onxt;
2251 					goto drop;
2252 				} else if (tp->t_dupacks > tcprexmtthresh) {
2253 					tp->snd_cwnd += tp->t_segsz;
2254 					(void) tcp_output(tp);
2255 					goto drop;
2256 				}
2257 			} else {
2258 				/*
2259 				 * If the ack appears to be very old, only
2260 				 * allow data that is in-sequence.  This
2261 				 * makes it somewhat more difficult to insert
2262 				 * forged data by guessing sequence numbers.
2263 				 * Sent an ack to try to update the send
2264 				 * sequence number on the other side.
2265 				 */
2266 				if (tlen && th->th_seq != tp->rcv_nxt &&
2267 				    SEQ_LT(th->th_ack,
2268 				    tp->snd_una - tp->max_sndwnd))
2269 					goto dropafterack;
2270 			}
2271 			break;
2272 		}
2273 		/*
2274 		 * If the congestion window was inflated to account
2275 		 * for the other side's cached packets, retract it.
2276 		 */
2277 		if (TCP_SACK_ENABLED(tp))
2278 			tcp_sack_newack(tp, th);
2279 		else if (tcp_do_newreno)
2280 			tcp_newreno_newack(tp, th);
2281 		else
2282 			tcp_reno_newack(tp, th);
2283 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
2284 			tcpstat.tcps_rcvacktoomuch++;
2285 			goto dropafterack;
2286 		}
2287 		acked = th->th_ack - tp->snd_una;
2288 		tcpstat.tcps_rcvackpack++;
2289 		tcpstat.tcps_rcvackbyte += acked;
2290 
2291 		/*
2292 		 * If we have a timestamp reply, update smoothed
2293 		 * round trip time.  If no timestamp is present but
2294 		 * transmit timer is running and timed sequence
2295 		 * number was acked, update smoothed round trip time.
2296 		 * Since we now have an rtt measurement, cancel the
2297 		 * timer backoff (cf., Phil Karn's retransmit alg.).
2298 		 * Recompute the initial retransmit timer.
2299 		 */
2300 		if (ts_rtt)
2301 			tcp_xmit_timer(tp, ts_rtt);
2302 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2303 			tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2304 
2305 		/*
2306 		 * If all outstanding data is acked, stop retransmit
2307 		 * timer and remember to restart (more output or persist).
2308 		 * If there is more data to be acked, restart retransmit
2309 		 * timer, using current (possibly backed-off) value.
2310 		 */
2311 		if (th->th_ack == tp->snd_max) {
2312 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
2313 			needoutput = 1;
2314 		} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
2315 			TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2316 		/*
2317 		 * When new data is acked, open the congestion window.
2318 		 * If the window gives us less than ssthresh packets
2319 		 * in flight, open exponentially (segsz per packet).
2320 		 * Otherwise open linearly: segsz per window
2321 		 * (segsz^2 / cwnd per packet).
2322 		 *
2323 		 * If we are still in fast recovery (meaning we are using
2324 		 * NewReno and we have only received partial acks), do not
2325 		 * inflate the window yet.
2326 		 */
2327 		if (tp->t_partialacks < 0) {
2328 			u_int cw = tp->snd_cwnd;
2329 			u_int incr = tp->t_segsz;
2330 
2331 			if (cw >= tp->snd_ssthresh)
2332 				incr = incr * incr / cw;
2333 			tp->snd_cwnd = min(cw + incr,
2334 			    TCP_MAXWIN << tp->snd_scale);
2335 		}
2336 		ND6_HINT(tp);
2337 		if (acked > so->so_snd.sb_cc) {
2338 			tp->snd_wnd -= so->so_snd.sb_cc;
2339 			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2340 			ourfinisacked = 1;
2341 		} else {
2342 			if (acked > (tp->t_lastoff - tp->t_inoff))
2343 				tp->t_lastm = NULL;
2344 			sbdrop(&so->so_snd, acked);
2345 			tp->t_lastoff -= acked;
2346 			tp->snd_wnd -= acked;
2347 			ourfinisacked = 0;
2348 		}
2349 		sowwakeup(so);
2350 
2351 		ICMP_CHECK(tp, th, acked);
2352 
2353 		tp->snd_una = th->th_ack;
2354 		if (SEQ_GT(tp->snd_una, tp->snd_fack))
2355 			tp->snd_fack = tp->snd_una;
2356 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2357 			tp->snd_nxt = tp->snd_una;
2358 		if (SEQ_LT(tp->snd_high, tp->snd_una))
2359 			tp->snd_high = tp->snd_una;
2360 
2361 		switch (tp->t_state) {
2362 
2363 		/*
2364 		 * In FIN_WAIT_1 STATE in addition to the processing
2365 		 * for the ESTABLISHED state if our FIN is now acknowledged
2366 		 * then enter FIN_WAIT_2.
2367 		 */
2368 		case TCPS_FIN_WAIT_1:
2369 			if (ourfinisacked) {
2370 				/*
2371 				 * If we can't receive any more
2372 				 * data, then closing user can proceed.
2373 				 * Starting the timer is contrary to the
2374 				 * specification, but if we don't get a FIN
2375 				 * we'll hang forever.
2376 				 */
2377 				if (so->so_state & SS_CANTRCVMORE) {
2378 					soisdisconnected(so);
2379 					if (tcp_maxidle > 0)
2380 						TCP_TIMER_ARM(tp, TCPT_2MSL,
2381 						    tcp_maxidle);
2382 				}
2383 				tp->t_state = TCPS_FIN_WAIT_2;
2384 			}
2385 			break;
2386 
2387 	 	/*
2388 		 * In CLOSING STATE in addition to the processing for
2389 		 * the ESTABLISHED state if the ACK acknowledges our FIN
2390 		 * then enter the TIME-WAIT state, otherwise ignore
2391 		 * the segment.
2392 		 */
2393 		case TCPS_CLOSING:
2394 			if (ourfinisacked) {
2395 				tp->t_state = TCPS_TIME_WAIT;
2396 				tcp_canceltimers(tp);
2397 				TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2398 				soisdisconnected(so);
2399 			}
2400 			break;
2401 
2402 		/*
2403 		 * In LAST_ACK, we may still be waiting for data to drain
2404 		 * and/or to be acked, as well as for the ack of our FIN.
2405 		 * If our FIN is now acknowledged, delete the TCB,
2406 		 * enter the closed state and return.
2407 		 */
2408 		case TCPS_LAST_ACK:
2409 			if (ourfinisacked) {
2410 				tp = tcp_close(tp);
2411 				goto drop;
2412 			}
2413 			break;
2414 
2415 		/*
2416 		 * In TIME_WAIT state the only thing that should arrive
2417 		 * is a retransmission of the remote FIN.  Acknowledge
2418 		 * it and restart the finack timer.
2419 		 */
2420 		case TCPS_TIME_WAIT:
2421 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2422 			goto dropafterack;
2423 		}
2424 	}
2425 
2426 step6:
2427 	/*
2428 	 * Update window information.
2429 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2430 	 */
2431 	if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2432 	    (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) ||
2433 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) {
2434 		/* keep track of pure window updates */
2435 		if (tlen == 0 &&
2436 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2437 			tcpstat.tcps_rcvwinupd++;
2438 		tp->snd_wnd = tiwin;
2439 		tp->snd_wl1 = th->th_seq;
2440 		tp->snd_wl2 = th->th_ack;
2441 		if (tp->snd_wnd > tp->max_sndwnd)
2442 			tp->max_sndwnd = tp->snd_wnd;
2443 		needoutput = 1;
2444 	}
2445 
2446 	/*
2447 	 * Process segments with URG.
2448 	 */
2449 	if ((tiflags & TH_URG) && th->th_urp &&
2450 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2451 		/*
2452 		 * This is a kludge, but if we receive and accept
2453 		 * random urgent pointers, we'll crash in
2454 		 * soreceive.  It's hard to imagine someone
2455 		 * actually wanting to send this much urgent data.
2456 		 */
2457 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2458 			th->th_urp = 0;			/* XXX */
2459 			tiflags &= ~TH_URG;		/* XXX */
2460 			goto dodata;			/* XXX */
2461 		}
2462 		/*
2463 		 * If this segment advances the known urgent pointer,
2464 		 * then mark the data stream.  This should not happen
2465 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2466 		 * a FIN has been received from the remote side.
2467 		 * In these states we ignore the URG.
2468 		 *
2469 		 * According to RFC961 (Assigned Protocols),
2470 		 * the urgent pointer points to the last octet
2471 		 * of urgent data.  We continue, however,
2472 		 * to consider it to indicate the first octet
2473 		 * of data past the urgent section as the original
2474 		 * spec states (in one of two places).
2475 		 */
2476 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2477 			tp->rcv_up = th->th_seq + th->th_urp;
2478 			so->so_oobmark = so->so_rcv.sb_cc +
2479 			    (tp->rcv_up - tp->rcv_nxt) - 1;
2480 			if (so->so_oobmark == 0)
2481 				so->so_state |= SS_RCVATMARK;
2482 			sohasoutofband(so);
2483 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2484 		}
2485 		/*
2486 		 * Remove out of band data so doesn't get presented to user.
2487 		 * This can happen independent of advancing the URG pointer,
2488 		 * but if two URG's are pending at once, some out-of-band
2489 		 * data may creep in... ick.
2490 		 */
2491 		if (th->th_urp <= (u_int16_t) tlen
2492 #ifdef SO_OOBINLINE
2493 		     && (so->so_options & SO_OOBINLINE) == 0
2494 #endif
2495 		     )
2496 			tcp_pulloutofband(so, th, m, hdroptlen);
2497 	} else
2498 		/*
2499 		 * If no out of band data is expected,
2500 		 * pull receive urgent pointer along
2501 		 * with the receive window.
2502 		 */
2503 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2504 			tp->rcv_up = tp->rcv_nxt;
2505 dodata:							/* XXX */
2506 
2507 	/*
2508 	 * Process the segment text, merging it into the TCP sequencing queue,
2509 	 * and arranging for acknowledgement of receipt if necessary.
2510 	 * This process logically involves adjusting tp->rcv_wnd as data
2511 	 * is presented to the user (this happens in tcp_usrreq.c,
2512 	 * case PRU_RCVD).  If a FIN has already been received on this
2513 	 * connection then we just ignore the text.
2514 	 */
2515 	if ((tlen || (tiflags & TH_FIN)) &&
2516 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2517 		/*
2518 		 * Insert segment ti into reassembly queue of tcp with
2519 		 * control block tp.  Return TH_FIN if reassembly now includes
2520 		 * a segment with FIN.  The macro form does the common case
2521 		 * inline (segment is the next to be received on an
2522 		 * established connection, and the queue is empty),
2523 		 * avoiding linkage into and removal from the queue and
2524 		 * repetition of various conversions.
2525 		 * Set DELACK for segments received in order, but ack
2526 		 * immediately when segments are out of order
2527 		 * (so fast retransmit can work).
2528 		 */
2529 		/* NOTE: this was TCP_REASS() macro, but used only once */
2530 		TCP_REASS_LOCK(tp);
2531 		if (th->th_seq == tp->rcv_nxt &&
2532 		    TAILQ_FIRST(&tp->segq) == NULL &&
2533 		    tp->t_state == TCPS_ESTABLISHED) {
2534 			TCP_SETUP_ACK(tp, th);
2535 			tp->rcv_nxt += tlen;
2536 			tiflags = th->th_flags & TH_FIN;
2537 			tcpstat.tcps_rcvpack++;
2538 			tcpstat.tcps_rcvbyte += tlen;
2539 			ND6_HINT(tp);
2540 			if (so->so_state & SS_CANTRCVMORE)
2541 				m_freem(m);
2542 			else {
2543 				m_adj(m, hdroptlen);
2544 				sbappendstream(&(so)->so_rcv, m);
2545 			}
2546 			sorwakeup(so);
2547 		} else {
2548 			m_adj(m, hdroptlen);
2549 			tiflags = tcp_reass(tp, th, m, &tlen);
2550 			tp->t_flags |= TF_ACKNOW;
2551 		}
2552 		TCP_REASS_UNLOCK(tp);
2553 
2554 		/*
2555 		 * Note the amount of data that peer has sent into
2556 		 * our window, in order to estimate the sender's
2557 		 * buffer size.
2558 		 */
2559 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2560 	} else {
2561 		m_freem(m);
2562 		m = NULL;
2563 		tiflags &= ~TH_FIN;
2564 	}
2565 
2566 	/*
2567 	 * If FIN is received ACK the FIN and let the user know
2568 	 * that the connection is closing.  Ignore a FIN received before
2569 	 * the connection is fully established.
2570 	 */
2571 	if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2572 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2573 			socantrcvmore(so);
2574 			tp->t_flags |= TF_ACKNOW;
2575 			tp->rcv_nxt++;
2576 		}
2577 		switch (tp->t_state) {
2578 
2579 	 	/*
2580 		 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2581 		 */
2582 		case TCPS_ESTABLISHED:
2583 			tp->t_state = TCPS_CLOSE_WAIT;
2584 			break;
2585 
2586 	 	/*
2587 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2588 		 * enter the CLOSING state.
2589 		 */
2590 		case TCPS_FIN_WAIT_1:
2591 			tp->t_state = TCPS_CLOSING;
2592 			break;
2593 
2594 	 	/*
2595 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2596 		 * starting the time-wait timer, turning off the other
2597 		 * standard timers.
2598 		 */
2599 		case TCPS_FIN_WAIT_2:
2600 			tp->t_state = TCPS_TIME_WAIT;
2601 			tcp_canceltimers(tp);
2602 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2603 			soisdisconnected(so);
2604 			break;
2605 
2606 		/*
2607 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2608 		 */
2609 		case TCPS_TIME_WAIT:
2610 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2611 			break;
2612 		}
2613 	}
2614 #ifdef TCP_DEBUG
2615 	if (so->so_options & SO_DEBUG)
2616 		tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2617 #endif
2618 
2619 	/*
2620 	 * Return any desired output.
2621 	 */
2622 	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2623 		(void) tcp_output(tp);
2624 	}
2625 	if (tcp_saveti)
2626 		m_freem(tcp_saveti);
2627 	return;
2628 
2629 badsyn:
2630 	/*
2631 	 * Received a bad SYN.  Increment counters and dropwithreset.
2632 	 */
2633 	tcpstat.tcps_badsyn++;
2634 	tp = NULL;
2635 	goto dropwithreset;
2636 
2637 dropafterack:
2638 	/*
2639 	 * Generate an ACK dropping incoming segment if it occupies
2640 	 * sequence space, where the ACK reflects our state.
2641 	 */
2642 	if (tiflags & TH_RST)
2643 		goto drop;
2644 	goto dropafterack2;
2645 
2646 dropafterack_ratelim:
2647 	/*
2648 	 * We may want to rate-limit ACKs against SYN/RST attack.
2649 	 */
2650 	if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2651 	    tcp_ackdrop_ppslim) == 0) {
2652 		/* XXX stat */
2653 		goto drop;
2654 	}
2655 	/* ...fall into dropafterack2... */
2656 
2657 dropafterack2:
2658 	m_freem(m);
2659 	tp->t_flags |= TF_ACKNOW;
2660 	(void) tcp_output(tp);
2661 	if (tcp_saveti)
2662 		m_freem(tcp_saveti);
2663 	return;
2664 
2665 dropwithreset_ratelim:
2666 	/*
2667 	 * We may want to rate-limit RSTs in certain situations,
2668 	 * particularly if we are sending an RST in response to
2669 	 * an attempt to connect to or otherwise communicate with
2670 	 * a port for which we have no socket.
2671 	 */
2672 	if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2673 	    tcp_rst_ppslim) == 0) {
2674 		/* XXX stat */
2675 		goto drop;
2676 	}
2677 	/* ...fall into dropwithreset... */
2678 
2679 dropwithreset:
2680 	/*
2681 	 * Generate a RST, dropping incoming segment.
2682 	 * Make ACK acceptable to originator of segment.
2683 	 */
2684 	if (tiflags & TH_RST)
2685 		goto drop;
2686 
2687 	switch (af) {
2688 #ifdef INET6
2689 	case AF_INET6:
2690 		/* For following calls to tcp_respond */
2691 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
2692 			goto drop;
2693 		break;
2694 #endif /* INET6 */
2695 	case AF_INET:
2696 		if (IN_MULTICAST(ip->ip_dst.s_addr) ||
2697 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2698 			goto drop;
2699 	}
2700 
2701 	if (tiflags & TH_ACK)
2702 		(void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2703 	else {
2704 		if (tiflags & TH_SYN)
2705 			tlen++;
2706 		(void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2707 		    TH_RST|TH_ACK);
2708 	}
2709 	if (tcp_saveti)
2710 		m_freem(tcp_saveti);
2711 	return;
2712 
2713 badcsum:
2714 drop:
2715 	/*
2716 	 * Drop space held by incoming segment and return.
2717 	 */
2718 	if (tp) {
2719 		if (tp->t_inpcb)
2720 			so = tp->t_inpcb->inp_socket;
2721 #ifdef INET6
2722 		else if (tp->t_in6pcb)
2723 			so = tp->t_in6pcb->in6p_socket;
2724 #endif
2725 		else
2726 			so = NULL;
2727 #ifdef TCP_DEBUG
2728 		if (so && (so->so_options & SO_DEBUG) != 0)
2729 			tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2730 #endif
2731 	}
2732 	if (tcp_saveti)
2733 		m_freem(tcp_saveti);
2734 	m_freem(m);
2735 	return;
2736 }
2737 
2738 #ifdef TCP_SIGNATURE
2739 int
2740 tcp_signature_apply(void *fstate, caddr_t data, u_int len)
2741 {
2742 
2743 	MD5Update(fstate, (u_char *)data, len);
2744 	return (0);
2745 }
2746 
2747 struct secasvar *
2748 tcp_signature_getsav(struct mbuf *m, struct tcphdr *th)
2749 {
2750 	struct secasvar *sav;
2751 #ifdef FAST_IPSEC
2752 	union sockaddr_union dst;
2753 #endif
2754 	struct ip *ip;
2755 	struct ip6_hdr *ip6;
2756 
2757 	ip = mtod(m, struct ip *);
2758 	switch (ip->ip_v) {
2759 	case 4:
2760 		ip = mtod(m, struct ip *);
2761 		ip6 = NULL;
2762 		break;
2763 	case 6:
2764 		ip = NULL;
2765 		ip6 = mtod(m, struct ip6_hdr *);
2766 		break;
2767 	default:
2768 		return (NULL);
2769 	}
2770 
2771 #ifdef FAST_IPSEC
2772 	/* Extract the destination from the IP header in the mbuf. */
2773 	bzero(&dst, sizeof(union sockaddr_union));
2774 	dst.sa.sa_len = sizeof(struct sockaddr_in);
2775 	dst.sa.sa_family = AF_INET;
2776 	dst.sin.sin_addr = ip->ip_dst;
2777 
2778 	/*
2779 	 * Look up an SADB entry which matches the address of the peer.
2780 	 */
2781 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2782 #else
2783 	if (ip)
2784 		sav = key_allocsa(AF_INET, (caddr_t)&ip->ip_src,
2785 		    (caddr_t)&ip->ip_dst, IPPROTO_TCP,
2786 		    htonl(TCP_SIG_SPI), 0, 0);
2787 	else
2788 		sav = key_allocsa(AF_INET6, (caddr_t)&ip6->ip6_src,
2789 		    (caddr_t)&ip6->ip6_dst, IPPROTO_TCP,
2790 		    htonl(TCP_SIG_SPI), 0, 0);
2791 #endif
2792 
2793 	return (sav);	/* freesav must be performed by caller */
2794 }
2795 
2796 int
2797 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
2798     struct secasvar *sav, char *sig)
2799 {
2800 	MD5_CTX ctx;
2801 	struct ip *ip;
2802 	struct ipovly *ipovly;
2803 	struct ip6_hdr *ip6;
2804 	struct ippseudo ippseudo;
2805 	struct ip6_hdr_pseudo ip6pseudo;
2806 	struct tcphdr th0;
2807 	int l, tcphdrlen;
2808 
2809 	if (sav == NULL)
2810 		return (-1);
2811 
2812 	tcphdrlen = th->th_off * 4;
2813 
2814 	switch (mtod(m, struct ip *)->ip_v) {
2815 	case 4:
2816 		ip = mtod(m, struct ip *);
2817 		ip6 = NULL;
2818 		break;
2819 	case 6:
2820 		ip = NULL;
2821 		ip6 = mtod(m, struct ip6_hdr *);
2822 		break;
2823 	default:
2824 		return (-1);
2825 	}
2826 
2827 	MD5Init(&ctx);
2828 
2829 	if (ip) {
2830 		memset(&ippseudo, 0, sizeof(ippseudo));
2831 		ipovly = (struct ipovly *)ip;
2832 		ippseudo.ippseudo_src = ipovly->ih_src;
2833 		ippseudo.ippseudo_dst = ipovly->ih_dst;
2834 		ippseudo.ippseudo_pad = 0;
2835 		ippseudo.ippseudo_p = IPPROTO_TCP;
2836 		ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
2837 		MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
2838 	} else {
2839 		memset(&ip6pseudo, 0, sizeof(ip6pseudo));
2840 		ip6pseudo.ip6ph_src = ip6->ip6_src;
2841 		in6_clearscope(&ip6pseudo.ip6ph_src);
2842 		ip6pseudo.ip6ph_dst = ip6->ip6_dst;
2843 		in6_clearscope(&ip6pseudo.ip6ph_dst);
2844 		ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
2845 		ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
2846 		MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
2847 	}
2848 
2849 	th0 = *th;
2850 	th0.th_sum = 0;
2851 	MD5Update(&ctx, (char *)&th0, sizeof(th0));
2852 
2853 	l = m->m_pkthdr.len - thoff - tcphdrlen;
2854 	if (l > 0)
2855 		m_apply(m, thoff + tcphdrlen,
2856 		    m->m_pkthdr.len - thoff - tcphdrlen,
2857 		    tcp_signature_apply, &ctx);
2858 
2859 	MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
2860 	MD5Final(sig, &ctx);
2861 
2862 	return (0);
2863 }
2864 #endif
2865 
2866 int
2867 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
2868     struct mbuf *m, int toff, struct tcp_opt_info *oi)
2869 {
2870 	u_int16_t mss;
2871 	int opt, optlen = 0;
2872 #ifdef TCP_SIGNATURE
2873 	caddr_t sigp = NULL;
2874 	char sigbuf[TCP_SIGLEN];
2875 	struct secasvar *sav = NULL;
2876 #endif
2877 
2878 	for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2879 		opt = cp[0];
2880 		if (opt == TCPOPT_EOL)
2881 			break;
2882 		if (opt == TCPOPT_NOP)
2883 			optlen = 1;
2884 		else {
2885 			if (cnt < 2)
2886 				break;
2887 			optlen = cp[1];
2888 			if (optlen < 2 || optlen > cnt)
2889 				break;
2890 		}
2891 		switch (opt) {
2892 
2893 		default:
2894 			continue;
2895 
2896 		case TCPOPT_MAXSEG:
2897 			if (optlen != TCPOLEN_MAXSEG)
2898 				continue;
2899 			if (!(th->th_flags & TH_SYN))
2900 				continue;
2901 			if (TCPS_HAVERCVDSYN(tp->t_state))
2902 				continue;
2903 			bcopy(cp + 2, &mss, sizeof(mss));
2904 			oi->maxseg = ntohs(mss);
2905 			break;
2906 
2907 		case TCPOPT_WINDOW:
2908 			if (optlen != TCPOLEN_WINDOW)
2909 				continue;
2910 			if (!(th->th_flags & TH_SYN))
2911 				continue;
2912 			if (TCPS_HAVERCVDSYN(tp->t_state))
2913 				continue;
2914 			tp->t_flags |= TF_RCVD_SCALE;
2915 			tp->requested_s_scale = cp[2];
2916 			if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
2917 #if 0	/*XXX*/
2918 				char *p;
2919 
2920 				if (ip)
2921 					p = ntohl(ip->ip_src);
2922 #ifdef INET6
2923 				else if (ip6)
2924 					p = ip6_sprintf(&ip6->ip6_src);
2925 #endif
2926 				else
2927 					p = "(unknown)";
2928 				log(LOG_ERR, "TCP: invalid wscale %d from %s, "
2929 				    "assuming %d\n",
2930 				    tp->requested_s_scale, p,
2931 				    TCP_MAX_WINSHIFT);
2932 #else
2933 				log(LOG_ERR, "TCP: invalid wscale %d, "
2934 				    "assuming %d\n",
2935 				    tp->requested_s_scale,
2936 				    TCP_MAX_WINSHIFT);
2937 #endif
2938 				tp->requested_s_scale = TCP_MAX_WINSHIFT;
2939 			}
2940 			break;
2941 
2942 		case TCPOPT_TIMESTAMP:
2943 			if (optlen != TCPOLEN_TIMESTAMP)
2944 				continue;
2945 			oi->ts_present = 1;
2946 			bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
2947 			NTOHL(oi->ts_val);
2948 			bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
2949 			NTOHL(oi->ts_ecr);
2950 
2951 			if (!(th->th_flags & TH_SYN))
2952 				continue;
2953 			if (TCPS_HAVERCVDSYN(tp->t_state))
2954 				continue;
2955 			/*
2956 			 * A timestamp received in a SYN makes
2957 			 * it ok to send timestamp requests and replies.
2958 			 */
2959 			tp->t_flags |= TF_RCVD_TSTMP;
2960 			tp->ts_recent = oi->ts_val;
2961 			tp->ts_recent_age = tcp_now;
2962                         break;
2963 
2964 		case TCPOPT_SACK_PERMITTED:
2965 			if (optlen != TCPOLEN_SACK_PERMITTED)
2966 				continue;
2967 			if (!(th->th_flags & TH_SYN))
2968 				continue;
2969 			if (TCPS_HAVERCVDSYN(tp->t_state))
2970 				continue;
2971 			if (tcp_do_sack) {
2972 				tp->t_flags |= TF_SACK_PERMIT;
2973 				tp->t_flags |= TF_WILL_SACK;
2974 			}
2975 			break;
2976 
2977 		case TCPOPT_SACK:
2978 			tcp_sack_option(tp, th, cp, optlen);
2979 			break;
2980 #ifdef TCP_SIGNATURE
2981 		case TCPOPT_SIGNATURE:
2982 			if (optlen != TCPOLEN_SIGNATURE)
2983 				continue;
2984 			if (sigp && bcmp(sigp, cp + 2, TCP_SIGLEN))
2985 				return (-1);
2986 
2987 			sigp = sigbuf;
2988 			memcpy(sigbuf, cp + 2, TCP_SIGLEN);
2989 			memset(cp + 2, 0, TCP_SIGLEN);
2990 			tp->t_flags |= TF_SIGNATURE;
2991 			break;
2992 #endif
2993 		}
2994 	}
2995 
2996 #ifdef TCP_SIGNATURE
2997 	if (tp->t_flags & TF_SIGNATURE) {
2998 
2999 		sav = tcp_signature_getsav(m, th);
3000 
3001 		if (sav == NULL && tp->t_state == TCPS_LISTEN)
3002 			return (-1);
3003 	}
3004 
3005 	if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
3006 		if (sav == NULL)
3007 			return (-1);
3008 #ifdef FAST_IPSEC
3009 		KEY_FREESAV(&sav);
3010 #else
3011 		key_freesav(sav);
3012 #endif
3013 		return (-1);
3014 	}
3015 
3016 	if (sigp) {
3017 		char sig[TCP_SIGLEN];
3018 
3019 		TCP_FIELDS_TO_NET(th);
3020 		if (tcp_signature(m, th, toff, sav, sig) < 0) {
3021 			TCP_FIELDS_TO_HOST(th);
3022 			if (sav == NULL)
3023 				return (-1);
3024 #ifdef FAST_IPSEC
3025 			KEY_FREESAV(&sav);
3026 #else
3027 			key_freesav(sav);
3028 #endif
3029 			return (-1);
3030 		}
3031 		TCP_FIELDS_TO_HOST(th);
3032 
3033 		if (bcmp(sig, sigp, TCP_SIGLEN)) {
3034 			tcpstat.tcps_badsig++;
3035 			if (sav == NULL)
3036 				return (-1);
3037 #ifdef FAST_IPSEC
3038 			KEY_FREESAV(&sav);
3039 #else
3040 			key_freesav(sav);
3041 #endif
3042 			return (-1);
3043 		} else
3044 			tcpstat.tcps_goodsig++;
3045 
3046 		key_sa_recordxfer(sav, m);
3047 #ifdef FAST_IPSEC
3048 		KEY_FREESAV(&sav);
3049 #else
3050 		key_freesav(sav);
3051 #endif
3052 	}
3053 #endif
3054 
3055 	return (0);
3056 }
3057 
3058 /*
3059  * Pull out of band byte out of a segment so
3060  * it doesn't appear in the user's data queue.
3061  * It is still reflected in the segment length for
3062  * sequencing purposes.
3063  */
3064 void
3065 tcp_pulloutofband(struct socket *so, struct tcphdr *th,
3066     struct mbuf *m, int off)
3067 {
3068 	int cnt = off + th->th_urp - 1;
3069 
3070 	while (cnt >= 0) {
3071 		if (m->m_len > cnt) {
3072 			char *cp = mtod(m, caddr_t) + cnt;
3073 			struct tcpcb *tp = sototcpcb(so);
3074 
3075 			tp->t_iobc = *cp;
3076 			tp->t_oobflags |= TCPOOB_HAVEDATA;
3077 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3078 			m->m_len--;
3079 			return;
3080 		}
3081 		cnt -= m->m_len;
3082 		m = m->m_next;
3083 		if (m == 0)
3084 			break;
3085 	}
3086 	panic("tcp_pulloutofband");
3087 }
3088 
3089 /*
3090  * Collect new round-trip time estimate
3091  * and update averages and current timeout.
3092  */
3093 void
3094 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
3095 {
3096 	int32_t delta;
3097 
3098 	tcpstat.tcps_rttupdated++;
3099 	if (tp->t_srtt != 0) {
3100 		/*
3101 		 * srtt is stored as fixed point with 3 bits after the
3102 		 * binary point (i.e., scaled by 8).  The following magic
3103 		 * is equivalent to the smoothing algorithm in rfc793 with
3104 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3105 		 * point).  Adjust rtt to origin 0.
3106 		 */
3107 		delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
3108 		if ((tp->t_srtt += delta) <= 0)
3109 			tp->t_srtt = 1 << 2;
3110 		/*
3111 		 * We accumulate a smoothed rtt variance (actually, a
3112 		 * smoothed mean difference), then set the retransmit
3113 		 * timer to smoothed rtt + 4 times the smoothed variance.
3114 		 * rttvar is stored as fixed point with 2 bits after the
3115 		 * binary point (scaled by 4).  The following is
3116 		 * equivalent to rfc793 smoothing with an alpha of .75
3117 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
3118 		 * rfc793's wired-in beta.
3119 		 */
3120 		if (delta < 0)
3121 			delta = -delta;
3122 		delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3123 		if ((tp->t_rttvar += delta) <= 0)
3124 			tp->t_rttvar = 1 << 2;
3125 	} else {
3126 		/*
3127 		 * No rtt measurement yet - use the unsmoothed rtt.
3128 		 * Set the variance to half the rtt (so our first
3129 		 * retransmit happens at 3*rtt).
3130 		 */
3131 		tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
3132 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
3133 	}
3134 	tp->t_rtttime = 0;
3135 	tp->t_rxtshift = 0;
3136 
3137 	/*
3138 	 * the retransmit should happen at rtt + 4 * rttvar.
3139 	 * Because of the way we do the smoothing, srtt and rttvar
3140 	 * will each average +1/2 tick of bias.  When we compute
3141 	 * the retransmit timer, we want 1/2 tick of rounding and
3142 	 * 1 extra tick because of +-1/2 tick uncertainty in the
3143 	 * firing of the timer.  The bias will give us exactly the
3144 	 * 1.5 tick we need.  But, because the bias is
3145 	 * statistical, we have to test that we don't drop below
3146 	 * the minimum feasible timer (which is 2 ticks).
3147 	 */
3148 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3149 	    max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3150 
3151 	/*
3152 	 * We received an ack for a packet that wasn't retransmitted;
3153 	 * it is probably safe to discard any error indications we've
3154 	 * received recently.  This isn't quite right, but close enough
3155 	 * for now (a route might have failed after we sent a segment,
3156 	 * and the return path might not be symmetrical).
3157 	 */
3158 	tp->t_softerror = 0;
3159 }
3160 
3161 void
3162 tcp_reno_newack(struct tcpcb *tp, struct tcphdr *th)
3163 {
3164 	if (tp->t_partialacks < 0) {
3165 		/*
3166 		 * We were not in fast recovery.  Reset the duplicate ack
3167 		 * counter.
3168 		 */
3169 		tp->t_dupacks = 0;
3170 	} else {
3171 		/*
3172 		 * Clamp the congestion window to the crossover point and
3173 		 * exit fast recovery.
3174 		 */
3175 		if (tp->snd_cwnd > tp->snd_ssthresh)
3176 			tp->snd_cwnd = tp->snd_ssthresh;
3177 		tp->t_partialacks = -1;
3178 		tp->t_dupacks = 0;
3179 	}
3180 }
3181 
3182 /*
3183  * Implement the NewReno response to a new ack, checking for partial acks in
3184  * fast recovery.
3185  */
3186 void
3187 tcp_newreno_newack(struct tcpcb *tp, struct tcphdr *th)
3188 {
3189 	if (tp->t_partialacks < 0) {
3190 		/*
3191 		 * We were not in fast recovery.  Reset the duplicate ack
3192 		 * counter.
3193 		 */
3194 		tp->t_dupacks = 0;
3195 	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3196 		/*
3197 		 * This is a partial ack.  Retransmit the first unacknowledged
3198 		 * segment and deflate the congestion window by the amount of
3199 		 * acknowledged data.  Do not exit fast recovery.
3200 		 */
3201 		tcp_seq onxt = tp->snd_nxt;
3202 		u_long ocwnd = tp->snd_cwnd;
3203 
3204 		/*
3205 		 * snd_una has not yet been updated and the socket's send
3206 		 * buffer has not yet drained off the ACK'd data, so we
3207 		 * have to leave snd_una as it was to get the correct data
3208 		 * offset in tcp_output().
3209 		 */
3210 		if (++tp->t_partialacks == 1)
3211 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
3212 		tp->t_rtttime = 0;
3213 		tp->snd_nxt = th->th_ack;
3214 		/*
3215 		 * Set snd_cwnd to one segment beyond ACK'd offset.  snd_una
3216 		 * is not yet updated when we're called.
3217 		 */
3218 		tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
3219 		(void) tcp_output(tp);
3220 		tp->snd_cwnd = ocwnd;
3221 		if (SEQ_GT(onxt, tp->snd_nxt))
3222 			tp->snd_nxt = onxt;
3223 		/*
3224 		 * Partial window deflation.  Relies on fact that tp->snd_una
3225 		 * not updated yet.
3226 		 */
3227 		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
3228 	} else {
3229 		/*
3230 		 * Complete ack.  Inflate the congestion window to ssthresh
3231 		 * and exit fast recovery.
3232 		 *
3233 		 * Window inflation should have left us with approx.
3234 		 * snd_ssthresh outstanding data.  But in case we
3235 		 * would be inclined to send a burst, better to do
3236 		 * it via the slow start mechanism.
3237 		 */
3238 		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
3239 			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
3240 			    + tp->t_segsz;
3241 		else
3242 			tp->snd_cwnd = tp->snd_ssthresh;
3243 		tp->t_partialacks = -1;
3244 		tp->t_dupacks = 0;
3245 	}
3246 }
3247 
3248 
3249 /*
3250  * TCP compressed state engine.  Currently used to hold compressed
3251  * state for SYN_RECEIVED.
3252  */
3253 
3254 u_long	syn_cache_count;
3255 u_int32_t syn_hash1, syn_hash2;
3256 
3257 #define SYN_HASH(sa, sp, dp) \
3258 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3259 				     ((u_int32_t)(sp)))^syn_hash2)))
3260 #ifndef INET6
3261 #define	SYN_HASHALL(hash, src, dst) \
3262 do {									\
3263 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
3264 		((const struct sockaddr_in *)(src))->sin_port,		\
3265 		((const struct sockaddr_in *)(dst))->sin_port);		\
3266 } while (/*CONSTCOND*/ 0)
3267 #else
3268 #define SYN_HASH6(sa, sp, dp) \
3269 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3270 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3271 	 & 0x7fffffff)
3272 
3273 #define SYN_HASHALL(hash, src, dst) \
3274 do {									\
3275 	switch ((src)->sa_family) {					\
3276 	case AF_INET:							\
3277 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3278 			((const struct sockaddr_in *)(src))->sin_port,	\
3279 			((const struct sockaddr_in *)(dst))->sin_port);	\
3280 		break;							\
3281 	case AF_INET6:							\
3282 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3283 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
3284 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
3285 		break;							\
3286 	default:							\
3287 		hash = 0;						\
3288 	}								\
3289 } while (/*CONSTCOND*/0)
3290 #endif /* INET6 */
3291 
3292 #define	SYN_CACHE_RM(sc)						\
3293 do {									\
3294 	TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket,	\
3295 	    (sc), sc_bucketq);						\
3296 	(sc)->sc_tp = NULL;						\
3297 	LIST_REMOVE((sc), sc_tpq);					\
3298 	tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;			\
3299 	callout_stop(&(sc)->sc_timer);					\
3300 	syn_cache_count--;						\
3301 } while (/*CONSTCOND*/0)
3302 
3303 #define	SYN_CACHE_PUT(sc)						\
3304 do {									\
3305 	if ((sc)->sc_ipopts)						\
3306 		(void) m_free((sc)->sc_ipopts);				\
3307 	if ((sc)->sc_route4.ro_rt != NULL)				\
3308 		RTFREE((sc)->sc_route4.ro_rt);				\
3309 	if (callout_invoking(&(sc)->sc_timer))				\
3310 		(sc)->sc_flags |= SCF_DEAD;				\
3311 	else								\
3312 		pool_put(&syn_cache_pool, (sc));			\
3313 } while (/*CONSTCOND*/0)
3314 
3315 POOL_INIT(syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL);
3316 
3317 /*
3318  * We don't estimate RTT with SYNs, so each packet starts with the default
3319  * RTT and each timer step has a fixed timeout value.
3320  */
3321 #define	SYN_CACHE_TIMER_ARM(sc)						\
3322 do {									\
3323 	TCPT_RANGESET((sc)->sc_rxtcur,					\
3324 	    TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN,	\
3325 	    TCPTV_REXMTMAX);						\
3326 	callout_reset(&(sc)->sc_timer,					\
3327 	    (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc));	\
3328 } while (/*CONSTCOND*/0)
3329 
3330 #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
3331 
3332 void
3333 syn_cache_init(void)
3334 {
3335 	int i;
3336 
3337 	/* Initialize the hash buckets. */
3338 	for (i = 0; i < tcp_syn_cache_size; i++)
3339 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3340 }
3341 
3342 void
3343 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3344 {
3345 	struct syn_cache_head *scp;
3346 	struct syn_cache *sc2;
3347 	int s;
3348 
3349 	/*
3350 	 * If there are no entries in the hash table, reinitialize
3351 	 * the hash secrets.
3352 	 */
3353 	if (syn_cache_count == 0) {
3354 		syn_hash1 = arc4random();
3355 		syn_hash2 = arc4random();
3356 	}
3357 
3358 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3359 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3360 	scp = &tcp_syn_cache[sc->sc_bucketidx];
3361 
3362 	/*
3363 	 * Make sure that we don't overflow the per-bucket
3364 	 * limit or the total cache size limit.
3365 	 */
3366 	s = splsoftnet();
3367 	if (scp->sch_length >= tcp_syn_bucket_limit) {
3368 		tcpstat.tcps_sc_bucketoverflow++;
3369 		/*
3370 		 * The bucket is full.  Toss the oldest element in the
3371 		 * bucket.  This will be the first entry in the bucket.
3372 		 */
3373 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
3374 #ifdef DIAGNOSTIC
3375 		/*
3376 		 * This should never happen; we should always find an
3377 		 * entry in our bucket.
3378 		 */
3379 		if (sc2 == NULL)
3380 			panic("syn_cache_insert: bucketoverflow: impossible");
3381 #endif
3382 		SYN_CACHE_RM(sc2);
3383 		SYN_CACHE_PUT(sc2);
3384 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
3385 		struct syn_cache_head *scp2, *sce;
3386 
3387 		tcpstat.tcps_sc_overflowed++;
3388 		/*
3389 		 * The cache is full.  Toss the oldest entry in the
3390 		 * first non-empty bucket we can find.
3391 		 *
3392 		 * XXX We would really like to toss the oldest
3393 		 * entry in the cache, but we hope that this
3394 		 * condition doesn't happen very often.
3395 		 */
3396 		scp2 = scp;
3397 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3398 			sce = &tcp_syn_cache[tcp_syn_cache_size];
3399 			for (++scp2; scp2 != scp; scp2++) {
3400 				if (scp2 >= sce)
3401 					scp2 = &tcp_syn_cache[0];
3402 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
3403 					break;
3404 			}
3405 #ifdef DIAGNOSTIC
3406 			/*
3407 			 * This should never happen; we should always find a
3408 			 * non-empty bucket.
3409 			 */
3410 			if (scp2 == scp)
3411 				panic("syn_cache_insert: cacheoverflow: "
3412 				    "impossible");
3413 #endif
3414 		}
3415 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3416 		SYN_CACHE_RM(sc2);
3417 		SYN_CACHE_PUT(sc2);
3418 	}
3419 
3420 	/*
3421 	 * Initialize the entry's timer.
3422 	 */
3423 	sc->sc_rxttot = 0;
3424 	sc->sc_rxtshift = 0;
3425 	SYN_CACHE_TIMER_ARM(sc);
3426 
3427 	/* Link it from tcpcb entry */
3428 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3429 
3430 	/* Put it into the bucket. */
3431 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3432 	scp->sch_length++;
3433 	syn_cache_count++;
3434 
3435 	tcpstat.tcps_sc_added++;
3436 	splx(s);
3437 }
3438 
3439 /*
3440  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3441  * If we have retransmitted an entry the maximum number of times, expire
3442  * that entry.
3443  */
3444 void
3445 syn_cache_timer(void *arg)
3446 {
3447 	struct syn_cache *sc = arg;
3448 	int s;
3449 
3450 	s = splsoftnet();
3451 	callout_ack(&sc->sc_timer);
3452 
3453 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
3454 		tcpstat.tcps_sc_delayed_free++;
3455 		pool_put(&syn_cache_pool, sc);
3456 		splx(s);
3457 		return;
3458 	}
3459 
3460 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3461 		/* Drop it -- too many retransmissions. */
3462 		goto dropit;
3463 	}
3464 
3465 	/*
3466 	 * Compute the total amount of time this entry has
3467 	 * been on a queue.  If this entry has been on longer
3468 	 * than the keep alive timer would allow, expire it.
3469 	 */
3470 	sc->sc_rxttot += sc->sc_rxtcur;
3471 	if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
3472 		goto dropit;
3473 
3474 	tcpstat.tcps_sc_retransmitted++;
3475 	(void) syn_cache_respond(sc, NULL);
3476 
3477 	/* Advance the timer back-off. */
3478 	sc->sc_rxtshift++;
3479 	SYN_CACHE_TIMER_ARM(sc);
3480 
3481 	splx(s);
3482 	return;
3483 
3484  dropit:
3485 	tcpstat.tcps_sc_timed_out++;
3486 	SYN_CACHE_RM(sc);
3487 	SYN_CACHE_PUT(sc);
3488 	splx(s);
3489 }
3490 
3491 /*
3492  * Remove syn cache created by the specified tcb entry,
3493  * because this does not make sense to keep them
3494  * (if there's no tcb entry, syn cache entry will never be used)
3495  */
3496 void
3497 syn_cache_cleanup(struct tcpcb *tp)
3498 {
3499 	struct syn_cache *sc, *nsc;
3500 	int s;
3501 
3502 	s = splsoftnet();
3503 
3504 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3505 		nsc = LIST_NEXT(sc, sc_tpq);
3506 
3507 #ifdef DIAGNOSTIC
3508 		if (sc->sc_tp != tp)
3509 			panic("invalid sc_tp in syn_cache_cleanup");
3510 #endif
3511 		SYN_CACHE_RM(sc);
3512 		SYN_CACHE_PUT(sc);
3513 	}
3514 	/* just for safety */
3515 	LIST_INIT(&tp->t_sc);
3516 
3517 	splx(s);
3518 }
3519 
3520 /*
3521  * Find an entry in the syn cache.
3522  */
3523 struct syn_cache *
3524 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
3525     struct syn_cache_head **headp)
3526 {
3527 	struct syn_cache *sc;
3528 	struct syn_cache_head *scp;
3529 	u_int32_t hash;
3530 	int s;
3531 
3532 	SYN_HASHALL(hash, src, dst);
3533 
3534 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3535 	*headp = scp;
3536 	s = splsoftnet();
3537 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3538 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
3539 		if (sc->sc_hash != hash)
3540 			continue;
3541 		if (!bcmp(&sc->sc_src, src, src->sa_len) &&
3542 		    !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
3543 			splx(s);
3544 			return (sc);
3545 		}
3546 	}
3547 	splx(s);
3548 	return (NULL);
3549 }
3550 
3551 /*
3552  * This function gets called when we receive an ACK for a
3553  * socket in the LISTEN state.  We look up the connection
3554  * in the syn cache, and if its there, we pull it out of
3555  * the cache and turn it into a full-blown connection in
3556  * the SYN-RECEIVED state.
3557  *
3558  * The return values may not be immediately obvious, and their effects
3559  * can be subtle, so here they are:
3560  *
3561  *	NULL	SYN was not found in cache; caller should drop the
3562  *		packet and send an RST.
3563  *
3564  *	-1	We were unable to create the new connection, and are
3565  *		aborting it.  An ACK,RST is being sent to the peer
3566  *		(unless we got screwey sequence numbners; see below),
3567  *		because the 3-way handshake has been completed.  Caller
3568  *		should not free the mbuf, since we may be using it.  If
3569  *		we are not, we will free it.
3570  *
3571  *	Otherwise, the return value is a pointer to the new socket
3572  *	associated with the connection.
3573  */
3574 struct socket *
3575 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
3576     struct tcphdr *th, unsigned int hlen, unsigned int tlen,
3577     struct socket *so, struct mbuf *m)
3578 {
3579 	struct syn_cache *sc;
3580 	struct syn_cache_head *scp;
3581 	struct inpcb *inp = NULL;
3582 #ifdef INET6
3583 	struct in6pcb *in6p = NULL;
3584 #endif
3585 	struct tcpcb *tp = 0;
3586 	struct mbuf *am;
3587 	int s;
3588 	struct socket *oso;
3589 
3590 	s = splsoftnet();
3591 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3592 		splx(s);
3593 		return (NULL);
3594 	}
3595 
3596 	/*
3597 	 * Verify the sequence and ack numbers.  Try getting the correct
3598 	 * response again.
3599 	 */
3600 	if ((th->th_ack != sc->sc_iss + 1) ||
3601 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3602 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3603 		(void) syn_cache_respond(sc, m);
3604 		splx(s);
3605 		return ((struct socket *)(-1));
3606 	}
3607 
3608 	/* Remove this cache entry */
3609 	SYN_CACHE_RM(sc);
3610 	splx(s);
3611 
3612 	/*
3613 	 * Ok, create the full blown connection, and set things up
3614 	 * as they would have been set up if we had created the
3615 	 * connection when the SYN arrived.  If we can't create
3616 	 * the connection, abort it.
3617 	 */
3618 	/*
3619 	 * inp still has the OLD in_pcb stuff, set the
3620 	 * v6-related flags on the new guy, too.   This is
3621 	 * done particularly for the case where an AF_INET6
3622 	 * socket is bound only to a port, and a v4 connection
3623 	 * comes in on that port.
3624 	 * we also copy the flowinfo from the original pcb
3625 	 * to the new one.
3626 	 */
3627 	oso = so;
3628 	so = sonewconn(so, SS_ISCONNECTED);
3629 	if (so == NULL)
3630 		goto resetandabort;
3631 
3632 	switch (so->so_proto->pr_domain->dom_family) {
3633 #ifdef INET
3634 	case AF_INET:
3635 		inp = sotoinpcb(so);
3636 		break;
3637 #endif
3638 #ifdef INET6
3639 	case AF_INET6:
3640 		in6p = sotoin6pcb(so);
3641 		break;
3642 #endif
3643 	}
3644 	switch (src->sa_family) {
3645 #ifdef INET
3646 	case AF_INET:
3647 		if (inp) {
3648 			inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3649 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
3650 			inp->inp_options = ip_srcroute();
3651 			in_pcbstate(inp, INP_BOUND);
3652 			if (inp->inp_options == NULL) {
3653 				inp->inp_options = sc->sc_ipopts;
3654 				sc->sc_ipopts = NULL;
3655 			}
3656 		}
3657 #ifdef INET6
3658 		else if (in6p) {
3659 			/* IPv4 packet to AF_INET6 socket */
3660 			bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr));
3661 			in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
3662 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
3663 				&in6p->in6p_laddr.s6_addr32[3],
3664 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
3665 			in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
3666 			in6totcpcb(in6p)->t_family = AF_INET;
3667 			if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
3668 				in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
3669 			else
3670 				in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
3671 			in6_pcbstate(in6p, IN6P_BOUND);
3672 		}
3673 #endif
3674 		break;
3675 #endif
3676 #ifdef INET6
3677 	case AF_INET6:
3678 		if (in6p) {
3679 			in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
3680 			in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
3681 			in6_pcbstate(in6p, IN6P_BOUND);
3682 		}
3683 		break;
3684 #endif
3685 	}
3686 #ifdef INET6
3687 	if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
3688 		struct in6pcb *oin6p = sotoin6pcb(oso);
3689 		/* inherit socket options from the listening socket */
3690 		in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
3691 		if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
3692 			m_freem(in6p->in6p_options);
3693 			in6p->in6p_options = 0;
3694 		}
3695 		ip6_savecontrol(in6p, &in6p->in6p_options,
3696 			mtod(m, struct ip6_hdr *), m);
3697 	}
3698 #endif
3699 
3700 #if defined(IPSEC) || defined(FAST_IPSEC)
3701 	/*
3702 	 * we make a copy of policy, instead of sharing the policy,
3703 	 * for better behavior in terms of SA lookup and dead SA removal.
3704 	 */
3705 	if (inp) {
3706 		/* copy old policy into new socket's */
3707 		if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
3708 			printf("tcp_input: could not copy policy\n");
3709 	}
3710 #ifdef INET6
3711 	else if (in6p) {
3712 		/* copy old policy into new socket's */
3713 		if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
3714 		    in6p->in6p_sp))
3715 			printf("tcp_input: could not copy policy\n");
3716 	}
3717 #endif
3718 #endif
3719 
3720 	/*
3721 	 * Give the new socket our cached route reference.
3722 	 */
3723 	if (inp)
3724 		inp->inp_route = sc->sc_route4;		/* struct assignment */
3725 #ifdef INET6
3726 	else
3727 		in6p->in6p_route = sc->sc_route6;
3728 #endif
3729 	sc->sc_route4.ro_rt = NULL;
3730 
3731 	am = m_get(M_DONTWAIT, MT_SONAME);	/* XXX */
3732 	if (am == NULL)
3733 		goto resetandabort;
3734 	MCLAIM(am, &tcp_mowner);
3735 	am->m_len = src->sa_len;
3736 	bcopy(src, mtod(am, caddr_t), src->sa_len);
3737 	if (inp) {
3738 		if (in_pcbconnect(inp, am, NULL)) {
3739 			(void) m_free(am);
3740 			goto resetandabort;
3741 		}
3742 	}
3743 #ifdef INET6
3744 	else if (in6p) {
3745 		if (src->sa_family == AF_INET) {
3746 			/* IPv4 packet to AF_INET6 socket */
3747 			struct sockaddr_in6 *sin6;
3748 			sin6 = mtod(am, struct sockaddr_in6 *);
3749 			am->m_len = sizeof(*sin6);
3750 			bzero(sin6, sizeof(*sin6));
3751 			sin6->sin6_family = AF_INET6;
3752 			sin6->sin6_len = sizeof(*sin6);
3753 			sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
3754 			sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
3755 			bcopy(&((struct sockaddr_in *)src)->sin_addr,
3756 				&sin6->sin6_addr.s6_addr32[3],
3757 				sizeof(sin6->sin6_addr.s6_addr32[3]));
3758 		}
3759 		if (in6_pcbconnect(in6p, am, NULL)) {
3760 			(void) m_free(am);
3761 			goto resetandabort;
3762 		}
3763 	}
3764 #endif
3765 	else {
3766 		(void) m_free(am);
3767 		goto resetandabort;
3768 	}
3769 	(void) m_free(am);
3770 
3771 	if (inp)
3772 		tp = intotcpcb(inp);
3773 #ifdef INET6
3774 	else if (in6p)
3775 		tp = in6totcpcb(in6p);
3776 #endif
3777 	else
3778 		tp = NULL;
3779 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3780 	if (sc->sc_request_r_scale != 15) {
3781 		tp->requested_s_scale = sc->sc_requested_s_scale;
3782 		tp->request_r_scale = sc->sc_request_r_scale;
3783 		tp->snd_scale = sc->sc_requested_s_scale;
3784 		tp->rcv_scale = sc->sc_request_r_scale;
3785 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3786 	}
3787 	if (sc->sc_flags & SCF_TIMESTAMP)
3788 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3789 	tp->ts_timebase = sc->sc_timebase;
3790 
3791 	tp->t_template = tcp_template(tp);
3792 	if (tp->t_template == 0) {
3793 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
3794 		so = NULL;
3795 		m_freem(m);
3796 		goto abort;
3797 	}
3798 
3799 	tp->iss = sc->sc_iss;
3800 	tp->irs = sc->sc_irs;
3801 	tcp_sendseqinit(tp);
3802 	tcp_rcvseqinit(tp);
3803 	tp->t_state = TCPS_SYN_RECEIVED;
3804 	TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
3805 	tcpstat.tcps_accepts++;
3806 
3807 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
3808 		tp->t_flags |= TF_WILL_SACK;
3809 
3810 #ifdef TCP_SIGNATURE
3811 	if (sc->sc_flags & SCF_SIGNATURE)
3812 		tp->t_flags |= TF_SIGNATURE;
3813 #endif
3814 
3815 	/* Initialize tp->t_ourmss before we deal with the peer's! */
3816 	tp->t_ourmss = sc->sc_ourmaxseg;
3817 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
3818 
3819 	/*
3820 	 * Initialize the initial congestion window.  If we
3821 	 * had to retransmit the SYN,ACK, we must initialize cwnd
3822 	 * to 1 segment (i.e. the Loss Window).
3823 	 */
3824 	if (sc->sc_rxtshift)
3825 		tp->snd_cwnd = tp->t_peermss;
3826 	else {
3827 		int ss = tcp_init_win;
3828 #ifdef INET
3829 		if (inp != NULL && in_localaddr(inp->inp_faddr))
3830 			ss = tcp_init_win_local;
3831 #endif
3832 #ifdef INET6
3833 		if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
3834 			ss = tcp_init_win_local;
3835 #endif
3836 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
3837 	}
3838 
3839 	tcp_rmx_rtt(tp);
3840 	tp->snd_wl1 = sc->sc_irs;
3841 	tp->rcv_up = sc->sc_irs + 1;
3842 
3843 	/*
3844 	 * This is what whould have happened in tcp_output() when
3845 	 * the SYN,ACK was sent.
3846 	 */
3847 	tp->snd_up = tp->snd_una;
3848 	tp->snd_max = tp->snd_nxt = tp->iss+1;
3849 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3850 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3851 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3852 	tp->last_ack_sent = tp->rcv_nxt;
3853 	tp->t_partialacks = -1;
3854 	tp->t_dupacks = 0;
3855 
3856 	tcpstat.tcps_sc_completed++;
3857 	SYN_CACHE_PUT(sc);
3858 	return (so);
3859 
3860 resetandabort:
3861 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
3862 abort:
3863 	if (so != NULL)
3864 		(void) soabort(so);
3865 	SYN_CACHE_PUT(sc);
3866 	tcpstat.tcps_sc_aborted++;
3867 	return ((struct socket *)(-1));
3868 }
3869 
3870 /*
3871  * This function is called when we get a RST for a
3872  * non-existent connection, so that we can see if the
3873  * connection is in the syn cache.  If it is, zap it.
3874  */
3875 
3876 void
3877 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
3878 {
3879 	struct syn_cache *sc;
3880 	struct syn_cache_head *scp;
3881 	int s = splsoftnet();
3882 
3883 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3884 		splx(s);
3885 		return;
3886 	}
3887 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3888 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3889 		splx(s);
3890 		return;
3891 	}
3892 	SYN_CACHE_RM(sc);
3893 	splx(s);
3894 	tcpstat.tcps_sc_reset++;
3895 	SYN_CACHE_PUT(sc);
3896 }
3897 
3898 void
3899 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
3900     struct tcphdr *th)
3901 {
3902 	struct syn_cache *sc;
3903 	struct syn_cache_head *scp;
3904 	int s;
3905 
3906 	s = splsoftnet();
3907 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3908 		splx(s);
3909 		return;
3910 	}
3911 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3912 	if (ntohl (th->th_seq) != sc->sc_iss) {
3913 		splx(s);
3914 		return;
3915 	}
3916 
3917 	/*
3918 	 * If we've retransmitted 3 times and this is our second error,
3919 	 * we remove the entry.  Otherwise, we allow it to continue on.
3920 	 * This prevents us from incorrectly nuking an entry during a
3921 	 * spurious network outage.
3922 	 *
3923 	 * See tcp_notify().
3924 	 */
3925 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3926 		sc->sc_flags |= SCF_UNREACH;
3927 		splx(s);
3928 		return;
3929 	}
3930 
3931 	SYN_CACHE_RM(sc);
3932 	splx(s);
3933 	tcpstat.tcps_sc_unreach++;
3934 	SYN_CACHE_PUT(sc);
3935 }
3936 
3937 /*
3938  * Given a LISTEN socket and an inbound SYN request, add
3939  * this to the syn cache, and send back a segment:
3940  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3941  * to the source.
3942  *
3943  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3944  * Doing so would require that we hold onto the data and deliver it
3945  * to the application.  However, if we are the target of a SYN-flood
3946  * DoS attack, an attacker could send data which would eventually
3947  * consume all available buffer space if it were ACKed.  By not ACKing
3948  * the data, we avoid this DoS scenario.
3949  */
3950 
3951 int
3952 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
3953     unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp,
3954     int optlen, struct tcp_opt_info *oi)
3955 {
3956 	struct tcpcb tb, *tp;
3957 	long win;
3958 	struct syn_cache *sc;
3959 	struct syn_cache_head *scp;
3960 	struct mbuf *ipopts;
3961 	struct tcp_opt_info opti;
3962 
3963 	tp = sototcpcb(so);
3964 
3965 	bzero(&opti, sizeof(opti));
3966 
3967 	/*
3968 	 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
3969 	 *
3970 	 * Note this check is performed in tcp_input() very early on.
3971 	 */
3972 
3973 	/*
3974 	 * Initialize some local state.
3975 	 */
3976 	win = sbspace(&so->so_rcv);
3977 	if (win > TCP_MAXWIN)
3978 		win = TCP_MAXWIN;
3979 
3980 	switch (src->sa_family) {
3981 #ifdef INET
3982 	case AF_INET:
3983 		/*
3984 		 * Remember the IP options, if any.
3985 		 */
3986 		ipopts = ip_srcroute();
3987 		break;
3988 #endif
3989 	default:
3990 		ipopts = NULL;
3991 	}
3992 
3993 #ifdef TCP_SIGNATURE
3994 	if (optp || (tp->t_flags & TF_SIGNATURE))
3995 #else
3996 	if (optp)
3997 #endif
3998 	{
3999 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
4000 #ifdef TCP_SIGNATURE
4001 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
4002 #endif
4003 		tb.t_state = TCPS_LISTEN;
4004 		if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len -
4005 		    sizeof(struct tcphdr) - optlen - hlen, oi) < 0)
4006 			return (0);
4007 	} else
4008 		tb.t_flags = 0;
4009 
4010 	/*
4011 	 * See if we already have an entry for this connection.
4012 	 * If we do, resend the SYN,ACK.  We do not count this
4013 	 * as a retransmission (XXX though maybe we should).
4014 	 */
4015 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4016 		tcpstat.tcps_sc_dupesyn++;
4017 		if (ipopts) {
4018 			/*
4019 			 * If we were remembering a previous source route,
4020 			 * forget it and use the new one we've been given.
4021 			 */
4022 			if (sc->sc_ipopts)
4023 				(void) m_free(sc->sc_ipopts);
4024 			sc->sc_ipopts = ipopts;
4025 		}
4026 		sc->sc_timestamp = tb.ts_recent;
4027 		if (syn_cache_respond(sc, m) == 0) {
4028 			tcpstat.tcps_sndacks++;
4029 			tcpstat.tcps_sndtotal++;
4030 		}
4031 		return (1);
4032 	}
4033 
4034 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4035 	if (sc == NULL) {
4036 		if (ipopts)
4037 			(void) m_free(ipopts);
4038 		return (0);
4039 	}
4040 
4041 	/*
4042 	 * Fill in the cache, and put the necessary IP and TCP
4043 	 * options into the reply.
4044 	 */
4045 	bzero(sc, sizeof(struct syn_cache));
4046 	callout_init(&sc->sc_timer);
4047 	bcopy(src, &sc->sc_src, src->sa_len);
4048 	bcopy(dst, &sc->sc_dst, dst->sa_len);
4049 	sc->sc_flags = 0;
4050 	sc->sc_ipopts = ipopts;
4051 	sc->sc_irs = th->th_seq;
4052 	switch (src->sa_family) {
4053 #ifdef INET
4054 	case AF_INET:
4055 	    {
4056 		struct sockaddr_in *srcin = (void *) src;
4057 		struct sockaddr_in *dstin = (void *) dst;
4058 
4059 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
4060 		    &srcin->sin_addr, dstin->sin_port,
4061 		    srcin->sin_port, sizeof(dstin->sin_addr), 0);
4062 		break;
4063 	    }
4064 #endif /* INET */
4065 #ifdef INET6
4066 	case AF_INET6:
4067 	    {
4068 		struct sockaddr_in6 *srcin6 = (void *) src;
4069 		struct sockaddr_in6 *dstin6 = (void *) dst;
4070 
4071 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
4072 		    &srcin6->sin6_addr, dstin6->sin6_port,
4073 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
4074 		break;
4075 	    }
4076 #endif /* INET6 */
4077 	}
4078 	sc->sc_peermaxseg = oi->maxseg;
4079 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
4080 						m->m_pkthdr.rcvif : NULL,
4081 						sc->sc_src.sa.sa_family);
4082 	sc->sc_win = win;
4083 	sc->sc_timebase = tcp_now;	/* see tcp_newtcpcb() */
4084 	sc->sc_timestamp = tb.ts_recent;
4085 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
4086 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
4087 		sc->sc_flags |= SCF_TIMESTAMP;
4088 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
4089 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
4090 		sc->sc_requested_s_scale = tb.requested_s_scale;
4091 		sc->sc_request_r_scale = 0;
4092 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4093 		    TCP_MAXWIN << sc->sc_request_r_scale <
4094 		    so->so_rcv.sb_hiwat)
4095 			sc->sc_request_r_scale++;
4096 	} else {
4097 		sc->sc_requested_s_scale = 15;
4098 		sc->sc_request_r_scale = 15;
4099 	}
4100 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
4101 		sc->sc_flags |= SCF_SACK_PERMIT;
4102 #ifdef TCP_SIGNATURE
4103 	if (tb.t_flags & TF_SIGNATURE)
4104 		sc->sc_flags |= SCF_SIGNATURE;
4105 #endif
4106 	sc->sc_tp = tp;
4107 	if (syn_cache_respond(sc, m) == 0) {
4108 		syn_cache_insert(sc, tp);
4109 		tcpstat.tcps_sndacks++;
4110 		tcpstat.tcps_sndtotal++;
4111 	} else {
4112 		SYN_CACHE_PUT(sc);
4113 		tcpstat.tcps_sc_dropped++;
4114 	}
4115 	return (1);
4116 }
4117 
4118 int
4119 syn_cache_respond(struct syn_cache *sc, struct mbuf *m)
4120 {
4121 	struct route *ro;
4122 	u_int8_t *optp;
4123 	int optlen, error;
4124 	u_int16_t tlen;
4125 	struct ip *ip = NULL;
4126 #ifdef INET6
4127 	struct ip6_hdr *ip6 = NULL;
4128 #endif
4129 	struct tcpcb *tp;
4130 	struct tcphdr *th;
4131 	u_int hlen;
4132 	struct socket *so;
4133 
4134 	switch (sc->sc_src.sa.sa_family) {
4135 	case AF_INET:
4136 		hlen = sizeof(struct ip);
4137 		ro = &sc->sc_route4;
4138 		break;
4139 #ifdef INET6
4140 	case AF_INET6:
4141 		hlen = sizeof(struct ip6_hdr);
4142 		ro = (struct route *)&sc->sc_route6;
4143 		break;
4144 #endif
4145 	default:
4146 		if (m)
4147 			m_freem(m);
4148 		return (EAFNOSUPPORT);
4149 	}
4150 
4151 	/* Compute the size of the TCP options. */
4152 	optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
4153 	    ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
4154 #ifdef TCP_SIGNATURE
4155 	    ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
4156 #endif
4157 	    ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
4158 
4159 	tlen = hlen + sizeof(struct tcphdr) + optlen;
4160 
4161 	/*
4162 	 * Create the IP+TCP header from scratch.
4163 	 */
4164 	if (m)
4165 		m_freem(m);
4166 #ifdef DIAGNOSTIC
4167 	if (max_linkhdr + tlen > MCLBYTES)
4168 		return (ENOBUFS);
4169 #endif
4170 	MGETHDR(m, M_DONTWAIT, MT_DATA);
4171 	if (m && tlen > MHLEN) {
4172 		MCLGET(m, M_DONTWAIT);
4173 		if ((m->m_flags & M_EXT) == 0) {
4174 			m_freem(m);
4175 			m = NULL;
4176 		}
4177 	}
4178 	if (m == NULL)
4179 		return (ENOBUFS);
4180 	MCLAIM(m, &tcp_tx_mowner);
4181 
4182 	/* Fixup the mbuf. */
4183 	m->m_data += max_linkhdr;
4184 	m->m_len = m->m_pkthdr.len = tlen;
4185 	if (sc->sc_tp) {
4186 		tp = sc->sc_tp;
4187 		if (tp->t_inpcb)
4188 			so = tp->t_inpcb->inp_socket;
4189 #ifdef INET6
4190 		else if (tp->t_in6pcb)
4191 			so = tp->t_in6pcb->in6p_socket;
4192 #endif
4193 		else
4194 			so = NULL;
4195 	} else
4196 		so = NULL;
4197 	m->m_pkthdr.rcvif = NULL;
4198 	memset(mtod(m, u_char *), 0, tlen);
4199 
4200 	switch (sc->sc_src.sa.sa_family) {
4201 	case AF_INET:
4202 		ip = mtod(m, struct ip *);
4203 		ip->ip_v = 4;
4204 		ip->ip_dst = sc->sc_src.sin.sin_addr;
4205 		ip->ip_src = sc->sc_dst.sin.sin_addr;
4206 		ip->ip_p = IPPROTO_TCP;
4207 		th = (struct tcphdr *)(ip + 1);
4208 		th->th_dport = sc->sc_src.sin.sin_port;
4209 		th->th_sport = sc->sc_dst.sin.sin_port;
4210 		break;
4211 #ifdef INET6
4212 	case AF_INET6:
4213 		ip6 = mtod(m, struct ip6_hdr *);
4214 		ip6->ip6_vfc = IPV6_VERSION;
4215 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4216 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4217 		ip6->ip6_nxt = IPPROTO_TCP;
4218 		/* ip6_plen will be updated in ip6_output() */
4219 		th = (struct tcphdr *)(ip6 + 1);
4220 		th->th_dport = sc->sc_src.sin6.sin6_port;
4221 		th->th_sport = sc->sc_dst.sin6.sin6_port;
4222 		break;
4223 #endif
4224 	default:
4225 		th = NULL;
4226 	}
4227 
4228 	th->th_seq = htonl(sc->sc_iss);
4229 	th->th_ack = htonl(sc->sc_irs + 1);
4230 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4231 	th->th_flags = TH_SYN|TH_ACK;
4232 	th->th_win = htons(sc->sc_win);
4233 	/* th_sum already 0 */
4234 	/* th_urp already 0 */
4235 
4236 	/* Tack on the TCP options. */
4237 	optp = (u_int8_t *)(th + 1);
4238 	*optp++ = TCPOPT_MAXSEG;
4239 	*optp++ = 4;
4240 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4241 	*optp++ = sc->sc_ourmaxseg & 0xff;
4242 
4243 	if (sc->sc_request_r_scale != 15) {
4244 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4245 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4246 		    sc->sc_request_r_scale);
4247 		optp += 4;
4248 	}
4249 
4250 	if (sc->sc_flags & SCF_TIMESTAMP) {
4251 		u_int32_t *lp = (u_int32_t *)(optp);
4252 		/* Form timestamp option as shown in appendix A of RFC 1323. */
4253 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
4254 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4255 		*lp   = htonl(sc->sc_timestamp);
4256 		optp += TCPOLEN_TSTAMP_APPA;
4257 	}
4258 
4259 	if (sc->sc_flags & SCF_SACK_PERMIT) {
4260 		u_int8_t *p = optp;
4261 
4262 		/* Let the peer know that we will SACK. */
4263 		p[0] = TCPOPT_SACK_PERMITTED;
4264 		p[1] = 2;
4265 		p[2] = TCPOPT_NOP;
4266 		p[3] = TCPOPT_NOP;
4267 		optp += 4;
4268 	}
4269 
4270 #ifdef TCP_SIGNATURE
4271 	if (sc->sc_flags & SCF_SIGNATURE) {
4272 		struct secasvar *sav;
4273 		u_int8_t *sigp;
4274 
4275 		sav = tcp_signature_getsav(m, th);
4276 
4277 		if (sav == NULL) {
4278 			if (m)
4279 				m_freem(m);
4280 			return (EPERM);
4281 		}
4282 
4283 		*optp++ = TCPOPT_SIGNATURE;
4284 		*optp++ = TCPOLEN_SIGNATURE;
4285 		sigp = optp;
4286 		bzero(optp, TCP_SIGLEN);
4287 		optp += TCP_SIGLEN;
4288 		*optp++ = TCPOPT_NOP;
4289 		*optp++ = TCPOPT_EOL;
4290 
4291 		(void)tcp_signature(m, th, hlen, sav, sigp);
4292 
4293 		key_sa_recordxfer(sav, m);
4294 #ifdef FAST_IPSEC
4295 		KEY_FREESAV(&sav);
4296 #else
4297 		key_freesav(sav);
4298 #endif
4299 	}
4300 #endif
4301 
4302 	/* Compute the packet's checksum. */
4303 	switch (sc->sc_src.sa.sa_family) {
4304 	case AF_INET:
4305 		ip->ip_len = htons(tlen - hlen);
4306 		th->th_sum = 0;
4307 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4308 		break;
4309 #ifdef INET6
4310 	case AF_INET6:
4311 		ip6->ip6_plen = htons(tlen - hlen);
4312 		th->th_sum = 0;
4313 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4314 		break;
4315 #endif
4316 	}
4317 
4318 	/*
4319 	 * Fill in some straggling IP bits.  Note the stack expects
4320 	 * ip_len to be in host order, for convenience.
4321 	 */
4322 	switch (sc->sc_src.sa.sa_family) {
4323 #ifdef INET
4324 	case AF_INET:
4325 		ip->ip_len = htons(tlen);
4326 		ip->ip_ttl = ip_defttl;
4327 		/* XXX tos? */
4328 		break;
4329 #endif
4330 #ifdef INET6
4331 	case AF_INET6:
4332 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4333 		ip6->ip6_vfc |= IPV6_VERSION;
4334 		ip6->ip6_plen = htons(tlen - hlen);
4335 		/* ip6_hlim will be initialized afterwards */
4336 		/* XXX flowlabel? */
4337 		break;
4338 #endif
4339 	}
4340 
4341 	/* XXX use IPsec policy on listening socket, on SYN ACK */
4342 	tp = sc->sc_tp;
4343 
4344 	switch (sc->sc_src.sa.sa_family) {
4345 #ifdef INET
4346 	case AF_INET:
4347 		error = ip_output(m, sc->sc_ipopts, ro,
4348 		    (ip_mtudisc ? IP_MTUDISC : 0),
4349 		    (struct ip_moptions *)NULL, so);
4350 		break;
4351 #endif
4352 #ifdef INET6
4353 	case AF_INET6:
4354 		ip6->ip6_hlim = in6_selecthlim(NULL,
4355 				ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
4356 
4357 		error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
4358 			(struct ip6_moptions *)0, so, NULL);
4359 		break;
4360 #endif
4361 	default:
4362 		error = EAFNOSUPPORT;
4363 		break;
4364 	}
4365 	return (error);
4366 }
4367