xref: /netbsd-src/sys/netinet/tcp_input.c (revision 7863ba460b0a05b553c754e5dbc29247dddec322)
1 /*	$NetBSD: tcp_input.c,v 1.404 2018/04/03 09:03:59 maxv Exp $	*/
2 
3 /*
4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the project nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
34  *
35  * NRL grants permission for redistribution and use in source and binary
36  * forms, with or without modification, of the software and documentation
37  * created at NRL provided that the following conditions are met:
38  *
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgements:
46  *      This product includes software developed by the University of
47  *      California, Berkeley and its contributors.
48  *      This product includes software developed at the Information
49  *      Technology Division, US Naval Research Laboratory.
50  * 4. Neither the name of the NRL nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  *
66  * The views and conclusions contained in the software and documentation
67  * are those of the authors and should not be interpreted as representing
68  * official policies, either expressed or implied, of the US Naval
69  * Research Laboratory (NRL).
70  */
71 
72 /*-
73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
74  * 2011 The NetBSD Foundation, Inc.
75  * All rights reserved.
76  *
77  * This code is derived from software contributed to The NetBSD Foundation
78  * by Coyote Point Systems, Inc.
79  * This code is derived from software contributed to The NetBSD Foundation
80  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
81  * Facility, NASA Ames Research Center.
82  * This code is derived from software contributed to The NetBSD Foundation
83  * by Charles M. Hannum.
84  * This code is derived from software contributed to The NetBSD Foundation
85  * by Rui Paulo.
86  *
87  * Redistribution and use in source and binary forms, with or without
88  * modification, are permitted provided that the following conditions
89  * are met:
90  * 1. Redistributions of source code must retain the above copyright
91  *    notice, this list of conditions and the following disclaimer.
92  * 2. Redistributions in binary form must reproduce the above copyright
93  *    notice, this list of conditions and the following disclaimer in the
94  *    documentation and/or other materials provided with the distribution.
95  *
96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
106  * POSSIBILITY OF SUCH DAMAGE.
107  */
108 
109 /*
110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
111  *	The Regents of the University of California.  All rights reserved.
112  *
113  * Redistribution and use in source and binary forms, with or without
114  * modification, are permitted provided that the following conditions
115  * are met:
116  * 1. Redistributions of source code must retain the above copyright
117  *    notice, this list of conditions and the following disclaimer.
118  * 2. Redistributions in binary form must reproduce the above copyright
119  *    notice, this list of conditions and the following disclaimer in the
120  *    documentation and/or other materials provided with the distribution.
121  * 3. Neither the name of the University nor the names of its contributors
122  *    may be used to endorse or promote products derived from this software
123  *    without specific prior written permission.
124  *
125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
135  * SUCH DAMAGE.
136  *
137  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
138  */
139 
140 /*
141  *	TODO list for SYN cache stuff:
142  *
143  *	Find room for a "state" field, which is needed to keep a
144  *	compressed state for TIME_WAIT TCBs.  It's been noted already
145  *	that this is fairly important for very high-volume web and
146  *	mail servers, which use a large number of short-lived
147  *	connections.
148  */
149 
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.404 2018/04/03 09:03:59 maxv Exp $");
152 
153 #ifdef _KERNEL_OPT
154 #include "opt_inet.h"
155 #include "opt_ipsec.h"
156 #include "opt_inet_csum.h"
157 #include "opt_tcp_debug.h"
158 #endif
159 
160 #include <sys/param.h>
161 #include <sys/systm.h>
162 #include <sys/malloc.h>
163 #include <sys/mbuf.h>
164 #include <sys/protosw.h>
165 #include <sys/socket.h>
166 #include <sys/socketvar.h>
167 #include <sys/errno.h>
168 #include <sys/syslog.h>
169 #include <sys/pool.h>
170 #include <sys/domain.h>
171 #include <sys/kernel.h>
172 #ifdef TCP_SIGNATURE
173 #include <sys/md5.h>
174 #endif
175 #include <sys/lwp.h> /* for lwp0 */
176 #include <sys/cprng.h>
177 
178 #include <net/if.h>
179 #include <net/if_types.h>
180 
181 #include <netinet/in.h>
182 #include <netinet/in_systm.h>
183 #include <netinet/ip.h>
184 #include <netinet/in_pcb.h>
185 #include <netinet/in_var.h>
186 #include <netinet/ip_var.h>
187 #include <netinet/in_offload.h>
188 
189 #ifdef INET6
190 #include <netinet/ip6.h>
191 #include <netinet6/ip6_var.h>
192 #include <netinet6/in6_pcb.h>
193 #include <netinet6/ip6_var.h>
194 #include <netinet6/in6_var.h>
195 #include <netinet/icmp6.h>
196 #include <netinet6/nd6.h>
197 #ifdef TCP_SIGNATURE
198 #include <netinet6/scope6_var.h>
199 #endif
200 #endif
201 
202 #ifndef INET6
203 /* always need ip6.h for IP6_EXTHDR_GET */
204 #include <netinet/ip6.h>
205 #endif
206 
207 #include <netinet/tcp.h>
208 #include <netinet/tcp_fsm.h>
209 #include <netinet/tcp_seq.h>
210 #include <netinet/tcp_timer.h>
211 #include <netinet/tcp_var.h>
212 #include <netinet/tcp_private.h>
213 #include <netinet/tcpip.h>
214 #include <netinet/tcp_congctl.h>
215 #include <netinet/tcp_debug.h>
216 
217 #ifdef INET6
218 #include "faith.h"
219 #if defined(NFAITH) && NFAITH > 0
220 #include <net/if_faith.h>
221 #endif
222 #endif
223 
224 #ifdef IPSEC
225 #include <netipsec/ipsec.h>
226 #include <netipsec/ipsec_var.h>
227 #include <netipsec/key.h>
228 #ifdef INET6
229 #include <netipsec/ipsec6.h>
230 #endif
231 #endif	/* IPSEC*/
232 
233 #include <netinet/tcp_vtw.h>
234 
235 int	tcprexmtthresh = 3;
236 int	tcp_log_refused;
237 
238 int	tcp_do_autorcvbuf = 1;
239 int	tcp_autorcvbuf_inc = 16 * 1024;
240 int	tcp_autorcvbuf_max = 256 * 1024;
241 int	tcp_msl = (TCPTV_MSL / PR_SLOWHZ);
242 
243 static int tcp_rst_ppslim_count = 0;
244 static struct timeval tcp_rst_ppslim_last;
245 static int tcp_ackdrop_ppslim_count = 0;
246 static struct timeval tcp_ackdrop_ppslim_last;
247 
248 static void syn_cache_timer(void *);
249 
250 #define TCP_PAWS_IDLE	(24U * 24 * 60 * 60 * PR_SLOWHZ)
251 
252 /* for modulo comparisons of timestamps */
253 #define TSTMP_LT(a,b)	((int)((a)-(b)) < 0)
254 #define TSTMP_GEQ(a,b)	((int)((a)-(b)) >= 0)
255 
256 /*
257  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
258  */
259 #ifdef INET6
260 static inline void
261 nd6_hint(struct tcpcb *tp)
262 {
263 	struct rtentry *rt = NULL;
264 
265 	if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 &&
266 	    (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL)
267 		nd6_nud_hint(rt);
268 	rtcache_unref(rt, &tp->t_in6pcb->in6p_route);
269 }
270 #else
271 static inline void
272 nd6_hint(struct tcpcb *tp)
273 {
274 }
275 #endif
276 
277 /*
278  * Compute ACK transmission behavior.  Delay the ACK unless
279  * we have already delayed an ACK (must send an ACK every two segments).
280  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
281  * option is enabled.
282  */
283 static void
284 tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th)
285 {
286 
287 	if (tp->t_flags & TF_DELACK ||
288 	    (tcp_ack_on_push && th->th_flags & TH_PUSH))
289 		tp->t_flags |= TF_ACKNOW;
290 	else
291 		TCP_SET_DELACK(tp);
292 }
293 
294 static void
295 icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked)
296 {
297 
298 	/*
299 	 * If we had a pending ICMP message that refers to data that have
300 	 * just been acknowledged, disregard the recorded ICMP message.
301 	 */
302 	if ((tp->t_flags & TF_PMTUD_PEND) &&
303 	    SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
304 		tp->t_flags &= ~TF_PMTUD_PEND;
305 
306 	/*
307 	 * Keep track of the largest chunk of data
308 	 * acknowledged since last PMTU update
309 	 */
310 	if (tp->t_pmtud_mss_acked < acked)
311 		tp->t_pmtud_mss_acked = acked;
312 }
313 
314 /*
315  * Convert TCP protocol fields to host order for easier processing.
316  */
317 static void
318 tcp_fields_to_host(struct tcphdr *th)
319 {
320 
321 	NTOHL(th->th_seq);
322 	NTOHL(th->th_ack);
323 	NTOHS(th->th_win);
324 	NTOHS(th->th_urp);
325 }
326 
327 /*
328  * ... and reverse the above.
329  */
330 static void
331 tcp_fields_to_net(struct tcphdr *th)
332 {
333 
334 	HTONL(th->th_seq);
335 	HTONL(th->th_ack);
336 	HTONS(th->th_win);
337 	HTONS(th->th_urp);
338 }
339 
340 static void
341 tcp_urp_drop(struct tcphdr *th, int todrop, int *tiflags)
342 {
343 	if (th->th_urp > todrop) {
344 		th->th_urp -= todrop;
345 	} else {
346 		*tiflags &= ~TH_URG;
347 		th->th_urp = 0;
348 	}
349 }
350 
351 #ifdef TCP_CSUM_COUNTERS
352 #include <sys/device.h>
353 
354 extern struct evcnt tcp_hwcsum_ok;
355 extern struct evcnt tcp_hwcsum_bad;
356 extern struct evcnt tcp_hwcsum_data;
357 extern struct evcnt tcp_swcsum;
358 #if defined(INET6)
359 extern struct evcnt tcp6_hwcsum_ok;
360 extern struct evcnt tcp6_hwcsum_bad;
361 extern struct evcnt tcp6_hwcsum_data;
362 extern struct evcnt tcp6_swcsum;
363 #endif /* defined(INET6) */
364 
365 #define	TCP_CSUM_COUNTER_INCR(ev)	(ev)->ev_count++
366 
367 #else
368 
369 #define	TCP_CSUM_COUNTER_INCR(ev)	/* nothing */
370 
371 #endif /* TCP_CSUM_COUNTERS */
372 
373 #ifdef TCP_REASS_COUNTERS
374 #include <sys/device.h>
375 
376 extern struct evcnt tcp_reass_;
377 extern struct evcnt tcp_reass_empty;
378 extern struct evcnt tcp_reass_iteration[8];
379 extern struct evcnt tcp_reass_prependfirst;
380 extern struct evcnt tcp_reass_prepend;
381 extern struct evcnt tcp_reass_insert;
382 extern struct evcnt tcp_reass_inserttail;
383 extern struct evcnt tcp_reass_append;
384 extern struct evcnt tcp_reass_appendtail;
385 extern struct evcnt tcp_reass_overlaptail;
386 extern struct evcnt tcp_reass_overlapfront;
387 extern struct evcnt tcp_reass_segdup;
388 extern struct evcnt tcp_reass_fragdup;
389 
390 #define	TCP_REASS_COUNTER_INCR(ev)	(ev)->ev_count++
391 
392 #else
393 
394 #define	TCP_REASS_COUNTER_INCR(ev)	/* nothing */
395 
396 #endif /* TCP_REASS_COUNTERS */
397 
398 static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *,
399     int);
400 static int tcp_dooptions(struct tcpcb *, const u_char *, int,
401     struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
402 
403 static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
404 #ifdef INET6
405 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
406 #endif
407 
408 #define	TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
409 
410 #if defined(MBUFTRACE)
411 struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass");
412 #endif /* defined(MBUFTRACE) */
413 
414 static struct pool tcpipqent_pool;
415 
416 void
417 tcpipqent_init(void)
418 {
419 
420 	pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl",
421 	    NULL, IPL_VM);
422 }
423 
424 struct ipqent *
425 tcpipqent_alloc(void)
426 {
427 	struct ipqent *ipqe;
428 	int s;
429 
430 	s = splvm();
431 	ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
432 	splx(s);
433 
434 	return ipqe;
435 }
436 
437 void
438 tcpipqent_free(struct ipqent *ipqe)
439 {
440 	int s;
441 
442 	s = splvm();
443 	pool_put(&tcpipqent_pool, ipqe);
444 	splx(s);
445 }
446 
447 /*
448  * Insert segment ti into reassembly queue of tcp with
449  * control block tp.  Return TH_FIN if reassembly now includes
450  * a segment with FIN.
451  */
452 static int
453 tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int tlen)
454 {
455 	struct ipqent *p, *q, *nq, *tiqe = NULL;
456 	struct socket *so = NULL;
457 	int pkt_flags;
458 	tcp_seq pkt_seq;
459 	unsigned pkt_len;
460 	u_long rcvpartdupbyte = 0;
461 	u_long rcvoobyte;
462 #ifdef TCP_REASS_COUNTERS
463 	u_int count = 0;
464 #endif
465 	uint64_t *tcps;
466 
467 	if (tp->t_inpcb)
468 		so = tp->t_inpcb->inp_socket;
469 #ifdef INET6
470 	else if (tp->t_in6pcb)
471 		so = tp->t_in6pcb->in6p_socket;
472 #endif
473 
474 	TCP_REASS_LOCK_CHECK(tp);
475 
476 	/*
477 	 * Call with th==NULL after become established to
478 	 * force pre-ESTABLISHED data up to user socket.
479 	 */
480 	if (th == NULL)
481 		goto present;
482 
483 	m_claimm(m, &tcp_reass_mowner);
484 
485 	rcvoobyte = tlen;
486 	/*
487 	 * Copy these to local variables because the TCP header gets munged
488 	 * while we are collapsing mbufs.
489 	 */
490 	pkt_seq = th->th_seq;
491 	pkt_len = tlen;
492 	pkt_flags = th->th_flags;
493 
494 	TCP_REASS_COUNTER_INCR(&tcp_reass_);
495 
496 	if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
497 		/*
498 		 * When we miss a packet, the vast majority of time we get
499 		 * packets that follow it in order.  So optimize for that.
500 		 */
501 		if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
502 			p->ipqe_len += pkt_len;
503 			p->ipqe_flags |= pkt_flags;
504 			m_cat(p->ipre_mlast, m);
505 			TRAVERSE(p->ipre_mlast);
506 			m = NULL;
507 			tiqe = p;
508 			TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
509 			TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
510 			goto skip_replacement;
511 		}
512 		/*
513 		 * While we're here, if the pkt is completely beyond
514 		 * anything we have, just insert it at the tail.
515 		 */
516 		if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
517 			TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
518 			goto insert_it;
519 		}
520 	}
521 
522 	q = TAILQ_FIRST(&tp->segq);
523 
524 	if (q != NULL) {
525 		/*
526 		 * If this segment immediately precedes the first out-of-order
527 		 * block, simply slap the segment in front of it and (mostly)
528 		 * skip the complicated logic.
529 		 */
530 		if (pkt_seq + pkt_len == q->ipqe_seq) {
531 			q->ipqe_seq = pkt_seq;
532 			q->ipqe_len += pkt_len;
533 			q->ipqe_flags |= pkt_flags;
534 			m_cat(m, q->ipqe_m);
535 			q->ipqe_m = m;
536 			q->ipre_mlast = m; /* last mbuf may have changed */
537 			TRAVERSE(q->ipre_mlast);
538 			tiqe = q;
539 			TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
540 			TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
541 			goto skip_replacement;
542 		}
543 	} else {
544 		TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
545 	}
546 
547 	/*
548 	 * Find a segment which begins after this one does.
549 	 */
550 	for (p = NULL; q != NULL; q = nq) {
551 		nq = TAILQ_NEXT(q, ipqe_q);
552 #ifdef TCP_REASS_COUNTERS
553 		count++;
554 #endif
555 
556 		/*
557 		 * If the received segment is just right after this
558 		 * fragment, merge the two together and then check
559 		 * for further overlaps.
560 		 */
561 		if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
562 			pkt_len += q->ipqe_len;
563 			pkt_flags |= q->ipqe_flags;
564 			pkt_seq = q->ipqe_seq;
565 			m_cat(q->ipre_mlast, m);
566 			TRAVERSE(q->ipre_mlast);
567 			m = q->ipqe_m;
568 			TCP_REASS_COUNTER_INCR(&tcp_reass_append);
569 			goto free_ipqe;
570 		}
571 
572 		/*
573 		 * If the received segment is completely past this
574 		 * fragment, we need to go to the next fragment.
575 		 */
576 		if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
577 			p = q;
578 			continue;
579 		}
580 
581 		/*
582 		 * If the fragment is past the received segment,
583 		 * it (or any following) can't be concatenated.
584 		 */
585 		if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
586 			TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
587 			break;
588 		}
589 
590 		/*
591 		 * We've received all the data in this segment before.
592 		 * Mark it as a duplicate and return.
593 		 */
594 		if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
595 		    SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
596 			tcps = TCP_STAT_GETREF();
597 			tcps[TCP_STAT_RCVDUPPACK]++;
598 			tcps[TCP_STAT_RCVDUPBYTE] += pkt_len;
599 			TCP_STAT_PUTREF();
600 			tcp_new_dsack(tp, pkt_seq, pkt_len);
601 			m_freem(m);
602 			if (tiqe != NULL) {
603 				tcpipqent_free(tiqe);
604 			}
605 			TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
606 			goto out;
607 		}
608 
609 		/*
610 		 * Received segment completely overlaps this fragment
611 		 * so we drop the fragment (this keeps the temporal
612 		 * ordering of segments correct).
613 		 */
614 		if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
615 		    SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
616 			rcvpartdupbyte += q->ipqe_len;
617 			m_freem(q->ipqe_m);
618 			TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
619 			goto free_ipqe;
620 		}
621 
622 		/*
623 		 * Received segment extends past the end of the fragment.
624 		 * Drop the overlapping bytes, merge the fragment and
625 		 * segment, and treat as a longer received packet.
626 		 */
627 		if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
628 		    SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
629 			int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
630 			m_adj(m, overlap);
631 			rcvpartdupbyte += overlap;
632 			m_cat(q->ipre_mlast, m);
633 			TRAVERSE(q->ipre_mlast);
634 			m = q->ipqe_m;
635 			pkt_seq = q->ipqe_seq;
636 			pkt_len += q->ipqe_len - overlap;
637 			rcvoobyte -= overlap;
638 			TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
639 			goto free_ipqe;
640 		}
641 
642 		/*
643 		 * Received segment extends past the front of the fragment.
644 		 * Drop the overlapping bytes on the received packet. The
645 		 * packet will then be concatenated with this fragment a
646 		 * bit later.
647 		 */
648 		if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
649 		    SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
650 			int overlap = pkt_seq + pkt_len - q->ipqe_seq;
651 			m_adj(m, -overlap);
652 			pkt_len -= overlap;
653 			rcvpartdupbyte += overlap;
654 			TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
655 			rcvoobyte -= overlap;
656 		}
657 
658 		/*
659 		 * If the received segment immediately precedes this
660 		 * fragment then tack the fragment onto this segment
661 		 * and reinsert the data.
662 		 */
663 		if (q->ipqe_seq == pkt_seq + pkt_len) {
664 			pkt_len += q->ipqe_len;
665 			pkt_flags |= q->ipqe_flags;
666 			m_cat(m, q->ipqe_m);
667 			TAILQ_REMOVE(&tp->segq, q, ipqe_q);
668 			TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
669 			tp->t_segqlen--;
670 			KASSERT(tp->t_segqlen >= 0);
671 			KASSERT(tp->t_segqlen != 0 ||
672 			    (TAILQ_EMPTY(&tp->segq) &&
673 			    TAILQ_EMPTY(&tp->timeq)));
674 			if (tiqe == NULL) {
675 				tiqe = q;
676 			} else {
677 				tcpipqent_free(q);
678 			}
679 			TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
680 			break;
681 		}
682 
683 		/*
684 		 * If the fragment is before the segment, remember it.
685 		 * When this loop is terminated, p will contain the
686 		 * pointer to the fragment that is right before the
687 		 * received segment.
688 		 */
689 		if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
690 			p = q;
691 
692 		continue;
693 
694 		/*
695 		 * This is a common operation.  It also will allow
696 		 * to save doing a malloc/free in most instances.
697 		 */
698 	  free_ipqe:
699 		TAILQ_REMOVE(&tp->segq, q, ipqe_q);
700 		TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
701 		tp->t_segqlen--;
702 		KASSERT(tp->t_segqlen >= 0);
703 		KASSERT(tp->t_segqlen != 0 ||
704 		    (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
705 		if (tiqe == NULL) {
706 			tiqe = q;
707 		} else {
708 			tcpipqent_free(q);
709 		}
710 	}
711 
712 #ifdef TCP_REASS_COUNTERS
713 	if (count > 7)
714 		TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
715 	else if (count > 0)
716 		TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
717 #endif
718 
719 insert_it:
720 	/*
721 	 * Allocate a new queue entry (block) since the received segment
722 	 * did not collapse onto any other out-of-order block. If it had
723 	 * collapsed, tiqe would not be NULL and we would be reusing it.
724 	 *
725 	 * If the allocation fails, drop the packet.
726 	 */
727 	if (tiqe == NULL) {
728 		tiqe = tcpipqent_alloc();
729 		if (tiqe == NULL) {
730 			TCP_STATINC(TCP_STAT_RCVMEMDROP);
731 			m_freem(m);
732 			goto out;
733 		}
734 	}
735 
736 	/*
737 	 * Update the counters.
738 	 */
739 	tp->t_rcvoopack++;
740 	tcps = TCP_STAT_GETREF();
741 	tcps[TCP_STAT_RCVOOPACK]++;
742 	tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
743 	if (rcvpartdupbyte) {
744 	    tcps[TCP_STAT_RCVPARTDUPPACK]++;
745 	    tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte;
746 	}
747 	TCP_STAT_PUTREF();
748 
749 	/*
750 	 * Insert the new fragment queue entry into both queues.
751 	 */
752 	tiqe->ipqe_m = m;
753 	tiqe->ipre_mlast = m;
754 	tiqe->ipqe_seq = pkt_seq;
755 	tiqe->ipqe_len = pkt_len;
756 	tiqe->ipqe_flags = pkt_flags;
757 	if (p == NULL) {
758 		TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
759 	} else {
760 		TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
761 	}
762 	tp->t_segqlen++;
763 
764 skip_replacement:
765 	TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
766 
767 present:
768 	/*
769 	 * Present data to user, advancing rcv_nxt through
770 	 * completed sequence space.
771 	 */
772 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
773 		goto out;
774 	q = TAILQ_FIRST(&tp->segq);
775 	if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
776 		goto out;
777 	if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
778 		goto out;
779 
780 	tp->rcv_nxt += q->ipqe_len;
781 	pkt_flags = q->ipqe_flags & TH_FIN;
782 	nd6_hint(tp);
783 
784 	TAILQ_REMOVE(&tp->segq, q, ipqe_q);
785 	TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
786 	tp->t_segqlen--;
787 	KASSERT(tp->t_segqlen >= 0);
788 	KASSERT(tp->t_segqlen != 0 ||
789 	    (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
790 	if (so->so_state & SS_CANTRCVMORE)
791 		m_freem(q->ipqe_m);
792 	else
793 		sbappendstream(&so->so_rcv, q->ipqe_m);
794 	tcpipqent_free(q);
795 	TCP_REASS_UNLOCK(tp);
796 	sorwakeup(so);
797 	return pkt_flags;
798 
799 out:
800 	TCP_REASS_UNLOCK(tp);
801 	return 0;
802 }
803 
804 #ifdef INET6
805 int
806 tcp6_input(struct mbuf **mp, int *offp, int proto)
807 {
808 	struct mbuf *m = *mp;
809 
810 	/*
811 	 * draft-itojun-ipv6-tcp-to-anycast
812 	 * better place to put this in?
813 	 */
814 	if (m->m_flags & M_ANYCAST6) {
815 		struct ip6_hdr *ip6;
816 		if (m->m_len < sizeof(struct ip6_hdr)) {
817 			if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
818 				TCP_STATINC(TCP_STAT_RCVSHORT);
819 				return IPPROTO_DONE;
820 			}
821 		}
822 		ip6 = mtod(m, struct ip6_hdr *);
823 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
824 		    (char *)&ip6->ip6_dst - (char *)ip6);
825 		return IPPROTO_DONE;
826 	}
827 
828 	tcp_input(m, *offp, proto);
829 	return IPPROTO_DONE;
830 }
831 #endif
832 
833 static void
834 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
835 {
836 	char src[INET_ADDRSTRLEN];
837 	char dst[INET_ADDRSTRLEN];
838 
839 	if (ip) {
840 		in_print(src, sizeof(src), &ip->ip_src);
841 		in_print(dst, sizeof(dst), &ip->ip_dst);
842 	} else {
843 		strlcpy(src, "(unknown)", sizeof(src));
844 		strlcpy(dst, "(unknown)", sizeof(dst));
845 	}
846 	log(LOG_INFO,
847 	    "Connection attempt to TCP %s:%d from %s:%d\n",
848 	    dst, ntohs(th->th_dport),
849 	    src, ntohs(th->th_sport));
850 }
851 
852 #ifdef INET6
853 static void
854 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
855 {
856 	char src[INET6_ADDRSTRLEN];
857 	char dst[INET6_ADDRSTRLEN];
858 
859 	if (ip6) {
860 		in6_print(src, sizeof(src), &ip6->ip6_src);
861 		in6_print(dst, sizeof(dst), &ip6->ip6_dst);
862 	} else {
863 		strlcpy(src, "(unknown v6)", sizeof(src));
864 		strlcpy(dst, "(unknown v6)", sizeof(dst));
865 	}
866 	log(LOG_INFO,
867 	    "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
868 	    dst, ntohs(th->th_dport),
869 	    src, ntohs(th->th_sport));
870 }
871 #endif
872 
873 /*
874  * Checksum extended TCP header and data.
875  */
876 int
877 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th,
878     int toff, int off, int tlen)
879 {
880 	struct ifnet *rcvif;
881 	int s;
882 
883 	/*
884 	 * XXX it's better to record and check if this mbuf is
885 	 * already checked.
886 	 */
887 
888 	rcvif = m_get_rcvif(m, &s);
889 	if (__predict_false(rcvif == NULL))
890 		goto badcsum; /* XXX */
891 
892 	switch (af) {
893 	case AF_INET:
894 		switch (m->m_pkthdr.csum_flags &
895 			((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
896 			 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
897 		case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
898 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
899 			goto badcsum;
900 
901 		case M_CSUM_TCPv4|M_CSUM_DATA: {
902 			u_int32_t hw_csum = m->m_pkthdr.csum_data;
903 
904 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
905 			if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
906 				const struct ip *ip =
907 				    mtod(m, const struct ip *);
908 
909 				hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
910 				    ip->ip_dst.s_addr,
911 				    htons(hw_csum + tlen + off + IPPROTO_TCP));
912 			}
913 			if ((hw_csum ^ 0xffff) != 0)
914 				goto badcsum;
915 			break;
916 		}
917 
918 		case M_CSUM_TCPv4:
919 			/* Checksum was okay. */
920 			TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
921 			break;
922 
923 		default:
924 			/*
925 			 * Must compute it ourselves.  Maybe skip checksum
926 			 * on loopback interfaces.
927 			 */
928 			if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) ||
929 					   tcp_do_loopback_cksum)) {
930 				TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
931 				if (in4_cksum(m, IPPROTO_TCP, toff,
932 					      tlen + off) != 0)
933 					goto badcsum;
934 			}
935 			break;
936 		}
937 		break;
938 
939 #ifdef INET6
940 	case AF_INET6:
941 		switch (m->m_pkthdr.csum_flags &
942 			((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
943 			 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
944 		case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
945 			TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
946 			goto badcsum;
947 
948 #if 0 /* notyet */
949 		case M_CSUM_TCPv6|M_CSUM_DATA:
950 #endif
951 
952 		case M_CSUM_TCPv6:
953 			/* Checksum was okay. */
954 			TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
955 			break;
956 
957 		default:
958 			/*
959 			 * Must compute it ourselves.  Maybe skip checksum
960 			 * on loopback interfaces.
961 			 */
962 			if (__predict_true((m->m_flags & M_LOOP) == 0 ||
963 			    tcp_do_loopback_cksum)) {
964 				TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
965 				if (in6_cksum(m, IPPROTO_TCP, toff,
966 				    tlen + off) != 0)
967 					goto badcsum;
968 			}
969 		}
970 		break;
971 #endif /* INET6 */
972 	}
973 	m_put_rcvif(rcvif, &s);
974 
975 	return 0;
976 
977 badcsum:
978 	m_put_rcvif(rcvif, &s);
979 	TCP_STATINC(TCP_STAT_RCVBADSUM);
980 	return -1;
981 }
982 
983 /*
984  * When a packet arrives addressed to a vestigial tcpbp, we
985  * nevertheless have to respond to it per the spec.
986  *
987  * This code is duplicated from the one in tcp_input().
988  */
989 static void tcp_vtw_input(struct tcphdr *th, vestigial_inpcb_t *vp,
990     struct mbuf *m, int tlen)
991 {
992 	int tiflags;
993 	int todrop;
994 	uint32_t t_flags = 0;
995 	uint64_t *tcps;
996 
997 	tiflags = th->th_flags;
998 	todrop  = vp->rcv_nxt - th->th_seq;
999 
1000 	if (todrop > 0) {
1001 		if (tiflags & TH_SYN) {
1002 			tiflags &= ~TH_SYN;
1003 			th->th_seq++;
1004 			tcp_urp_drop(th, 1, &tiflags);
1005 			todrop--;
1006 		}
1007 		if (todrop > tlen ||
1008 		    (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1009 			/*
1010 			 * Any valid FIN or RST must be to the left of the
1011 			 * window.  At this point the FIN or RST must be a
1012 			 * duplicate or out of sequence; drop it.
1013 			 */
1014 			if (tiflags & TH_RST)
1015 				goto drop;
1016 			tiflags &= ~(TH_FIN|TH_RST);
1017 
1018 			/*
1019 			 * Send an ACK to resynchronize and drop any data.
1020 			 * But keep on processing for RST or ACK.
1021 			 */
1022 			t_flags |= TF_ACKNOW;
1023 			todrop = tlen;
1024 			tcps = TCP_STAT_GETREF();
1025 			tcps[TCP_STAT_RCVDUPPACK] += 1;
1026 			tcps[TCP_STAT_RCVDUPBYTE] += todrop;
1027 			TCP_STAT_PUTREF();
1028 		} else if ((tiflags & TH_RST) &&
1029 		    th->th_seq != vp->rcv_nxt) {
1030 			/*
1031 			 * Test for reset before adjusting the sequence
1032 			 * number for overlapping data.
1033 			 */
1034 			goto dropafterack_ratelim;
1035 		} else {
1036 			tcps = TCP_STAT_GETREF();
1037 			tcps[TCP_STAT_RCVPARTDUPPACK] += 1;
1038 			tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
1039 			TCP_STAT_PUTREF();
1040 		}
1041 
1042 //		tcp_new_dsack(tp, th->th_seq, todrop);
1043 //		hdroptlen += todrop;	/*drop from head afterwards*/
1044 
1045 		th->th_seq += todrop;
1046 		tlen -= todrop;
1047 		tcp_urp_drop(th, todrop, &tiflags);
1048 	}
1049 
1050 	/*
1051 	 * If new data are received on a connection after the
1052 	 * user processes are gone, then RST the other end.
1053 	 */
1054 	if (tlen) {
1055 		TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
1056 		goto dropwithreset;
1057 	}
1058 
1059 	/*
1060 	 * If segment ends after window, drop trailing data
1061 	 * (and PUSH and FIN); if nothing left, just ACK.
1062 	 */
1063 	todrop = (th->th_seq + tlen) - (vp->rcv_nxt + vp->rcv_wnd);
1064 
1065 	if (todrop > 0) {
1066 		TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
1067 		if (todrop >= tlen) {
1068 			/*
1069 			 * The segment actually starts after the window.
1070 			 * th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen
1071 			 * th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0
1072 			 * th->th_seq >= vp->rcv_nxt + vp->rcv_wnd
1073 			 */
1074 			TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
1075 
1076 			/*
1077 			 * If a new connection request is received
1078 			 * while in TIME_WAIT, drop the old connection
1079 			 * and start over if the sequence numbers
1080 			 * are above the previous ones.
1081 			 */
1082 			if ((tiflags & TH_SYN) &&
1083 			    SEQ_GT(th->th_seq, vp->rcv_nxt)) {
1084 				/*
1085 				 * We only support this in the !NOFDREF case, which
1086 				 * is to say: not here.
1087 				 */
1088 				goto dropwithreset;
1089 			}
1090 
1091 			/*
1092 			 * If window is closed can only take segments at
1093 			 * window edge, and have to drop data and PUSH from
1094 			 * incoming segments.  Continue processing, but
1095 			 * remember to ack.  Otherwise, drop segment
1096 			 * and (if not RST) ack.
1097 			 */
1098 			if (vp->rcv_wnd == 0 && th->th_seq == vp->rcv_nxt) {
1099 				t_flags |= TF_ACKNOW;
1100 				TCP_STATINC(TCP_STAT_RCVWINPROBE);
1101 			} else {
1102 				goto dropafterack;
1103 			}
1104 		} else {
1105 			TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
1106 		}
1107 		m_adj(m, -todrop);
1108 		tlen -= todrop;
1109 		tiflags &= ~(TH_PUSH|TH_FIN);
1110 	}
1111 
1112 	if (tiflags & TH_RST) {
1113 		if (th->th_seq != vp->rcv_nxt)
1114 			goto dropafterack_ratelim;
1115 
1116 		vtw_del(vp->ctl, vp->vtw);
1117 		goto drop;
1118 	}
1119 
1120 	/*
1121 	 * If the ACK bit is off we drop the segment and return.
1122 	 */
1123 	if ((tiflags & TH_ACK) == 0) {
1124 		if (t_flags & TF_ACKNOW)
1125 			goto dropafterack;
1126 		goto drop;
1127 	}
1128 
1129 	/*
1130 	 * In TIME_WAIT state the only thing that should arrive
1131 	 * is a retransmission of the remote FIN.  Acknowledge
1132 	 * it and restart the finack timer.
1133 	 */
1134 	vtw_restart(vp);
1135 	goto dropafterack;
1136 
1137 dropafterack:
1138 	/*
1139 	 * Generate an ACK dropping incoming segment if it occupies
1140 	 * sequence space, where the ACK reflects our state.
1141 	 */
1142 	if (tiflags & TH_RST)
1143 		goto drop;
1144 	goto dropafterack2;
1145 
1146 dropafterack_ratelim:
1147 	/*
1148 	 * We may want to rate-limit ACKs against SYN/RST attack.
1149 	 */
1150 	if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
1151 	    tcp_ackdrop_ppslim) == 0) {
1152 		/* XXX stat */
1153 		goto drop;
1154 	}
1155 	/* ...fall into dropafterack2... */
1156 
1157 dropafterack2:
1158 	(void)tcp_respond(0, m, m, th, th->th_seq + tlen, th->th_ack, TH_ACK);
1159 	return;
1160 
1161 dropwithreset:
1162 	/*
1163 	 * Generate a RST, dropping incoming segment.
1164 	 * Make ACK acceptable to originator of segment.
1165 	 */
1166 	if (tiflags & TH_RST)
1167 		goto drop;
1168 
1169 	if (tiflags & TH_ACK) {
1170 		tcp_respond(0, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
1171 	} else {
1172 		if (tiflags & TH_SYN)
1173 			++tlen;
1174 		(void)tcp_respond(0, m, m, th, th->th_seq + tlen, (tcp_seq)0,
1175 		    TH_RST|TH_ACK);
1176 	}
1177 	return;
1178 drop:
1179 	m_freem(m);
1180 }
1181 
1182 /*
1183  * TCP input routine, follows pages 65-76 of RFC 793 very closely.
1184  */
1185 void
1186 tcp_input(struct mbuf *m, ...)
1187 {
1188 	struct tcphdr *th;
1189 	struct ip *ip;
1190 	struct inpcb *inp;
1191 #ifdef INET6
1192 	struct ip6_hdr *ip6;
1193 	struct in6pcb *in6p;
1194 #endif
1195 	u_int8_t *optp = NULL;
1196 	int optlen = 0;
1197 	int len, tlen, toff, hdroptlen = 0;
1198 	struct tcpcb *tp = NULL;
1199 	int tiflags;
1200 	struct socket *so = NULL;
1201 	int todrop, acked, ourfinisacked, needoutput = 0;
1202 	bool dupseg;
1203 #ifdef TCP_DEBUG
1204 	short ostate = 0;
1205 #endif
1206 	u_long tiwin;
1207 	struct tcp_opt_info opti;
1208 	int off, iphlen;
1209 	va_list ap;
1210 	int af;		/* af on the wire */
1211 	struct mbuf *tcp_saveti = NULL;
1212 	uint32_t ts_rtt;
1213 	uint8_t iptos;
1214 	uint64_t *tcps;
1215 	vestigial_inpcb_t vestige;
1216 
1217 	vestige.valid = 0;
1218 
1219 	MCLAIM(m, &tcp_rx_mowner);
1220 	va_start(ap, m);
1221 	toff = va_arg(ap, int);
1222 	(void)va_arg(ap, int);		/* ignore value, advance ap */
1223 	va_end(ap);
1224 
1225 	TCP_STATINC(TCP_STAT_RCVTOTAL);
1226 
1227 	memset(&opti, 0, sizeof(opti));
1228 	opti.ts_present = 0;
1229 	opti.maxseg = 0;
1230 
1231 	/*
1232 	 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
1233 	 *
1234 	 * TCP is, by definition, unicast, so we reject all
1235 	 * multicast outright.
1236 	 *
1237 	 * Note, there are additional src/dst address checks in
1238 	 * the AF-specific code below.
1239 	 */
1240 	if (m->m_flags & (M_BCAST|M_MCAST)) {
1241 		/* XXX stat */
1242 		goto drop;
1243 	}
1244 #ifdef INET6
1245 	if (m->m_flags & M_ANYCAST6) {
1246 		/* XXX stat */
1247 		goto drop;
1248 	}
1249 #endif
1250 
1251 	IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, sizeof(struct tcphdr));
1252 	if (th == NULL) {
1253 		TCP_STATINC(TCP_STAT_RCVSHORT);
1254 		return;
1255 	}
1256 
1257 	/*
1258 	 * Get IP and TCP header.
1259 	 * Note: IP leaves IP header in first mbuf.
1260 	 */
1261 	ip = mtod(m, struct ip *);
1262 	switch (ip->ip_v) {
1263 	case 4:
1264 #ifdef INET6
1265 		ip6 = NULL;
1266 #endif
1267 		af = AF_INET;
1268 		iphlen = sizeof(struct ip);
1269 
1270 		if (IN_MULTICAST(ip->ip_dst.s_addr) ||
1271 		    in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m)))
1272 			goto drop;
1273 
1274 		/* We do the checksum after PCB lookup... */
1275 		len = ntohs(ip->ip_len);
1276 		tlen = len - toff;
1277 		iptos = ip->ip_tos;
1278 		break;
1279 #ifdef INET6
1280 	case 6:
1281 		ip = NULL;
1282 		iphlen = sizeof(struct ip6_hdr);
1283 		af = AF_INET6;
1284 		ip6 = mtod(m, struct ip6_hdr *);
1285 
1286 		/*
1287 		 * Be proactive about unspecified IPv6 address in source.
1288 		 * As we use all-zero to indicate unbounded/unconnected pcb,
1289 		 * unspecified IPv6 address can be used to confuse us.
1290 		 *
1291 		 * Note that packets with unspecified IPv6 destination is
1292 		 * already dropped in ip6_input.
1293 		 */
1294 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1295 			/* XXX stat */
1296 			goto drop;
1297 		}
1298 
1299 		/*
1300 		 * Make sure destination address is not multicast.
1301 		 * Source address checked in ip6_input().
1302 		 */
1303 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
1304 			/* XXX stat */
1305 			goto drop;
1306 		}
1307 
1308 		/* We do the checksum after PCB lookup... */
1309 		len = m->m_pkthdr.len;
1310 		tlen = len - toff;
1311 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
1312 		break;
1313 #endif
1314 	default:
1315 		m_freem(m);
1316 		return;
1317 	}
1318 
1319 	/*
1320 	 * Enforce alignment requirements that are violated in
1321 	 * some cases, see kern/50766 for details.
1322 	 */
1323 	if (TCP_HDR_ALIGNED_P(th) == 0) {
1324 		m = m_copyup(m, toff + sizeof(struct tcphdr), 0);
1325 		if (m == NULL) {
1326 			TCP_STATINC(TCP_STAT_RCVSHORT);
1327 			return;
1328 		}
1329 		ip = mtod(m, struct ip *);
1330 #ifdef INET6
1331 		ip6 = mtod(m, struct ip6_hdr *);
1332 #endif
1333 		th = (struct tcphdr *)(mtod(m, char *) + toff);
1334 	}
1335 	KASSERT(TCP_HDR_ALIGNED_P(th));
1336 
1337 	/*
1338 	 * Check that TCP offset makes sense, pull out TCP options and
1339 	 * adjust length.
1340 	 */
1341 	off = th->th_off << 2;
1342 	if (off < sizeof(struct tcphdr) || off > tlen) {
1343 		TCP_STATINC(TCP_STAT_RCVBADOFF);
1344 		goto drop;
1345 	}
1346 	tlen -= off;
1347 
1348 	if (off > sizeof(struct tcphdr)) {
1349 		IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
1350 		if (th == NULL) {
1351 			TCP_STATINC(TCP_STAT_RCVSHORT);
1352 			return;
1353 		}
1354 		KASSERT(TCP_HDR_ALIGNED_P(th));
1355 		optlen = off - sizeof(struct tcphdr);
1356 		optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
1357 
1358 		/*
1359 		 * Do quick retrieval of timestamp options.
1360 		 *
1361 		 * If timestamp is the only option and it's formatted as
1362 		 * recommended in RFC 1323 appendix A, we quickly get the
1363 		 * values now and don't bother calling tcp_dooptions(),
1364 		 * etc.
1365 		 */
1366 		if ((optlen == TCPOLEN_TSTAMP_APPA ||
1367 		     (optlen > TCPOLEN_TSTAMP_APPA &&
1368 		      optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1369 		    *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1370 		    (th->th_flags & TH_SYN) == 0) {
1371 			opti.ts_present = 1;
1372 			opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
1373 			opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
1374 			optp = NULL;	/* we've parsed the options */
1375 		}
1376 	}
1377 	tiflags = th->th_flags;
1378 
1379 	/*
1380 	 * Checksum extended TCP header and data
1381 	 */
1382 	if (tcp_input_checksum(af, m, th, toff, off, tlen))
1383 		goto badcsum;
1384 
1385 	/*
1386 	 * Locate pcb for segment.
1387 	 */
1388 findpcb:
1389 	inp = NULL;
1390 #ifdef INET6
1391 	in6p = NULL;
1392 #endif
1393 	switch (af) {
1394 	case AF_INET:
1395 		inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
1396 		    ip->ip_dst, th->th_dport, &vestige);
1397 		if (inp == NULL && !vestige.valid) {
1398 			TCP_STATINC(TCP_STAT_PCBHASHMISS);
1399 			inp = in_pcblookup_bind(&tcbtable, ip->ip_dst,
1400 			    th->th_dport);
1401 		}
1402 #ifdef INET6
1403 		if (inp == NULL && !vestige.valid) {
1404 			struct in6_addr s, d;
1405 
1406 			/* mapped addr case */
1407 			in6_in_2_v4mapin6(&ip->ip_src, &s);
1408 			in6_in_2_v4mapin6(&ip->ip_dst, &d);
1409 			in6p = in6_pcblookup_connect(&tcbtable, &s,
1410 			    th->th_sport, &d, th->th_dport, 0, &vestige);
1411 			if (in6p == 0 && !vestige.valid) {
1412 				TCP_STATINC(TCP_STAT_PCBHASHMISS);
1413 				in6p = in6_pcblookup_bind(&tcbtable, &d,
1414 				    th->th_dport, 0);
1415 			}
1416 		}
1417 #endif
1418 #ifndef INET6
1419 		if (inp == NULL && !vestige.valid)
1420 #else
1421 		if (inp == NULL && in6p == NULL && !vestige.valid)
1422 #endif
1423 		{
1424 			TCP_STATINC(TCP_STAT_NOPORT);
1425 			if (tcp_log_refused &&
1426 			    (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1427 				tcp4_log_refused(ip, th);
1428 			}
1429 			tcp_fields_to_host(th);
1430 			goto dropwithreset_ratelim;
1431 		}
1432 #if defined(IPSEC)
1433 		if (ipsec_used) {
1434 			if (inp && ipsec_in_reject(m, inp)) {
1435 				goto drop;
1436 			}
1437 #ifdef INET6
1438 			else if (in6p && ipsec_in_reject(m, in6p)) {
1439 				goto drop;
1440 			}
1441 #endif
1442 		}
1443 #endif /*IPSEC*/
1444 		break;
1445 #ifdef INET6
1446 	case AF_INET6:
1447 	    {
1448 		int faith;
1449 
1450 #if defined(NFAITH) && NFAITH > 0
1451 		faith = faithprefix(&ip6->ip6_dst);
1452 #else
1453 		faith = 0;
1454 #endif
1455 		in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
1456 		    th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige);
1457 		if (!in6p && !vestige.valid) {
1458 			TCP_STATINC(TCP_STAT_PCBHASHMISS);
1459 			in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
1460 			    th->th_dport, faith);
1461 		}
1462 		if (!in6p && !vestige.valid) {
1463 			TCP_STATINC(TCP_STAT_NOPORT);
1464 			if (tcp_log_refused &&
1465 			    (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1466 				tcp6_log_refused(ip6, th);
1467 			}
1468 			tcp_fields_to_host(th);
1469 			goto dropwithreset_ratelim;
1470 		}
1471 #if defined(IPSEC)
1472 		if (ipsec_used && in6p && ipsec_in_reject(m, in6p)) {
1473 			goto drop;
1474 		}
1475 #endif
1476 		break;
1477 	    }
1478 #endif
1479 	}
1480 
1481 	tcp_fields_to_host(th);
1482 
1483 	/*
1484 	 * If the state is CLOSED (i.e., TCB does not exist) then
1485 	 * all data in the incoming segment is discarded.
1486 	 * If the TCB exists but is in CLOSED state, it is embryonic,
1487 	 * but should either do a listen or a connect soon.
1488 	 */
1489 	tp = NULL;
1490 	so = NULL;
1491 	if (inp) {
1492 		/* Check the minimum TTL for socket. */
1493 		if (ip->ip_ttl < inp->inp_ip_minttl)
1494 			goto drop;
1495 
1496 		tp = intotcpcb(inp);
1497 		so = inp->inp_socket;
1498 	}
1499 #ifdef INET6
1500 	else if (in6p) {
1501 		tp = in6totcpcb(in6p);
1502 		so = in6p->in6p_socket;
1503 	}
1504 #endif
1505 	else if (vestige.valid) {
1506 		/* We do not support the resurrection of vtw tcpcps. */
1507 		tcp_vtw_input(th, &vestige, m, tlen);
1508 		m = NULL;
1509 		goto drop;
1510 	}
1511 
1512 	if (tp == NULL)
1513 		goto dropwithreset_ratelim;
1514 	if (tp->t_state == TCPS_CLOSED)
1515 		goto drop;
1516 
1517 	KASSERT(so->so_lock == softnet_lock);
1518 	KASSERT(solocked(so));
1519 
1520 	/* Unscale the window into a 32-bit value. */
1521 	if ((tiflags & TH_SYN) == 0)
1522 		tiwin = th->th_win << tp->snd_scale;
1523 	else
1524 		tiwin = th->th_win;
1525 
1526 #ifdef INET6
1527 	/* save packet options if user wanted */
1528 	if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
1529 		if (in6p->in6p_options) {
1530 			m_freem(in6p->in6p_options);
1531 			in6p->in6p_options = NULL;
1532 		}
1533 		KASSERT(ip6 != NULL);
1534 		ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
1535 	}
1536 #endif
1537 
1538 	if (so->so_options & SO_DEBUG) {
1539 #ifdef TCP_DEBUG
1540 		ostate = tp->t_state;
1541 #endif
1542 
1543 		tcp_saveti = NULL;
1544 		if (iphlen + sizeof(struct tcphdr) > MHLEN)
1545 			goto nosave;
1546 
1547 		if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
1548 			tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
1549 			if (tcp_saveti == NULL)
1550 				goto nosave;
1551 		} else {
1552 			MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1553 			if (tcp_saveti == NULL)
1554 				goto nosave;
1555 			MCLAIM(m, &tcp_mowner);
1556 			tcp_saveti->m_len = iphlen;
1557 			m_copydata(m, 0, iphlen,
1558 			    mtod(tcp_saveti, void *));
1559 		}
1560 
1561 		if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1562 			m_freem(tcp_saveti);
1563 			tcp_saveti = NULL;
1564 		} else {
1565 			tcp_saveti->m_len += sizeof(struct tcphdr);
1566 			memcpy(mtod(tcp_saveti, char *) + iphlen, th,
1567 			    sizeof(struct tcphdr));
1568 		}
1569 nosave:;
1570 	}
1571 
1572 	if (so->so_options & SO_ACCEPTCONN) {
1573 		union syn_cache_sa src;
1574 		union syn_cache_sa dst;
1575 
1576 		KASSERT(tp->t_state == TCPS_LISTEN);
1577 
1578 		memset(&src, 0, sizeof(src));
1579 		memset(&dst, 0, sizeof(dst));
1580 		switch (af) {
1581 		case AF_INET:
1582 			src.sin.sin_len = sizeof(struct sockaddr_in);
1583 			src.sin.sin_family = AF_INET;
1584 			src.sin.sin_addr = ip->ip_src;
1585 			src.sin.sin_port = th->th_sport;
1586 
1587 			dst.sin.sin_len = sizeof(struct sockaddr_in);
1588 			dst.sin.sin_family = AF_INET;
1589 			dst.sin.sin_addr = ip->ip_dst;
1590 			dst.sin.sin_port = th->th_dport;
1591 			break;
1592 #ifdef INET6
1593 		case AF_INET6:
1594 			src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1595 			src.sin6.sin6_family = AF_INET6;
1596 			src.sin6.sin6_addr = ip6->ip6_src;
1597 			src.sin6.sin6_port = th->th_sport;
1598 
1599 			dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1600 			dst.sin6.sin6_family = AF_INET6;
1601 			dst.sin6.sin6_addr = ip6->ip6_dst;
1602 			dst.sin6.sin6_port = th->th_dport;
1603 			break;
1604 #endif
1605 		}
1606 
1607 		if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1608 			if (tiflags & TH_RST) {
1609 				syn_cache_reset(&src.sa, &dst.sa, th);
1610 			} else if ((tiflags & (TH_ACK|TH_SYN)) ==
1611 			    (TH_ACK|TH_SYN)) {
1612 				/*
1613 				 * Received a SYN,ACK. This should never
1614 				 * happen while we are in LISTEN. Send an RST.
1615 				 */
1616 				goto badsyn;
1617 			} else if (tiflags & TH_ACK) {
1618 				so = syn_cache_get(&src.sa, &dst.sa, th, so, m);
1619 				if (so == NULL) {
1620 					/*
1621 					 * We don't have a SYN for this ACK;
1622 					 * send an RST.
1623 					 */
1624 					goto badsyn;
1625 				} else if (so == (struct socket *)(-1)) {
1626 					/*
1627 					 * We were unable to create the
1628 					 * connection. If the 3-way handshake
1629 					 * was completed, and RST has been
1630 					 * sent to the peer. Since the mbuf
1631 					 * might be in use for the reply, do
1632 					 * not free it.
1633 					 */
1634 					m = NULL;
1635 				} else {
1636 					/*
1637 					 * We have created a full-blown
1638 					 * connection.
1639 					 */
1640 					tp = NULL;
1641 					inp = NULL;
1642 #ifdef INET6
1643 					in6p = NULL;
1644 #endif
1645 					switch (so->so_proto->pr_domain->dom_family) {
1646 					case AF_INET:
1647 						inp = sotoinpcb(so);
1648 						tp = intotcpcb(inp);
1649 						break;
1650 #ifdef INET6
1651 					case AF_INET6:
1652 						in6p = sotoin6pcb(so);
1653 						tp = in6totcpcb(in6p);
1654 						break;
1655 #endif
1656 					}
1657 					if (tp == NULL)
1658 						goto badsyn;	/*XXX*/
1659 					tiwin <<= tp->snd_scale;
1660 					goto after_listen;
1661 				}
1662 			} else {
1663 				/*
1664 				 * None of RST, SYN or ACK was set.
1665 				 * This is an invalid packet for a
1666 				 * TCB in LISTEN state.  Send a RST.
1667 				 */
1668 				goto badsyn;
1669 			}
1670 		} else {
1671 			/*
1672 			 * Received a SYN.
1673 			 */
1674 
1675 #ifdef INET6
1676 			/*
1677 			 * If deprecated address is forbidden, we do
1678 			 * not accept SYN to deprecated interface
1679 			 * address to prevent any new inbound
1680 			 * connection from getting established.
1681 			 * When we do not accept SYN, we send a TCP
1682 			 * RST, with deprecated source address (instead
1683 			 * of dropping it).  We compromise it as it is
1684 			 * much better for peer to send a RST, and
1685 			 * RST will be the final packet for the
1686 			 * exchange.
1687 			 *
1688 			 * If we do not forbid deprecated addresses, we
1689 			 * accept the SYN packet.  RFC2462 does not
1690 			 * suggest dropping SYN in this case.
1691 			 * If we decipher RFC2462 5.5.4, it says like
1692 			 * this:
1693 			 * 1. use of deprecated addr with existing
1694 			 *    communication is okay - "SHOULD continue
1695 			 *    to be used"
1696 			 * 2. use of it with new communication:
1697 			 *   (2a) "SHOULD NOT be used if alternate
1698 			 *        address with sufficient scope is
1699 			 *        available"
1700 			 *   (2b) nothing mentioned otherwise.
1701 			 * Here we fall into (2b) case as we have no
1702 			 * choice in our source address selection - we
1703 			 * must obey the peer.
1704 			 *
1705 			 * The wording in RFC2462 is confusing, and
1706 			 * there are multiple description text for
1707 			 * deprecated address handling - worse, they
1708 			 * are not exactly the same.  I believe 5.5.4
1709 			 * is the best one, so we follow 5.5.4.
1710 			 */
1711 			if (af == AF_INET6 && !ip6_use_deprecated) {
1712 				struct in6_ifaddr *ia6;
1713 				int s;
1714 				struct ifnet *rcvif = m_get_rcvif(m, &s);
1715 				if (rcvif == NULL)
1716 					goto dropwithreset; /* XXX */
1717 				if ((ia6 = in6ifa_ifpwithaddr(rcvif,
1718 				    &ip6->ip6_dst)) &&
1719 				    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1720 					tp = NULL;
1721 					m_put_rcvif(rcvif, &s);
1722 					goto dropwithreset;
1723 				}
1724 				m_put_rcvif(rcvif, &s);
1725 			}
1726 #endif
1727 
1728 			/*
1729 			 * LISTEN socket received a SYN from itself? This
1730 			 * can't possibly be valid; drop the packet.
1731 			 */
1732 			if (th->th_sport == th->th_dport) {
1733 				int eq = 0;
1734 
1735 				switch (af) {
1736 				case AF_INET:
1737 					eq = in_hosteq(ip->ip_src, ip->ip_dst);
1738 					break;
1739 #ifdef INET6
1740 				case AF_INET6:
1741 					eq = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
1742 					    &ip6->ip6_dst);
1743 					break;
1744 #endif
1745 				}
1746 				if (eq) {
1747 					TCP_STATINC(TCP_STAT_BADSYN);
1748 					goto drop;
1749 				}
1750 			}
1751 
1752 			/*
1753 			 * SYN looks ok; create compressed TCP
1754 			 * state for it.
1755 			 */
1756 			if (so->so_qlen <= so->so_qlimit &&
1757 			    syn_cache_add(&src.sa, &dst.sa, th, toff,
1758 			    so, m, optp, optlen, &opti))
1759 				m = NULL;
1760 		}
1761 
1762 		goto drop;
1763 	}
1764 
1765 after_listen:
1766 	/*
1767 	 * From here on, we're dealing with !LISTEN.
1768 	 */
1769 	KASSERT(tp->t_state != TCPS_LISTEN);
1770 
1771 	/*
1772 	 * Segment received on connection.
1773 	 * Reset idle time and keep-alive timer.
1774 	 */
1775 	tp->t_rcvtime = tcp_now;
1776 	if (TCPS_HAVEESTABLISHED(tp->t_state))
1777 		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
1778 
1779 	/*
1780 	 * Process options.
1781 	 */
1782 #ifdef TCP_SIGNATURE
1783 	if (optp || (tp->t_flags & TF_SIGNATURE))
1784 #else
1785 	if (optp)
1786 #endif
1787 		if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
1788 			goto drop;
1789 
1790 	if (TCP_SACK_ENABLED(tp)) {
1791 		tcp_del_sackholes(tp, th);
1792 	}
1793 
1794 	if (TCP_ECN_ALLOWED(tp)) {
1795 		if (tiflags & TH_CWR) {
1796 			tp->t_flags &= ~TF_ECN_SND_ECE;
1797 		}
1798 		switch (iptos & IPTOS_ECN_MASK) {
1799 		case IPTOS_ECN_CE:
1800 			tp->t_flags |= TF_ECN_SND_ECE;
1801 			TCP_STATINC(TCP_STAT_ECN_CE);
1802 			break;
1803 		case IPTOS_ECN_ECT0:
1804 			TCP_STATINC(TCP_STAT_ECN_ECT);
1805 			break;
1806 		case IPTOS_ECN_ECT1:
1807 			/* XXX: ignore for now -- rpaulo */
1808 			break;
1809 		}
1810 		/*
1811 		 * Congestion experienced.
1812 		 * Ignore if we are already trying to recover.
1813 		 */
1814 		if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
1815 			tp->t_congctl->cong_exp(tp);
1816 	}
1817 
1818 	if (opti.ts_present && opti.ts_ecr) {
1819 		/*
1820 		 * Calculate the RTT from the returned time stamp and the
1821 		 * connection's time base.  If the time stamp is later than
1822 		 * the current time, or is extremely old, fall back to non-1323
1823 		 * RTT calculation.  Since ts_rtt is unsigned, we can test both
1824 		 * at the same time.
1825 		 *
1826 		 * Note that ts_rtt is in units of slow ticks (500
1827 		 * ms).  Since most earthbound RTTs are < 500 ms,
1828 		 * observed values will have large quantization noise.
1829 		 * Our smoothed RTT is then the fraction of observed
1830 		 * samples that are 1 tick instead of 0 (times 500
1831 		 * ms).
1832 		 *
1833 		 * ts_rtt is increased by 1 to denote a valid sample,
1834 		 * with 0 indicating an invalid measurement.  This
1835 		 * extra 1 must be removed when ts_rtt is used, or
1836 		 * else an an erroneous extra 500 ms will result.
1837 		 */
1838 		ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
1839 		if (ts_rtt > TCP_PAWS_IDLE)
1840 			ts_rtt = 0;
1841 	} else {
1842 		ts_rtt = 0;
1843 	}
1844 
1845 	/*
1846 	 * Fast path: check for the two common cases of a uni-directional
1847 	 * data transfer. If:
1848 	 *    o We are in the ESTABLISHED state, and
1849 	 *    o The packet has no control flags, and
1850 	 *    o The packet is in-sequence, and
1851 	 *    o The window didn't change, and
1852 	 *    o We are not retransmitting
1853 	 * It's a candidate.
1854 	 *
1855 	 * If the length (tlen) is zero and the ack moved forward, we're
1856 	 * the sender side of the transfer. Just free the data acked and
1857 	 * wake any higher level process that was blocked waiting for
1858 	 * space.
1859 	 *
1860 	 * If the length is non-zero and the ack didn't move, we're the
1861 	 * receiver side. If we're getting packets in-order (the reassembly
1862 	 * queue is empty), add the data to the socket buffer and note
1863 	 * that we need a delayed ack.
1864 	 */
1865 	if (tp->t_state == TCPS_ESTABLISHED &&
1866 	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
1867 	        == TH_ACK &&
1868 	    (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1869 	    th->th_seq == tp->rcv_nxt &&
1870 	    tiwin && tiwin == tp->snd_wnd &&
1871 	    tp->snd_nxt == tp->snd_max) {
1872 
1873 		/*
1874 		 * If last ACK falls within this segment's sequence numbers,
1875 		 * record the timestamp.
1876 		 * NOTE that the test is modified according to the latest
1877 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1878 		 *
1879 		 * note that we already know
1880 		 *	TSTMP_GEQ(opti.ts_val, tp->ts_recent)
1881 		 */
1882 		if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1883 			tp->ts_recent_age = tcp_now;
1884 			tp->ts_recent = opti.ts_val;
1885 		}
1886 
1887 		if (tlen == 0) {
1888 			/* Ack prediction. */
1889 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
1890 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
1891 			    tp->snd_cwnd >= tp->snd_wnd &&
1892 			    tp->t_partialacks < 0) {
1893 				/*
1894 				 * this is a pure ack for outstanding data.
1895 				 */
1896 				if (ts_rtt)
1897 					tcp_xmit_timer(tp, ts_rtt - 1);
1898 				else if (tp->t_rtttime &&
1899 				    SEQ_GT(th->th_ack, tp->t_rtseq))
1900 					tcp_xmit_timer(tp,
1901 					  tcp_now - tp->t_rtttime);
1902 				acked = th->th_ack - tp->snd_una;
1903 				tcps = TCP_STAT_GETREF();
1904 				tcps[TCP_STAT_PREDACK]++;
1905 				tcps[TCP_STAT_RCVACKPACK]++;
1906 				tcps[TCP_STAT_RCVACKBYTE] += acked;
1907 				TCP_STAT_PUTREF();
1908 				nd6_hint(tp);
1909 
1910 				if (acked > (tp->t_lastoff - tp->t_inoff))
1911 					tp->t_lastm = NULL;
1912 				sbdrop(&so->so_snd, acked);
1913 				tp->t_lastoff -= acked;
1914 
1915 				icmp_check(tp, th, acked);
1916 
1917 				tp->snd_una = th->th_ack;
1918 				tp->snd_fack = tp->snd_una;
1919 				if (SEQ_LT(tp->snd_high, tp->snd_una))
1920 					tp->snd_high = tp->snd_una;
1921 				m_freem(m);
1922 
1923 				/*
1924 				 * If all outstanding data are acked, stop
1925 				 * retransmit timer, otherwise restart timer
1926 				 * using current (possibly backed-off) value.
1927 				 * If process is waiting for space,
1928 				 * wakeup/selnotify/signal.  If data
1929 				 * are ready to send, let tcp_output
1930 				 * decide between more output or persist.
1931 				 */
1932 				if (tp->snd_una == tp->snd_max)
1933 					TCP_TIMER_DISARM(tp, TCPT_REXMT);
1934 				else if (TCP_TIMER_ISARMED(tp,
1935 				    TCPT_PERSIST) == 0)
1936 					TCP_TIMER_ARM(tp, TCPT_REXMT,
1937 					    tp->t_rxtcur);
1938 
1939 				sowwakeup(so);
1940 				if (so->so_snd.sb_cc) {
1941 					KERNEL_LOCK(1, NULL);
1942 					(void)tcp_output(tp);
1943 					KERNEL_UNLOCK_ONE(NULL);
1944 				}
1945 				if (tcp_saveti)
1946 					m_freem(tcp_saveti);
1947 				return;
1948 			}
1949 		} else if (th->th_ack == tp->snd_una &&
1950 		    TAILQ_FIRST(&tp->segq) == NULL &&
1951 		    tlen <= sbspace(&so->so_rcv)) {
1952 			int newsize = 0;
1953 
1954 			/*
1955 			 * this is a pure, in-sequence data packet
1956 			 * with nothing on the reassembly queue and
1957 			 * we have enough buffer space to take it.
1958 			 */
1959 			tp->rcv_nxt += tlen;
1960 			tcps = TCP_STAT_GETREF();
1961 			tcps[TCP_STAT_PREDDAT]++;
1962 			tcps[TCP_STAT_RCVPACK]++;
1963 			tcps[TCP_STAT_RCVBYTE] += tlen;
1964 			TCP_STAT_PUTREF();
1965 			nd6_hint(tp);
1966 
1967 		/*
1968 		 * Automatic sizing enables the performance of large buffers
1969 		 * and most of the efficiency of small ones by only allocating
1970 		 * space when it is needed.
1971 		 *
1972 		 * On the receive side the socket buffer memory is only rarely
1973 		 * used to any significant extent.  This allows us to be much
1974 		 * more aggressive in scaling the receive socket buffer.  For
1975 		 * the case that the buffer space is actually used to a large
1976 		 * extent and we run out of kernel memory we can simply drop
1977 		 * the new segments; TCP on the sender will just retransmit it
1978 		 * later.  Setting the buffer size too big may only consume too
1979 		 * much kernel memory if the application doesn't read() from
1980 		 * the socket or packet loss or reordering makes use of the
1981 		 * reassembly queue.
1982 		 *
1983 		 * The criteria to step up the receive buffer one notch are:
1984 		 *  1. the number of bytes received during the time it takes
1985 		 *     one timestamp to be reflected back to us (the RTT);
1986 		 *  2. received bytes per RTT is within seven eighth of the
1987 		 *     current socket buffer size;
1988 		 *  3. receive buffer size has not hit maximal automatic size;
1989 		 *
1990 		 * This algorithm does one step per RTT at most and only if
1991 		 * we receive a bulk stream w/o packet losses or reorderings.
1992 		 * Shrinking the buffer during idle times is not necessary as
1993 		 * it doesn't consume any memory when idle.
1994 		 *
1995 		 * TODO: Only step up if the application is actually serving
1996 		 * the buffer to better manage the socket buffer resources.
1997 		 */
1998 			if (tcp_do_autorcvbuf &&
1999 			    opti.ts_ecr &&
2000 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
2001 				if (opti.ts_ecr > tp->rfbuf_ts &&
2002 				    opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) {
2003 					if (tp->rfbuf_cnt >
2004 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
2005 					    so->so_rcv.sb_hiwat <
2006 					    tcp_autorcvbuf_max) {
2007 						newsize =
2008 						    min(so->so_rcv.sb_hiwat +
2009 						    tcp_autorcvbuf_inc,
2010 						    tcp_autorcvbuf_max);
2011 					}
2012 					/* Start over with next RTT. */
2013 					tp->rfbuf_ts = 0;
2014 					tp->rfbuf_cnt = 0;
2015 				} else
2016 					tp->rfbuf_cnt += tlen;	/* add up */
2017 			}
2018 
2019 			/*
2020 			 * Drop TCP, IP headers and TCP options then add data
2021 			 * to socket buffer.
2022 			 */
2023 			if (so->so_state & SS_CANTRCVMORE) {
2024 				m_freem(m);
2025 			} else {
2026 				/*
2027 				 * Set new socket buffer size.
2028 				 * Give up when limit is reached.
2029 				 */
2030 				if (newsize)
2031 					if (!sbreserve(&so->so_rcv,
2032 					    newsize, so))
2033 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
2034 				m_adj(m, toff + off);
2035 				sbappendstream(&so->so_rcv, m);
2036 			}
2037 			sorwakeup(so);
2038 			tcp_setup_ack(tp, th);
2039 			if (tp->t_flags & TF_ACKNOW) {
2040 				KERNEL_LOCK(1, NULL);
2041 				(void)tcp_output(tp);
2042 				KERNEL_UNLOCK_ONE(NULL);
2043 			}
2044 			if (tcp_saveti)
2045 				m_freem(tcp_saveti);
2046 			return;
2047 		}
2048 	}
2049 
2050 	/*
2051 	 * Compute mbuf offset to TCP data segment.
2052 	 */
2053 	hdroptlen = toff + off;
2054 
2055 	/*
2056 	 * Calculate amount of space in receive window. Receive window is
2057 	 * amount of space in rcv queue, but not less than advertised
2058 	 * window.
2059 	 */
2060 	{
2061 		int win;
2062 		win = sbspace(&so->so_rcv);
2063 		if (win < 0)
2064 			win = 0;
2065 		tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2066 	}
2067 
2068 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
2069 	tp->rfbuf_ts = 0;
2070 	tp->rfbuf_cnt = 0;
2071 
2072 	switch (tp->t_state) {
2073 	/*
2074 	 * If the state is SYN_SENT:
2075 	 *	if seg contains an ACK, but not for our SYN, drop the input.
2076 	 *	if seg contains a RST, then drop the connection.
2077 	 *	if seg does not contain SYN, then drop it.
2078 	 * Otherwise this is an acceptable SYN segment
2079 	 *	initialize tp->rcv_nxt and tp->irs
2080 	 *	if seg contains ack then advance tp->snd_una
2081 	 *	if seg contains a ECE and ECN support is enabled, the stream
2082 	 *	    is ECN capable.
2083 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2084 	 *	arrange for segment to be acked (eventually)
2085 	 *	continue processing rest of data/controls, beginning with URG
2086 	 */
2087 	case TCPS_SYN_SENT:
2088 		if ((tiflags & TH_ACK) &&
2089 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
2090 		     SEQ_GT(th->th_ack, tp->snd_max)))
2091 			goto dropwithreset;
2092 		if (tiflags & TH_RST) {
2093 			if (tiflags & TH_ACK)
2094 				tp = tcp_drop(tp, ECONNREFUSED);
2095 			goto drop;
2096 		}
2097 		if ((tiflags & TH_SYN) == 0)
2098 			goto drop;
2099 		if (tiflags & TH_ACK) {
2100 			tp->snd_una = th->th_ack;
2101 			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2102 				tp->snd_nxt = tp->snd_una;
2103 			if (SEQ_LT(tp->snd_high, tp->snd_una))
2104 				tp->snd_high = tp->snd_una;
2105 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
2106 
2107 			if ((tiflags & TH_ECE) && tcp_do_ecn) {
2108 				tp->t_flags |= TF_ECN_PERMIT;
2109 				TCP_STATINC(TCP_STAT_ECN_SHS);
2110 			}
2111 		}
2112 		tp->irs = th->th_seq;
2113 		tcp_rcvseqinit(tp);
2114 		tp->t_flags |= TF_ACKNOW;
2115 		tcp_mss_from_peer(tp, opti.maxseg);
2116 
2117 		/*
2118 		 * Initialize the initial congestion window.  If we
2119 		 * had to retransmit the SYN, we must initialize cwnd
2120 		 * to 1 segment (i.e. the Loss Window).
2121 		 */
2122 		if (tp->t_flags & TF_SYN_REXMT)
2123 			tp->snd_cwnd = tp->t_peermss;
2124 		else {
2125 			int ss = tcp_init_win;
2126 			if (inp != NULL && in_localaddr(inp->inp_faddr))
2127 				ss = tcp_init_win_local;
2128 #ifdef INET6
2129 			if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
2130 				ss = tcp_init_win_local;
2131 #endif
2132 			tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
2133 		}
2134 
2135 		tcp_rmx_rtt(tp);
2136 		if (tiflags & TH_ACK) {
2137 			TCP_STATINC(TCP_STAT_CONNECTS);
2138 			/*
2139 			 * move tcp_established before soisconnected
2140 			 * because upcall handler can drive tcp_output
2141 			 * functionality.
2142 			 * XXX we might call soisconnected at the end of
2143 			 * all processing
2144 			 */
2145 			tcp_established(tp);
2146 			soisconnected(so);
2147 			/* Do window scaling on this connection? */
2148 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2149 			    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2150 				tp->snd_scale = tp->requested_s_scale;
2151 				tp->rcv_scale = tp->request_r_scale;
2152 			}
2153 			TCP_REASS_LOCK(tp);
2154 			(void)tcp_reass(tp, NULL, NULL, tlen);
2155 			/*
2156 			 * if we didn't have to retransmit the SYN,
2157 			 * use its rtt as our initial srtt & rtt var.
2158 			 */
2159 			if (tp->t_rtttime)
2160 				tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2161 		} else {
2162 			tp->t_state = TCPS_SYN_RECEIVED;
2163 		}
2164 
2165 		/*
2166 		 * Advance th->th_seq to correspond to first data byte.
2167 		 * If data, trim to stay within window,
2168 		 * dropping FIN if necessary.
2169 		 */
2170 		th->th_seq++;
2171 		if (tlen > tp->rcv_wnd) {
2172 			todrop = tlen - tp->rcv_wnd;
2173 			m_adj(m, -todrop);
2174 			tlen = tp->rcv_wnd;
2175 			tiflags &= ~TH_FIN;
2176 			tcps = TCP_STAT_GETREF();
2177 			tcps[TCP_STAT_RCVPACKAFTERWIN]++;
2178 			tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop;
2179 			TCP_STAT_PUTREF();
2180 		}
2181 		tp->snd_wl1 = th->th_seq - 1;
2182 		tp->rcv_up = th->th_seq;
2183 		goto step6;
2184 
2185 	/*
2186 	 * If the state is SYN_RECEIVED:
2187 	 *	If seg contains an ACK, but not for our SYN, drop the input
2188 	 *	and generate an RST.  See page 36, rfc793
2189 	 */
2190 	case TCPS_SYN_RECEIVED:
2191 		if ((tiflags & TH_ACK) &&
2192 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
2193 		     SEQ_GT(th->th_ack, tp->snd_max)))
2194 			goto dropwithreset;
2195 		break;
2196 	}
2197 
2198 	/*
2199 	 * From here on, we're dealing with !LISTEN and !SYN_SENT.
2200 	 */
2201 	KASSERT(tp->t_state != TCPS_LISTEN &&
2202 	    tp->t_state != TCPS_SYN_SENT);
2203 
2204 	/*
2205 	 * RFC1323 PAWS: if we have a timestamp reply on this segment and
2206 	 * it's less than ts_recent, drop it.
2207 	 */
2208 	if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
2209 	    TSTMP_LT(opti.ts_val, tp->ts_recent)) {
2210 		/* Check to see if ts_recent is over 24 days old.  */
2211 		if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
2212 			/*
2213 			 * Invalidate ts_recent.  If this segment updates
2214 			 * ts_recent, the age will be reset later and ts_recent
2215 			 * will get a valid value.  If it does not, setting
2216 			 * ts_recent to zero will at least satisfy the
2217 			 * requirement that zero be placed in the timestamp
2218 			 * echo reply when ts_recent isn't valid.  The
2219 			 * age isn't reset until we get a valid ts_recent
2220 			 * because we don't want out-of-order segments to be
2221 			 * dropped when ts_recent is old.
2222 			 */
2223 			tp->ts_recent = 0;
2224 		} else {
2225 			tcps = TCP_STAT_GETREF();
2226 			tcps[TCP_STAT_RCVDUPPACK]++;
2227 			tcps[TCP_STAT_RCVDUPBYTE] += tlen;
2228 			tcps[TCP_STAT_PAWSDROP]++;
2229 			TCP_STAT_PUTREF();
2230 			tcp_new_dsack(tp, th->th_seq, tlen);
2231 			goto dropafterack;
2232 		}
2233 	}
2234 
2235 	/*
2236 	 * Check that at least some bytes of the segment are within the
2237 	 * receive window. If segment begins before rcv_nxt, drop leading
2238 	 * data (and SYN); if nothing left, just ack.
2239 	 */
2240 	todrop = tp->rcv_nxt - th->th_seq;
2241 	dupseg = false;
2242 	if (todrop > 0) {
2243 		if (tiflags & TH_SYN) {
2244 			tiflags &= ~TH_SYN;
2245 			th->th_seq++;
2246 			tcp_urp_drop(th, 1, &tiflags);
2247 			todrop--;
2248 		}
2249 		if (todrop > tlen ||
2250 		    (todrop == tlen && (tiflags & TH_FIN) == 0)) {
2251 			/*
2252 			 * Any valid FIN or RST must be to the left of the
2253 			 * window.  At this point the FIN or RST must be a
2254 			 * duplicate or out of sequence; drop it.
2255 			 */
2256 			if (tiflags & TH_RST)
2257 				goto drop;
2258 			tiflags &= ~(TH_FIN|TH_RST);
2259 
2260 			/*
2261 			 * Send an ACK to resynchronize and drop any data.
2262 			 * But keep on processing for RST or ACK.
2263 			 */
2264 			tp->t_flags |= TF_ACKNOW;
2265 			todrop = tlen;
2266 			dupseg = true;
2267 			tcps = TCP_STAT_GETREF();
2268 			tcps[TCP_STAT_RCVDUPPACK]++;
2269 			tcps[TCP_STAT_RCVDUPBYTE] += todrop;
2270 			TCP_STAT_PUTREF();
2271 		} else if ((tiflags & TH_RST) && th->th_seq != tp->rcv_nxt) {
2272 			/*
2273 			 * Test for reset before adjusting the sequence
2274 			 * number for overlapping data.
2275 			 */
2276 			goto dropafterack_ratelim;
2277 		} else {
2278 			tcps = TCP_STAT_GETREF();
2279 			tcps[TCP_STAT_RCVPARTDUPPACK]++;
2280 			tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop;
2281 			TCP_STAT_PUTREF();
2282 		}
2283 		tcp_new_dsack(tp, th->th_seq, todrop);
2284 		hdroptlen += todrop;	/* drop from head afterwards (m_adj) */
2285 		th->th_seq += todrop;
2286 		tlen -= todrop;
2287 		tcp_urp_drop(th, todrop, &tiflags);
2288 	}
2289 
2290 	/*
2291 	 * If new data is received on a connection after the user processes
2292 	 * are gone, then RST the other end.
2293 	 */
2294 	if ((so->so_state & SS_NOFDREF) &&
2295 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2296 		tp = tcp_close(tp);
2297 		TCP_STATINC(TCP_STAT_RCVAFTERCLOSE);
2298 		goto dropwithreset;
2299 	}
2300 
2301 	/*
2302 	 * If the segment ends after the window, drop trailing data (and
2303 	 * PUSH and FIN); if nothing left, just ACK.
2304 	 */
2305 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
2306 	if (todrop > 0) {
2307 		TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN);
2308 		if (todrop >= tlen) {
2309 			/*
2310 			 * The segment actually starts after the window.
2311 			 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
2312 			 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
2313 			 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
2314 			 */
2315 			TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen);
2316 
2317 			/*
2318 			 * If a new connection request is received while in
2319 			 * TIME_WAIT, drop the old connection and start over
2320 			 * if the sequence numbers are above the previous
2321 			 * ones.
2322 			 *
2323 			 * NOTE: We need to put the header fields back into
2324 			 * network order.
2325 			 */
2326 			if ((tiflags & TH_SYN) &&
2327 			    tp->t_state == TCPS_TIME_WAIT &&
2328 			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2329 				tp = tcp_close(tp);
2330 				tcp_fields_to_net(th);
2331 				m_freem(tcp_saveti);
2332 				tcp_saveti = NULL;
2333 				goto findpcb;
2334 			}
2335 
2336 			/*
2337 			 * If window is closed can only take segments at
2338 			 * window edge, and have to drop data and PUSH from
2339 			 * incoming segments.  Continue processing, but
2340 			 * remember to ack.  Otherwise, drop segment
2341 			 * and (if not RST) ack.
2342 			 */
2343 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2344 				KASSERT(todrop == tlen);
2345 				tp->t_flags |= TF_ACKNOW;
2346 				TCP_STATINC(TCP_STAT_RCVWINPROBE);
2347 			} else {
2348 				goto dropafterack;
2349 			}
2350 		} else {
2351 			TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop);
2352 		}
2353 		m_adj(m, -todrop);
2354 		tlen -= todrop;
2355 		tiflags &= ~(TH_PUSH|TH_FIN);
2356 	}
2357 
2358 	/*
2359 	 * If last ACK falls within this segment's sequence numbers,
2360 	 *  record the timestamp.
2361 	 * NOTE:
2362 	 * 1) That the test incorporates suggestions from the latest
2363 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
2364 	 * 2) That updating only on newer timestamps interferes with
2365 	 *    our earlier PAWS tests, so this check should be solely
2366 	 *    predicated on the sequence space of this segment.
2367 	 * 3) That we modify the segment boundary check to be
2368 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
2369 	 *    instead of RFC1323's
2370 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
2371 	 *    This modified check allows us to overcome RFC1323's
2372 	 *    limitations as described in Stevens TCP/IP Illustrated
2373 	 *    Vol. 2 p.869. In such cases, we can still calculate the
2374 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
2375 	 */
2376 	if (opti.ts_present &&
2377 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2378 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2379 	         ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
2380 		tp->ts_recent_age = tcp_now;
2381 		tp->ts_recent = opti.ts_val;
2382 	}
2383 
2384 	/*
2385 	 * If the RST bit is set examine the state:
2386 	 *    RECEIVED state:
2387 	 *        If passive open, return to LISTEN state.
2388 	 *        If active open, inform user that connection was refused.
2389 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT states:
2390 	 *        Inform user that connection was reset, and close tcb.
2391 	 *    CLOSING, LAST_ACK, TIME_WAIT states:
2392 	 *        Close the tcb.
2393 	 */
2394 	if (tiflags & TH_RST) {
2395 		if (th->th_seq != tp->rcv_nxt)
2396 			goto dropafterack_ratelim;
2397 
2398 		switch (tp->t_state) {
2399 		case TCPS_SYN_RECEIVED:
2400 			so->so_error = ECONNREFUSED;
2401 			goto close;
2402 
2403 		case TCPS_ESTABLISHED:
2404 		case TCPS_FIN_WAIT_1:
2405 		case TCPS_FIN_WAIT_2:
2406 		case TCPS_CLOSE_WAIT:
2407 			so->so_error = ECONNRESET;
2408 		close:
2409 			tp->t_state = TCPS_CLOSED;
2410 			TCP_STATINC(TCP_STAT_DROPS);
2411 			tp = tcp_close(tp);
2412 			goto drop;
2413 
2414 		case TCPS_CLOSING:
2415 		case TCPS_LAST_ACK:
2416 		case TCPS_TIME_WAIT:
2417 			tp = tcp_close(tp);
2418 			goto drop;
2419 		}
2420 	}
2421 
2422 	/*
2423 	 * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2424 	 * we must be in a synchronized state.  RFC791 states (under RST
2425 	 * generation) that any unacceptable segment (an out-of-order SYN
2426 	 * qualifies) received in a synchronized state must elicit only an
2427 	 * empty acknowledgment segment ... and the connection remains in
2428 	 * the same state.
2429 	 */
2430 	if (tiflags & TH_SYN) {
2431 		if (tp->rcv_nxt == th->th_seq) {
2432 			tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
2433 			    TH_ACK);
2434 			if (tcp_saveti)
2435 				m_freem(tcp_saveti);
2436 			return;
2437 		}
2438 
2439 		goto dropafterack_ratelim;
2440 	}
2441 
2442 	/*
2443 	 * If the ACK bit is off we drop the segment and return.
2444 	 */
2445 	if ((tiflags & TH_ACK) == 0) {
2446 		if (tp->t_flags & TF_ACKNOW)
2447 			goto dropafterack;
2448 		goto drop;
2449 	}
2450 
2451 	/*
2452 	 * From here on, we're doing ACK processing.
2453 	 */
2454 
2455 	switch (tp->t_state) {
2456 	/*
2457 	 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2458 	 * ESTABLISHED state and continue processing, otherwise
2459 	 * send an RST.
2460 	 */
2461 	case TCPS_SYN_RECEIVED:
2462 		if (SEQ_GT(tp->snd_una, th->th_ack) ||
2463 		    SEQ_GT(th->th_ack, tp->snd_max))
2464 			goto dropwithreset;
2465 		TCP_STATINC(TCP_STAT_CONNECTS);
2466 		soisconnected(so);
2467 		tcp_established(tp);
2468 		/* Do window scaling? */
2469 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2470 		    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2471 			tp->snd_scale = tp->requested_s_scale;
2472 			tp->rcv_scale = tp->request_r_scale;
2473 		}
2474 		TCP_REASS_LOCK(tp);
2475 		(void)tcp_reass(tp, NULL, NULL, tlen);
2476 		tp->snd_wl1 = th->th_seq - 1;
2477 		/* FALLTHROUGH */
2478 
2479 	/*
2480 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2481 	 * ACKs.  If the ack is in the range
2482 	 *	tp->snd_una < th->th_ack <= tp->snd_max
2483 	 * then advance tp->snd_una to th->th_ack and drop
2484 	 * data from the retransmission queue.  If this ACK reflects
2485 	 * more up to date window information we update our window information.
2486 	 */
2487 	case TCPS_ESTABLISHED:
2488 	case TCPS_FIN_WAIT_1:
2489 	case TCPS_FIN_WAIT_2:
2490 	case TCPS_CLOSE_WAIT:
2491 	case TCPS_CLOSING:
2492 	case TCPS_LAST_ACK:
2493 	case TCPS_TIME_WAIT:
2494 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2495 			if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
2496 				TCP_STATINC(TCP_STAT_RCVDUPACK);
2497 				/*
2498 				 * If we have outstanding data (other than
2499 				 * a window probe), this is a completely
2500 				 * duplicate ack (ie, window info didn't
2501 				 * change), the ack is the biggest we've
2502 				 * seen and we've seen exactly our rexmt
2503 				 * threshhold of them, assume a packet
2504 				 * has been dropped and retransmit it.
2505 				 * Kludge snd_nxt & the congestion
2506 				 * window so we send only this one
2507 				 * packet.
2508 				 */
2509 				if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
2510 				    th->th_ack != tp->snd_una)
2511 					tp->t_dupacks = 0;
2512 				else if (tp->t_partialacks < 0 &&
2513 				    (++tp->t_dupacks == tcprexmtthresh ||
2514 				     TCP_FACK_FASTRECOV(tp))) {
2515 					/*
2516 					 * Do the fast retransmit, and adjust
2517 					 * congestion control paramenters.
2518 					 */
2519 					if (tp->t_congctl->fast_retransmit(tp, th)) {
2520 						/* False fast retransmit */
2521 						break;
2522 					}
2523 					goto drop;
2524 				} else if (tp->t_dupacks > tcprexmtthresh) {
2525 					tp->snd_cwnd += tp->t_segsz;
2526 					KERNEL_LOCK(1, NULL);
2527 					(void)tcp_output(tp);
2528 					KERNEL_UNLOCK_ONE(NULL);
2529 					goto drop;
2530 				}
2531 			} else {
2532 				/*
2533 				 * If the ack appears to be very old, only
2534 				 * allow data that is in-sequence.  This
2535 				 * makes it somewhat more difficult to insert
2536 				 * forged data by guessing sequence numbers.
2537 				 * Sent an ack to try to update the send
2538 				 * sequence number on the other side.
2539 				 */
2540 				if (tlen && th->th_seq != tp->rcv_nxt &&
2541 				    SEQ_LT(th->th_ack,
2542 				    tp->snd_una - tp->max_sndwnd))
2543 					goto dropafterack;
2544 			}
2545 			break;
2546 		}
2547 		/*
2548 		 * If the congestion window was inflated to account
2549 		 * for the other side's cached packets, retract it.
2550 		 */
2551 		tp->t_congctl->fast_retransmit_newack(tp, th);
2552 
2553 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
2554 			TCP_STATINC(TCP_STAT_RCVACKTOOMUCH);
2555 			goto dropafterack;
2556 		}
2557 		acked = th->th_ack - tp->snd_una;
2558 		tcps = TCP_STAT_GETREF();
2559 		tcps[TCP_STAT_RCVACKPACK]++;
2560 		tcps[TCP_STAT_RCVACKBYTE] += acked;
2561 		TCP_STAT_PUTREF();
2562 
2563 		/*
2564 		 * If we have a timestamp reply, update smoothed
2565 		 * round trip time.  If no timestamp is present but
2566 		 * transmit timer is running and timed sequence
2567 		 * number was acked, update smoothed round trip time.
2568 		 * Since we now have an rtt measurement, cancel the
2569 		 * timer backoff (cf., Phil Karn's retransmit alg.).
2570 		 * Recompute the initial retransmit timer.
2571 		 */
2572 		if (ts_rtt)
2573 			tcp_xmit_timer(tp, ts_rtt - 1);
2574 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2575 			tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2576 
2577 		/*
2578 		 * If all outstanding data is acked, stop retransmit
2579 		 * timer and remember to restart (more output or persist).
2580 		 * If there is more data to be acked, restart retransmit
2581 		 * timer, using current (possibly backed-off) value.
2582 		 */
2583 		if (th->th_ack == tp->snd_max) {
2584 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
2585 			needoutput = 1;
2586 		} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
2587 			TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2588 
2589 		/*
2590 		 * New data has been acked, adjust the congestion window.
2591 		 */
2592 		tp->t_congctl->newack(tp, th);
2593 
2594 		nd6_hint(tp);
2595 		if (acked > so->so_snd.sb_cc) {
2596 			tp->snd_wnd -= so->so_snd.sb_cc;
2597 			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2598 			ourfinisacked = 1;
2599 		} else {
2600 			if (acked > (tp->t_lastoff - tp->t_inoff))
2601 				tp->t_lastm = NULL;
2602 			sbdrop(&so->so_snd, acked);
2603 			tp->t_lastoff -= acked;
2604 			if (tp->snd_wnd > acked)
2605 				tp->snd_wnd -= acked;
2606 			else
2607 				tp->snd_wnd = 0;
2608 			ourfinisacked = 0;
2609 		}
2610 		sowwakeup(so);
2611 
2612 		icmp_check(tp, th, acked);
2613 
2614 		tp->snd_una = th->th_ack;
2615 		if (SEQ_GT(tp->snd_una, tp->snd_fack))
2616 			tp->snd_fack = tp->snd_una;
2617 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2618 			tp->snd_nxt = tp->snd_una;
2619 		if (SEQ_LT(tp->snd_high, tp->snd_una))
2620 			tp->snd_high = tp->snd_una;
2621 
2622 		switch (tp->t_state) {
2623 
2624 		/*
2625 		 * In FIN_WAIT_1 STATE in addition to the processing
2626 		 * for the ESTABLISHED state if our FIN is now acknowledged
2627 		 * then enter FIN_WAIT_2.
2628 		 */
2629 		case TCPS_FIN_WAIT_1:
2630 			if (ourfinisacked) {
2631 				/*
2632 				 * If we can't receive any more
2633 				 * data, then closing user can proceed.
2634 				 * Starting the timer is contrary to the
2635 				 * specification, but if we don't get a FIN
2636 				 * we'll hang forever.
2637 				 */
2638 				if (so->so_state & SS_CANTRCVMORE) {
2639 					soisdisconnected(so);
2640 					if (tp->t_maxidle > 0)
2641 						TCP_TIMER_ARM(tp, TCPT_2MSL,
2642 						    tp->t_maxidle);
2643 				}
2644 				tp->t_state = TCPS_FIN_WAIT_2;
2645 			}
2646 			break;
2647 
2648 	 	/*
2649 		 * In CLOSING STATE in addition to the processing for
2650 		 * the ESTABLISHED state if the ACK acknowledges our FIN
2651 		 * then enter the TIME-WAIT state, otherwise ignore
2652 		 * the segment.
2653 		 */
2654 		case TCPS_CLOSING:
2655 			if (ourfinisacked) {
2656 				tp->t_state = TCPS_TIME_WAIT;
2657 				tcp_canceltimers(tp);
2658 				TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl);
2659 				soisdisconnected(so);
2660 			}
2661 			break;
2662 
2663 		/*
2664 		 * In LAST_ACK, we may still be waiting for data to drain
2665 		 * and/or to be acked, as well as for the ack of our FIN.
2666 		 * If our FIN is now acknowledged, delete the TCB,
2667 		 * enter the closed state and return.
2668 		 */
2669 		case TCPS_LAST_ACK:
2670 			if (ourfinisacked) {
2671 				tp = tcp_close(tp);
2672 				goto drop;
2673 			}
2674 			break;
2675 
2676 		/*
2677 		 * In TIME_WAIT state the only thing that should arrive
2678 		 * is a retransmission of the remote FIN.  Acknowledge
2679 		 * it and restart the finack timer.
2680 		 */
2681 		case TCPS_TIME_WAIT:
2682 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl);
2683 			goto dropafterack;
2684 		}
2685 	}
2686 
2687 step6:
2688 	/*
2689 	 * Update window information.
2690 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2691 	 */
2692 	if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2693 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2694 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2695 		/* keep track of pure window updates */
2696 		if (tlen == 0 &&
2697 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2698 			TCP_STATINC(TCP_STAT_RCVWINUPD);
2699 		tp->snd_wnd = tiwin;
2700 		tp->snd_wl1 = th->th_seq;
2701 		tp->snd_wl2 = th->th_ack;
2702 		if (tp->snd_wnd > tp->max_sndwnd)
2703 			tp->max_sndwnd = tp->snd_wnd;
2704 		needoutput = 1;
2705 	}
2706 
2707 	/*
2708 	 * Process segments with URG.
2709 	 */
2710 	if ((tiflags & TH_URG) && th->th_urp &&
2711 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2712 		/*
2713 		 * This is a kludge, but if we receive and accept
2714 		 * random urgent pointers, we'll crash in
2715 		 * soreceive.  It's hard to imagine someone
2716 		 * actually wanting to send this much urgent data.
2717 		 */
2718 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2719 			th->th_urp = 0;			/* XXX */
2720 			tiflags &= ~TH_URG;		/* XXX */
2721 			goto dodata;			/* XXX */
2722 		}
2723 
2724 		/*
2725 		 * If this segment advances the known urgent pointer,
2726 		 * then mark the data stream.  This should not happen
2727 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2728 		 * a FIN has been received from the remote side.
2729 		 * In these states we ignore the URG.
2730 		 *
2731 		 * According to RFC961 (Assigned Protocols),
2732 		 * the urgent pointer points to the last octet
2733 		 * of urgent data.  We continue, however,
2734 		 * to consider it to indicate the first octet
2735 		 * of data past the urgent section as the original
2736 		 * spec states (in one of two places).
2737 		 */
2738 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2739 			tp->rcv_up = th->th_seq + th->th_urp;
2740 			so->so_oobmark = so->so_rcv.sb_cc +
2741 			    (tp->rcv_up - tp->rcv_nxt) - 1;
2742 			if (so->so_oobmark == 0)
2743 				so->so_state |= SS_RCVATMARK;
2744 			sohasoutofband(so);
2745 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2746 		}
2747 
2748 		/*
2749 		 * Remove out of band data so doesn't get presented to user.
2750 		 * This can happen independent of advancing the URG pointer,
2751 		 * but if two URG's are pending at once, some out-of-band
2752 		 * data may creep in... ick.
2753 		 */
2754 		if (th->th_urp <= (u_int16_t)tlen &&
2755 		    (so->so_options & SO_OOBINLINE) == 0)
2756 			tcp_pulloutofband(so, th, m, hdroptlen);
2757 	} else {
2758 		/*
2759 		 * If no out of band data is expected,
2760 		 * pull receive urgent pointer along
2761 		 * with the receive window.
2762 		 */
2763 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2764 			tp->rcv_up = tp->rcv_nxt;
2765 	}
2766 dodata:
2767 
2768 	/*
2769 	 * Process the segment text, merging it into the TCP sequencing queue,
2770 	 * and arranging for acknowledgement of receipt if necessary.
2771 	 * This process logically involves adjusting tp->rcv_wnd as data
2772 	 * is presented to the user (this happens in tcp_usrreq.c,
2773 	 * tcp_rcvd()).  If a FIN has already been received on this
2774 	 * connection then we just ignore the text.
2775 	 */
2776 	if ((tlen || (tiflags & TH_FIN)) &&
2777 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2778 		/*
2779 		 * Handle the common case:
2780 		 *  o Segment is the next to be received, and
2781 		 *  o The queue is empty, and
2782 		 *  o The connection is established
2783 		 * In this case, we avoid calling tcp_reass.
2784 		 *
2785 		 * tcp_setup_ack: set DELACK for segments received in order,
2786 		 * but ack immediately when segments are out of order (so that
2787 		 * fast retransmit can work).
2788 		 */
2789 		TCP_REASS_LOCK(tp);
2790 		if (th->th_seq == tp->rcv_nxt &&
2791 		    TAILQ_FIRST(&tp->segq) == NULL &&
2792 		    tp->t_state == TCPS_ESTABLISHED) {
2793 			tcp_setup_ack(tp, th);
2794 			tp->rcv_nxt += tlen;
2795 			tiflags = th->th_flags & TH_FIN;
2796 			tcps = TCP_STAT_GETREF();
2797 			tcps[TCP_STAT_RCVPACK]++;
2798 			tcps[TCP_STAT_RCVBYTE] += tlen;
2799 			TCP_STAT_PUTREF();
2800 			nd6_hint(tp);
2801 			if (so->so_state & SS_CANTRCVMORE) {
2802 				m_freem(m);
2803 			} else {
2804 				m_adj(m, hdroptlen);
2805 				sbappendstream(&(so)->so_rcv, m);
2806 			}
2807 			TCP_REASS_UNLOCK(tp);
2808 			sorwakeup(so);
2809 		} else {
2810 			m_adj(m, hdroptlen);
2811 			tiflags = tcp_reass(tp, th, m, tlen);
2812 			tp->t_flags |= TF_ACKNOW;
2813 		}
2814 
2815 		/*
2816 		 * Note the amount of data that peer has sent into
2817 		 * our window, in order to estimate the sender's
2818 		 * buffer size.
2819 		 */
2820 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2821 	} else {
2822 		m_freem(m);
2823 		m = NULL;
2824 		tiflags &= ~TH_FIN;
2825 	}
2826 
2827 	/*
2828 	 * If FIN is received ACK the FIN and let the user know
2829 	 * that the connection is closing.  Ignore a FIN received before
2830 	 * the connection is fully established.
2831 	 */
2832 	if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2833 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2834 			socantrcvmore(so);
2835 			tp->t_flags |= TF_ACKNOW;
2836 			tp->rcv_nxt++;
2837 		}
2838 		switch (tp->t_state) {
2839 
2840 	 	/*
2841 		 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2842 		 */
2843 		case TCPS_ESTABLISHED:
2844 			tp->t_state = TCPS_CLOSE_WAIT;
2845 			break;
2846 
2847 	 	/*
2848 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2849 		 * enter the CLOSING state.
2850 		 */
2851 		case TCPS_FIN_WAIT_1:
2852 			tp->t_state = TCPS_CLOSING;
2853 			break;
2854 
2855 	 	/*
2856 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2857 		 * starting the time-wait timer, turning off the other
2858 		 * standard timers.
2859 		 */
2860 		case TCPS_FIN_WAIT_2:
2861 			tp->t_state = TCPS_TIME_WAIT;
2862 			tcp_canceltimers(tp);
2863 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl);
2864 			soisdisconnected(so);
2865 			break;
2866 
2867 		/*
2868 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2869 		 */
2870 		case TCPS_TIME_WAIT:
2871 			TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl);
2872 			break;
2873 		}
2874 	}
2875 #ifdef TCP_DEBUG
2876 	if (so->so_options & SO_DEBUG)
2877 		tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2878 #endif
2879 
2880 	/*
2881 	 * Return any desired output.
2882 	 */
2883 	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2884 		KERNEL_LOCK(1, NULL);
2885 		(void)tcp_output(tp);
2886 		KERNEL_UNLOCK_ONE(NULL);
2887 	}
2888 	if (tcp_saveti)
2889 		m_freem(tcp_saveti);
2890 
2891 	if (tp->t_state == TCPS_TIME_WAIT
2892 	    && (so->so_state & SS_NOFDREF)
2893 	    && (tp->t_inpcb || af != AF_INET)
2894 	    && (tp->t_in6pcb || af != AF_INET6)
2895 	    && ((af == AF_INET ? tcp4_vtw_enable : tcp6_vtw_enable) & 1) != 0
2896 	    && TAILQ_EMPTY(&tp->segq)
2897 	    && vtw_add(af, tp)) {
2898 		;
2899 	}
2900 	return;
2901 
2902 badsyn:
2903 	/*
2904 	 * Received a bad SYN.  Increment counters and dropwithreset.
2905 	 */
2906 	TCP_STATINC(TCP_STAT_BADSYN);
2907 	tp = NULL;
2908 	goto dropwithreset;
2909 
2910 dropafterack:
2911 	/*
2912 	 * Generate an ACK dropping incoming segment if it occupies
2913 	 * sequence space, where the ACK reflects our state.
2914 	 */
2915 	if (tiflags & TH_RST)
2916 		goto drop;
2917 	goto dropafterack2;
2918 
2919 dropafterack_ratelim:
2920 	/*
2921 	 * We may want to rate-limit ACKs against SYN/RST attack.
2922 	 */
2923 	if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2924 	    tcp_ackdrop_ppslim) == 0) {
2925 		/* XXX stat */
2926 		goto drop;
2927 	}
2928 
2929 dropafterack2:
2930 	m_freem(m);
2931 	tp->t_flags |= TF_ACKNOW;
2932 	KERNEL_LOCK(1, NULL);
2933 	(void)tcp_output(tp);
2934 	KERNEL_UNLOCK_ONE(NULL);
2935 	if (tcp_saveti)
2936 		m_freem(tcp_saveti);
2937 	return;
2938 
2939 dropwithreset_ratelim:
2940 	/*
2941 	 * We may want to rate-limit RSTs in certain situations,
2942 	 * particularly if we are sending an RST in response to
2943 	 * an attempt to connect to or otherwise communicate with
2944 	 * a port for which we have no socket.
2945 	 */
2946 	if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2947 	    tcp_rst_ppslim) == 0) {
2948 		/* XXX stat */
2949 		goto drop;
2950 	}
2951 
2952 dropwithreset:
2953 	/*
2954 	 * Generate a RST, dropping incoming segment.
2955 	 * Make ACK acceptable to originator of segment.
2956 	 */
2957 	if (tiflags & TH_RST)
2958 		goto drop;
2959 	if (tiflags & TH_ACK) {
2960 		(void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2961 	} else {
2962 		if (tiflags & TH_SYN)
2963 			tlen++;
2964 		(void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2965 		    TH_RST|TH_ACK);
2966 	}
2967 	if (tcp_saveti)
2968 		m_freem(tcp_saveti);
2969 	return;
2970 
2971 badcsum:
2972 drop:
2973 	/*
2974 	 * Drop space held by incoming segment and return.
2975 	 */
2976 	if (tp) {
2977 		if (tp->t_inpcb)
2978 			so = tp->t_inpcb->inp_socket;
2979 #ifdef INET6
2980 		else if (tp->t_in6pcb)
2981 			so = tp->t_in6pcb->in6p_socket;
2982 #endif
2983 		else
2984 			so = NULL;
2985 #ifdef TCP_DEBUG
2986 		if (so && (so->so_options & SO_DEBUG) != 0)
2987 			tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2988 #endif
2989 	}
2990 	if (tcp_saveti)
2991 		m_freem(tcp_saveti);
2992 	m_freem(m);
2993 	return;
2994 }
2995 
2996 #ifdef TCP_SIGNATURE
2997 int
2998 tcp_signature_apply(void *fstate, void *data, u_int len)
2999 {
3000 
3001 	MD5Update(fstate, (u_char *)data, len);
3002 	return (0);
3003 }
3004 
3005 struct secasvar *
3006 tcp_signature_getsav(struct mbuf *m)
3007 {
3008 	struct ip *ip;
3009 	struct ip6_hdr *ip6;
3010 
3011 	ip = mtod(m, struct ip *);
3012 	switch (ip->ip_v) {
3013 	case 4:
3014 		ip = mtod(m, struct ip *);
3015 		ip6 = NULL;
3016 		break;
3017 	case 6:
3018 		ip = NULL;
3019 		ip6 = mtod(m, struct ip6_hdr *);
3020 		break;
3021 	default:
3022 		return (NULL);
3023 	}
3024 
3025 #ifdef IPSEC
3026 	union sockaddr_union dst;
3027 
3028 	/* Extract the destination from the IP header in the mbuf. */
3029 	memset(&dst, 0, sizeof(union sockaddr_union));
3030 	if (ip != NULL) {
3031 		dst.sa.sa_len = sizeof(struct sockaddr_in);
3032 		dst.sa.sa_family = AF_INET;
3033 		dst.sin.sin_addr = ip->ip_dst;
3034 	} else {
3035 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
3036 		dst.sa.sa_family = AF_INET6;
3037 		dst.sin6.sin6_addr = ip6->ip6_dst;
3038 	}
3039 
3040 	/*
3041 	 * Look up an SADB entry which matches the address of the peer.
3042 	 */
3043 	return KEY_LOOKUP_SA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI), 0, 0);
3044 #else
3045 	return NULL;
3046 #endif
3047 }
3048 
3049 int
3050 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
3051     struct secasvar *sav, char *sig)
3052 {
3053 	MD5_CTX ctx;
3054 	struct ip *ip;
3055 	struct ipovly *ipovly;
3056 #ifdef INET6
3057 	struct ip6_hdr *ip6;
3058 	struct ip6_hdr_pseudo ip6pseudo;
3059 #endif
3060 	struct ippseudo ippseudo;
3061 	struct tcphdr th0;
3062 	int l, tcphdrlen;
3063 
3064 	if (sav == NULL)
3065 		return (-1);
3066 
3067 	tcphdrlen = th->th_off * 4;
3068 
3069 	switch (mtod(m, struct ip *)->ip_v) {
3070 	case 4:
3071 		MD5Init(&ctx);
3072 		ip = mtod(m, struct ip *);
3073 		memset(&ippseudo, 0, sizeof(ippseudo));
3074 		ipovly = (struct ipovly *)ip;
3075 		ippseudo.ippseudo_src = ipovly->ih_src;
3076 		ippseudo.ippseudo_dst = ipovly->ih_dst;
3077 		ippseudo.ippseudo_pad = 0;
3078 		ippseudo.ippseudo_p = IPPROTO_TCP;
3079 		ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
3080 		MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
3081 		break;
3082 #if INET6
3083 	case 6:
3084 		MD5Init(&ctx);
3085 		ip6 = mtod(m, struct ip6_hdr *);
3086 		memset(&ip6pseudo, 0, sizeof(ip6pseudo));
3087 		ip6pseudo.ip6ph_src = ip6->ip6_src;
3088 		in6_clearscope(&ip6pseudo.ip6ph_src);
3089 		ip6pseudo.ip6ph_dst = ip6->ip6_dst;
3090 		in6_clearscope(&ip6pseudo.ip6ph_dst);
3091 		ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
3092 		ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
3093 		MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
3094 		break;
3095 #endif
3096 	default:
3097 		return (-1);
3098 	}
3099 
3100 	th0 = *th;
3101 	th0.th_sum = 0;
3102 	MD5Update(&ctx, (char *)&th0, sizeof(th0));
3103 
3104 	l = m->m_pkthdr.len - thoff - tcphdrlen;
3105 	if (l > 0)
3106 		m_apply(m, thoff + tcphdrlen,
3107 		    m->m_pkthdr.len - thoff - tcphdrlen,
3108 		    tcp_signature_apply, &ctx);
3109 
3110 	MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
3111 	MD5Final(sig, &ctx);
3112 
3113 	return (0);
3114 }
3115 #endif
3116 
3117 /*
3118  * Parse and process tcp options.
3119  *
3120  * Returns -1 if this segment should be dropped.  (eg. wrong signature)
3121  * Otherwise returns 0.
3122  */
3123 static int
3124 tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, struct tcphdr *th,
3125     struct mbuf *m, int toff, struct tcp_opt_info *oi)
3126 {
3127 	u_int16_t mss;
3128 	int opt, optlen = 0;
3129 #ifdef TCP_SIGNATURE
3130 	void *sigp = NULL;
3131 	char sigbuf[TCP_SIGLEN];
3132 	struct secasvar *sav = NULL;
3133 #endif
3134 
3135 	for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
3136 		opt = cp[0];
3137 		if (opt == TCPOPT_EOL)
3138 			break;
3139 		if (opt == TCPOPT_NOP)
3140 			optlen = 1;
3141 		else {
3142 			if (cnt < 2)
3143 				break;
3144 			optlen = cp[1];
3145 			if (optlen < 2 || optlen > cnt)
3146 				break;
3147 		}
3148 		switch (opt) {
3149 
3150 		default:
3151 			continue;
3152 
3153 		case TCPOPT_MAXSEG:
3154 			if (optlen != TCPOLEN_MAXSEG)
3155 				continue;
3156 			if (!(th->th_flags & TH_SYN))
3157 				continue;
3158 			if (TCPS_HAVERCVDSYN(tp->t_state))
3159 				continue;
3160 			memcpy(&mss, cp + 2, sizeof(mss));
3161 			oi->maxseg = ntohs(mss);
3162 			break;
3163 
3164 		case TCPOPT_WINDOW:
3165 			if (optlen != TCPOLEN_WINDOW)
3166 				continue;
3167 			if (!(th->th_flags & TH_SYN))
3168 				continue;
3169 			if (TCPS_HAVERCVDSYN(tp->t_state))
3170 				continue;
3171 			tp->t_flags |= TF_RCVD_SCALE;
3172 			tp->requested_s_scale = cp[2];
3173 			if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
3174 				char buf[INET6_ADDRSTRLEN];
3175 				struct ip *ip = mtod(m, struct ip *);
3176 #ifdef INET6
3177 				struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
3178 #endif
3179 
3180 				switch (ip->ip_v) {
3181 				case 4:
3182 					in_print(buf, sizeof(buf),
3183 					    &ip->ip_src);
3184 					break;
3185 #ifdef INET6
3186 				case 6:
3187 					in6_print(buf, sizeof(buf),
3188 					    &ip6->ip6_src);
3189 					break;
3190 #endif
3191 				default:
3192 					strlcpy(buf, "(unknown)", sizeof(buf));
3193 					break;
3194 				}
3195 
3196 				log(LOG_ERR, "TCP: invalid wscale %d from %s, "
3197 				    "assuming %d\n",
3198 				    tp->requested_s_scale, buf,
3199 				    TCP_MAX_WINSHIFT);
3200 				tp->requested_s_scale = TCP_MAX_WINSHIFT;
3201 			}
3202 			break;
3203 
3204 		case TCPOPT_TIMESTAMP:
3205 			if (optlen != TCPOLEN_TIMESTAMP)
3206 				continue;
3207 			oi->ts_present = 1;
3208 			memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val));
3209 			NTOHL(oi->ts_val);
3210 			memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr));
3211 			NTOHL(oi->ts_ecr);
3212 
3213 			if (!(th->th_flags & TH_SYN))
3214 				continue;
3215 			if (TCPS_HAVERCVDSYN(tp->t_state))
3216 				continue;
3217 			/*
3218 			 * A timestamp received in a SYN makes
3219 			 * it ok to send timestamp requests and replies.
3220 			 */
3221 			tp->t_flags |= TF_RCVD_TSTMP;
3222 			tp->ts_recent = oi->ts_val;
3223 			tp->ts_recent_age = tcp_now;
3224                         break;
3225 
3226 		case TCPOPT_SACK_PERMITTED:
3227 			if (optlen != TCPOLEN_SACK_PERMITTED)
3228 				continue;
3229 			if (!(th->th_flags & TH_SYN))
3230 				continue;
3231 			if (TCPS_HAVERCVDSYN(tp->t_state))
3232 				continue;
3233 			if (tcp_do_sack) {
3234 				tp->t_flags |= TF_SACK_PERMIT;
3235 				tp->t_flags |= TF_WILL_SACK;
3236 			}
3237 			break;
3238 
3239 		case TCPOPT_SACK:
3240 			tcp_sack_option(tp, th, cp, optlen);
3241 			break;
3242 #ifdef TCP_SIGNATURE
3243 		case TCPOPT_SIGNATURE:
3244 			if (optlen != TCPOLEN_SIGNATURE)
3245 				continue;
3246 			if (sigp &&
3247 			    !consttime_memequal(sigp, cp + 2, TCP_SIGLEN))
3248 				return (-1);
3249 
3250 			sigp = sigbuf;
3251 			memcpy(sigbuf, cp + 2, TCP_SIGLEN);
3252 			tp->t_flags |= TF_SIGNATURE;
3253 			break;
3254 #endif
3255 		}
3256 	}
3257 
3258 #ifndef TCP_SIGNATURE
3259 	return 0;
3260 #else
3261 	if (tp->t_flags & TF_SIGNATURE) {
3262 		sav = tcp_signature_getsav(m);
3263 		if (sav == NULL && tp->t_state == TCPS_LISTEN)
3264 			return (-1);
3265 	}
3266 
3267 	if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE))
3268 		goto out;
3269 
3270 	if (sigp) {
3271 		char sig[TCP_SIGLEN];
3272 
3273 		tcp_fields_to_net(th);
3274 		if (tcp_signature(m, th, toff, sav, sig) < 0) {
3275 			tcp_fields_to_host(th);
3276 			goto out;
3277 		}
3278 		tcp_fields_to_host(th);
3279 
3280 		if (!consttime_memequal(sig, sigp, TCP_SIGLEN)) {
3281 			TCP_STATINC(TCP_STAT_BADSIG);
3282 			goto out;
3283 		} else
3284 			TCP_STATINC(TCP_STAT_GOODSIG);
3285 
3286 		key_sa_recordxfer(sav, m);
3287 		KEY_SA_UNREF(&sav);
3288 	}
3289 	return 0;
3290 out:
3291 	if (sav != NULL)
3292 		KEY_SA_UNREF(&sav);
3293 	return -1;
3294 #endif
3295 }
3296 
3297 /*
3298  * Pull out of band byte out of a segment so
3299  * it doesn't appear in the user's data queue.
3300  * It is still reflected in the segment length for
3301  * sequencing purposes.
3302  */
3303 void
3304 tcp_pulloutofband(struct socket *so, struct tcphdr *th,
3305     struct mbuf *m, int off)
3306 {
3307 	int cnt = off + th->th_urp - 1;
3308 
3309 	while (cnt >= 0) {
3310 		if (m->m_len > cnt) {
3311 			char *cp = mtod(m, char *) + cnt;
3312 			struct tcpcb *tp = sototcpcb(so);
3313 
3314 			tp->t_iobc = *cp;
3315 			tp->t_oobflags |= TCPOOB_HAVEDATA;
3316 			memmove(cp, cp + 1, (unsigned)(m->m_len - cnt - 1));
3317 			m->m_len--;
3318 			return;
3319 		}
3320 		cnt -= m->m_len;
3321 		m = m->m_next;
3322 		if (m == NULL)
3323 			break;
3324 	}
3325 	panic("tcp_pulloutofband");
3326 }
3327 
3328 /*
3329  * Collect new round-trip time estimate
3330  * and update averages and current timeout.
3331  *
3332  * rtt is in units of slow ticks (typically 500 ms) -- essentially the
3333  * difference of two timestamps.
3334  */
3335 void
3336 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
3337 {
3338 	int32_t delta;
3339 
3340 	TCP_STATINC(TCP_STAT_RTTUPDATED);
3341 	if (tp->t_srtt != 0) {
3342 		/*
3343 		 * Compute the amount to add to srtt for smoothing,
3344 		 * *alpha, or 2^(-TCP_RTT_SHIFT).  Because
3345 		 * srtt is stored in 1/32 slow ticks, we conceptually
3346 		 * shift left 5 bits, subtract srtt to get the
3347 		 * diference, and then shift right by TCP_RTT_SHIFT
3348 		 * (3) to obtain 1/8 of the difference.
3349 		 */
3350 		delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
3351 		/*
3352 		 * This can never happen, because delta's lowest
3353 		 * possible value is 1/8 of t_srtt.  But if it does,
3354 		 * set srtt to some reasonable value, here chosen
3355 		 * as 1/8 tick.
3356 		 */
3357 		if ((tp->t_srtt += delta) <= 0)
3358 			tp->t_srtt = 1 << 2;
3359 		/*
3360 		 * RFC2988 requires that rttvar be updated first.
3361 		 * This code is compliant because "delta" is the old
3362 		 * srtt minus the new observation (scaled).
3363 		 *
3364 		 * RFC2988 says:
3365 		 *   rttvar = (1-beta) * rttvar + beta * |srtt-observed|
3366 		 *
3367 		 * delta is in units of 1/32 ticks, and has then been
3368 		 * divided by 8.  This is equivalent to being in 1/16s
3369 		 * units and divided by 4.  Subtract from it 1/4 of
3370 		 * the existing rttvar to form the (signed) amount to
3371 		 * adjust.
3372 		 */
3373 		if (delta < 0)
3374 			delta = -delta;
3375 		delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3376 		/*
3377 		 * As with srtt, this should never happen.  There is
3378 		 * no support in RFC2988 for this operation.  But 1/4s
3379 		 * as rttvar when faced with something arguably wrong
3380 		 * is ok.
3381 		 */
3382 		if ((tp->t_rttvar += delta) <= 0)
3383 			tp->t_rttvar = 1 << 2;
3384 
3385 		/*
3386 		 * If srtt exceeds .01 second, ensure we use the 'remote' MSL
3387 		 * Problem is: it doesn't work.  Disabled by defaulting
3388 		 * tcp_rttlocal to 0; see corresponding code in
3389 		 * tcp_subr that selects local vs remote in a different way.
3390 		 *
3391 		 * The static branch prediction hint here should be removed
3392 		 * when the rtt estimator is fixed and the rtt_enable code
3393 		 * is turned back on.
3394 		 */
3395 		if (__predict_false(tcp_rttlocal) && tcp_msl_enable
3396 		    && tp->t_srtt > tcp_msl_remote_threshold
3397 		    && tp->t_msl  < tcp_msl_remote) {
3398 			tp->t_msl = tcp_msl_remote;
3399 		}
3400 	} else {
3401 		/*
3402 		 * This is the first measurement.  Per RFC2988, 2.2,
3403 		 * set rtt=R and srtt=R/2.
3404 		 * For srtt, storage representation is 1/32 ticks,
3405 		 * so shift left by 5.
3406 		 * For rttvar, storage representation is 1/16 ticks,
3407 		 * So shift left by 4, but then right by 1 to halve.
3408 		 */
3409 		tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
3410 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
3411 	}
3412 	tp->t_rtttime = 0;
3413 	tp->t_rxtshift = 0;
3414 
3415 	/*
3416 	 * the retransmit should happen at rtt + 4 * rttvar.
3417 	 * Because of the way we do the smoothing, srtt and rttvar
3418 	 * will each average +1/2 tick of bias.  When we compute
3419 	 * the retransmit timer, we want 1/2 tick of rounding and
3420 	 * 1 extra tick because of +-1/2 tick uncertainty in the
3421 	 * firing of the timer.  The bias will give us exactly the
3422 	 * 1.5 tick we need.  But, because the bias is
3423 	 * statistical, we have to test that we don't drop below
3424 	 * the minimum feasible timer (which is 2 ticks).
3425 	 */
3426 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3427 	    max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3428 
3429 	/*
3430 	 * We received an ack for a packet that wasn't retransmitted;
3431 	 * it is probably safe to discard any error indications we've
3432 	 * received recently.  This isn't quite right, but close enough
3433 	 * for now (a route might have failed after we sent a segment,
3434 	 * and the return path might not be symmetrical).
3435 	 */
3436 	tp->t_softerror = 0;
3437 }
3438 
3439 
3440 /*
3441  * TCP compressed state engine.  Currently used to hold compressed
3442  * state for SYN_RECEIVED.
3443  */
3444 
3445 u_long	syn_cache_count;
3446 u_int32_t syn_hash1, syn_hash2;
3447 
3448 #define SYN_HASH(sa, sp, dp) \
3449 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3450 				     ((u_int32_t)(sp)))^syn_hash2)))
3451 #ifndef INET6
3452 #define	SYN_HASHALL(hash, src, dst) \
3453 do {									\
3454 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
3455 		((const struct sockaddr_in *)(src))->sin_port,		\
3456 		((const struct sockaddr_in *)(dst))->sin_port);		\
3457 } while (/*CONSTCOND*/ 0)
3458 #else
3459 #define SYN_HASH6(sa, sp, dp) \
3460 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3461 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3462 	 & 0x7fffffff)
3463 
3464 #define SYN_HASHALL(hash, src, dst) \
3465 do {									\
3466 	switch ((src)->sa_family) {					\
3467 	case AF_INET:							\
3468 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
3469 			((const struct sockaddr_in *)(src))->sin_port,	\
3470 			((const struct sockaddr_in *)(dst))->sin_port);	\
3471 		break;							\
3472 	case AF_INET6:							\
3473 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
3474 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
3475 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
3476 		break;							\
3477 	default:							\
3478 		hash = 0;						\
3479 	}								\
3480 } while (/*CONSTCOND*/0)
3481 #endif /* INET6 */
3482 
3483 static struct pool syn_cache_pool;
3484 
3485 /*
3486  * We don't estimate RTT with SYNs, so each packet starts with the default
3487  * RTT and each timer step has a fixed timeout value.
3488  */
3489 static inline void
3490 syn_cache_timer_arm(struct syn_cache *sc)
3491 {
3492 
3493 	TCPT_RANGESET(sc->sc_rxtcur,
3494 	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
3495 	    TCPTV_REXMTMAX);
3496 	callout_reset(&sc->sc_timer,
3497 	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
3498 }
3499 
3500 #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
3501 
3502 static inline void
3503 syn_cache_rm(struct syn_cache *sc)
3504 {
3505 	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
3506 	    sc, sc_bucketq);
3507 	sc->sc_tp = NULL;
3508 	LIST_REMOVE(sc, sc_tpq);
3509 	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
3510 	callout_stop(&sc->sc_timer);
3511 	syn_cache_count--;
3512 }
3513 
3514 static inline void
3515 syn_cache_put(struct syn_cache *sc)
3516 {
3517 	if (sc->sc_ipopts)
3518 		(void) m_free(sc->sc_ipopts);
3519 	rtcache_free(&sc->sc_route);
3520 	sc->sc_flags |= SCF_DEAD;
3521 	if (!callout_invoking(&sc->sc_timer))
3522 		callout_schedule(&(sc)->sc_timer, 1);
3523 }
3524 
3525 void
3526 syn_cache_init(void)
3527 {
3528 	int i;
3529 
3530 	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
3531 	    "synpl", NULL, IPL_SOFTNET);
3532 
3533 	/* Initialize the hash buckets. */
3534 	for (i = 0; i < tcp_syn_cache_size; i++)
3535 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3536 }
3537 
3538 void
3539 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3540 {
3541 	struct syn_cache_head *scp;
3542 	struct syn_cache *sc2;
3543 	int s;
3544 
3545 	/*
3546 	 * If there are no entries in the hash table, reinitialize
3547 	 * the hash secrets.
3548 	 */
3549 	if (syn_cache_count == 0) {
3550 		syn_hash1 = cprng_fast32();
3551 		syn_hash2 = cprng_fast32();
3552 	}
3553 
3554 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3555 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3556 	scp = &tcp_syn_cache[sc->sc_bucketidx];
3557 
3558 	/*
3559 	 * Make sure that we don't overflow the per-bucket
3560 	 * limit or the total cache size limit.
3561 	 */
3562 	s = splsoftnet();
3563 	if (scp->sch_length >= tcp_syn_bucket_limit) {
3564 		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
3565 		/*
3566 		 * The bucket is full.  Toss the oldest element in the
3567 		 * bucket.  This will be the first entry in the bucket.
3568 		 */
3569 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
3570 #ifdef DIAGNOSTIC
3571 		/*
3572 		 * This should never happen; we should always find an
3573 		 * entry in our bucket.
3574 		 */
3575 		if (sc2 == NULL)
3576 			panic("syn_cache_insert: bucketoverflow: impossible");
3577 #endif
3578 		syn_cache_rm(sc2);
3579 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
3580 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
3581 		struct syn_cache_head *scp2, *sce;
3582 
3583 		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
3584 		/*
3585 		 * The cache is full.  Toss the oldest entry in the
3586 		 * first non-empty bucket we can find.
3587 		 *
3588 		 * XXX We would really like to toss the oldest
3589 		 * entry in the cache, but we hope that this
3590 		 * condition doesn't happen very often.
3591 		 */
3592 		scp2 = scp;
3593 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3594 			sce = &tcp_syn_cache[tcp_syn_cache_size];
3595 			for (++scp2; scp2 != scp; scp2++) {
3596 				if (scp2 >= sce)
3597 					scp2 = &tcp_syn_cache[0];
3598 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
3599 					break;
3600 			}
3601 #ifdef DIAGNOSTIC
3602 			/*
3603 			 * This should never happen; we should always find a
3604 			 * non-empty bucket.
3605 			 */
3606 			if (scp2 == scp)
3607 				panic("syn_cache_insert: cacheoverflow: "
3608 				    "impossible");
3609 #endif
3610 		}
3611 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3612 		syn_cache_rm(sc2);
3613 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
3614 	}
3615 
3616 	/*
3617 	 * Initialize the entry's timer.
3618 	 */
3619 	sc->sc_rxttot = 0;
3620 	sc->sc_rxtshift = 0;
3621 	syn_cache_timer_arm(sc);
3622 
3623 	/* Link it from tcpcb entry */
3624 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3625 
3626 	/* Put it into the bucket. */
3627 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3628 	scp->sch_length++;
3629 	syn_cache_count++;
3630 
3631 	TCP_STATINC(TCP_STAT_SC_ADDED);
3632 	splx(s);
3633 }
3634 
3635 /*
3636  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3637  * If we have retransmitted an entry the maximum number of times, expire
3638  * that entry.
3639  */
3640 static void
3641 syn_cache_timer(void *arg)
3642 {
3643 	struct syn_cache *sc = arg;
3644 
3645 	mutex_enter(softnet_lock);
3646 	KERNEL_LOCK(1, NULL);
3647 
3648 	callout_ack(&sc->sc_timer);
3649 
3650 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
3651 		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
3652 		goto free;
3653 	}
3654 
3655 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3656 		/* Drop it -- too many retransmissions. */
3657 		goto dropit;
3658 	}
3659 
3660 	/*
3661 	 * Compute the total amount of time this entry has
3662 	 * been on a queue.  If this entry has been on longer
3663 	 * than the keep alive timer would allow, expire it.
3664 	 */
3665 	sc->sc_rxttot += sc->sc_rxtcur;
3666 	if (sc->sc_rxttot >= tcp_keepinit)
3667 		goto dropit;
3668 
3669 	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
3670 	(void)syn_cache_respond(sc);
3671 
3672 	/* Advance the timer back-off. */
3673 	sc->sc_rxtshift++;
3674 	syn_cache_timer_arm(sc);
3675 
3676 	goto out;
3677 
3678  dropit:
3679 	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
3680 	syn_cache_rm(sc);
3681 	if (sc->sc_ipopts)
3682 		(void) m_free(sc->sc_ipopts);
3683 	rtcache_free(&sc->sc_route);
3684 
3685  free:
3686 	callout_destroy(&sc->sc_timer);
3687 	pool_put(&syn_cache_pool, sc);
3688 
3689  out:
3690 	KERNEL_UNLOCK_ONE(NULL);
3691 	mutex_exit(softnet_lock);
3692 }
3693 
3694 /*
3695  * Remove syn cache created by the specified tcb entry,
3696  * because this does not make sense to keep them
3697  * (if there's no tcb entry, syn cache entry will never be used)
3698  */
3699 void
3700 syn_cache_cleanup(struct tcpcb *tp)
3701 {
3702 	struct syn_cache *sc, *nsc;
3703 	int s;
3704 
3705 	s = splsoftnet();
3706 
3707 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3708 		nsc = LIST_NEXT(sc, sc_tpq);
3709 
3710 #ifdef DIAGNOSTIC
3711 		if (sc->sc_tp != tp)
3712 			panic("invalid sc_tp in syn_cache_cleanup");
3713 #endif
3714 		syn_cache_rm(sc);
3715 		syn_cache_put(sc);	/* calls pool_put but see spl above */
3716 	}
3717 	/* just for safety */
3718 	LIST_INIT(&tp->t_sc);
3719 
3720 	splx(s);
3721 }
3722 
3723 /*
3724  * Find an entry in the syn cache.
3725  */
3726 struct syn_cache *
3727 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
3728     struct syn_cache_head **headp)
3729 {
3730 	struct syn_cache *sc;
3731 	struct syn_cache_head *scp;
3732 	u_int32_t hash;
3733 	int s;
3734 
3735 	SYN_HASHALL(hash, src, dst);
3736 
3737 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3738 	*headp = scp;
3739 	s = splsoftnet();
3740 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3741 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
3742 		if (sc->sc_hash != hash)
3743 			continue;
3744 		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
3745 		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
3746 			splx(s);
3747 			return (sc);
3748 		}
3749 	}
3750 	splx(s);
3751 	return (NULL);
3752 }
3753 
3754 /*
3755  * This function gets called when we receive an ACK for a socket in the
3756  * LISTEN state. We look up the connection in the syn cache, and if it's
3757  * there, we pull it out of the cache and turn it into a full-blown
3758  * connection in the SYN-RECEIVED state.
3759  *
3760  * The return values may not be immediately obvious, and their effects
3761  * can be subtle, so here they are:
3762  *
3763  *	NULL	SYN was not found in cache; caller should drop the
3764  *		packet and send an RST.
3765  *
3766  *	-1	We were unable to create the new connection, and are
3767  *		aborting it.  An ACK,RST is being sent to the peer
3768  *		(unless we got screwey sequence numbers; see below),
3769  *		because the 3-way handshake has been completed.  Caller
3770  *		should not free the mbuf, since we may be using it.  If
3771  *		we are not, we will free it.
3772  *
3773  *	Otherwise, the return value is a pointer to the new socket
3774  *	associated with the connection.
3775  */
3776 struct socket *
3777 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
3778     struct tcphdr *th, struct socket *so, struct mbuf *m)
3779 {
3780 	struct syn_cache *sc;
3781 	struct syn_cache_head *scp;
3782 	struct inpcb *inp = NULL;
3783 #ifdef INET6
3784 	struct in6pcb *in6p = NULL;
3785 #endif
3786 	struct tcpcb *tp;
3787 	int s;
3788 	struct socket *oso;
3789 
3790 	s = splsoftnet();
3791 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3792 		splx(s);
3793 		return NULL;
3794 	}
3795 
3796 	/*
3797 	 * Verify the sequence and ack numbers.  Try getting the correct
3798 	 * response again.
3799 	 */
3800 	if ((th->th_ack != sc->sc_iss + 1) ||
3801 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3802 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3803 		m_freem(m);
3804 		(void)syn_cache_respond(sc);
3805 		splx(s);
3806 		return ((struct socket *)(-1));
3807 	}
3808 
3809 	/* Remove this cache entry */
3810 	syn_cache_rm(sc);
3811 	splx(s);
3812 
3813 	/*
3814 	 * Ok, create the full blown connection, and set things up
3815 	 * as they would have been set up if we had created the
3816 	 * connection when the SYN arrived.  If we can't create
3817 	 * the connection, abort it.
3818 	 */
3819 	/*
3820 	 * inp still has the OLD in_pcb stuff, set the
3821 	 * v6-related flags on the new guy, too.   This is
3822 	 * done particularly for the case where an AF_INET6
3823 	 * socket is bound only to a port, and a v4 connection
3824 	 * comes in on that port.
3825 	 * we also copy the flowinfo from the original pcb
3826 	 * to the new one.
3827 	 */
3828 	oso = so;
3829 	so = sonewconn(so, true);
3830 	if (so == NULL)
3831 		goto resetandabort;
3832 
3833 	switch (so->so_proto->pr_domain->dom_family) {
3834 	case AF_INET:
3835 		inp = sotoinpcb(so);
3836 		break;
3837 #ifdef INET6
3838 	case AF_INET6:
3839 		in6p = sotoin6pcb(so);
3840 		break;
3841 #endif
3842 	}
3843 
3844 	switch (src->sa_family) {
3845 	case AF_INET:
3846 		if (inp) {
3847 			inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3848 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
3849 			inp->inp_options = ip_srcroute(m);
3850 			in_pcbstate(inp, INP_BOUND);
3851 			if (inp->inp_options == NULL) {
3852 				inp->inp_options = sc->sc_ipopts;
3853 				sc->sc_ipopts = NULL;
3854 			}
3855 		}
3856 #ifdef INET6
3857 		else if (in6p) {
3858 			/* IPv4 packet to AF_INET6 socket */
3859 			memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr));
3860 			in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
3861 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
3862 				&in6p->in6p_laddr.s6_addr32[3],
3863 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
3864 			in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
3865 			in6totcpcb(in6p)->t_family = AF_INET;
3866 			if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
3867 				in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
3868 			else
3869 				in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
3870 			in6_pcbstate(in6p, IN6P_BOUND);
3871 		}
3872 #endif
3873 		break;
3874 #ifdef INET6
3875 	case AF_INET6:
3876 		if (in6p) {
3877 			in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
3878 			in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
3879 			in6_pcbstate(in6p, IN6P_BOUND);
3880 		}
3881 		break;
3882 #endif
3883 	}
3884 
3885 #ifdef INET6
3886 	if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
3887 		struct in6pcb *oin6p = sotoin6pcb(oso);
3888 		/* inherit socket options from the listening socket */
3889 		in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
3890 		if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
3891 			m_freem(in6p->in6p_options);
3892 			in6p->in6p_options = NULL;
3893 		}
3894 		ip6_savecontrol(in6p, &in6p->in6p_options,
3895 		    mtod(m, struct ip6_hdr *), m);
3896 	}
3897 #endif
3898 
3899 	/*
3900 	 * Give the new socket our cached route reference.
3901 	 */
3902 	if (inp) {
3903 		rtcache_copy(&inp->inp_route, &sc->sc_route);
3904 		rtcache_free(&sc->sc_route);
3905 	}
3906 #ifdef INET6
3907 	else {
3908 		rtcache_copy(&in6p->in6p_route, &sc->sc_route);
3909 		rtcache_free(&sc->sc_route);
3910 	}
3911 #endif
3912 
3913 	if (inp) {
3914 		struct sockaddr_in sin;
3915 		memcpy(&sin, src, src->sa_len);
3916 		if (in_pcbconnect(inp, &sin, &lwp0)) {
3917 			goto resetandabort;
3918 		}
3919 	}
3920 #ifdef INET6
3921 	else if (in6p) {
3922 		struct sockaddr_in6 sin6;
3923 		memcpy(&sin6, src, src->sa_len);
3924 		if (src->sa_family == AF_INET) {
3925 			/* IPv4 packet to AF_INET6 socket */
3926 			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
3927 		}
3928 		if (in6_pcbconnect(in6p, &sin6, NULL)) {
3929 			goto resetandabort;
3930 		}
3931 	}
3932 #endif
3933 	else {
3934 		goto resetandabort;
3935 	}
3936 
3937 	if (inp)
3938 		tp = intotcpcb(inp);
3939 #ifdef INET6
3940 	else if (in6p)
3941 		tp = in6totcpcb(in6p);
3942 #endif
3943 	else
3944 		tp = NULL;
3945 
3946 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3947 	if (sc->sc_request_r_scale != 15) {
3948 		tp->requested_s_scale = sc->sc_requested_s_scale;
3949 		tp->request_r_scale = sc->sc_request_r_scale;
3950 		tp->snd_scale = sc->sc_requested_s_scale;
3951 		tp->rcv_scale = sc->sc_request_r_scale;
3952 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3953 	}
3954 	if (sc->sc_flags & SCF_TIMESTAMP)
3955 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3956 	tp->ts_timebase = sc->sc_timebase;
3957 
3958 	tp->t_template = tcp_template(tp);
3959 	if (tp->t_template == 0) {
3960 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
3961 		so = NULL;
3962 		m_freem(m);
3963 		goto abort;
3964 	}
3965 
3966 	tp->iss = sc->sc_iss;
3967 	tp->irs = sc->sc_irs;
3968 	tcp_sendseqinit(tp);
3969 	tcp_rcvseqinit(tp);
3970 	tp->t_state = TCPS_SYN_RECEIVED;
3971 	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
3972 	TCP_STATINC(TCP_STAT_ACCEPTS);
3973 
3974 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
3975 		tp->t_flags |= TF_WILL_SACK;
3976 
3977 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
3978 		tp->t_flags |= TF_ECN_PERMIT;
3979 
3980 #ifdef TCP_SIGNATURE
3981 	if (sc->sc_flags & SCF_SIGNATURE)
3982 		tp->t_flags |= TF_SIGNATURE;
3983 #endif
3984 
3985 	/* Initialize tp->t_ourmss before we deal with the peer's! */
3986 	tp->t_ourmss = sc->sc_ourmaxseg;
3987 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
3988 
3989 	/*
3990 	 * Initialize the initial congestion window.  If we
3991 	 * had to retransmit the SYN,ACK, we must initialize cwnd
3992 	 * to 1 segment (i.e. the Loss Window).
3993 	 */
3994 	if (sc->sc_rxtshift)
3995 		tp->snd_cwnd = tp->t_peermss;
3996 	else {
3997 		int ss = tcp_init_win;
3998 		if (inp != NULL && in_localaddr(inp->inp_faddr))
3999 			ss = tcp_init_win_local;
4000 #ifdef INET6
4001 		if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
4002 			ss = tcp_init_win_local;
4003 #endif
4004 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
4005 	}
4006 
4007 	tcp_rmx_rtt(tp);
4008 	tp->snd_wl1 = sc->sc_irs;
4009 	tp->rcv_up = sc->sc_irs + 1;
4010 
4011 	/*
4012 	 * This is what whould have happened in tcp_output() when
4013 	 * the SYN,ACK was sent.
4014 	 */
4015 	tp->snd_up = tp->snd_una;
4016 	tp->snd_max = tp->snd_nxt = tp->iss+1;
4017 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
4018 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
4019 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
4020 	tp->last_ack_sent = tp->rcv_nxt;
4021 	tp->t_partialacks = -1;
4022 	tp->t_dupacks = 0;
4023 
4024 	TCP_STATINC(TCP_STAT_SC_COMPLETED);
4025 	s = splsoftnet();
4026 	syn_cache_put(sc);
4027 	splx(s);
4028 	return so;
4029 
4030 resetandabort:
4031 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
4032 abort:
4033 	if (so != NULL) {
4034 		(void) soqremque(so, 1);
4035 		(void) soabort(so);
4036 		mutex_enter(softnet_lock);
4037 	}
4038 	s = splsoftnet();
4039 	syn_cache_put(sc);
4040 	splx(s);
4041 	TCP_STATINC(TCP_STAT_SC_ABORTED);
4042 	return ((struct socket *)(-1));
4043 }
4044 
4045 /*
4046  * This function is called when we get a RST for a
4047  * non-existent connection, so that we can see if the
4048  * connection is in the syn cache.  If it is, zap it.
4049  */
4050 
4051 void
4052 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
4053 {
4054 	struct syn_cache *sc;
4055 	struct syn_cache_head *scp;
4056 	int s = splsoftnet();
4057 
4058 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
4059 		splx(s);
4060 		return;
4061 	}
4062 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
4063 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
4064 		splx(s);
4065 		return;
4066 	}
4067 	syn_cache_rm(sc);
4068 	TCP_STATINC(TCP_STAT_SC_RESET);
4069 	syn_cache_put(sc);	/* calls pool_put but see spl above */
4070 	splx(s);
4071 }
4072 
4073 void
4074 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
4075     struct tcphdr *th)
4076 {
4077 	struct syn_cache *sc;
4078 	struct syn_cache_head *scp;
4079 	int s;
4080 
4081 	s = splsoftnet();
4082 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
4083 		splx(s);
4084 		return;
4085 	}
4086 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
4087 	if (ntohl(th->th_seq) != sc->sc_iss) {
4088 		splx(s);
4089 		return;
4090 	}
4091 
4092 	/*
4093 	 * If we've retransmitted 3 times and this is our second error,
4094 	 * we remove the entry.  Otherwise, we allow it to continue on.
4095 	 * This prevents us from incorrectly nuking an entry during a
4096 	 * spurious network outage.
4097 	 *
4098 	 * See tcp_notify().
4099 	 */
4100 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
4101 		sc->sc_flags |= SCF_UNREACH;
4102 		splx(s);
4103 		return;
4104 	}
4105 
4106 	syn_cache_rm(sc);
4107 	TCP_STATINC(TCP_STAT_SC_UNREACH);
4108 	syn_cache_put(sc);	/* calls pool_put but see spl above */
4109 	splx(s);
4110 }
4111 
4112 /*
4113  * Given a LISTEN socket and an inbound SYN request, add this to the syn
4114  * cache, and send back a segment:
4115  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
4116  * to the source.
4117  *
4118  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
4119  * Doing so would require that we hold onto the data and deliver it
4120  * to the application.  However, if we are the target of a SYN-flood
4121  * DoS attack, an attacker could send data which would eventually
4122  * consume all available buffer space if it were ACKed.  By not ACKing
4123  * the data, we avoid this DoS scenario.
4124  */
4125 int
4126 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
4127     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
4128     int optlen, struct tcp_opt_info *oi)
4129 {
4130 	struct tcpcb tb, *tp;
4131 	long win;
4132 	struct syn_cache *sc;
4133 	struct syn_cache_head *scp;
4134 	struct mbuf *ipopts;
4135 	int s;
4136 
4137 	tp = sototcpcb(so);
4138 
4139 	/*
4140 	 * Initialize some local state.
4141 	 */
4142 	win = sbspace(&so->so_rcv);
4143 	if (win > TCP_MAXWIN)
4144 		win = TCP_MAXWIN;
4145 
4146 #ifdef TCP_SIGNATURE
4147 	if (optp || (tp->t_flags & TF_SIGNATURE))
4148 #else
4149 	if (optp)
4150 #endif
4151 	{
4152 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
4153 #ifdef TCP_SIGNATURE
4154 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
4155 #endif
4156 		tb.t_state = TCPS_LISTEN;
4157 		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
4158 			return 0;
4159 	} else
4160 		tb.t_flags = 0;
4161 
4162 	switch (src->sa_family) {
4163 	case AF_INET:
4164 		/* Remember the IP options, if any. */
4165 		ipopts = ip_srcroute(m);
4166 		break;
4167 	default:
4168 		ipopts = NULL;
4169 	}
4170 
4171 	/*
4172 	 * See if we already have an entry for this connection.
4173 	 * If we do, resend the SYN,ACK.  We do not count this
4174 	 * as a retransmission (XXX though maybe we should).
4175 	 */
4176 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4177 		TCP_STATINC(TCP_STAT_SC_DUPESYN);
4178 		if (ipopts) {
4179 			/*
4180 			 * If we were remembering a previous source route,
4181 			 * forget it and use the new one we've been given.
4182 			 */
4183 			if (sc->sc_ipopts)
4184 				(void)m_free(sc->sc_ipopts);
4185 			sc->sc_ipopts = ipopts;
4186 		}
4187 		sc->sc_timestamp = tb.ts_recent;
4188 		m_freem(m);
4189 		if (syn_cache_respond(sc) == 0) {
4190 			uint64_t *tcps = TCP_STAT_GETREF();
4191 			tcps[TCP_STAT_SNDACKS]++;
4192 			tcps[TCP_STAT_SNDTOTAL]++;
4193 			TCP_STAT_PUTREF();
4194 		}
4195 		return 1;
4196 	}
4197 
4198 	s = splsoftnet();
4199 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4200 	splx(s);
4201 	if (sc == NULL) {
4202 		if (ipopts)
4203 			(void)m_free(ipopts);
4204 		return 0;
4205 	}
4206 
4207 	/*
4208 	 * Fill in the cache, and put the necessary IP and TCP
4209 	 * options into the reply.
4210 	 */
4211 	memset(sc, 0, sizeof(struct syn_cache));
4212 	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
4213 	memcpy(&sc->sc_src, src, src->sa_len);
4214 	memcpy(&sc->sc_dst, dst, dst->sa_len);
4215 	sc->sc_flags = 0;
4216 	sc->sc_ipopts = ipopts;
4217 	sc->sc_irs = th->th_seq;
4218 	switch (src->sa_family) {
4219 	case AF_INET:
4220 	    {
4221 		struct sockaddr_in *srcin = (void *)src;
4222 		struct sockaddr_in *dstin = (void *)dst;
4223 
4224 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
4225 		    &srcin->sin_addr, dstin->sin_port,
4226 		    srcin->sin_port, sizeof(dstin->sin_addr), 0);
4227 		break;
4228 	    }
4229 #ifdef INET6
4230 	case AF_INET6:
4231 	    {
4232 		struct sockaddr_in6 *srcin6 = (void *)src;
4233 		struct sockaddr_in6 *dstin6 = (void *)dst;
4234 
4235 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
4236 		    &srcin6->sin6_addr, dstin6->sin6_port,
4237 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
4238 		break;
4239 	    }
4240 #endif
4241 	}
4242 	sc->sc_peermaxseg = oi->maxseg;
4243 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
4244 	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
4245 	sc->sc_win = win;
4246 	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
4247 	sc->sc_timestamp = tb.ts_recent;
4248 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
4249 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
4250 		sc->sc_flags |= SCF_TIMESTAMP;
4251 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
4252 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
4253 		sc->sc_requested_s_scale = tb.requested_s_scale;
4254 		sc->sc_request_r_scale = 0;
4255 		/*
4256 		 * Pick the smallest possible scaling factor that
4257 		 * will still allow us to scale up to sb_max.
4258 		 *
4259 		 * We do this because there are broken firewalls that
4260 		 * will corrupt the window scale option, leading to
4261 		 * the other endpoint believing that our advertised
4262 		 * window is unscaled.  At scale factors larger than
4263 		 * 5 the unscaled window will drop below 1500 bytes,
4264 		 * leading to serious problems when traversing these
4265 		 * broken firewalls.
4266 		 *
4267 		 * With the default sbmax of 256K, a scale factor
4268 		 * of 3 will be chosen by this algorithm.  Those who
4269 		 * choose a larger sbmax should watch out
4270 		 * for the compatiblity problems mentioned above.
4271 		 *
4272 		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
4273 		 * or <SYN,ACK>) segment itself is never scaled.
4274 		 */
4275 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4276 		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
4277 			sc->sc_request_r_scale++;
4278 	} else {
4279 		sc->sc_requested_s_scale = 15;
4280 		sc->sc_request_r_scale = 15;
4281 	}
4282 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
4283 		sc->sc_flags |= SCF_SACK_PERMIT;
4284 
4285 	/*
4286 	 * ECN setup packet received.
4287 	 */
4288 	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
4289 		sc->sc_flags |= SCF_ECN_PERMIT;
4290 
4291 #ifdef TCP_SIGNATURE
4292 	if (tb.t_flags & TF_SIGNATURE)
4293 		sc->sc_flags |= SCF_SIGNATURE;
4294 #endif
4295 	sc->sc_tp = tp;
4296 	m_freem(m);
4297 	if (syn_cache_respond(sc) == 0) {
4298 		uint64_t *tcps = TCP_STAT_GETREF();
4299 		tcps[TCP_STAT_SNDACKS]++;
4300 		tcps[TCP_STAT_SNDTOTAL]++;
4301 		TCP_STAT_PUTREF();
4302 		syn_cache_insert(sc, tp);
4303 	} else {
4304 		s = splsoftnet();
4305 		/*
4306 		 * syn_cache_put() will try to schedule the timer, so
4307 		 * we need to initialize it
4308 		 */
4309 		syn_cache_timer_arm(sc);
4310 		syn_cache_put(sc);
4311 		splx(s);
4312 		TCP_STATINC(TCP_STAT_SC_DROPPED);
4313 	}
4314 	return 1;
4315 }
4316 
4317 /*
4318  * syn_cache_respond: (re)send SYN+ACK.
4319  *
4320  * Returns 0 on success.
4321  */
4322 
4323 int
4324 syn_cache_respond(struct syn_cache *sc)
4325 {
4326 #ifdef INET6
4327 	struct rtentry *rt = NULL;
4328 #endif
4329 	struct route *ro;
4330 	u_int8_t *optp;
4331 	int optlen, error;
4332 	u_int16_t tlen;
4333 	struct ip *ip = NULL;
4334 #ifdef INET6
4335 	struct ip6_hdr *ip6 = NULL;
4336 #endif
4337 	struct tcpcb *tp;
4338 	struct tcphdr *th;
4339 	struct mbuf *m;
4340 	u_int hlen;
4341 #ifdef TCP_SIGNATURE
4342 	struct secasvar *sav = NULL;
4343 	u_int8_t *sigp = NULL;
4344 #endif
4345 
4346 	ro = &sc->sc_route;
4347 	switch (sc->sc_src.sa.sa_family) {
4348 	case AF_INET:
4349 		hlen = sizeof(struct ip);
4350 		break;
4351 #ifdef INET6
4352 	case AF_INET6:
4353 		hlen = sizeof(struct ip6_hdr);
4354 		break;
4355 #endif
4356 	default:
4357 		return EAFNOSUPPORT;
4358 	}
4359 
4360 	/* Worst case scanario, since we don't know the option size yet. */
4361 	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
4362 	KASSERT(max_linkhdr + tlen <= MCLBYTES);
4363 
4364 	/*
4365 	 * Create the IP+TCP header from scratch.
4366 	 */
4367 	MGETHDR(m, M_DONTWAIT, MT_DATA);
4368 	if (m && (max_linkhdr + tlen) > MHLEN) {
4369 		MCLGET(m, M_DONTWAIT);
4370 		if ((m->m_flags & M_EXT) == 0) {
4371 			m_freem(m);
4372 			m = NULL;
4373 		}
4374 	}
4375 	if (m == NULL)
4376 		return ENOBUFS;
4377 	MCLAIM(m, &tcp_tx_mowner);
4378 
4379 	tp = sc->sc_tp;
4380 
4381 	/* Fixup the mbuf. */
4382 	m->m_data += max_linkhdr;
4383 	m_reset_rcvif(m);
4384 	memset(mtod(m, void *), 0, tlen);
4385 
4386 	switch (sc->sc_src.sa.sa_family) {
4387 	case AF_INET:
4388 		ip = mtod(m, struct ip *);
4389 		ip->ip_v = 4;
4390 		ip->ip_dst = sc->sc_src.sin.sin_addr;
4391 		ip->ip_src = sc->sc_dst.sin.sin_addr;
4392 		ip->ip_p = IPPROTO_TCP;
4393 		th = (struct tcphdr *)(ip + 1);
4394 		th->th_dport = sc->sc_src.sin.sin_port;
4395 		th->th_sport = sc->sc_dst.sin.sin_port;
4396 		break;
4397 #ifdef INET6
4398 	case AF_INET6:
4399 		ip6 = mtod(m, struct ip6_hdr *);
4400 		ip6->ip6_vfc = IPV6_VERSION;
4401 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4402 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4403 		ip6->ip6_nxt = IPPROTO_TCP;
4404 		/* ip6_plen will be updated in ip6_output() */
4405 		th = (struct tcphdr *)(ip6 + 1);
4406 		th->th_dport = sc->sc_src.sin6.sin6_port;
4407 		th->th_sport = sc->sc_dst.sin6.sin6_port;
4408 		break;
4409 #endif
4410 	default:
4411 		panic("%s: impossible (1)", __func__);
4412 	}
4413 
4414 	th->th_seq = htonl(sc->sc_iss);
4415 	th->th_ack = htonl(sc->sc_irs + 1);
4416 	th->th_flags = TH_SYN|TH_ACK;
4417 	th->th_win = htons(sc->sc_win);
4418 	/* th_x2, th_sum, th_urp already 0 from memset */
4419 
4420 	/* Tack on the TCP options. */
4421 	optp = (u_int8_t *)(th + 1);
4422 	optlen = 0;
4423 	*optp++ = TCPOPT_MAXSEG;
4424 	*optp++ = TCPOLEN_MAXSEG;
4425 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4426 	*optp++ = sc->sc_ourmaxseg & 0xff;
4427 	optlen += TCPOLEN_MAXSEG;
4428 
4429 	if (sc->sc_request_r_scale != 15) {
4430 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4431 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4432 		    sc->sc_request_r_scale);
4433 		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
4434 		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
4435 	}
4436 
4437 	if (sc->sc_flags & SCF_SACK_PERMIT) {
4438 		/* Let the peer know that we will SACK. */
4439 		*optp++ = TCPOPT_SACK_PERMITTED;
4440 		*optp++ = TCPOLEN_SACK_PERMITTED;
4441 		optlen += TCPOLEN_SACK_PERMITTED;
4442 	}
4443 
4444 	if (sc->sc_flags & SCF_TIMESTAMP) {
4445 		while (optlen % 4 != 2) {
4446 			optlen += TCPOLEN_NOP;
4447 			*optp++ = TCPOPT_NOP;
4448 		}
4449 		*optp++ = TCPOPT_TIMESTAMP;
4450 		*optp++ = TCPOLEN_TIMESTAMP;
4451 		u_int32_t *lp = (u_int32_t *)(optp);
4452 		/* Form timestamp option as shown in appendix A of RFC 1323. */
4453 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4454 		*lp   = htonl(sc->sc_timestamp);
4455 		optp += TCPOLEN_TIMESTAMP - 2;
4456 		optlen += TCPOLEN_TIMESTAMP;
4457 	}
4458 
4459 #ifdef TCP_SIGNATURE
4460 	if (sc->sc_flags & SCF_SIGNATURE) {
4461 		sav = tcp_signature_getsav(m);
4462 		if (sav == NULL) {
4463 			m_freem(m);
4464 			return EPERM;
4465 		}
4466 
4467 		*optp++ = TCPOPT_SIGNATURE;
4468 		*optp++ = TCPOLEN_SIGNATURE;
4469 		sigp = optp;
4470 		memset(optp, 0, TCP_SIGLEN);
4471 		optp += TCP_SIGLEN;
4472 		optlen += TCPOLEN_SIGNATURE;
4473 	}
4474 #endif
4475 
4476 	/*
4477 	 * Terminate and pad TCP options to a 4 byte boundary.
4478 	 *
4479 	 * According to RFC793: "The content of the header beyond the
4480 	 * End-of-Option option must be header padding (i.e., zero)."
4481 	 * And later: "The padding is composed of zeros."
4482 	 */
4483 	if (optlen % 4) {
4484 		optlen += TCPOLEN_EOL;
4485 		*optp++ = TCPOPT_EOL;
4486 	}
4487 	while (optlen % 4) {
4488 		optlen += TCPOLEN_PAD;
4489 		*optp++ = TCPOPT_PAD;
4490 	}
4491 
4492 	/* Compute the actual values now that we've added the options. */
4493 	tlen = hlen + sizeof(struct tcphdr) + optlen;
4494 	m->m_len = m->m_pkthdr.len = tlen;
4495 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4496 
4497 #ifdef TCP_SIGNATURE
4498 	if (sav) {
4499 		(void)tcp_signature(m, th, hlen, sav, sigp);
4500 		key_sa_recordxfer(sav, m);
4501 		KEY_SA_UNREF(&sav);
4502 	}
4503 #endif
4504 
4505 	/*
4506 	 * Send ECN SYN-ACK setup packet.
4507 	 * Routes can be asymetric, so, even if we receive a packet
4508 	 * with ECE and CWR set, we must not assume no one will block
4509 	 * the ECE packet we are about to send.
4510 	 */
4511 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
4512 	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
4513 		th->th_flags |= TH_ECE;
4514 		TCP_STATINC(TCP_STAT_ECN_SHS);
4515 
4516 		/*
4517 		 * draft-ietf-tcpm-ecnsyn-00.txt
4518 		 *
4519 		 * "[...] a TCP node MAY respond to an ECN-setup
4520 		 * SYN packet by setting ECT in the responding
4521 		 * ECN-setup SYN/ACK packet, indicating to routers
4522 		 * that the SYN/ACK packet is ECN-Capable.
4523 		 * This allows a congested router along the path
4524 		 * to mark the packet instead of dropping the
4525 		 * packet as an indication of congestion."
4526 		 *
4527 		 * "[...] There can be a great benefit in setting
4528 		 * an ECN-capable codepoint in SYN/ACK packets [...]
4529 		 * Congestion is  most likely to occur in
4530 		 * the server-to-client direction.  As a result,
4531 		 * setting an ECN-capable codepoint in SYN/ACK
4532 		 * packets can reduce the occurence of three-second
4533 		 * retransmit timeouts resulting from the drop
4534 		 * of SYN/ACK packets."
4535 		 *
4536 		 * Page 4 and 6, January 2006.
4537 		 */
4538 
4539 		switch (sc->sc_src.sa.sa_family) {
4540 		case AF_INET:
4541 			ip->ip_tos |= IPTOS_ECN_ECT0;
4542 			break;
4543 #ifdef INET6
4544 		case AF_INET6:
4545 			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
4546 			break;
4547 #endif
4548 		}
4549 		TCP_STATINC(TCP_STAT_ECN_ECT);
4550 	}
4551 
4552 
4553 	/*
4554 	 * Compute the packet's checksum.
4555 	 *
4556 	 * Fill in some straggling IP bits.  Note the stack expects
4557 	 * ip_len to be in host order, for convenience.
4558 	 */
4559 	switch (sc->sc_src.sa.sa_family) {
4560 	case AF_INET:
4561 		ip->ip_len = htons(tlen - hlen);
4562 		th->th_sum = 0;
4563 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4564 		ip->ip_len = htons(tlen);
4565 		ip->ip_ttl = ip_defttl;
4566 		/* XXX tos? */
4567 		break;
4568 #ifdef INET6
4569 	case AF_INET6:
4570 		ip6->ip6_plen = htons(tlen - hlen);
4571 		th->th_sum = 0;
4572 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4573 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4574 		ip6->ip6_vfc |= IPV6_VERSION;
4575 		ip6->ip6_plen = htons(tlen - hlen);
4576 		/* ip6_hlim will be initialized afterwards */
4577 		/* XXX flowlabel? */
4578 		break;
4579 #endif
4580 	}
4581 
4582 	/* XXX use IPsec policy on listening socket, on SYN ACK */
4583 	tp = sc->sc_tp;
4584 
4585 	switch (sc->sc_src.sa.sa_family) {
4586 	case AF_INET:
4587 		error = ip_output(m, sc->sc_ipopts, ro,
4588 		    (ip_mtudisc ? IP_MTUDISC : 0),
4589 		    NULL, tp ? tp->t_inpcb : NULL);
4590 		break;
4591 #ifdef INET6
4592 	case AF_INET6:
4593 		ip6->ip6_hlim = in6_selecthlim(NULL,
4594 		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
4595 		rtcache_unref(rt, ro);
4596 
4597 		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
4598 		    tp ? tp->t_in6pcb : NULL, NULL);
4599 		break;
4600 #endif
4601 	default:
4602 		panic("%s: impossible (2)", __func__);
4603 	}
4604 
4605 	return error;
4606 }
4607