xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision 172e4b14195019a7bc72f63f2a62581d7903b8ec)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.217 2023/03/14 00:24:05 yasuoka Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 #include <sys/pool.h>
82 #include <sys/proc.h>
83 
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/route.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/in_var.h>
90 #include <netinet/ip.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_fsm.h>
95 #include <netinet/tcp_seq.h>
96 #include <netinet/tcp_timer.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet/tcp_debug.h>
99 
100 #ifdef INET6
101 #include <netinet6/in6_var.h>
102 #endif
103 
104 #ifndef TCP_SENDSPACE
105 #define	TCP_SENDSPACE	1024*16
106 #endif
107 u_int	tcp_sendspace = TCP_SENDSPACE;
108 #ifndef TCP_RECVSPACE
109 #define	TCP_RECVSPACE	1024*16
110 #endif
111 u_int	tcp_recvspace = TCP_RECVSPACE;
112 u_int	tcp_autorcvbuf_inc = 16 * 1024;
113 
114 const struct pr_usrreqs tcp_usrreqs = {
115 	.pru_attach	= tcp_attach,
116 	.pru_detach	= tcp_detach,
117 	.pru_bind	= tcp_bind,
118 	.pru_listen	= tcp_listen,
119 	.pru_connect	= tcp_connect,
120 	.pru_accept	= tcp_accept,
121 	.pru_disconnect	= tcp_disconnect,
122 	.pru_shutdown	= tcp_shutdown,
123 	.pru_rcvd	= tcp_rcvd,
124 	.pru_send	= tcp_send,
125 	.pru_abort	= tcp_abort,
126 	.pru_sense	= tcp_sense,
127 	.pru_rcvoob	= tcp_rcvoob,
128 	.pru_sendoob	= tcp_sendoob,
129 	.pru_control	= in_control,
130 	.pru_sockaddr	= tcp_sockaddr,
131 	.pru_peeraddr	= tcp_peeraddr,
132 };
133 
134 #ifdef INET6
135 const struct pr_usrreqs tcp6_usrreqs = {
136 	.pru_attach	= tcp_attach,
137 	.pru_detach	= tcp_detach,
138 	.pru_bind	= tcp_bind,
139 	.pru_listen	= tcp_listen,
140 	.pru_connect	= tcp_connect,
141 	.pru_accept	= tcp_accept,
142 	.pru_disconnect	= tcp_disconnect,
143 	.pru_shutdown	= tcp_shutdown,
144 	.pru_rcvd	= tcp_rcvd,
145 	.pru_send	= tcp_send,
146 	.pru_abort	= tcp_abort,
147 	.pru_sense	= tcp_sense,
148 	.pru_rcvoob	= tcp_rcvoob,
149 	.pru_sendoob	= tcp_sendoob,
150 	.pru_control	= in6_control,
151 	.pru_sockaddr	= tcp_sockaddr,
152 	.pru_peeraddr	= tcp_peeraddr,
153 };
154 #endif
155 
156 const struct sysctl_bounded_args tcpctl_vars[] = {
157 	{ TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
158 	{ TCPCTL_SACK, &tcp_do_sack, 0, 1 },
159 	{ TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
160 	{ TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
161 	{ TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
162 #ifdef TCP_ECN
163 	{ TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
164 #endif
165 	{ TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
166 	{ TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
167 	{ TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
168 	{ TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
169 };
170 
171 struct	inpcbtable tcbtable;
172 
173 int	tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
174 int	tcp_ident(void *, size_t *, void *, size_t, int);
175 
176 static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
177                       struct tcpcb **);
178 
179 static inline int
180 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
181 {
182 	struct inpcb *inp;
183 	struct tcpcb *tp;
184 
185 	/*
186 	 * When a TCP is attached to a socket, then there will be
187 	 * a (struct inpcb) pointed at by the socket, and this
188 	 * structure will point at a subsidiary (struct tcpcb).
189 	 */
190 	if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
191 		if (so->so_error)
192 			return so->so_error;
193 		return EINVAL;
194 	}
195 
196 	*rinp = inp;
197 	*rtp = tp;
198 
199 	return 0;
200 }
201 
202 /*
203  * Export internal TCP state information via a struct tcp_info without
204  * leaking any sensitive information. Sequence numbers are reported
205  * relative to the initial sequence number.
206  */
207 int
208 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
209 {
210 	struct proc *p = curproc;
211 	struct tcp_info *ti;
212 	u_int t = 1000;		/* msec => usec */
213 	uint32_t now;
214 
215 	if (sizeof(*ti) > MLEN) {
216 		MCLGETL(m, M_WAITOK, sizeof(*ti));
217 		if (!ISSET(m->m_flags, M_EXT))
218 			return ENOMEM;
219 	}
220 	ti = mtod(m, struct tcp_info *);
221 	m->m_len = sizeof(*ti);
222 	memset(ti, 0, sizeof(*ti));
223 	now = tcp_now();
224 
225 	ti->tcpi_state = tp->t_state;
226 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
227 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
228 	if (tp->t_flags & TF_SACK_PERMIT)
229 		ti->tcpi_options |= TCPI_OPT_SACK;
230 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
231 		ti->tcpi_options |= TCPI_OPT_WSCALE;
232 		ti->tcpi_snd_wscale = tp->snd_scale;
233 		ti->tcpi_rcv_wscale = tp->rcv_scale;
234 	}
235 #ifdef TCP_ECN
236 	if (tp->t_flags & TF_ECN_PERMIT)
237 		ti->tcpi_options |= TCPI_OPT_ECN;
238 #endif
239 
240 	ti->tcpi_rto = tp->t_rxtcur * t;
241 	ti->tcpi_snd_mss = tp->t_maxseg;
242 	ti->tcpi_rcv_mss = tp->t_peermss;
243 
244 	ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
245 	ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
246 	ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
247 	ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
248 
249 	ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
250 	    (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
251 	ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
252 	    (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
253 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
254 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
255 
256 	ti->tcpi_rcv_space = tp->rcv_wnd;
257 
258 	/*
259 	 * Provide only minimal information for unprivileged processes.
260 	 */
261 	if (suser(p) != 0)
262 		return 0;
263 
264 	/* FreeBSD-specific extension fields for tcp_info.  */
265 	ti->tcpi_snd_wnd = tp->snd_wnd;
266 	ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
267 	ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
268 	/* missing tcpi_toe_tid */
269 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
270 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
271 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
272 
273 	/* OpenBSD extensions */
274 	ti->tcpi_rttmin = tp->t_rttmin * t;
275 	ti->tcpi_max_sndwnd = tp->max_sndwnd;
276 	ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
277 	ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
278 	ti->tcpi_snd_una = tp->snd_una - tp->iss;
279 	ti->tcpi_snd_up = tp->snd_up - tp->iss;
280 	ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
281 	ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
282 	ti->tcpi_snd_max = tp->snd_max - tp->iss;
283 
284 	ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
285 	ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
286 	ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
287 	ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
288 
289 	ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
290 	ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
291 	ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
292 	ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
293 	ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
294 	ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
295 	ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
296 	ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
297 
298 	return 0;
299 }
300 
301 int
302 tcp_ctloutput(int op, struct socket *so, int level, int optname,
303     struct mbuf *m)
304 {
305 	int error = 0;
306 	struct inpcb *inp;
307 	struct tcpcb *tp;
308 	int i;
309 
310 	inp = sotoinpcb(so);
311 	if (inp == NULL)
312 		return (ECONNRESET);
313 	if (level != IPPROTO_TCP) {
314 		switch (so->so_proto->pr_domain->dom_family) {
315 #ifdef INET6
316 		case PF_INET6:
317 			error = ip6_ctloutput(op, so, level, optname, m);
318 			break;
319 #endif /* INET6 */
320 		case PF_INET:
321 			error = ip_ctloutput(op, so, level, optname, m);
322 			break;
323 		default:
324 			error = EAFNOSUPPORT;	/*?*/
325 			break;
326 		}
327 		return (error);
328 	}
329 	tp = intotcpcb(inp);
330 
331 	switch (op) {
332 
333 	case PRCO_SETOPT:
334 		switch (optname) {
335 
336 		case TCP_NODELAY:
337 			if (m == NULL || m->m_len < sizeof (int))
338 				error = EINVAL;
339 			else if (*mtod(m, int *))
340 				tp->t_flags |= TF_NODELAY;
341 			else
342 				tp->t_flags &= ~TF_NODELAY;
343 			break;
344 
345 		case TCP_NOPUSH:
346 			if (m == NULL || m->m_len < sizeof (int))
347 				error = EINVAL;
348 			else if (*mtod(m, int *))
349 				tp->t_flags |= TF_NOPUSH;
350 			else if (tp->t_flags & TF_NOPUSH) {
351 				tp->t_flags &= ~TF_NOPUSH;
352 				if (TCPS_HAVEESTABLISHED(tp->t_state))
353 					error = tcp_output(tp);
354 			}
355 			break;
356 
357 		case TCP_MAXSEG:
358 			if (m == NULL || m->m_len < sizeof (int)) {
359 				error = EINVAL;
360 				break;
361 			}
362 
363 			i = *mtod(m, int *);
364 			if (i > 0 && i <= tp->t_maxseg)
365 				tp->t_maxseg = i;
366 			else
367 				error = EINVAL;
368 			break;
369 
370 		case TCP_SACK_ENABLE:
371 			if (m == NULL || m->m_len < sizeof (int)) {
372 				error = EINVAL;
373 				break;
374 			}
375 
376 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
377 				error = EPERM;
378 				break;
379 			}
380 
381 			if (tp->t_flags & TF_SIGNATURE) {
382 				error = EPERM;
383 				break;
384 			}
385 
386 			if (*mtod(m, int *))
387 				tp->sack_enable = 1;
388 			else
389 				tp->sack_enable = 0;
390 			break;
391 #ifdef TCP_SIGNATURE
392 		case TCP_MD5SIG:
393 			if (m == NULL || m->m_len < sizeof (int)) {
394 				error = EINVAL;
395 				break;
396 			}
397 
398 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
399 				error = EPERM;
400 				break;
401 			}
402 
403 			if (*mtod(m, int *)) {
404 				tp->t_flags |= TF_SIGNATURE;
405 				tp->sack_enable = 0;
406 			} else
407 				tp->t_flags &= ~TF_SIGNATURE;
408 			break;
409 #endif /* TCP_SIGNATURE */
410 		default:
411 			error = ENOPROTOOPT;
412 			break;
413 		}
414 		break;
415 
416 	case PRCO_GETOPT:
417 		switch (optname) {
418 		case TCP_NODELAY:
419 			m->m_len = sizeof(int);
420 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
421 			break;
422 		case TCP_NOPUSH:
423 			m->m_len = sizeof(int);
424 			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
425 			break;
426 		case TCP_MAXSEG:
427 			m->m_len = sizeof(int);
428 			*mtod(m, int *) = tp->t_maxseg;
429 			break;
430 		case TCP_SACK_ENABLE:
431 			m->m_len = sizeof(int);
432 			*mtod(m, int *) = tp->sack_enable;
433 			break;
434 		case TCP_INFO:
435 			error = tcp_fill_info(tp, so, m);
436 			break;
437 #ifdef TCP_SIGNATURE
438 		case TCP_MD5SIG:
439 			m->m_len = sizeof(int);
440 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
441 			break;
442 #endif
443 		default:
444 			error = ENOPROTOOPT;
445 			break;
446 		}
447 		break;
448 	}
449 	return (error);
450 }
451 
452 /*
453  * Attach TCP protocol to socket, allocating
454  * internet protocol control block, tcp control block,
455  * buffer space, and entering LISTEN state to accept connections.
456  */
457 int
458 tcp_attach(struct socket *so, int proto, int wait)
459 {
460 	struct tcpcb *tp;
461 	struct inpcb *inp;
462 	int error;
463 
464 	if (so->so_pcb)
465 		return EISCONN;
466 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
467 	    sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
468 	    sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
469 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
470 		if (error)
471 			return (error);
472 	}
473 
474 	NET_ASSERT_LOCKED();
475 	error = in_pcballoc(so, &tcbtable, wait);
476 	if (error)
477 		return (error);
478 	inp = sotoinpcb(so);
479 	tp = tcp_newtcpcb(inp, wait);
480 	if (tp == NULL) {
481 		unsigned int nofd = so->so_state & SS_NOFDREF;	/* XXX */
482 
483 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
484 		in_pcbdetach(inp);
485 		so->so_state |= nofd;
486 		return (ENOBUFS);
487 	}
488 	tp->t_state = TCPS_CLOSED;
489 #ifdef INET6
490 	/* we disallow IPv4 mapped address completely. */
491 	if (inp->inp_flags & INP_IPV6)
492 		tp->pf = PF_INET6;
493 	else
494 		tp->pf = PF_INET;
495 #else
496 	tp->pf = PF_INET;
497 #endif
498 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
499 		so->so_linger = TCP_LINGERTIME;
500 
501 	if (so->so_options & SO_DEBUG)
502 		tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
503 	return (0);
504 }
505 
506 int
507 tcp_detach(struct socket *so)
508 {
509 	struct inpcb *inp;
510 	struct tcpcb *otp = NULL, *tp;
511 	int error = 0;
512 	short ostate;
513 
514 	soassertlocked(so);
515 
516 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
517 		return (error);
518 
519 	if (so->so_options & SO_DEBUG) {
520 		otp = tp;
521 		ostate = tp->t_state;
522 	}
523 
524 	/*
525 	 * Detach the TCP protocol from the socket.
526 	 * If the protocol state is non-embryonic, then can't
527 	 * do this directly: have to initiate a PRU_DISCONNECT,
528 	 * which may finish later; embryonic TCB's can just
529 	 * be discarded here.
530 	 */
531 	tp = tcp_dodisconnect(tp);
532 
533 	if (otp)
534 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
535 	return (error);
536 }
537 
538 /*
539  * Give the socket an address.
540  */
541 int
542 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
543 {
544 	struct inpcb *inp;
545 	struct tcpcb *tp;
546 	int error;
547 	short ostate;
548 
549 	soassertlocked(so);
550 
551 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
552 		return (error);
553 
554 	if (so->so_options & SO_DEBUG)
555 		ostate = tp->t_state;
556 
557 	error = in_pcbbind(inp, nam, p);
558 
559 	if (so->so_options & SO_DEBUG)
560 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
561 	return (error);
562 }
563 
564 /*
565  * Prepare to accept connections.
566  */
567 int
568 tcp_listen(struct socket *so)
569 {
570 	struct inpcb *inp;
571 	struct tcpcb *tp, *otp = NULL;
572 	int error;
573 	short ostate;
574 
575 	soassertlocked(so);
576 
577 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
578 		return (error);
579 
580 	if (so->so_options & SO_DEBUG) {
581 		otp = tp;
582 		ostate = tp->t_state;
583 	}
584 
585 	if (inp->inp_lport == 0)
586 		if ((error = in_pcbbind(inp, NULL, curproc)))
587 			goto out;
588 
589 	/*
590 	 * If the in_pcbbind() above is called, the tp->pf
591 	 * should still be whatever it was before.
592 	 */
593 	tp->t_state = TCPS_LISTEN;
594 
595 out:
596 	if (otp)
597 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
598 	return (error);
599 }
600 
601 /*
602  * Initiate connection to peer.
603  * Create a template for use in transmissions on this connection.
604  * Enter SYN_SENT state, and mark socket as connecting.
605  * Start keep-alive timer, and seed output sequence space.
606  * Send initial segment on connection.
607  */
608 int
609 tcp_connect(struct socket *so, struct mbuf *nam)
610 {
611 	struct inpcb *inp;
612 	struct tcpcb *tp, *otp = NULL;
613 	int error;
614 	short ostate;
615 
616 	soassertlocked(so);
617 
618 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
619 		return (error);
620 
621 	if (so->so_options & SO_DEBUG) {
622 		otp = tp;
623 		ostate = tp->t_state;
624 	}
625 
626 #ifdef INET6
627 	if (inp->inp_flags & INP_IPV6) {
628 		struct sockaddr_in6 *sin6;
629 
630 		if ((error = in6_nam2sin6(nam, &sin6)))
631 			goto out;
632 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
633 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
634 			error = EINVAL;
635 			goto out;
636 		}
637 		error = in6_pcbconnect(inp, nam);
638 	} else
639 #endif /* INET6 */
640 	{
641 		struct sockaddr_in *sin;
642 
643 		if ((error = in_nam2sin(nam, &sin)))
644 			goto out;
645 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
646 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
647 		    IN_MULTICAST(sin->sin_addr.s_addr) ||
648 		    in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
649 			error = EINVAL;
650 			goto out;
651 		}
652 		error = in_pcbconnect(inp, nam);
653 	}
654 	if (error)
655 		goto out;
656 
657 	tp->t_template = tcp_template(tp);
658 	if (tp->t_template == 0) {
659 		in_pcbdisconnect(inp);
660 		error = ENOBUFS;
661 		goto out;
662 	}
663 
664 	so->so_state |= SS_CONNECTOUT;
665 
666 	/* Compute window scaling to request.  */
667 	tcp_rscale(tp, sb_max);
668 
669 	soisconnecting(so);
670 	tcpstat_inc(tcps_connattempt);
671 	tp->t_state = TCPS_SYN_SENT;
672 	TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
673 	tcp_set_iss_tsm(tp);
674 	tcp_sendseqinit(tp);
675 	tp->snd_last = tp->snd_una;
676 	error = tcp_output(tp);
677 
678 out:
679 	if (otp)
680 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
681 	return (error);
682 }
683 
684 /*
685  * Accept a connection.  Essentially all the work is done at higher
686  * levels; just return the address of the peer, storing through addr.
687  */
688 int
689 tcp_accept(struct socket *so, struct mbuf *nam)
690 {
691 	struct inpcb *inp;
692 	struct tcpcb *tp;
693 	int error;
694 	short ostate;
695 
696 	soassertlocked(so);
697 
698 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
699 		return (error);
700 
701 	if (so->so_options & SO_DEBUG)
702 		ostate = tp->t_state;
703 
704 #ifdef INET6
705 	if (inp->inp_flags & INP_IPV6)
706 		in6_setpeeraddr(inp, nam);
707 	else
708 #endif
709 		in_setpeeraddr(inp, nam);
710 
711 	if (so->so_options & SO_DEBUG)
712 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0);
713 	return (error);
714 }
715 
716 /*
717  * Initiate disconnect from peer.
718  * If connection never passed embryonic stage, just drop;
719  * else if don't need to let data drain, then can just drop anyways,
720  * else have to begin TCP shutdown process: mark socket disconnecting,
721  * drain unread data, state switch to reflect user close, and
722  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
723  * when peer sends FIN and acks ours.
724  *
725  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
726  */
727 int
728 tcp_disconnect(struct socket *so)
729 {
730 	struct inpcb *inp;
731 	struct tcpcb *tp, *otp = NULL;
732 	int error;
733 	short ostate;
734 
735 	soassertlocked(so);
736 
737 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
738 		return (error);
739 
740 	if (so->so_options & SO_DEBUG) {
741 		otp = tp;
742 		ostate = tp->t_state;
743 	}
744 
745 	tp = tcp_dodisconnect(tp);
746 
747 	if (otp)
748 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
749 	return (0);
750 }
751 
752 /*
753  * Mark the connection as being incapable of further output.
754  */
755 int
756 tcp_shutdown(struct socket *so)
757 {
758 	struct inpcb *inp;
759 	struct tcpcb *tp, *otp = NULL;
760 	int error;
761 	short ostate;
762 
763 	soassertlocked(so);
764 
765 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
766 		return (error);
767 
768 	if (so->so_options & SO_DEBUG) {
769 		otp = tp;
770 		ostate = tp->t_state;
771 	}
772 
773 	if (so->so_snd.sb_state & SS_CANTSENDMORE)
774 		goto out;
775 
776 	socantsendmore(so);
777 	tp = tcp_usrclosed(tp);
778 	if (tp)
779 		error = tcp_output(tp);
780 
781 out:
782 	if (otp)
783 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
784 	return (error);
785 }
786 
787 /*
788  * After a receive, possibly send window update to peer.
789  */
790 void
791 tcp_rcvd(struct socket *so)
792 {
793 	struct inpcb *inp;
794 	struct tcpcb *tp;
795 	short ostate;
796 
797 	soassertlocked(so);
798 
799 	if (tcp_sogetpcb(so, &inp, &tp))
800 		return;
801 
802 	if (so->so_options & SO_DEBUG)
803 		ostate = tp->t_state;
804 
805 	/*
806 	 * soreceive() calls this function when a user receives
807 	 * ancillary data on a listening socket. We don't call
808 	 * tcp_output in such a case, since there is no header
809 	 * template for a listening socket and hence the kernel
810 	 * will panic.
811 	 */
812 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
813 		(void) tcp_output(tp);
814 
815 	if (so->so_options & SO_DEBUG)
816 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
817 }
818 
819 /*
820  * Do a send by putting data in output queue and updating urgent
821  * marker if URG set.  Possibly send more data.
822  */
823 int
824 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
825     struct mbuf *control)
826 {
827 	struct inpcb *inp;
828 	struct tcpcb *tp;
829 	int error;
830 	short ostate;
831 
832 	soassertlocked(so);
833 
834 	if (control && control->m_len) {
835 		error = EINVAL;
836 		goto out;
837 	}
838 
839 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
840 		goto out;
841 
842 	if (so->so_options & SO_DEBUG)
843 		ostate = tp->t_state;
844 
845 	sbappendstream(so, &so->so_snd, m);
846 	m = NULL;
847 
848 	error = tcp_output(tp);
849 
850 	if (so->so_options & SO_DEBUG)
851 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
852 
853 out:
854 	m_freem(control);
855 	m_freem(m);
856 
857 	return (error);
858 }
859 
860 /*
861  * Abort the TCP.
862  */
863 void
864 tcp_abort(struct socket *so)
865 {
866 	struct inpcb *inp;
867 	struct tcpcb *tp, *otp = NULL;
868 	short ostate;
869 
870 	soassertlocked(so);
871 
872 	if (tcp_sogetpcb(so, &inp, &tp))
873 		return;
874 
875 	if (so->so_options & SO_DEBUG) {
876 		otp = tp;
877 		ostate = tp->t_state;
878 	}
879 
880 	tp = tcp_drop(tp, ECONNABORTED);
881 
882 	if (otp)
883 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
884 }
885 
886 int
887 tcp_sense(struct socket *so, struct stat *ub)
888 {
889 	struct inpcb *inp;
890 	struct tcpcb *tp;
891 	int error;
892 
893 	soassertlocked(so);
894 
895 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
896 		return (error);
897 
898 	ub->st_blksize = so->so_snd.sb_hiwat;
899 
900 	if (so->so_options & SO_DEBUG)
901 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
902 	return (0);
903 }
904 
905 int
906 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
907 {
908 	struct inpcb *inp;
909 	struct tcpcb *tp;
910 	int error;
911 
912 	soassertlocked(so);
913 
914 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
915 		return (error);
916 
917 	if ((so->so_oobmark == 0 &&
918 	    (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
919 	    so->so_options & SO_OOBINLINE ||
920 	    tp->t_oobflags & TCPOOB_HADDATA) {
921 		error = EINVAL;
922 		goto out;
923 	}
924 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
925 		error = EWOULDBLOCK;
926 		goto out;
927 	}
928 	m->m_len = 1;
929 	*mtod(m, caddr_t) = tp->t_iobc;
930 	if ((flags & MSG_PEEK) == 0)
931 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
932 out:
933 	if (so->so_options & SO_DEBUG)
934 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
935 	return (error);
936 }
937 
938 int
939 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
940     struct mbuf *control)
941 {
942 	struct inpcb *inp;
943 	struct tcpcb *tp;
944 	int error;
945 	short ostate;
946 
947 	soassertlocked(so);
948 
949 	if (control && control->m_len) {
950 		error = EINVAL;
951 		goto release;
952 	}
953 
954 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
955 		goto release;
956 
957 	if (so->so_options & SO_DEBUG)
958 		ostate = tp->t_state;
959 
960 	if (sbspace(so, &so->so_snd) < -512) {
961 		error = ENOBUFS;
962 		goto out;
963 	}
964 
965 	/*
966 	 * According to RFC961 (Assigned Protocols),
967 	 * the urgent pointer points to the last octet
968 	 * of urgent data.  We continue, however,
969 	 * to consider it to indicate the first octet
970 	 * of data past the urgent section.
971 	 * Otherwise, snd_up should be one lower.
972 	 */
973 	sbappendstream(so, &so->so_snd, m);
974 	m = NULL;
975 	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
976 	tp->t_force = 1;
977 	error = tcp_output(tp);
978 	tp->t_force = 0;
979 
980 out:
981 	if (so->so_options & SO_DEBUG)
982 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
983 
984 release:
985 	m_freem(control);
986 	m_freem(m);
987 
988 	return (error);
989 }
990 
991 int
992 tcp_sockaddr(struct socket *so, struct mbuf *nam)
993 {
994 	struct inpcb *inp;
995 	struct tcpcb *tp;
996 	int error;
997 
998 	soassertlocked(so);
999 
1000 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
1001 		return (error);
1002 
1003 #ifdef INET6
1004 	if (inp->inp_flags & INP_IPV6)
1005 		in6_setsockaddr(inp, nam);
1006 	else
1007 #endif
1008 		in_setsockaddr(inp, nam);
1009 
1010 	if (so->so_options & SO_DEBUG)
1011 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1012 		    PRU_SOCKADDR, 0);
1013 	return (0);
1014 }
1015 
1016 int
1017 tcp_peeraddr(struct socket *so, struct mbuf *nam)
1018 {
1019 	struct inpcb *inp;
1020 	struct tcpcb *tp;
1021 	int error;
1022 
1023 	soassertlocked(so);
1024 
1025 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
1026 		return (error);
1027 
1028 #ifdef INET6
1029 	if (inp->inp_flags & INP_IPV6)
1030 		in6_setpeeraddr(inp, nam);
1031 	else
1032 #endif
1033 		in_setpeeraddr(inp, nam);
1034 
1035 	if (so->so_options & SO_DEBUG)
1036 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1037 		    PRU_PEERADDR, 0);
1038 	return (0);
1039 }
1040 
1041 /*
1042  * Initiate (or continue) disconnect.
1043  * If embryonic state, just send reset (once).
1044  * If in ``let data drain'' option and linger null, just drop.
1045  * Otherwise (hard), mark socket disconnecting and drop
1046  * current input data; switch states based on user close, and
1047  * send segment to peer (with FIN).
1048  */
1049 struct tcpcb *
1050 tcp_dodisconnect(struct tcpcb *tp)
1051 {
1052 	struct socket *so = tp->t_inpcb->inp_socket;
1053 
1054 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
1055 		tp = tcp_close(tp);
1056 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1057 		tp = tcp_drop(tp, 0);
1058 	else {
1059 		soisdisconnecting(so);
1060 		sbflush(so, &so->so_rcv);
1061 		tp = tcp_usrclosed(tp);
1062 		if (tp)
1063 			(void) tcp_output(tp);
1064 	}
1065 	return (tp);
1066 }
1067 
1068 /*
1069  * User issued close, and wish to trail through shutdown states:
1070  * if never received SYN, just forget it.  If got a SYN from peer,
1071  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1072  * If already got a FIN from peer, then almost done; go to LAST_ACK
1073  * state.  In all other cases, have already sent FIN to peer (e.g.
1074  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1075  * for peer to send FIN or not respond to keep-alives, etc.
1076  * We can let the user exit from the close as soon as the FIN is acked.
1077  */
1078 struct tcpcb *
1079 tcp_usrclosed(struct tcpcb *tp)
1080 {
1081 
1082 	switch (tp->t_state) {
1083 
1084 	case TCPS_CLOSED:
1085 	case TCPS_LISTEN:
1086 	case TCPS_SYN_SENT:
1087 		tp->t_state = TCPS_CLOSED;
1088 		tp = tcp_close(tp);
1089 		break;
1090 
1091 	case TCPS_SYN_RECEIVED:
1092 	case TCPS_ESTABLISHED:
1093 		tp->t_state = TCPS_FIN_WAIT_1;
1094 		break;
1095 
1096 	case TCPS_CLOSE_WAIT:
1097 		tp->t_state = TCPS_LAST_ACK;
1098 		break;
1099 	}
1100 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1101 		soisdisconnected(tp->t_inpcb->inp_socket);
1102 		/*
1103 		 * If we are in FIN_WAIT_2, we arrived here because the
1104 		 * application did a shutdown of the send side.  Like the
1105 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
1106 		 * a full close, we start a timer to make sure sockets are
1107 		 * not left in FIN_WAIT_2 forever.
1108 		 */
1109 		if (tp->t_state == TCPS_FIN_WAIT_2)
1110 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
1111 	}
1112 	return (tp);
1113 }
1114 
1115 /*
1116  * Look up a socket for ident or tcpdrop, ...
1117  */
1118 int
1119 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
1120 {
1121 	int error = 0;
1122 	struct tcp_ident_mapping tir;
1123 	struct inpcb *inp;
1124 	struct tcpcb *tp = NULL;
1125 	struct sockaddr_in *fin, *lin;
1126 #ifdef INET6
1127 	struct sockaddr_in6 *fin6, *lin6;
1128 	struct in6_addr f6, l6;
1129 #endif
1130 
1131 	NET_ASSERT_LOCKED();
1132 
1133 	if (dodrop) {
1134 		if (oldp != NULL || *oldlenp != 0)
1135 			return (EINVAL);
1136 		if (newp == NULL)
1137 			return (EPERM);
1138 		if (newlen < sizeof(tir))
1139 			return (ENOMEM);
1140 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
1141 			return (error);
1142 	} else {
1143 		if (oldp == NULL)
1144 			return (EINVAL);
1145 		if (*oldlenp < sizeof(tir))
1146 			return (ENOMEM);
1147 		if (newp != NULL || newlen != 0)
1148 			return (EINVAL);
1149 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
1150 			return (error);
1151 	}
1152 	switch (tir.faddr.ss_family) {
1153 #ifdef INET6
1154 	case AF_INET6:
1155 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
1156 		error = in6_embedscope(&f6, fin6, NULL);
1157 		if (error)
1158 			return EINVAL;	/*?*/
1159 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
1160 		error = in6_embedscope(&l6, lin6, NULL);
1161 		if (error)
1162 			return EINVAL;	/*?*/
1163 		break;
1164 #endif
1165 	case AF_INET:
1166 		fin = (struct sockaddr_in *)&tir.faddr;
1167 		lin = (struct sockaddr_in *)&tir.laddr;
1168 		break;
1169 	default:
1170 		return (EINVAL);
1171 	}
1172 
1173 	switch (tir.faddr.ss_family) {
1174 #ifdef INET6
1175 	case AF_INET6:
1176 		inp = in6_pcblookup(&tcbtable, &f6,
1177 		    fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
1178 		break;
1179 #endif
1180 	case AF_INET:
1181 		inp = in_pcblookup(&tcbtable, fin->sin_addr,
1182 		    fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
1183 		break;
1184 	default:
1185 		unhandled_af(tir.faddr.ss_family);
1186 	}
1187 
1188 	if (dodrop) {
1189 		if (inp && (tp = intotcpcb(inp)) &&
1190 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
1191 			tp = tcp_drop(tp, ECONNABORTED);
1192 		else
1193 			error = ESRCH;
1194 		in_pcbunref(inp);
1195 		return (error);
1196 	}
1197 
1198 	if (inp == NULL) {
1199 		tcpstat_inc(tcps_pcbhashmiss);
1200 		switch (tir.faddr.ss_family) {
1201 #ifdef INET6
1202 		case AF_INET6:
1203 			inp = in6_pcblookup_listen(&tcbtable,
1204 			    &l6, lin6->sin6_port, NULL, tir.rdomain);
1205 			break;
1206 #endif
1207 		case AF_INET:
1208 			inp = in_pcblookup_listen(&tcbtable,
1209 			    lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
1210 			break;
1211 		}
1212 	}
1213 
1214 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
1215 		tir.ruid = inp->inp_socket->so_ruid;
1216 		tir.euid = inp->inp_socket->so_euid;
1217 	} else {
1218 		tir.ruid = -1;
1219 		tir.euid = -1;
1220 	}
1221 
1222 	*oldlenp = sizeof (tir);
1223 	error = copyout((void *)&tir, oldp, sizeof (tir));
1224 	in_pcbunref(inp);
1225 	return (error);
1226 }
1227 
1228 int
1229 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
1230 {
1231 	uint64_t counters[tcps_ncounters];
1232 	struct tcpstat tcpstat;
1233 	struct syn_cache_set *set;
1234 	int i = 0;
1235 
1236 #define ASSIGN(field)	do { tcpstat.field = counters[i++]; } while (0)
1237 
1238 	memset(&tcpstat, 0, sizeof tcpstat);
1239 	counters_read(tcpcounters, counters, nitems(counters));
1240 	ASSIGN(tcps_connattempt);
1241 	ASSIGN(tcps_accepts);
1242 	ASSIGN(tcps_connects);
1243 	ASSIGN(tcps_drops);
1244 	ASSIGN(tcps_conndrops);
1245 	ASSIGN(tcps_closed);
1246 	ASSIGN(tcps_segstimed);
1247 	ASSIGN(tcps_rttupdated);
1248 	ASSIGN(tcps_delack);
1249 	ASSIGN(tcps_timeoutdrop);
1250 	ASSIGN(tcps_rexmttimeo);
1251 	ASSIGN(tcps_persisttimeo);
1252 	ASSIGN(tcps_persistdrop);
1253 	ASSIGN(tcps_keeptimeo);
1254 	ASSIGN(tcps_keepprobe);
1255 	ASSIGN(tcps_keepdrops);
1256 	ASSIGN(tcps_sndtotal);
1257 	ASSIGN(tcps_sndpack);
1258 	ASSIGN(tcps_sndbyte);
1259 	ASSIGN(tcps_sndrexmitpack);
1260 	ASSIGN(tcps_sndrexmitbyte);
1261 	ASSIGN(tcps_sndrexmitfast);
1262 	ASSIGN(tcps_sndacks);
1263 	ASSIGN(tcps_sndprobe);
1264 	ASSIGN(tcps_sndurg);
1265 	ASSIGN(tcps_sndwinup);
1266 	ASSIGN(tcps_sndctrl);
1267 	ASSIGN(tcps_rcvtotal);
1268 	ASSIGN(tcps_rcvpack);
1269 	ASSIGN(tcps_rcvbyte);
1270 	ASSIGN(tcps_rcvbadsum);
1271 	ASSIGN(tcps_rcvbadoff);
1272 	ASSIGN(tcps_rcvmemdrop);
1273 	ASSIGN(tcps_rcvnosec);
1274 	ASSIGN(tcps_rcvshort);
1275 	ASSIGN(tcps_rcvduppack);
1276 	ASSIGN(tcps_rcvdupbyte);
1277 	ASSIGN(tcps_rcvpartduppack);
1278 	ASSIGN(tcps_rcvpartdupbyte);
1279 	ASSIGN(tcps_rcvoopack);
1280 	ASSIGN(tcps_rcvoobyte);
1281 	ASSIGN(tcps_rcvpackafterwin);
1282 	ASSIGN(tcps_rcvbyteafterwin);
1283 	ASSIGN(tcps_rcvafterclose);
1284 	ASSIGN(tcps_rcvwinprobe);
1285 	ASSIGN(tcps_rcvdupack);
1286 	ASSIGN(tcps_rcvacktoomuch);
1287 	ASSIGN(tcps_rcvacktooold);
1288 	ASSIGN(tcps_rcvackpack);
1289 	ASSIGN(tcps_rcvackbyte);
1290 	ASSIGN(tcps_rcvwinupd);
1291 	ASSIGN(tcps_pawsdrop);
1292 	ASSIGN(tcps_predack);
1293 	ASSIGN(tcps_preddat);
1294 	ASSIGN(tcps_pcbhashmiss);
1295 	ASSIGN(tcps_noport);
1296 	ASSIGN(tcps_badsyn);
1297 	ASSIGN(tcps_dropsyn);
1298 	ASSIGN(tcps_rcvbadsig);
1299 	ASSIGN(tcps_rcvgoodsig);
1300 	ASSIGN(tcps_inswcsum);
1301 	ASSIGN(tcps_outswcsum);
1302 	ASSIGN(tcps_ecn_accepts);
1303 	ASSIGN(tcps_ecn_rcvece);
1304 	ASSIGN(tcps_ecn_rcvcwr);
1305 	ASSIGN(tcps_ecn_rcvce);
1306 	ASSIGN(tcps_ecn_sndect);
1307 	ASSIGN(tcps_ecn_sndece);
1308 	ASSIGN(tcps_ecn_sndcwr);
1309 	ASSIGN(tcps_cwr_ecn);
1310 	ASSIGN(tcps_cwr_frecovery);
1311 	ASSIGN(tcps_cwr_timeout);
1312 	ASSIGN(tcps_sc_added);
1313 	ASSIGN(tcps_sc_completed);
1314 	ASSIGN(tcps_sc_timed_out);
1315 	ASSIGN(tcps_sc_overflowed);
1316 	ASSIGN(tcps_sc_reset);
1317 	ASSIGN(tcps_sc_unreach);
1318 	ASSIGN(tcps_sc_bucketoverflow);
1319 	ASSIGN(tcps_sc_aborted);
1320 	ASSIGN(tcps_sc_dupesyn);
1321 	ASSIGN(tcps_sc_dropped);
1322 	ASSIGN(tcps_sc_collisions);
1323 	ASSIGN(tcps_sc_retransmitted);
1324 	ASSIGN(tcps_sc_seedrandom);
1325 	ASSIGN(tcps_sc_hash_size);
1326 	ASSIGN(tcps_sc_entry_count);
1327 	ASSIGN(tcps_sc_entry_limit);
1328 	ASSIGN(tcps_sc_bucket_maxlen);
1329 	ASSIGN(tcps_sc_bucket_limit);
1330 	ASSIGN(tcps_sc_uses_left);
1331 	ASSIGN(tcps_conndrained);
1332 	ASSIGN(tcps_sack_recovery_episode);
1333 	ASSIGN(tcps_sack_rexmits);
1334 	ASSIGN(tcps_sack_rexmit_bytes);
1335 	ASSIGN(tcps_sack_rcv_opts);
1336 	ASSIGN(tcps_sack_snd_opts);
1337 	ASSIGN(tcps_sack_drop_opts);
1338 
1339 #undef ASSIGN
1340 
1341 	set = &tcp_syn_cache[tcp_syn_cache_active];
1342 	tcpstat.tcps_sc_hash_size = set->scs_size;
1343 	tcpstat.tcps_sc_entry_count = set->scs_count;
1344 	tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
1345 	tcpstat.tcps_sc_bucket_maxlen = 0;
1346 	for (i = 0; i < set->scs_size; i++) {
1347 		if (tcpstat.tcps_sc_bucket_maxlen <
1348 		    set->scs_buckethead[i].sch_length)
1349 			tcpstat.tcps_sc_bucket_maxlen =
1350 				set->scs_buckethead[i].sch_length;
1351 	}
1352 	tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
1353 	tcpstat.tcps_sc_uses_left = set->scs_use;
1354 
1355 	return (sysctl_rdstruct(oldp, oldlenp, newp,
1356 	    &tcpstat, sizeof(tcpstat)));
1357 }
1358 
1359 /*
1360  * Sysctl for tcp variables.
1361  */
1362 int
1363 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1364     size_t newlen)
1365 {
1366 	int error, nval;
1367 
1368 	/* All sysctl names at this level are terminal. */
1369 	if (namelen != 1)
1370 		return (ENOTDIR);
1371 
1372 	switch (name[0]) {
1373 	case TCPCTL_KEEPINITTIME:
1374 		NET_LOCK();
1375 		nval = tcptv_keep_init / TCP_TIME(1);
1376 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1377 		    1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1)));
1378 		if (!error)
1379 			tcptv_keep_init = TCP_TIME(nval);
1380 		NET_UNLOCK();
1381 		return (error);
1382 
1383 	case TCPCTL_KEEPIDLE:
1384 		NET_LOCK();
1385 		nval = tcp_keepidle / TCP_TIME(1);
1386 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1387 		    1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1)));
1388 		if (!error)
1389 			tcp_keepidle = TCP_TIME(nval);
1390 		NET_UNLOCK();
1391 		return (error);
1392 
1393 	case TCPCTL_KEEPINTVL:
1394 		NET_LOCK();
1395 		nval = tcp_keepintvl / TCP_TIME(1);
1396 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1397 		    1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1)));
1398 		if (!error)
1399 			tcp_keepintvl = TCP_TIME(nval);
1400 		NET_UNLOCK();
1401 		return (error);
1402 
1403 	case TCPCTL_BADDYNAMIC:
1404 		NET_LOCK();
1405 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1406 		    baddynamicports.tcp, sizeof(baddynamicports.tcp));
1407 		NET_UNLOCK();
1408 		return (error);
1409 
1410 	case TCPCTL_ROOTONLY:
1411 		if (newp && securelevel > 0)
1412 			return (EPERM);
1413 		NET_LOCK();
1414 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1415 		    rootonlyports.tcp, sizeof(rootonlyports.tcp));
1416 		NET_UNLOCK();
1417 		return (error);
1418 
1419 	case TCPCTL_IDENT:
1420 		NET_LOCK();
1421 		error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
1422 		NET_UNLOCK();
1423 		return (error);
1424 
1425 	case TCPCTL_DROP:
1426 		NET_LOCK();
1427 		error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
1428 		NET_UNLOCK();
1429 		return (error);
1430 
1431 	case TCPCTL_REASS_LIMIT:
1432 		NET_LOCK();
1433 		nval = tcp_reass_limit;
1434 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1435 		if (!error && nval != tcp_reass_limit) {
1436 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
1437 			if (!error)
1438 				tcp_reass_limit = nval;
1439 		}
1440 		NET_UNLOCK();
1441 		return (error);
1442 
1443 	case TCPCTL_SACKHOLE_LIMIT:
1444 		NET_LOCK();
1445 		nval = tcp_sackhole_limit;
1446 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1447 		if (!error && nval != tcp_sackhole_limit) {
1448 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
1449 			if (!error)
1450 				tcp_sackhole_limit = nval;
1451 		}
1452 		NET_UNLOCK();
1453 		return (error);
1454 
1455 	case TCPCTL_STATS:
1456 		return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
1457 
1458 	case TCPCTL_SYN_USE_LIMIT:
1459 		NET_LOCK();
1460 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1461 		    &tcp_syn_use_limit, 0, INT_MAX);
1462 		if (!error && newp != NULL) {
1463 			/*
1464 			 * Global tcp_syn_use_limit is used when reseeding a
1465 			 * new cache.  Also update the value in active cache.
1466 			 */
1467 			if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
1468 				tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
1469 			if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
1470 				tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
1471 		}
1472 		NET_UNLOCK();
1473 		return (error);
1474 
1475 	case TCPCTL_SYN_HASH_SIZE:
1476 		NET_LOCK();
1477 		nval = tcp_syn_hash_size;
1478 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1479 		    &nval, 1, 100000);
1480 		if (!error && nval != tcp_syn_hash_size) {
1481 			/*
1482 			 * If global hash size has been changed,
1483 			 * switch sets as soon as possible.  Then
1484 			 * the actual hash array will be reallocated.
1485 			 */
1486 			if (tcp_syn_cache[0].scs_size != nval)
1487 				tcp_syn_cache[0].scs_use = 0;
1488 			if (tcp_syn_cache[1].scs_size != nval)
1489 				tcp_syn_cache[1].scs_use = 0;
1490 			tcp_syn_hash_size = nval;
1491 		}
1492 		NET_UNLOCK();
1493 		return (error);
1494 
1495 	default:
1496 		NET_LOCK();
1497 		error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name,
1498 		     namelen, oldp, oldlenp, newp, newlen);
1499 		NET_UNLOCK();
1500 		return (error);
1501 	}
1502 	/* NOTREACHED */
1503 }
1504 
1505 /*
1506  * Scale the send buffer so that inflight data is not accounted against
1507  * the limit. The buffer will scale with the congestion window, if the
1508  * the receiver stops acking data the window will shrink and therefore
1509  * the buffer size will shrink as well.
1510  * In low memory situation try to shrink the buffer to the initial size
1511  * disabling the send buffer scaling as long as the situation persists.
1512  */
1513 void
1514 tcp_update_sndspace(struct tcpcb *tp)
1515 {
1516 	struct socket *so = tp->t_inpcb->inp_socket;
1517 	u_long nmax = so->so_snd.sb_hiwat;
1518 
1519 	if (sbchecklowmem()) {
1520 		/* low on memory try to get rid of some */
1521 		if (tcp_sendspace < nmax)
1522 			nmax = tcp_sendspace;
1523 	} else if (so->so_snd.sb_wat != tcp_sendspace)
1524 		/* user requested buffer size, auto-scaling disabled */
1525 		nmax = so->so_snd.sb_wat;
1526 	else
1527 		/* automatic buffer scaling */
1528 		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
1529 		    tp->snd_una);
1530 
1531 	/* a writable socket must be preserved because of poll(2) semantics */
1532 	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
1533 		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
1534 			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
1535 		/* keep in sync with sbreserve() calculation */
1536 		if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
1537 			nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
1538 	}
1539 
1540 	/* round to MSS boundary */
1541 	nmax = roundup(nmax, tp->t_maxseg);
1542 
1543 	if (nmax != so->so_snd.sb_hiwat)
1544 		sbreserve(so, &so->so_snd, nmax);
1545 }
1546 
1547 /*
1548  * Scale the recv buffer by looking at how much data was transferred in
1549  * one approximated RTT. If more than a big part of the recv buffer was
1550  * transferred during that time we increase the buffer by a constant.
1551  * In low memory situation try to shrink the buffer to the initial size.
1552  */
1553 void
1554 tcp_update_rcvspace(struct tcpcb *tp)
1555 {
1556 	struct socket *so = tp->t_inpcb->inp_socket;
1557 	u_long nmax = so->so_rcv.sb_hiwat;
1558 
1559 	if (sbchecklowmem()) {
1560 		/* low on memory try to get rid of some */
1561 		if (tcp_recvspace < nmax)
1562 			nmax = tcp_recvspace;
1563 	} else if (so->so_rcv.sb_wat != tcp_recvspace)
1564 		/* user requested buffer size, auto-scaling disabled */
1565 		nmax = so->so_rcv.sb_wat;
1566 	else {
1567 		/* automatic buffer scaling */
1568 		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1569 			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1570 			    tcp_autorcvbuf_inc);
1571 	}
1572 
1573 	/* a readable socket must be preserved because of poll(2) semantics */
1574 	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
1575 	    nmax < so->so_snd.sb_lowat)
1576 		nmax = so->so_snd.sb_lowat;
1577 
1578 	if (nmax == so->so_rcv.sb_hiwat)
1579 		return;
1580 
1581 	/* round to MSS boundary */
1582 	nmax = roundup(nmax, tp->t_maxseg);
1583 	sbreserve(so, &so->so_rcv, nmax);
1584 }
1585