xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision 3374c67d44f9b75b98444cbf63020f777792342e)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.214 2022/12/12 08:30:22 tb Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 #include <sys/pool.h>
82 #include <sys/proc.h>
83 
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/route.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/in_var.h>
90 #include <netinet/ip.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_fsm.h>
95 #include <netinet/tcp_seq.h>
96 #include <netinet/tcp_timer.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet/tcp_debug.h>
99 
100 #ifdef INET6
101 #include <netinet6/in6_var.h>
102 #endif
103 
104 #ifndef TCP_SENDSPACE
105 #define	TCP_SENDSPACE	1024*16
106 #endif
107 u_int	tcp_sendspace = TCP_SENDSPACE;
108 #ifndef TCP_RECVSPACE
109 #define	TCP_RECVSPACE	1024*16
110 #endif
111 u_int	tcp_recvspace = TCP_RECVSPACE;
112 u_int	tcp_autorcvbuf_inc = 16 * 1024;
113 
114 const struct pr_usrreqs tcp_usrreqs = {
115 	.pru_attach	= tcp_attach,
116 	.pru_detach	= tcp_detach,
117 	.pru_bind	= tcp_bind,
118 	.pru_listen	= tcp_listen,
119 	.pru_connect	= tcp_connect,
120 	.pru_accept	= tcp_accept,
121 	.pru_disconnect	= tcp_disconnect,
122 	.pru_shutdown	= tcp_shutdown,
123 	.pru_rcvd	= tcp_rcvd,
124 	.pru_send	= tcp_send,
125 	.pru_abort	= tcp_abort,
126 	.pru_sense	= tcp_sense,
127 	.pru_rcvoob	= tcp_rcvoob,
128 	.pru_sendoob	= tcp_sendoob,
129 	.pru_control	= in_control,
130 	.pru_sockaddr	= tcp_sockaddr,
131 	.pru_peeraddr	= tcp_peeraddr,
132 };
133 
134 #ifdef INET6
135 const struct pr_usrreqs tcp6_usrreqs = {
136 	.pru_attach	= tcp_attach,
137 	.pru_detach	= tcp_detach,
138 	.pru_bind	= tcp_bind,
139 	.pru_listen	= tcp_listen,
140 	.pru_connect	= tcp_connect,
141 	.pru_accept	= tcp_accept,
142 	.pru_disconnect	= tcp_disconnect,
143 	.pru_shutdown	= tcp_shutdown,
144 	.pru_rcvd	= tcp_rcvd,
145 	.pru_send	= tcp_send,
146 	.pru_abort	= tcp_abort,
147 	.pru_sense	= tcp_sense,
148 	.pru_rcvoob	= tcp_rcvoob,
149 	.pru_sendoob	= tcp_sendoob,
150 	.pru_control	= in6_control,
151 	.pru_sockaddr	= tcp_sockaddr,
152 	.pru_peeraddr	= tcp_peeraddr,
153 };
154 #endif
155 
156 const struct sysctl_bounded_args tcpctl_vars[] = {
157 	{ TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
158 	{ TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT },
159 	{ TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE },
160 	{ TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL },
161 	{ TCPCTL_SACK, &tcp_do_sack, 0, 1 },
162 	{ TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
163 	{ TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
164 	{ TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
165 #ifdef TCP_ECN
166 	{ TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
167 #endif
168 	{ TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
169 	{ TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
170 	{ TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
171 	{ TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
172 };
173 
174 struct	inpcbtable tcbtable;
175 
176 int	tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
177 int	tcp_ident(void *, size_t *, void *, size_t, int);
178 
179 static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
180                       struct tcpcb **);
181 
182 static inline int
183 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
184 {
185 	struct inpcb *inp;
186 	struct tcpcb *tp;
187 
188 	/*
189 	 * When a TCP is attached to a socket, then there will be
190 	 * a (struct inpcb) pointed at by the socket, and this
191 	 * structure will point at a subsidiary (struct tcpcb).
192 	 */
193 	if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
194 		if (so->so_error)
195 			return so->so_error;
196 		return EINVAL;
197 	}
198 
199 	*rinp = inp;
200 	*rtp = tp;
201 
202 	return 0;
203 }
204 
205 /*
206  * Export internal TCP state information via a struct tcp_info without
207  * leaking any sensitive information. Sequence numbers are reported
208  * relative to the initial sequence number.
209  */
210 int
211 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
212 {
213 	struct proc *p = curproc;
214 	struct tcp_info *ti;
215 	u_int t = 1000;		/* msec => usec */
216 	uint32_t now;
217 
218 	if (sizeof(*ti) > MLEN) {
219 		MCLGETL(m, M_WAITOK, sizeof(*ti));
220 		if (!ISSET(m->m_flags, M_EXT))
221 			return ENOMEM;
222 	}
223 	ti = mtod(m, struct tcp_info *);
224 	m->m_len = sizeof(*ti);
225 	memset(ti, 0, sizeof(*ti));
226 	now = tcp_now();
227 
228 	ti->tcpi_state = tp->t_state;
229 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
230 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
231 	if (tp->t_flags & TF_SACK_PERMIT)
232 		ti->tcpi_options |= TCPI_OPT_SACK;
233 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
234 		ti->tcpi_options |= TCPI_OPT_WSCALE;
235 		ti->tcpi_snd_wscale = tp->snd_scale;
236 		ti->tcpi_rcv_wscale = tp->rcv_scale;
237 	}
238 #ifdef TCP_ECN
239 	if (tp->t_flags & TF_ECN_PERMIT)
240 		ti->tcpi_options |= TCPI_OPT_ECN;
241 #endif
242 
243 	ti->tcpi_rto = tp->t_rxtcur * t;
244 	ti->tcpi_snd_mss = tp->t_maxseg;
245 	ti->tcpi_rcv_mss = tp->t_peermss;
246 
247 	ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
248 	ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
249 	ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
250 	ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
251 
252 	ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
253 	    (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
254 	ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
255 	    (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
256 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
257 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
258 
259 	ti->tcpi_rcv_space = tp->rcv_wnd;
260 
261 	/*
262 	 * Provide only minimal information for unprivileged processes.
263 	 */
264 	if (suser(p) != 0)
265 		return 0;
266 
267 	/* FreeBSD-specific extension fields for tcp_info.  */
268 	ti->tcpi_snd_wnd = tp->snd_wnd;
269 	ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
270 	ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
271 	/* missing tcpi_toe_tid */
272 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
273 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
274 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
275 
276 	/* OpenBSD extensions */
277 	ti->tcpi_rttmin = tp->t_rttmin * t;
278 	ti->tcpi_max_sndwnd = tp->max_sndwnd;
279 	ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
280 	ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
281 	ti->tcpi_snd_una = tp->snd_una - tp->iss;
282 	ti->tcpi_snd_up = tp->snd_up - tp->iss;
283 	ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
284 	ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
285 	ti->tcpi_snd_max = tp->snd_max - tp->iss;
286 
287 	ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
288 	ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
289 	ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
290 	ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
291 
292 	ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
293 	ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
294 	ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
295 	ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
296 	ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
297 	ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
298 	ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
299 	ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
300 
301 	return 0;
302 }
303 
304 int
305 tcp_ctloutput(int op, struct socket *so, int level, int optname,
306     struct mbuf *m)
307 {
308 	int error = 0;
309 	struct inpcb *inp;
310 	struct tcpcb *tp;
311 	int i;
312 
313 	inp = sotoinpcb(so);
314 	if (inp == NULL)
315 		return (ECONNRESET);
316 	if (level != IPPROTO_TCP) {
317 		switch (so->so_proto->pr_domain->dom_family) {
318 #ifdef INET6
319 		case PF_INET6:
320 			error = ip6_ctloutput(op, so, level, optname, m);
321 			break;
322 #endif /* INET6 */
323 		case PF_INET:
324 			error = ip_ctloutput(op, so, level, optname, m);
325 			break;
326 		default:
327 			error = EAFNOSUPPORT;	/*?*/
328 			break;
329 		}
330 		return (error);
331 	}
332 	tp = intotcpcb(inp);
333 
334 	switch (op) {
335 
336 	case PRCO_SETOPT:
337 		switch (optname) {
338 
339 		case TCP_NODELAY:
340 			if (m == NULL || m->m_len < sizeof (int))
341 				error = EINVAL;
342 			else if (*mtod(m, int *))
343 				tp->t_flags |= TF_NODELAY;
344 			else
345 				tp->t_flags &= ~TF_NODELAY;
346 			break;
347 
348 		case TCP_NOPUSH:
349 			if (m == NULL || m->m_len < sizeof (int))
350 				error = EINVAL;
351 			else if (*mtod(m, int *))
352 				tp->t_flags |= TF_NOPUSH;
353 			else if (tp->t_flags & TF_NOPUSH) {
354 				tp->t_flags &= ~TF_NOPUSH;
355 				if (TCPS_HAVEESTABLISHED(tp->t_state))
356 					error = tcp_output(tp);
357 			}
358 			break;
359 
360 		case TCP_MAXSEG:
361 			if (m == NULL || m->m_len < sizeof (int)) {
362 				error = EINVAL;
363 				break;
364 			}
365 
366 			i = *mtod(m, int *);
367 			if (i > 0 && i <= tp->t_maxseg)
368 				tp->t_maxseg = i;
369 			else
370 				error = EINVAL;
371 			break;
372 
373 		case TCP_SACK_ENABLE:
374 			if (m == NULL || m->m_len < sizeof (int)) {
375 				error = EINVAL;
376 				break;
377 			}
378 
379 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
380 				error = EPERM;
381 				break;
382 			}
383 
384 			if (tp->t_flags & TF_SIGNATURE) {
385 				error = EPERM;
386 				break;
387 			}
388 
389 			if (*mtod(m, int *))
390 				tp->sack_enable = 1;
391 			else
392 				tp->sack_enable = 0;
393 			break;
394 #ifdef TCP_SIGNATURE
395 		case TCP_MD5SIG:
396 			if (m == NULL || m->m_len < sizeof (int)) {
397 				error = EINVAL;
398 				break;
399 			}
400 
401 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
402 				error = EPERM;
403 				break;
404 			}
405 
406 			if (*mtod(m, int *)) {
407 				tp->t_flags |= TF_SIGNATURE;
408 				tp->sack_enable = 0;
409 			} else
410 				tp->t_flags &= ~TF_SIGNATURE;
411 			break;
412 #endif /* TCP_SIGNATURE */
413 		default:
414 			error = ENOPROTOOPT;
415 			break;
416 		}
417 		break;
418 
419 	case PRCO_GETOPT:
420 		switch (optname) {
421 		case TCP_NODELAY:
422 			m->m_len = sizeof(int);
423 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
424 			break;
425 		case TCP_NOPUSH:
426 			m->m_len = sizeof(int);
427 			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
428 			break;
429 		case TCP_MAXSEG:
430 			m->m_len = sizeof(int);
431 			*mtod(m, int *) = tp->t_maxseg;
432 			break;
433 		case TCP_SACK_ENABLE:
434 			m->m_len = sizeof(int);
435 			*mtod(m, int *) = tp->sack_enable;
436 			break;
437 		case TCP_INFO:
438 			error = tcp_fill_info(tp, so, m);
439 			break;
440 #ifdef TCP_SIGNATURE
441 		case TCP_MD5SIG:
442 			m->m_len = sizeof(int);
443 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
444 			break;
445 #endif
446 		default:
447 			error = ENOPROTOOPT;
448 			break;
449 		}
450 		break;
451 	}
452 	return (error);
453 }
454 
455 /*
456  * Attach TCP protocol to socket, allocating
457  * internet protocol control block, tcp control block,
458  * buffer space, and entering LISTEN state to accept connections.
459  */
460 int
461 tcp_attach(struct socket *so, int proto, int wait)
462 {
463 	struct tcpcb *tp;
464 	struct inpcb *inp;
465 	int error;
466 
467 	if (so->so_pcb)
468 		return EISCONN;
469 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
470 	    sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
471 	    sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
472 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
473 		if (error)
474 			return (error);
475 	}
476 
477 	NET_ASSERT_LOCKED();
478 	error = in_pcballoc(so, &tcbtable, wait);
479 	if (error)
480 		return (error);
481 	inp = sotoinpcb(so);
482 	tp = tcp_newtcpcb(inp, wait);
483 	if (tp == NULL) {
484 		unsigned int nofd = so->so_state & SS_NOFDREF;	/* XXX */
485 
486 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
487 		in_pcbdetach(inp);
488 		so->so_state |= nofd;
489 		return (ENOBUFS);
490 	}
491 	tp->t_state = TCPS_CLOSED;
492 #ifdef INET6
493 	/* we disallow IPv4 mapped address completely. */
494 	if (inp->inp_flags & INP_IPV6)
495 		tp->pf = PF_INET6;
496 	else
497 		tp->pf = PF_INET;
498 #else
499 	tp->pf = PF_INET;
500 #endif
501 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
502 		so->so_linger = TCP_LINGERTIME;
503 
504 	if (so->so_options & SO_DEBUG)
505 		tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
506 	return (0);
507 }
508 
509 int
510 tcp_detach(struct socket *so)
511 {
512 	struct inpcb *inp;
513 	struct tcpcb *otp = NULL, *tp;
514 	int error = 0;
515 	short ostate;
516 
517 	soassertlocked(so);
518 
519 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
520 		return (error);
521 
522 	if (so->so_options & SO_DEBUG) {
523 		otp = tp;
524 		ostate = tp->t_state;
525 	}
526 
527 	/*
528 	 * Detach the TCP protocol from the socket.
529 	 * If the protocol state is non-embryonic, then can't
530 	 * do this directly: have to initiate a PRU_DISCONNECT,
531 	 * which may finish later; embryonic TCB's can just
532 	 * be discarded here.
533 	 */
534 	tp = tcp_dodisconnect(tp);
535 
536 	if (otp)
537 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
538 	return (error);
539 }
540 
541 /*
542  * Give the socket an address.
543  */
544 int
545 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
546 {
547 	struct inpcb *inp;
548 	struct tcpcb *tp;
549 	int error;
550 	short ostate;
551 
552 	soassertlocked(so);
553 
554 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
555 		return (error);
556 
557 	if (so->so_options & SO_DEBUG)
558 		ostate = tp->t_state;
559 
560 	error = in_pcbbind(inp, nam, p);
561 
562 	if (so->so_options & SO_DEBUG)
563 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
564 	return (error);
565 }
566 
567 /*
568  * Prepare to accept connections.
569  */
570 int
571 tcp_listen(struct socket *so)
572 {
573 	struct inpcb *inp;
574 	struct tcpcb *tp, *otp = NULL;
575 	int error;
576 	short ostate;
577 
578 	soassertlocked(so);
579 
580 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
581 		return (error);
582 
583 	if (so->so_options & SO_DEBUG) {
584 		otp = tp;
585 		ostate = tp->t_state;
586 	}
587 
588 	if (inp->inp_lport == 0)
589 		if ((error = in_pcbbind(inp, NULL, curproc)))
590 			goto out;
591 
592 	/*
593 	 * If the in_pcbbind() above is called, the tp->pf
594 	 * should still be whatever it was before.
595 	 */
596 	tp->t_state = TCPS_LISTEN;
597 
598 out:
599 	if (otp)
600 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
601 	return (error);
602 }
603 
604 /*
605  * Initiate connection to peer.
606  * Create a template for use in transmissions on this connection.
607  * Enter SYN_SENT state, and mark socket as connecting.
608  * Start keep-alive timer, and seed output sequence space.
609  * Send initial segment on connection.
610  */
611 int
612 tcp_connect(struct socket *so, struct mbuf *nam)
613 {
614 	struct inpcb *inp;
615 	struct tcpcb *tp, *otp = NULL;
616 	int error;
617 	short ostate;
618 
619 	soassertlocked(so);
620 
621 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
622 		return (error);
623 
624 	if (so->so_options & SO_DEBUG) {
625 		otp = tp;
626 		ostate = tp->t_state;
627 	}
628 
629 #ifdef INET6
630 	if (inp->inp_flags & INP_IPV6) {
631 		struct sockaddr_in6 *sin6;
632 
633 		if ((error = in6_nam2sin6(nam, &sin6)))
634 			goto out;
635 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
636 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
637 			error = EINVAL;
638 			goto out;
639 		}
640 		error = in6_pcbconnect(inp, nam);
641 	} else
642 #endif /* INET6 */
643 	{
644 		struct sockaddr_in *sin;
645 
646 		if ((error = in_nam2sin(nam, &sin)))
647 			goto out;
648 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
649 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
650 		    IN_MULTICAST(sin->sin_addr.s_addr) ||
651 		    in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
652 			error = EINVAL;
653 			goto out;
654 		}
655 		error = in_pcbconnect(inp, nam);
656 	}
657 	if (error)
658 		goto out;
659 
660 	tp->t_template = tcp_template(tp);
661 	if (tp->t_template == 0) {
662 		in_pcbdisconnect(inp);
663 		error = ENOBUFS;
664 		goto out;
665 	}
666 
667 	so->so_state |= SS_CONNECTOUT;
668 
669 	/* Compute window scaling to request.  */
670 	tcp_rscale(tp, sb_max);
671 
672 	soisconnecting(so);
673 	tcpstat_inc(tcps_connattempt);
674 	tp->t_state = TCPS_SYN_SENT;
675 	TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init));
676 	tcp_set_iss_tsm(tp);
677 	tcp_sendseqinit(tp);
678 	tp->snd_last = tp->snd_una;
679 	error = tcp_output(tp);
680 
681 out:
682 	if (otp)
683 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
684 	return (error);
685 }
686 
687 /*
688  * Accept a connection.  Essentially all the work is done at higher
689  * levels; just return the address of the peer, storing through addr.
690  */
691 int
692 tcp_accept(struct socket *so, struct mbuf *nam)
693 {
694 	struct inpcb *inp;
695 	struct tcpcb *tp;
696 	int error;
697 	short ostate;
698 
699 	soassertlocked(so);
700 
701 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
702 		return (error);
703 
704 	if (so->so_options & SO_DEBUG)
705 		ostate = tp->t_state;
706 
707 #ifdef INET6
708 	if (inp->inp_flags & INP_IPV6)
709 		in6_setpeeraddr(inp, nam);
710 	else
711 #endif
712 		in_setpeeraddr(inp, nam);
713 
714 	if (so->so_options & SO_DEBUG)
715 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0);
716 	return (error);
717 }
718 
719 /*
720  * Initiate disconnect from peer.
721  * If connection never passed embryonic stage, just drop;
722  * else if don't need to let data drain, then can just drop anyways,
723  * else have to begin TCP shutdown process: mark socket disconnecting,
724  * drain unread data, state switch to reflect user close, and
725  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
726  * when peer sends FIN and acks ours.
727  *
728  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
729  */
730 int
731 tcp_disconnect(struct socket *so)
732 {
733 	struct inpcb *inp;
734 	struct tcpcb *tp, *otp = NULL;
735 	int error;
736 	short ostate;
737 
738 	soassertlocked(so);
739 
740 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
741 		return (error);
742 
743 	if (so->so_options & SO_DEBUG) {
744 		otp = tp;
745 		ostate = tp->t_state;
746 	}
747 
748 	tp = tcp_dodisconnect(tp);
749 
750 	if (otp)
751 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
752 	return (0);
753 }
754 
755 /*
756  * Mark the connection as being incapable of further output.
757  */
758 int
759 tcp_shutdown(struct socket *so)
760 {
761 	struct inpcb *inp;
762 	struct tcpcb *tp, *otp = NULL;
763 	int error;
764 	short ostate;
765 
766 	soassertlocked(so);
767 
768 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
769 		return (error);
770 
771 	if (so->so_options & SO_DEBUG) {
772 		otp = tp;
773 		ostate = tp->t_state;
774 	}
775 
776 	if (so->so_state & SS_CANTSENDMORE)
777 		goto out;
778 
779 	socantsendmore(so);
780 	tp = tcp_usrclosed(tp);
781 	if (tp)
782 		error = tcp_output(tp);
783 
784 out:
785 	if (otp)
786 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
787 	return (error);
788 }
789 
790 /*
791  * After a receive, possibly send window update to peer.
792  */
793 void
794 tcp_rcvd(struct socket *so)
795 {
796 	struct inpcb *inp;
797 	struct tcpcb *tp;
798 	short ostate;
799 
800 	soassertlocked(so);
801 
802 	if (tcp_sogetpcb(so, &inp, &tp))
803 		return;
804 
805 	if (so->so_options & SO_DEBUG)
806 		ostate = tp->t_state;
807 
808 	/*
809 	 * soreceive() calls this function when a user receives
810 	 * ancillary data on a listening socket. We don't call
811 	 * tcp_output in such a case, since there is no header
812 	 * template for a listening socket and hence the kernel
813 	 * will panic.
814 	 */
815 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
816 		(void) tcp_output(tp);
817 
818 	if (so->so_options & SO_DEBUG)
819 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
820 }
821 
822 /*
823  * Do a send by putting data in output queue and updating urgent
824  * marker if URG set.  Possibly send more data.
825  */
826 int
827 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
828     struct mbuf *control)
829 {
830 	struct inpcb *inp;
831 	struct tcpcb *tp;
832 	int error;
833 	short ostate;
834 
835 	soassertlocked(so);
836 
837 	if (control && control->m_len) {
838 		error = EINVAL;
839 		goto out;
840 	}
841 
842 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
843 		goto out;
844 
845 	if (so->so_options & SO_DEBUG)
846 		ostate = tp->t_state;
847 
848 	sbappendstream(so, &so->so_snd, m);
849 	m = NULL;
850 
851 	error = tcp_output(tp);
852 
853 	if (so->so_options & SO_DEBUG)
854 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
855 
856 out:
857 	m_freem(control);
858 	m_freem(m);
859 
860 	return (error);
861 }
862 
863 /*
864  * Abort the TCP.
865  */
866 void
867 tcp_abort(struct socket *so)
868 {
869 	struct inpcb *inp;
870 	struct tcpcb *tp, *otp = NULL;
871 	short ostate;
872 
873 	soassertlocked(so);
874 
875 	if (tcp_sogetpcb(so, &inp, &tp))
876 		return;
877 
878 	if (so->so_options & SO_DEBUG) {
879 		otp = tp;
880 		ostate = tp->t_state;
881 	}
882 
883 	tp = tcp_drop(tp, ECONNABORTED);
884 
885 	if (otp)
886 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
887 }
888 
889 int
890 tcp_sense(struct socket *so, struct stat *ub)
891 {
892 	struct inpcb *inp;
893 	struct tcpcb *tp;
894 	int error;
895 
896 	soassertlocked(so);
897 
898 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
899 		return (error);
900 
901 	ub->st_blksize = so->so_snd.sb_hiwat;
902 
903 	if (so->so_options & SO_DEBUG)
904 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
905 	return (0);
906 }
907 
908 int
909 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
910 {
911 	struct inpcb *inp;
912 	struct tcpcb *tp;
913 	int error;
914 
915 	soassertlocked(so);
916 
917 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
918 		return (error);
919 
920 	if ((so->so_oobmark == 0 &&
921 	    (so->so_state & SS_RCVATMARK) == 0) ||
922 	    so->so_options & SO_OOBINLINE ||
923 	    tp->t_oobflags & TCPOOB_HADDATA) {
924 		error = EINVAL;
925 		goto out;
926 	}
927 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
928 		error = EWOULDBLOCK;
929 		goto out;
930 	}
931 	m->m_len = 1;
932 	*mtod(m, caddr_t) = tp->t_iobc;
933 	if ((flags & MSG_PEEK) == 0)
934 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
935 out:
936 	if (so->so_options & SO_DEBUG)
937 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
938 	return (error);
939 }
940 
941 int
942 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
943     struct mbuf *control)
944 {
945 	struct inpcb *inp;
946 	struct tcpcb *tp;
947 	int error;
948 	short ostate;
949 
950 	soassertlocked(so);
951 
952 	if (control && control->m_len) {
953 		error = EINVAL;
954 		goto release;
955 	}
956 
957 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
958 		goto release;
959 
960 	if (so->so_options & SO_DEBUG)
961 		ostate = tp->t_state;
962 
963 	if (sbspace(so, &so->so_snd) < -512) {
964 		error = ENOBUFS;
965 		goto out;
966 	}
967 
968 	/*
969 	 * According to RFC961 (Assigned Protocols),
970 	 * the urgent pointer points to the last octet
971 	 * of urgent data.  We continue, however,
972 	 * to consider it to indicate the first octet
973 	 * of data past the urgent section.
974 	 * Otherwise, snd_up should be one lower.
975 	 */
976 	sbappendstream(so, &so->so_snd, m);
977 	m = NULL;
978 	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
979 	tp->t_force = 1;
980 	error = tcp_output(tp);
981 	tp->t_force = 0;
982 
983 out:
984 	if (so->so_options & SO_DEBUG)
985 		tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
986 
987 release:
988 	m_freem(control);
989 	m_freem(m);
990 
991 	return (error);
992 }
993 
994 int
995 tcp_sockaddr(struct socket *so, struct mbuf *nam)
996 {
997 	struct inpcb *inp;
998 	struct tcpcb *tp;
999 	int error;
1000 
1001 	soassertlocked(so);
1002 
1003 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
1004 		return (error);
1005 
1006 #ifdef INET6
1007 	if (inp->inp_flags & INP_IPV6)
1008 		in6_setsockaddr(inp, nam);
1009 	else
1010 #endif
1011 		in_setsockaddr(inp, nam);
1012 
1013 	if (so->so_options & SO_DEBUG)
1014 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1015 		    PRU_SOCKADDR, 0);
1016 	return (0);
1017 }
1018 
1019 int
1020 tcp_peeraddr(struct socket *so, struct mbuf *nam)
1021 {
1022 	struct inpcb *inp;
1023 	struct tcpcb *tp;
1024 	int error;
1025 
1026 	soassertlocked(so);
1027 
1028 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
1029 		return (error);
1030 
1031 #ifdef INET6
1032 	if (inp->inp_flags & INP_IPV6)
1033 		in6_setpeeraddr(inp, nam);
1034 	else
1035 #endif
1036 		in_setpeeraddr(inp, nam);
1037 
1038 	if (so->so_options & SO_DEBUG)
1039 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1040 		    PRU_PEERADDR, 0);
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Initiate (or continue) disconnect.
1046  * If embryonic state, just send reset (once).
1047  * If in ``let data drain'' option and linger null, just drop.
1048  * Otherwise (hard), mark socket disconnecting and drop
1049  * current input data; switch states based on user close, and
1050  * send segment to peer (with FIN).
1051  */
1052 struct tcpcb *
1053 tcp_dodisconnect(struct tcpcb *tp)
1054 {
1055 	struct socket *so = tp->t_inpcb->inp_socket;
1056 
1057 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
1058 		tp = tcp_close(tp);
1059 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1060 		tp = tcp_drop(tp, 0);
1061 	else {
1062 		soisdisconnecting(so);
1063 		sbflush(so, &so->so_rcv);
1064 		tp = tcp_usrclosed(tp);
1065 		if (tp)
1066 			(void) tcp_output(tp);
1067 	}
1068 	return (tp);
1069 }
1070 
1071 /*
1072  * User issued close, and wish to trail through shutdown states:
1073  * if never received SYN, just forget it.  If got a SYN from peer,
1074  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1075  * If already got a FIN from peer, then almost done; go to LAST_ACK
1076  * state.  In all other cases, have already sent FIN to peer (e.g.
1077  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1078  * for peer to send FIN or not respond to keep-alives, etc.
1079  * We can let the user exit from the close as soon as the FIN is acked.
1080  */
1081 struct tcpcb *
1082 tcp_usrclosed(struct tcpcb *tp)
1083 {
1084 
1085 	switch (tp->t_state) {
1086 
1087 	case TCPS_CLOSED:
1088 	case TCPS_LISTEN:
1089 	case TCPS_SYN_SENT:
1090 		tp->t_state = TCPS_CLOSED;
1091 		tp = tcp_close(tp);
1092 		break;
1093 
1094 	case TCPS_SYN_RECEIVED:
1095 	case TCPS_ESTABLISHED:
1096 		tp->t_state = TCPS_FIN_WAIT_1;
1097 		break;
1098 
1099 	case TCPS_CLOSE_WAIT:
1100 		tp->t_state = TCPS_LAST_ACK;
1101 		break;
1102 	}
1103 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1104 		soisdisconnected(tp->t_inpcb->inp_socket);
1105 		/*
1106 		 * If we are in FIN_WAIT_2, we arrived here because the
1107 		 * application did a shutdown of the send side.  Like the
1108 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
1109 		 * a full close, we start a timer to make sure sockets are
1110 		 * not left in FIN_WAIT_2 forever.
1111 		 */
1112 		if (tp->t_state == TCPS_FIN_WAIT_2)
1113 			TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(tcp_maxidle));
1114 	}
1115 	return (tp);
1116 }
1117 
1118 /*
1119  * Look up a socket for ident or tcpdrop, ...
1120  */
1121 int
1122 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
1123 {
1124 	int error = 0;
1125 	struct tcp_ident_mapping tir;
1126 	struct inpcb *inp;
1127 	struct tcpcb *tp = NULL;
1128 	struct sockaddr_in *fin, *lin;
1129 #ifdef INET6
1130 	struct sockaddr_in6 *fin6, *lin6;
1131 	struct in6_addr f6, l6;
1132 #endif
1133 
1134 	NET_ASSERT_LOCKED();
1135 
1136 	if (dodrop) {
1137 		if (oldp != NULL || *oldlenp != 0)
1138 			return (EINVAL);
1139 		if (newp == NULL)
1140 			return (EPERM);
1141 		if (newlen < sizeof(tir))
1142 			return (ENOMEM);
1143 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
1144 			return (error);
1145 	} else {
1146 		if (oldp == NULL)
1147 			return (EINVAL);
1148 		if (*oldlenp < sizeof(tir))
1149 			return (ENOMEM);
1150 		if (newp != NULL || newlen != 0)
1151 			return (EINVAL);
1152 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
1153 			return (error);
1154 	}
1155 	switch (tir.faddr.ss_family) {
1156 #ifdef INET6
1157 	case AF_INET6:
1158 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
1159 		error = in6_embedscope(&f6, fin6, NULL);
1160 		if (error)
1161 			return EINVAL;	/*?*/
1162 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
1163 		error = in6_embedscope(&l6, lin6, NULL);
1164 		if (error)
1165 			return EINVAL;	/*?*/
1166 		break;
1167 #endif
1168 	case AF_INET:
1169 		fin = (struct sockaddr_in *)&tir.faddr;
1170 		lin = (struct sockaddr_in *)&tir.laddr;
1171 		break;
1172 	default:
1173 		return (EINVAL);
1174 	}
1175 
1176 	switch (tir.faddr.ss_family) {
1177 #ifdef INET6
1178 	case AF_INET6:
1179 		inp = in6_pcblookup(&tcbtable, &f6,
1180 		    fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
1181 		break;
1182 #endif
1183 	case AF_INET:
1184 		inp = in_pcblookup(&tcbtable, fin->sin_addr,
1185 		    fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
1186 		break;
1187 	default:
1188 		unhandled_af(tir.faddr.ss_family);
1189 	}
1190 
1191 	if (dodrop) {
1192 		if (inp && (tp = intotcpcb(inp)) &&
1193 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
1194 			tp = tcp_drop(tp, ECONNABORTED);
1195 		else
1196 			error = ESRCH;
1197 		in_pcbunref(inp);
1198 		return (error);
1199 	}
1200 
1201 	if (inp == NULL) {
1202 		tcpstat_inc(tcps_pcbhashmiss);
1203 		switch (tir.faddr.ss_family) {
1204 #ifdef INET6
1205 		case AF_INET6:
1206 			inp = in6_pcblookup_listen(&tcbtable,
1207 			    &l6, lin6->sin6_port, NULL, tir.rdomain);
1208 			break;
1209 #endif
1210 		case AF_INET:
1211 			inp = in_pcblookup_listen(&tcbtable,
1212 			    lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
1213 			break;
1214 		}
1215 	}
1216 
1217 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
1218 		tir.ruid = inp->inp_socket->so_ruid;
1219 		tir.euid = inp->inp_socket->so_euid;
1220 	} else {
1221 		tir.ruid = -1;
1222 		tir.euid = -1;
1223 	}
1224 
1225 	*oldlenp = sizeof (tir);
1226 	error = copyout((void *)&tir, oldp, sizeof (tir));
1227 	in_pcbunref(inp);
1228 	return (error);
1229 }
1230 
1231 int
1232 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
1233 {
1234 	uint64_t counters[tcps_ncounters];
1235 	struct tcpstat tcpstat;
1236 	struct syn_cache_set *set;
1237 	int i = 0;
1238 
1239 #define ASSIGN(field)	do { tcpstat.field = counters[i++]; } while (0)
1240 
1241 	memset(&tcpstat, 0, sizeof tcpstat);
1242 	counters_read(tcpcounters, counters, nitems(counters));
1243 	ASSIGN(tcps_connattempt);
1244 	ASSIGN(tcps_accepts);
1245 	ASSIGN(tcps_connects);
1246 	ASSIGN(tcps_drops);
1247 	ASSIGN(tcps_conndrops);
1248 	ASSIGN(tcps_closed);
1249 	ASSIGN(tcps_segstimed);
1250 	ASSIGN(tcps_rttupdated);
1251 	ASSIGN(tcps_delack);
1252 	ASSIGN(tcps_timeoutdrop);
1253 	ASSIGN(tcps_rexmttimeo);
1254 	ASSIGN(tcps_persisttimeo);
1255 	ASSIGN(tcps_persistdrop);
1256 	ASSIGN(tcps_keeptimeo);
1257 	ASSIGN(tcps_keepprobe);
1258 	ASSIGN(tcps_keepdrops);
1259 	ASSIGN(tcps_sndtotal);
1260 	ASSIGN(tcps_sndpack);
1261 	ASSIGN(tcps_sndbyte);
1262 	ASSIGN(tcps_sndrexmitpack);
1263 	ASSIGN(tcps_sndrexmitbyte);
1264 	ASSIGN(tcps_sndrexmitfast);
1265 	ASSIGN(tcps_sndacks);
1266 	ASSIGN(tcps_sndprobe);
1267 	ASSIGN(tcps_sndurg);
1268 	ASSIGN(tcps_sndwinup);
1269 	ASSIGN(tcps_sndctrl);
1270 	ASSIGN(tcps_rcvtotal);
1271 	ASSIGN(tcps_rcvpack);
1272 	ASSIGN(tcps_rcvbyte);
1273 	ASSIGN(tcps_rcvbadsum);
1274 	ASSIGN(tcps_rcvbadoff);
1275 	ASSIGN(tcps_rcvmemdrop);
1276 	ASSIGN(tcps_rcvnosec);
1277 	ASSIGN(tcps_rcvshort);
1278 	ASSIGN(tcps_rcvduppack);
1279 	ASSIGN(tcps_rcvdupbyte);
1280 	ASSIGN(tcps_rcvpartduppack);
1281 	ASSIGN(tcps_rcvpartdupbyte);
1282 	ASSIGN(tcps_rcvoopack);
1283 	ASSIGN(tcps_rcvoobyte);
1284 	ASSIGN(tcps_rcvpackafterwin);
1285 	ASSIGN(tcps_rcvbyteafterwin);
1286 	ASSIGN(tcps_rcvafterclose);
1287 	ASSIGN(tcps_rcvwinprobe);
1288 	ASSIGN(tcps_rcvdupack);
1289 	ASSIGN(tcps_rcvacktoomuch);
1290 	ASSIGN(tcps_rcvacktooold);
1291 	ASSIGN(tcps_rcvackpack);
1292 	ASSIGN(tcps_rcvackbyte);
1293 	ASSIGN(tcps_rcvwinupd);
1294 	ASSIGN(tcps_pawsdrop);
1295 	ASSIGN(tcps_predack);
1296 	ASSIGN(tcps_preddat);
1297 	ASSIGN(tcps_pcbhashmiss);
1298 	ASSIGN(tcps_noport);
1299 	ASSIGN(tcps_badsyn);
1300 	ASSIGN(tcps_dropsyn);
1301 	ASSIGN(tcps_rcvbadsig);
1302 	ASSIGN(tcps_rcvgoodsig);
1303 	ASSIGN(tcps_inswcsum);
1304 	ASSIGN(tcps_outswcsum);
1305 	ASSIGN(tcps_ecn_accepts);
1306 	ASSIGN(tcps_ecn_rcvece);
1307 	ASSIGN(tcps_ecn_rcvcwr);
1308 	ASSIGN(tcps_ecn_rcvce);
1309 	ASSIGN(tcps_ecn_sndect);
1310 	ASSIGN(tcps_ecn_sndece);
1311 	ASSIGN(tcps_ecn_sndcwr);
1312 	ASSIGN(tcps_cwr_ecn);
1313 	ASSIGN(tcps_cwr_frecovery);
1314 	ASSIGN(tcps_cwr_timeout);
1315 	ASSIGN(tcps_sc_added);
1316 	ASSIGN(tcps_sc_completed);
1317 	ASSIGN(tcps_sc_timed_out);
1318 	ASSIGN(tcps_sc_overflowed);
1319 	ASSIGN(tcps_sc_reset);
1320 	ASSIGN(tcps_sc_unreach);
1321 	ASSIGN(tcps_sc_bucketoverflow);
1322 	ASSIGN(tcps_sc_aborted);
1323 	ASSIGN(tcps_sc_dupesyn);
1324 	ASSIGN(tcps_sc_dropped);
1325 	ASSIGN(tcps_sc_collisions);
1326 	ASSIGN(tcps_sc_retransmitted);
1327 	ASSIGN(tcps_sc_seedrandom);
1328 	ASSIGN(tcps_sc_hash_size);
1329 	ASSIGN(tcps_sc_entry_count);
1330 	ASSIGN(tcps_sc_entry_limit);
1331 	ASSIGN(tcps_sc_bucket_maxlen);
1332 	ASSIGN(tcps_sc_bucket_limit);
1333 	ASSIGN(tcps_sc_uses_left);
1334 	ASSIGN(tcps_conndrained);
1335 	ASSIGN(tcps_sack_recovery_episode);
1336 	ASSIGN(tcps_sack_rexmits);
1337 	ASSIGN(tcps_sack_rexmit_bytes);
1338 	ASSIGN(tcps_sack_rcv_opts);
1339 	ASSIGN(tcps_sack_snd_opts);
1340 	ASSIGN(tcps_sack_drop_opts);
1341 
1342 #undef ASSIGN
1343 
1344 	set = &tcp_syn_cache[tcp_syn_cache_active];
1345 	tcpstat.tcps_sc_hash_size = set->scs_size;
1346 	tcpstat.tcps_sc_entry_count = set->scs_count;
1347 	tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
1348 	tcpstat.tcps_sc_bucket_maxlen = 0;
1349 	for (i = 0; i < set->scs_size; i++) {
1350 		if (tcpstat.tcps_sc_bucket_maxlen <
1351 		    set->scs_buckethead[i].sch_length)
1352 			tcpstat.tcps_sc_bucket_maxlen =
1353 				set->scs_buckethead[i].sch_length;
1354 	}
1355 	tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
1356 	tcpstat.tcps_sc_uses_left = set->scs_use;
1357 
1358 	return (sysctl_rdstruct(oldp, oldlenp, newp,
1359 	    &tcpstat, sizeof(tcpstat)));
1360 }
1361 
1362 /*
1363  * Sysctl for tcp variables.
1364  */
1365 int
1366 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1367     size_t newlen)
1368 {
1369 	int error, nval;
1370 
1371 	/* All sysctl names at this level are terminal. */
1372 	if (namelen != 1)
1373 		return (ENOTDIR);
1374 
1375 	switch (name[0]) {
1376 	case TCPCTL_BADDYNAMIC:
1377 		NET_LOCK();
1378 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1379 		    baddynamicports.tcp, sizeof(baddynamicports.tcp));
1380 		NET_UNLOCK();
1381 		return (error);
1382 
1383 	case TCPCTL_ROOTONLY:
1384 		if (newp && securelevel > 0)
1385 			return (EPERM);
1386 		NET_LOCK();
1387 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1388 		    rootonlyports.tcp, sizeof(rootonlyports.tcp));
1389 		NET_UNLOCK();
1390 		return (error);
1391 
1392 	case TCPCTL_IDENT:
1393 		NET_LOCK();
1394 		error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
1395 		NET_UNLOCK();
1396 		return (error);
1397 
1398 	case TCPCTL_DROP:
1399 		NET_LOCK();
1400 		error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
1401 		NET_UNLOCK();
1402 		return (error);
1403 
1404 	case TCPCTL_REASS_LIMIT:
1405 		NET_LOCK();
1406 		nval = tcp_reass_limit;
1407 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1408 		if (!error && nval != tcp_reass_limit) {
1409 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
1410 			if (!error)
1411 				tcp_reass_limit = nval;
1412 		}
1413 		NET_UNLOCK();
1414 		return (error);
1415 
1416 	case TCPCTL_SACKHOLE_LIMIT:
1417 		NET_LOCK();
1418 		nval = tcp_sackhole_limit;
1419 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1420 		if (!error && nval != tcp_sackhole_limit) {
1421 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
1422 			if (!error)
1423 				tcp_sackhole_limit = nval;
1424 		}
1425 		NET_UNLOCK();
1426 		return (error);
1427 
1428 	case TCPCTL_STATS:
1429 		return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
1430 
1431 	case TCPCTL_SYN_USE_LIMIT:
1432 		NET_LOCK();
1433 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1434 		    &tcp_syn_use_limit, 0, INT_MAX);
1435 		if (!error && newp != NULL) {
1436 			/*
1437 			 * Global tcp_syn_use_limit is used when reseeding a
1438 			 * new cache.  Also update the value in active cache.
1439 			 */
1440 			if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
1441 				tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
1442 			if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
1443 				tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
1444 		}
1445 		NET_UNLOCK();
1446 		return (error);
1447 
1448 	case TCPCTL_SYN_HASH_SIZE:
1449 		NET_LOCK();
1450 		nval = tcp_syn_hash_size;
1451 		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1452 		    &nval, 1, 100000);
1453 		if (!error && nval != tcp_syn_hash_size) {
1454 			/*
1455 			 * If global hash size has been changed,
1456 			 * switch sets as soon as possible.  Then
1457 			 * the actual hash array will be reallocated.
1458 			 */
1459 			if (tcp_syn_cache[0].scs_size != nval)
1460 				tcp_syn_cache[0].scs_use = 0;
1461 			if (tcp_syn_cache[1].scs_size != nval)
1462 				tcp_syn_cache[1].scs_use = 0;
1463 			tcp_syn_hash_size = nval;
1464 		}
1465 		NET_UNLOCK();
1466 		return (error);
1467 
1468 	default:
1469 		NET_LOCK();
1470 		error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name,
1471 		     namelen, oldp, oldlenp, newp, newlen);
1472 		NET_UNLOCK();
1473 		return (error);
1474 	}
1475 	/* NOTREACHED */
1476 }
1477 
1478 /*
1479  * Scale the send buffer so that inflight data is not accounted against
1480  * the limit. The buffer will scale with the congestion window, if the
1481  * the receiver stops acking data the window will shrink and therefore
1482  * the buffer size will shrink as well.
1483  * In low memory situation try to shrink the buffer to the initial size
1484  * disabling the send buffer scaling as long as the situation persists.
1485  */
1486 void
1487 tcp_update_sndspace(struct tcpcb *tp)
1488 {
1489 	struct socket *so = tp->t_inpcb->inp_socket;
1490 	u_long nmax = so->so_snd.sb_hiwat;
1491 
1492 	if (sbchecklowmem()) {
1493 		/* low on memory try to get rid of some */
1494 		if (tcp_sendspace < nmax)
1495 			nmax = tcp_sendspace;
1496 	} else if (so->so_snd.sb_wat != tcp_sendspace)
1497 		/* user requested buffer size, auto-scaling disabled */
1498 		nmax = so->so_snd.sb_wat;
1499 	else
1500 		/* automatic buffer scaling */
1501 		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
1502 		    tp->snd_una);
1503 
1504 	/* a writable socket must be preserved because of poll(2) semantics */
1505 	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
1506 		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
1507 			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
1508 		/* keep in sync with sbreserve() calculation */
1509 		if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
1510 			nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
1511 	}
1512 
1513 	/* round to MSS boundary */
1514 	nmax = roundup(nmax, tp->t_maxseg);
1515 
1516 	if (nmax != so->so_snd.sb_hiwat)
1517 		sbreserve(so, &so->so_snd, nmax);
1518 }
1519 
1520 /*
1521  * Scale the recv buffer by looking at how much data was transferred in
1522  * one approximated RTT. If more than a big part of the recv buffer was
1523  * transferred during that time we increase the buffer by a constant.
1524  * In low memory situation try to shrink the buffer to the initial size.
1525  */
1526 void
1527 tcp_update_rcvspace(struct tcpcb *tp)
1528 {
1529 	struct socket *so = tp->t_inpcb->inp_socket;
1530 	u_long nmax = so->so_rcv.sb_hiwat;
1531 
1532 	if (sbchecklowmem()) {
1533 		/* low on memory try to get rid of some */
1534 		if (tcp_recvspace < nmax)
1535 			nmax = tcp_recvspace;
1536 	} else if (so->so_rcv.sb_wat != tcp_recvspace)
1537 		/* user requested buffer size, auto-scaling disabled */
1538 		nmax = so->so_rcv.sb_wat;
1539 	else {
1540 		/* automatic buffer scaling */
1541 		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1542 			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1543 			    tcp_autorcvbuf_inc);
1544 	}
1545 
1546 	/* a readable socket must be preserved because of poll(2) semantics */
1547 	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
1548 	    nmax < so->so_snd.sb_lowat)
1549 		nmax = so->so_snd.sb_lowat;
1550 
1551 	if (nmax == so->so_rcv.sb_hiwat)
1552 		return;
1553 
1554 	/* round to MSS boundary */
1555 	nmax = roundup(nmax, tp->t_maxseg);
1556 	sbreserve(so, &so->so_rcv, nmax);
1557 }
1558