xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.176 2020/08/18 18:19:30 gnezdo Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 #include <sys/pool.h>
82 
83 #include <net/if.h>
84 #include <net/if_var.h>
85 #include <net/route.h>
86 
87 #include <netinet/in.h>
88 #include <netinet/in_var.h>
89 #include <netinet/ip.h>
90 #include <netinet/in_pcb.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_debug.h>
98 
99 #ifdef INET6
100 #include <netinet6/in6_var.h>
101 #endif
102 
103 #ifndef TCP_SENDSPACE
104 #define	TCP_SENDSPACE	1024*16
105 #endif
106 u_int	tcp_sendspace = TCP_SENDSPACE;
107 #ifndef TCP_RECVSPACE
108 #define	TCP_RECVSPACE	1024*16
109 #endif
110 u_int	tcp_recvspace = TCP_RECVSPACE;
111 u_int	tcp_autorcvbuf_inc = 16 * 1024;
112 
113 const struct sysctl_bounded_args tcpctl_vars[] = {
114 	{ TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
115 	{ TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT },
116 	{ TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE },
117 	{ TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL },
118 	{ TCPCTL_SACK, &tcp_do_sack, 0, 1 },
119 	{ TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
120 	{ TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
121 	{ TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
122 #ifdef TCP_ECN
123 	{ TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
124 #endif
125 	{ TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
126 	{ TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
127 	{ TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
128 };
129 
130 struct	inpcbtable tcbtable;
131 
132 int tcp_ident(void *, size_t *, void *, size_t, int);
133 
134 /*
135  * Process a TCP user request for TCP tb.  If this is a send request
136  * then m is the mbuf chain of send data.  If this is a timer expiration
137  * (called from the software clock routine), then timertype tells which timer.
138  */
139 /*ARGSUSED*/
140 int
141 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
142     struct mbuf *control, struct proc *p)
143 {
144 	struct inpcb *inp;
145 	struct tcpcb *otp = NULL, *tp = NULL;
146 	int error = 0;
147 	short ostate;
148 
149 	if (req == PRU_CONTROL) {
150 #ifdef INET6
151 		if (sotopf(so) == PF_INET6)
152 			return in6_control(so, (u_long)m, (caddr_t)nam,
153 			    (struct ifnet *)control);
154 		else
155 #endif /* INET6 */
156 			return (in_control(so, (u_long)m, (caddr_t)nam,
157 			    (struct ifnet *)control));
158 	}
159 
160 	soassertlocked(so);
161 
162 	if (control && control->m_len) {
163 		error = EINVAL;
164 		goto release;
165 	}
166 
167 	inp = sotoinpcb(so);
168 	/*
169 	 * When a TCP is attached to a socket, then there will be
170 	 * a (struct inpcb) pointed at by the socket, and this
171 	 * structure will point at a subsidiary (struct tcpcb).
172 	 */
173 	if (inp == NULL) {
174 		error = so->so_error;
175 		if (error == 0)
176 			error = EINVAL;
177 		goto release;
178 	}
179 	tp = intotcpcb(inp);
180 	/* tp might get 0 when using socket splicing */
181 	if (tp == NULL)
182 		goto release;
183 	if (so->so_options & SO_DEBUG) {
184 		otp = tp;
185 		ostate = tp->t_state;
186 	}
187 
188 	switch (req) {
189 
190 	/*
191 	 * Give the socket an address.
192 	 */
193 	case PRU_BIND:
194 		error = in_pcbbind(inp, nam, p);
195 		break;
196 
197 	/*
198 	 * Prepare to accept connections.
199 	 */
200 	case PRU_LISTEN:
201 		if (inp->inp_lport == 0)
202 			error = in_pcbbind(inp, NULL, p);
203 		/* If the in_pcbbind() above is called, the tp->pf
204 		   should still be whatever it was before. */
205 		if (error == 0)
206 			tp->t_state = TCPS_LISTEN;
207 		break;
208 
209 	/*
210 	 * Initiate connection to peer.
211 	 * Create a template for use in transmissions on this connection.
212 	 * Enter SYN_SENT state, and mark socket as connecting.
213 	 * Start keep-alive timer, and seed output sequence space.
214 	 * Send initial segment on connection.
215 	 */
216 	case PRU_CONNECT:
217 #ifdef INET6
218 		if (inp->inp_flags & INP_IPV6) {
219 			struct sockaddr_in6 *sin6;
220 
221 			if ((error = in6_nam2sin6(nam, &sin6)))
222 				break;
223 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
224 			    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
225 				error = EINVAL;
226 				break;
227 			}
228 			error = in6_pcbconnect(inp, nam);
229 		} else
230 #endif /* INET6 */
231 		{
232 			struct sockaddr_in *sin;
233 
234 			if ((error = in_nam2sin(nam, &sin)))
235 				break;
236 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
237 			    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
238 			    IN_MULTICAST(sin->sin_addr.s_addr) ||
239 			    in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
240 				error = EINVAL;
241 				break;
242 			}
243 			error = in_pcbconnect(inp, nam);
244 		}
245 		if (error)
246 			break;
247 
248 		tp->t_template = tcp_template(tp);
249 		if (tp->t_template == 0) {
250 			in_pcbdisconnect(inp);
251 			error = ENOBUFS;
252 			break;
253 		}
254 
255 		so->so_state |= SS_CONNECTOUT;
256 
257 		/* Compute window scaling to request.  */
258 		tcp_rscale(tp, sb_max);
259 
260 		soisconnecting(so);
261 		tcpstat_inc(tcps_connattempt);
262 		tp->t_state = TCPS_SYN_SENT;
263 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
264 		tcp_set_iss_tsm(tp);
265 		tcp_sendseqinit(tp);
266 		tp->snd_last = tp->snd_una;
267 		error = tcp_output(tp);
268 		break;
269 
270 	/*
271 	 * Create a TCP connection between two sockets.
272 	 */
273 	case PRU_CONNECT2:
274 		error = EOPNOTSUPP;
275 		break;
276 
277 	/*
278 	 * Initiate disconnect from peer.
279 	 * If connection never passed embryonic stage, just drop;
280 	 * else if don't need to let data drain, then can just drop anyways,
281 	 * else have to begin TCP shutdown process: mark socket disconnecting,
282 	 * drain unread data, state switch to reflect user close, and
283 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
284 	 * when peer sends FIN and acks ours.
285 	 *
286 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
287 	 */
288 	case PRU_DISCONNECT:
289 		tp = tcp_disconnect(tp);
290 		break;
291 
292 	/*
293 	 * Accept a connection.  Essentially all the work is
294 	 * done at higher levels; just return the address
295 	 * of the peer, storing through addr.
296 	 */
297 	case PRU_ACCEPT:
298 #ifdef INET6
299 		if (inp->inp_flags & INP_IPV6)
300 			in6_setpeeraddr(inp, nam);
301 		else
302 #endif
303 			in_setpeeraddr(inp, nam);
304 		break;
305 
306 	/*
307 	 * Mark the connection as being incapable of further output.
308 	 */
309 	case PRU_SHUTDOWN:
310 		if (so->so_state & SS_CANTSENDMORE)
311 			break;
312 		socantsendmore(so);
313 		tp = tcp_usrclosed(tp);
314 		if (tp)
315 			error = tcp_output(tp);
316 		break;
317 
318 	/*
319 	 * After a receive, possibly send window update to peer.
320 	 */
321 	case PRU_RCVD:
322 		/*
323 		 * soreceive() calls this function when a user receives
324 		 * ancillary data on a listening socket. We don't call
325 		 * tcp_output in such a case, since there is no header
326 		 * template for a listening socket and hence the kernel
327 		 * will panic.
328 		 */
329 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
330 			(void) tcp_output(tp);
331 		break;
332 
333 	/*
334 	 * Do a send by putting data in output queue and updating urgent
335 	 * marker if URG set.  Possibly send more data.
336 	 */
337 	case PRU_SEND:
338 		sbappendstream(so, &so->so_snd, m);
339 		error = tcp_output(tp);
340 		break;
341 
342 	/*
343 	 * Abort the TCP.
344 	 */
345 	case PRU_ABORT:
346 		tp = tcp_drop(tp, ECONNABORTED);
347 		break;
348 
349 	case PRU_SENSE:
350 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
351 		break;
352 
353 	case PRU_RCVOOB:
354 		if ((so->so_oobmark == 0 &&
355 		    (so->so_state & SS_RCVATMARK) == 0) ||
356 		    so->so_options & SO_OOBINLINE ||
357 		    tp->t_oobflags & TCPOOB_HADDATA) {
358 			error = EINVAL;
359 			break;
360 		}
361 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
362 			error = EWOULDBLOCK;
363 			break;
364 		}
365 		m->m_len = 1;
366 		*mtod(m, caddr_t) = tp->t_iobc;
367 		if (((long)nam & MSG_PEEK) == 0)
368 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
369 		break;
370 
371 	case PRU_SENDOOB:
372 		if (sbspace(so, &so->so_snd) < -512) {
373 			m_freem(m);
374 			error = ENOBUFS;
375 			break;
376 		}
377 		/*
378 		 * According to RFC961 (Assigned Protocols),
379 		 * the urgent pointer points to the last octet
380 		 * of urgent data.  We continue, however,
381 		 * to consider it to indicate the first octet
382 		 * of data past the urgent section.
383 		 * Otherwise, snd_up should be one lower.
384 		 */
385 		sbappendstream(so, &so->so_snd, m);
386 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
387 		tp->t_force = 1;
388 		error = tcp_output(tp);
389 		tp->t_force = 0;
390 		break;
391 
392 	case PRU_SOCKADDR:
393 #ifdef INET6
394 		if (inp->inp_flags & INP_IPV6)
395 			in6_setsockaddr(inp, nam);
396 		else
397 #endif
398 			in_setsockaddr(inp, nam);
399 		break;
400 
401 	case PRU_PEERADDR:
402 #ifdef INET6
403 		if (inp->inp_flags & INP_IPV6)
404 			in6_setpeeraddr(inp, nam);
405 		else
406 #endif
407 			in_setpeeraddr(inp, nam);
408 		break;
409 
410 	default:
411 		panic("tcp_usrreq");
412 	}
413 	if (otp)
414 		tcp_trace(TA_USER, ostate, tp, otp, NULL, req, 0);
415 	return (error);
416 
417  release:
418 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
419 		m_freem(control);
420 		m_freem(m);
421 	}
422 	return (error);
423 }
424 
425 int
426 tcp_ctloutput(int op, struct socket *so, int level, int optname,
427     struct mbuf *m)
428 {
429 	int error = 0;
430 	struct inpcb *inp;
431 	struct tcpcb *tp;
432 	int i;
433 
434 	inp = sotoinpcb(so);
435 	if (inp == NULL)
436 		return (ECONNRESET);
437 	if (level != IPPROTO_TCP) {
438 		switch (so->so_proto->pr_domain->dom_family) {
439 #ifdef INET6
440 		case PF_INET6:
441 			error = ip6_ctloutput(op, so, level, optname, m);
442 			break;
443 #endif /* INET6 */
444 		case PF_INET:
445 			error = ip_ctloutput(op, so, level, optname, m);
446 			break;
447 		default:
448 			error = EAFNOSUPPORT;	/*?*/
449 			break;
450 		}
451 		return (error);
452 	}
453 	tp = intotcpcb(inp);
454 
455 	switch (op) {
456 
457 	case PRCO_SETOPT:
458 		switch (optname) {
459 
460 		case TCP_NODELAY:
461 			if (m == NULL || m->m_len < sizeof (int))
462 				error = EINVAL;
463 			else if (*mtod(m, int *))
464 				tp->t_flags |= TF_NODELAY;
465 			else
466 				tp->t_flags &= ~TF_NODELAY;
467 			break;
468 
469 		case TCP_NOPUSH:
470 			if (m == NULL || m->m_len < sizeof (int))
471 				error = EINVAL;
472 			else if (*mtod(m, int *))
473 				tp->t_flags |= TF_NOPUSH;
474 			else if (tp->t_flags & TF_NOPUSH) {
475 				tp->t_flags &= ~TF_NOPUSH;
476 				if (TCPS_HAVEESTABLISHED(tp->t_state))
477 					error = tcp_output(tp);
478 			}
479 			break;
480 
481 		case TCP_MAXSEG:
482 			if (m == NULL || m->m_len < sizeof (int)) {
483 				error = EINVAL;
484 				break;
485 			}
486 
487 			i = *mtod(m, int *);
488 			if (i > 0 && i <= tp->t_maxseg)
489 				tp->t_maxseg = i;
490 			else
491 				error = EINVAL;
492 			break;
493 
494 		case TCP_SACK_ENABLE:
495 			if (m == NULL || m->m_len < sizeof (int)) {
496 				error = EINVAL;
497 				break;
498 			}
499 
500 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
501 				error = EPERM;
502 				break;
503 			}
504 
505 			if (tp->t_flags & TF_SIGNATURE) {
506 				error = EPERM;
507 				break;
508 			}
509 
510 			if (*mtod(m, int *))
511 				tp->sack_enable = 1;
512 			else
513 				tp->sack_enable = 0;
514 			break;
515 #ifdef TCP_SIGNATURE
516 		case TCP_MD5SIG:
517 			if (m == NULL || m->m_len < sizeof (int)) {
518 				error = EINVAL;
519 				break;
520 			}
521 
522 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
523 				error = EPERM;
524 				break;
525 			}
526 
527 			if (*mtod(m, int *)) {
528 				tp->t_flags |= TF_SIGNATURE;
529 				tp->sack_enable = 0;
530 			} else
531 				tp->t_flags &= ~TF_SIGNATURE;
532 			break;
533 #endif /* TCP_SIGNATURE */
534 		default:
535 			error = ENOPROTOOPT;
536 			break;
537 		}
538 		break;
539 
540 	case PRCO_GETOPT:
541 		m->m_len = sizeof(int);
542 
543 		switch (optname) {
544 		case TCP_NODELAY:
545 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
546 			break;
547 		case TCP_NOPUSH:
548 			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
549 			break;
550 		case TCP_MAXSEG:
551 			*mtod(m, int *) = tp->t_maxseg;
552 			break;
553 		case TCP_SACK_ENABLE:
554 			*mtod(m, int *) = tp->sack_enable;
555 			break;
556 #ifdef TCP_SIGNATURE
557 		case TCP_MD5SIG:
558 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
559 			break;
560 #endif
561 		default:
562 			error = ENOPROTOOPT;
563 			break;
564 		}
565 		break;
566 	}
567 	return (error);
568 }
569 
570 /*
571  * Attach TCP protocol to socket, allocating
572  * internet protocol control block, tcp control block,
573  * buffer space, and entering LISTEN state to accept connections.
574  */
575 int
576 tcp_attach(struct socket *so, int proto)
577 {
578 	struct tcpcb *tp;
579 	struct inpcb *inp;
580 	int error;
581 
582 	if (so->so_pcb)
583 		return EISCONN;
584 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
585 	    sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
586 	    sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
587 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
588 		if (error)
589 			return (error);
590 	}
591 
592 	NET_ASSERT_LOCKED();
593 	error = in_pcballoc(so, &tcbtable);
594 	if (error)
595 		return (error);
596 	inp = sotoinpcb(so);
597 	tp = tcp_newtcpcb(inp);
598 	if (tp == NULL) {
599 		unsigned int nofd = so->so_state & SS_NOFDREF;	/* XXX */
600 
601 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
602 		in_pcbdetach(inp);
603 		so->so_state |= nofd;
604 		return (ENOBUFS);
605 	}
606 	tp->t_state = TCPS_CLOSED;
607 #ifdef INET6
608 	/* we disallow IPv4 mapped address completely. */
609 	if (inp->inp_flags & INP_IPV6)
610 		tp->pf = PF_INET6;
611 	else
612 		tp->pf = PF_INET;
613 #else
614 	tp->pf = PF_INET;
615 #endif
616 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
617 		so->so_linger = TCP_LINGERTIME;
618 
619 	if (so->so_options & SO_DEBUG)
620 		tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
621 	return (0);
622 }
623 
624 int
625 tcp_detach(struct socket *so)
626 {
627 	struct inpcb *inp;
628 	struct tcpcb *otp = NULL, *tp = NULL;
629 	int error = 0;
630 	short ostate;
631 
632 	soassertlocked(so);
633 
634 	inp = sotoinpcb(so);
635 	/*
636 	 * When a TCP is attached to a socket, then there will be
637 	 * a (struct inpcb) pointed at by the socket, and this
638 	 * structure will point at a subsidiary (struct tcpcb).
639 	 */
640 	if (inp == NULL) {
641 		error = so->so_error;
642 		if (error == 0)
643 			error = EINVAL;
644 		return (error);
645 	}
646 	tp = intotcpcb(inp);
647 	/* tp might get 0 when using socket splicing */
648 	if (tp == NULL)
649 		return (0);
650 	if (so->so_options & SO_DEBUG) {
651 		otp = tp;
652 		ostate = tp->t_state;
653 	}
654 
655 	/*
656 	 * Detach the TCP protocol from the socket.
657 	 * If the protocol state is non-embryonic, then can't
658 	 * do this directly: have to initiate a PRU_DISCONNECT,
659 	 * which may finish later; embryonic TCB's can just
660 	 * be discarded here.
661 	 */
662 	tp = tcp_disconnect(tp);
663 
664 	if (otp)
665 		tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
666 	return (error);
667 }
668 
669 /*
670  * Initiate (or continue) disconnect.
671  * If embryonic state, just send reset (once).
672  * If in ``let data drain'' option and linger null, just drop.
673  * Otherwise (hard), mark socket disconnecting and drop
674  * current input data; switch states based on user close, and
675  * send segment to peer (with FIN).
676  */
677 struct tcpcb *
678 tcp_disconnect(struct tcpcb *tp)
679 {
680 	struct socket *so = tp->t_inpcb->inp_socket;
681 
682 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
683 		tp = tcp_close(tp);
684 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
685 		tp = tcp_drop(tp, 0);
686 	else {
687 		soisdisconnecting(so);
688 		sbflush(so, &so->so_rcv);
689 		tp = tcp_usrclosed(tp);
690 		if (tp)
691 			(void) tcp_output(tp);
692 	}
693 	return (tp);
694 }
695 
696 /*
697  * User issued close, and wish to trail through shutdown states:
698  * if never received SYN, just forget it.  If got a SYN from peer,
699  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
700  * If already got a FIN from peer, then almost done; go to LAST_ACK
701  * state.  In all other cases, have already sent FIN to peer (e.g.
702  * after PRU_SHUTDOWN), and just have to play tedious game waiting
703  * for peer to send FIN or not respond to keep-alives, etc.
704  * We can let the user exit from the close as soon as the FIN is acked.
705  */
706 struct tcpcb *
707 tcp_usrclosed(struct tcpcb *tp)
708 {
709 
710 	switch (tp->t_state) {
711 
712 	case TCPS_CLOSED:
713 	case TCPS_LISTEN:
714 	case TCPS_SYN_SENT:
715 		tp->t_state = TCPS_CLOSED;
716 		tp = tcp_close(tp);
717 		break;
718 
719 	case TCPS_SYN_RECEIVED:
720 	case TCPS_ESTABLISHED:
721 		tp->t_state = TCPS_FIN_WAIT_1;
722 		break;
723 
724 	case TCPS_CLOSE_WAIT:
725 		tp->t_state = TCPS_LAST_ACK;
726 		break;
727 	}
728 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
729 		soisdisconnected(tp->t_inpcb->inp_socket);
730 		/*
731 		 * If we are in FIN_WAIT_2, we arrived here because the
732 		 * application did a shutdown of the send side.  Like the
733 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
734 		 * a full close, we start a timer to make sure sockets are
735 		 * not left in FIN_WAIT_2 forever.
736 		 */
737 		if (tp->t_state == TCPS_FIN_WAIT_2)
738 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
739 	}
740 	return (tp);
741 }
742 
743 /*
744  * Look up a socket for ident or tcpdrop, ...
745  */
746 int
747 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
748 {
749 	int error = 0;
750 	struct tcp_ident_mapping tir;
751 	struct inpcb *inp;
752 	struct tcpcb *tp = NULL;
753 	struct sockaddr_in *fin, *lin;
754 #ifdef INET6
755 	struct sockaddr_in6 *fin6, *lin6;
756 	struct in6_addr f6, l6;
757 #endif
758 
759 	NET_ASSERT_LOCKED();
760 
761 	if (dodrop) {
762 		if (oldp != NULL || *oldlenp != 0)
763 			return (EINVAL);
764 		if (newp == NULL)
765 			return (EPERM);
766 		if (newlen < sizeof(tir))
767 			return (ENOMEM);
768 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
769 			return (error);
770 	} else {
771 		if (oldp == NULL)
772 			return (EINVAL);
773 		if (*oldlenp < sizeof(tir))
774 			return (ENOMEM);
775 		if (newp != NULL || newlen != 0)
776 			return (EINVAL);
777 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
778 			return (error);
779 	}
780 	switch (tir.faddr.ss_family) {
781 #ifdef INET6
782 	case AF_INET6:
783 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
784 		error = in6_embedscope(&f6, fin6, NULL);
785 		if (error)
786 			return EINVAL;	/*?*/
787 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
788 		error = in6_embedscope(&l6, lin6, NULL);
789 		if (error)
790 			return EINVAL;	/*?*/
791 		break;
792 #endif
793 	case AF_INET:
794 		fin = (struct sockaddr_in *)&tir.faddr;
795 		lin = (struct sockaddr_in *)&tir.laddr;
796 		break;
797 	default:
798 		return (EINVAL);
799 	}
800 
801 	switch (tir.faddr.ss_family) {
802 #ifdef INET6
803 	case AF_INET6:
804 		inp = in6_pcbhashlookup(&tcbtable, &f6,
805 		    fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
806 		break;
807 #endif
808 	case AF_INET:
809 		inp = in_pcbhashlookup(&tcbtable, fin->sin_addr,
810 		    fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
811 		break;
812 	default:
813 		unhandled_af(tir.faddr.ss_family);
814 	}
815 
816 	if (dodrop) {
817 		if (inp && (tp = intotcpcb(inp)) &&
818 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
819 			tp = tcp_drop(tp, ECONNABORTED);
820 		else
821 			error = ESRCH;
822 		return (error);
823 	}
824 
825 	if (inp == NULL) {
826 		tcpstat_inc(tcps_pcbhashmiss);
827 		switch (tir.faddr.ss_family) {
828 #ifdef INET6
829 		case AF_INET6:
830 			inp = in6_pcblookup_listen(&tcbtable,
831 			    &l6, lin6->sin6_port, NULL, tir.rdomain);
832 			break;
833 #endif
834 		case AF_INET:
835 			inp = in_pcblookup_listen(&tcbtable,
836 			    lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
837 			break;
838 		}
839 	}
840 
841 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
842 		tir.ruid = inp->inp_socket->so_ruid;
843 		tir.euid = inp->inp_socket->so_euid;
844 	} else {
845 		tir.ruid = -1;
846 		tir.euid = -1;
847 	}
848 
849 	*oldlenp = sizeof (tir);
850 	error = copyout((void *)&tir, oldp, sizeof (tir));
851 	return (error);
852 }
853 
854 int
855 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
856 {
857 	uint64_t counters[tcps_ncounters];
858 	struct tcpstat tcpstat;
859 	struct syn_cache_set *set;
860 	int i = 0;
861 
862 #define ASSIGN(field)	do { tcpstat.field = counters[i++]; } while (0)
863 
864 	memset(&tcpstat, 0, sizeof tcpstat);
865 	counters_read(tcpcounters, counters, nitems(counters));
866 	ASSIGN(tcps_connattempt);
867 	ASSIGN(tcps_accepts);
868 	ASSIGN(tcps_connects);
869 	ASSIGN(tcps_drops);
870 	ASSIGN(tcps_conndrops);
871 	ASSIGN(tcps_closed);
872 	ASSIGN(tcps_segstimed);
873 	ASSIGN(tcps_rttupdated);
874 	ASSIGN(tcps_delack);
875 	ASSIGN(tcps_timeoutdrop);
876 	ASSIGN(tcps_rexmttimeo);
877 	ASSIGN(tcps_persisttimeo);
878 	ASSIGN(tcps_persistdrop);
879 	ASSIGN(tcps_keeptimeo);
880 	ASSIGN(tcps_keepprobe);
881 	ASSIGN(tcps_keepdrops);
882 	ASSIGN(tcps_sndtotal);
883 	ASSIGN(tcps_sndpack);
884 	ASSIGN(tcps_sndbyte);
885 	ASSIGN(tcps_sndrexmitpack);
886 	ASSIGN(tcps_sndrexmitbyte);
887 	ASSIGN(tcps_sndrexmitfast);
888 	ASSIGN(tcps_sndacks);
889 	ASSIGN(tcps_sndprobe);
890 	ASSIGN(tcps_sndurg);
891 	ASSIGN(tcps_sndwinup);
892 	ASSIGN(tcps_sndctrl);
893 	ASSIGN(tcps_rcvtotal);
894 	ASSIGN(tcps_rcvpack);
895 	ASSIGN(tcps_rcvbyte);
896 	ASSIGN(tcps_rcvbadsum);
897 	ASSIGN(tcps_rcvbadoff);
898 	ASSIGN(tcps_rcvmemdrop);
899 	ASSIGN(tcps_rcvnosec);
900 	ASSIGN(tcps_rcvshort);
901 	ASSIGN(tcps_rcvduppack);
902 	ASSIGN(tcps_rcvdupbyte);
903 	ASSIGN(tcps_rcvpartduppack);
904 	ASSIGN(tcps_rcvpartdupbyte);
905 	ASSIGN(tcps_rcvoopack);
906 	ASSIGN(tcps_rcvoobyte);
907 	ASSIGN(tcps_rcvpackafterwin);
908 	ASSIGN(tcps_rcvbyteafterwin);
909 	ASSIGN(tcps_rcvafterclose);
910 	ASSIGN(tcps_rcvwinprobe);
911 	ASSIGN(tcps_rcvdupack);
912 	ASSIGN(tcps_rcvacktoomuch);
913 	ASSIGN(tcps_rcvacktooold);
914 	ASSIGN(tcps_rcvackpack);
915 	ASSIGN(tcps_rcvackbyte);
916 	ASSIGN(tcps_rcvwinupd);
917 	ASSIGN(tcps_pawsdrop);
918 	ASSIGN(tcps_predack);
919 	ASSIGN(tcps_preddat);
920 	ASSIGN(tcps_pcbhashmiss);
921 	ASSIGN(tcps_noport);
922 	ASSIGN(tcps_badsyn);
923 	ASSIGN(tcps_dropsyn);
924 	ASSIGN(tcps_rcvbadsig);
925 	ASSIGN(tcps_rcvgoodsig);
926 	ASSIGN(tcps_inswcsum);
927 	ASSIGN(tcps_outswcsum);
928 	ASSIGN(tcps_ecn_accepts);
929 	ASSIGN(tcps_ecn_rcvece);
930 	ASSIGN(tcps_ecn_rcvcwr);
931 	ASSIGN(tcps_ecn_rcvce);
932 	ASSIGN(tcps_ecn_sndect);
933 	ASSIGN(tcps_ecn_sndece);
934 	ASSIGN(tcps_ecn_sndcwr);
935 	ASSIGN(tcps_cwr_ecn);
936 	ASSIGN(tcps_cwr_frecovery);
937 	ASSIGN(tcps_cwr_timeout);
938 	ASSIGN(tcps_sc_added);
939 	ASSIGN(tcps_sc_completed);
940 	ASSIGN(tcps_sc_timed_out);
941 	ASSIGN(tcps_sc_overflowed);
942 	ASSIGN(tcps_sc_reset);
943 	ASSIGN(tcps_sc_unreach);
944 	ASSIGN(tcps_sc_bucketoverflow);
945 	ASSIGN(tcps_sc_aborted);
946 	ASSIGN(tcps_sc_dupesyn);
947 	ASSIGN(tcps_sc_dropped);
948 	ASSIGN(tcps_sc_collisions);
949 	ASSIGN(tcps_sc_retransmitted);
950 	ASSIGN(tcps_sc_seedrandom);
951 	ASSIGN(tcps_sc_hash_size);
952 	ASSIGN(tcps_sc_entry_count);
953 	ASSIGN(tcps_sc_entry_limit);
954 	ASSIGN(tcps_sc_bucket_maxlen);
955 	ASSIGN(tcps_sc_bucket_limit);
956 	ASSIGN(tcps_sc_uses_left);
957 	ASSIGN(tcps_conndrained);
958 	ASSIGN(tcps_sack_recovery_episode);
959 	ASSIGN(tcps_sack_rexmits);
960 	ASSIGN(tcps_sack_rexmit_bytes);
961 	ASSIGN(tcps_sack_rcv_opts);
962 	ASSIGN(tcps_sack_snd_opts);
963 	ASSIGN(tcps_sack_drop_opts);
964 
965 #undef ASSIGN
966 
967 	set = &tcp_syn_cache[tcp_syn_cache_active];
968 	tcpstat.tcps_sc_hash_size = set->scs_size;
969 	tcpstat.tcps_sc_entry_count = set->scs_count;
970 	tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
971 	tcpstat.tcps_sc_bucket_maxlen = 0;
972 	for (i = 0; i < set->scs_size; i++) {
973 		if (tcpstat.tcps_sc_bucket_maxlen <
974 		    set->scs_buckethead[i].sch_length)
975 			tcpstat.tcps_sc_bucket_maxlen =
976 				set->scs_buckethead[i].sch_length;
977 	}
978 	tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
979 	tcpstat.tcps_sc_uses_left = set->scs_use;
980 
981 	return (sysctl_rdstruct(oldp, oldlenp, newp,
982 	    &tcpstat, sizeof(tcpstat)));
983 }
984 
985 /*
986  * Sysctl for tcp variables.
987  */
988 int
989 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
990     size_t newlen)
991 {
992 	int error, nval;
993 
994 	/* All sysctl names at this level are terminal. */
995 	if (namelen != 1)
996 		return (ENOTDIR);
997 
998 	switch (name[0]) {
999 	case TCPCTL_SLOWHZ:
1000 		return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ));
1001 
1002 	case TCPCTL_BADDYNAMIC:
1003 		NET_LOCK();
1004 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1005 		    baddynamicports.tcp, sizeof(baddynamicports.tcp));
1006 		NET_UNLOCK();
1007 		return (error);
1008 
1009 	case TCPCTL_ROOTONLY:
1010 		if (newp && securelevel > 0)
1011 			return (EPERM);
1012 		NET_LOCK();
1013 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
1014 		    rootonlyports.tcp, sizeof(rootonlyports.tcp));
1015 		NET_UNLOCK();
1016 		return (error);
1017 
1018 	case TCPCTL_IDENT:
1019 		NET_LOCK();
1020 		error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
1021 		NET_UNLOCK();
1022 		return (error);
1023 
1024 	case TCPCTL_DROP:
1025 		NET_LOCK();
1026 		error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
1027 		NET_UNLOCK();
1028 		return (error);
1029 
1030 	case TCPCTL_ALWAYS_KEEPALIVE:
1031 		NET_LOCK();
1032 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1033 		    &tcp_always_keepalive);
1034 		NET_UNLOCK();
1035 		return (error);
1036 
1037 	case TCPCTL_REASS_LIMIT:
1038 		NET_LOCK();
1039 		nval = tcp_reass_limit;
1040 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1041 		if (!error && nval != tcp_reass_limit) {
1042 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
1043 			if (!error)
1044 				tcp_reass_limit = nval;
1045 		}
1046 		NET_UNLOCK();
1047 		return (error);
1048 
1049 	case TCPCTL_SACKHOLE_LIMIT:
1050 		NET_LOCK();
1051 		nval = tcp_sackhole_limit;
1052 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1053 		if (!error && nval != tcp_sackhole_limit) {
1054 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
1055 			if (!error)
1056 				tcp_sackhole_limit = nval;
1057 		}
1058 		NET_UNLOCK();
1059 		return (error);
1060 
1061 	case TCPCTL_STATS:
1062 		return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
1063 
1064 	case TCPCTL_SYN_USE_LIMIT:
1065 		NET_LOCK();
1066 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1067 		    &tcp_syn_use_limit);
1068 		if (!error && newp != NULL) {
1069 			/*
1070 			 * Global tcp_syn_use_limit is used when reseeding a
1071 			 * new cache.  Also update the value in active cache.
1072 			 */
1073 			if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
1074 				tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
1075 			if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
1076 				tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
1077 		}
1078 		NET_UNLOCK();
1079 		return (error);
1080 
1081 	case TCPCTL_SYN_HASH_SIZE:
1082 		NET_LOCK();
1083 		nval = tcp_syn_hash_size;
1084 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1085 		if (!error && nval != tcp_syn_hash_size) {
1086 			if (nval < 1 || nval > 100000) {
1087 				error = EINVAL;
1088 			} else {
1089 				/*
1090 				 * If global hash size has been changed,
1091 				 * switch sets as soon as possible.  Then
1092 				 * the actual hash array will be reallocated.
1093 				 */
1094 				if (tcp_syn_cache[0].scs_size != nval)
1095 					tcp_syn_cache[0].scs_use = 0;
1096 				if (tcp_syn_cache[1].scs_size != nval)
1097 					tcp_syn_cache[1].scs_use = 0;
1098 				tcp_syn_hash_size = nval;
1099 			}
1100 		}
1101 		NET_UNLOCK();
1102 		return (error);
1103 
1104 	default:
1105 		NET_LOCK();
1106 		error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name,
1107 		     namelen, oldp, oldlenp, newp, newlen);
1108 		NET_UNLOCK();
1109 		return (error);
1110 	}
1111 	/* NOTREACHED */
1112 }
1113 
1114 /*
1115  * Scale the send buffer so that inflight data is not accounted against
1116  * the limit. The buffer will scale with the congestion window, if the
1117  * the receiver stops acking data the window will shrink and therefor
1118  * the buffer size will shrink as well.
1119  * In low memory situation try to shrink the buffer to the initial size
1120  * disabling the send buffer scaling as long as the situation persists.
1121  */
1122 void
1123 tcp_update_sndspace(struct tcpcb *tp)
1124 {
1125 	struct socket *so = tp->t_inpcb->inp_socket;
1126 	u_long nmax = so->so_snd.sb_hiwat;
1127 
1128 	if (sbchecklowmem()) {
1129 		/* low on memory try to get rid of some */
1130 		if (tcp_sendspace < nmax)
1131 			nmax = tcp_sendspace;
1132 	} else if (so->so_snd.sb_wat != tcp_sendspace)
1133 		/* user requested buffer size, auto-scaling disabled */
1134 		nmax = so->so_snd.sb_wat;
1135 	else
1136 		/* automatic buffer scaling */
1137 		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
1138 		    tp->snd_una);
1139 
1140 	/* a writable socket must be preserved because of poll(2) semantics */
1141 	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
1142 		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
1143 			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
1144 		/* keep in sync with sbreserve() calculation */
1145 		if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
1146 			nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
1147 	}
1148 
1149 	/* round to MSS boundary */
1150 	nmax = roundup(nmax, tp->t_maxseg);
1151 
1152 	if (nmax != so->so_snd.sb_hiwat)
1153 		sbreserve(so, &so->so_snd, nmax);
1154 }
1155 
1156 /*
1157  * Scale the recv buffer by looking at how much data was transferred in
1158  * on approximated RTT. If more than a big part of the recv buffer was
1159  * transferred during that time we increase the buffer by a constant.
1160  * In low memory situation try to shrink the buffer to the initial size.
1161  */
1162 void
1163 tcp_update_rcvspace(struct tcpcb *tp)
1164 {
1165 	struct socket *so = tp->t_inpcb->inp_socket;
1166 	u_long nmax = so->so_rcv.sb_hiwat;
1167 
1168 	if (sbchecklowmem()) {
1169 		/* low on memory try to get rid of some */
1170 		if (tcp_recvspace < nmax)
1171 			nmax = tcp_recvspace;
1172 	} else if (so->so_rcv.sb_wat != tcp_recvspace)
1173 		/* user requested buffer size, auto-scaling disabled */
1174 		nmax = so->so_rcv.sb_wat;
1175 	else {
1176 		/* automatic buffer scaling */
1177 		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1178 			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1179 			    tcp_autorcvbuf_inc);
1180 	}
1181 
1182 	/* a readable socket must be preserved because of poll(2) semantics */
1183 	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
1184 	    nmax < so->so_snd.sb_lowat)
1185 		nmax = so->so_snd.sb_lowat;
1186 
1187 	if (nmax == so->so_rcv.sb_hiwat)
1188 		return;
1189 
1190 	/* round to MSS boundary */
1191 	nmax = roundup(nmax, tp->t_maxseg);
1192 	sbreserve(so, &so->so_rcv, nmax);
1193 }
1194