xref: /netbsd-src/sys/netinet/tcp_usrreq.c (revision ba65fde2d7fefa7d39838fa5fa855e62bd606b5e)
1 /*	$NetBSD: tcp_usrreq.c,v 1.165 2012/06/02 21:36:47 dsl Exp $	*/
2 
3 /*
4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the project nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1997, 1998, 2005, 2006 The NetBSD Foundation, Inc.
34  * All rights reserved.
35  *
36  * This code is derived from software contributed to The NetBSD Foundation
37  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
38  * Facility, NASA Ames Research Center.
39  * This code is derived from software contributed to The NetBSD Foundation
40  * by Charles M. Hannum.
41  * This code is derived from software contributed to The NetBSD Foundation
42  * by Rui Paulo.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
54  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
56  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
57  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
58  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
59  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
60  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
61  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
62  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
63  * POSSIBILITY OF SUCH DAMAGE.
64  */
65 
66 /*
67  * Copyright (c) 1982, 1986, 1988, 1993, 1995
68  *	The Regents of the University of California.  All rights reserved.
69  *
70  * Redistribution and use in source and binary forms, with or without
71  * modification, are permitted provided that the following conditions
72  * are met:
73  * 1. Redistributions of source code must retain the above copyright
74  *    notice, this list of conditions and the following disclaimer.
75  * 2. Redistributions in binary form must reproduce the above copyright
76  *    notice, this list of conditions and the following disclaimer in the
77  *    documentation and/or other materials provided with the distribution.
78  * 3. Neither the name of the University nor the names of its contributors
79  *    may be used to endorse or promote products derived from this software
80  *    without specific prior written permission.
81  *
82  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
83  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
84  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
85  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
86  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
87  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
88  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
89  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
90  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
91  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
92  * SUCH DAMAGE.
93  *
94  *	@(#)tcp_usrreq.c	8.5 (Berkeley) 6/21/95
95  */
96 
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.165 2012/06/02 21:36:47 dsl Exp $");
99 
100 #include "opt_inet.h"
101 #include "opt_ipsec.h"
102 #include "opt_tcp_debug.h"
103 #include "opt_mbuftrace.h"
104 
105 
106 #include <sys/param.h>
107 #include <sys/systm.h>
108 #include <sys/kernel.h>
109 #include <sys/malloc.h>
110 #include <sys/mbuf.h>
111 #include <sys/socket.h>
112 #include <sys/socketvar.h>
113 #include <sys/protosw.h>
114 #include <sys/errno.h>
115 #include <sys/stat.h>
116 #include <sys/proc.h>
117 #include <sys/domain.h>
118 #include <sys/sysctl.h>
119 #include <sys/kauth.h>
120 #include <sys/uidinfo.h>
121 
122 #include <net/if.h>
123 #include <net/route.h>
124 
125 #include <netinet/in.h>
126 #include <netinet/in_systm.h>
127 #include <netinet/in_var.h>
128 #include <netinet/ip.h>
129 #include <netinet/in_pcb.h>
130 #include <netinet/ip_var.h>
131 #include <netinet/in_offload.h>
132 
133 #ifdef INET6
134 #ifndef INET
135 #include <netinet/in.h>
136 #endif
137 #include <netinet/ip6.h>
138 #include <netinet6/in6_pcb.h>
139 #include <netinet6/ip6_var.h>
140 #include <netinet6/scope6_var.h>
141 #endif
142 
143 #include <netinet/tcp.h>
144 #include <netinet/tcp_fsm.h>
145 #include <netinet/tcp_seq.h>
146 #include <netinet/tcp_timer.h>
147 #include <netinet/tcp_var.h>
148 #include <netinet/tcp_private.h>
149 #include <netinet/tcp_congctl.h>
150 #include <netinet/tcpip.h>
151 #include <netinet/tcp_debug.h>
152 #include <netinet/tcp_vtw.h>
153 
154 #include "opt_tcp_space.h"
155 
156 /*
157  * TCP protocol interface to socket abstraction.
158  */
159 
160 /*
161  * Process a TCP user request for TCP tb.  If this is a send request
162  * then m is the mbuf chain of send data.  If this is a timer expiration
163  * (called from the software clock routine), then timertype tells which timer.
164  */
165 /*ARGSUSED*/
166 int
167 tcp_usrreq(struct socket *so, int req,
168     struct mbuf *m, struct mbuf *nam, struct mbuf *control, struct lwp *l)
169 {
170 	struct inpcb *inp;
171 #ifdef INET6
172 	struct in6pcb *in6p;
173 #endif
174 	struct tcpcb *tp = NULL;
175 	int s;
176 	int error = 0;
177 #ifdef TCP_DEBUG
178 	int ostate = 0;
179 #endif
180 	int family;	/* family of the socket */
181 
182 	family = so->so_proto->pr_domain->dom_family;
183 
184 	if (req == PRU_CONTROL) {
185 		switch (family) {
186 #ifdef INET
187 		case PF_INET:
188 			return (in_control(so, (long)m, (void *)nam,
189 			    (struct ifnet *)control, l));
190 #endif
191 #ifdef INET6
192 		case PF_INET6:
193 			return (in6_control(so, (long)m, (void *)nam,
194 			    (struct ifnet *)control, l));
195 #endif
196 		default:
197 			return EAFNOSUPPORT;
198 		}
199 	}
200 
201 	s = splsoftnet();
202 
203 	if (req == PRU_PURGEIF) {
204 		mutex_enter(softnet_lock);
205 		switch (family) {
206 #ifdef INET
207 		case PF_INET:
208 			in_pcbpurgeif0(&tcbtable, (struct ifnet *)control);
209 			in_purgeif((struct ifnet *)control);
210 			in_pcbpurgeif(&tcbtable, (struct ifnet *)control);
211 			break;
212 #endif
213 #ifdef INET6
214 		case PF_INET6:
215 			in6_pcbpurgeif0(&tcbtable, (struct ifnet *)control);
216 			in6_purgeif((struct ifnet *)control);
217 			in6_pcbpurgeif(&tcbtable, (struct ifnet *)control);
218 			break;
219 #endif
220 		default:
221 			mutex_exit(softnet_lock);
222 			splx(s);
223 			return (EAFNOSUPPORT);
224 		}
225 		mutex_exit(softnet_lock);
226 		splx(s);
227 		return (0);
228 	}
229 
230 	if (req == PRU_ATTACH)
231 		sosetlock(so);
232 
233 	switch (family) {
234 #ifdef INET
235 	case PF_INET:
236 		inp = sotoinpcb(so);
237 #ifdef INET6
238 		in6p = NULL;
239 #endif
240 		break;
241 #endif
242 #ifdef INET6
243 	case PF_INET6:
244 		inp = NULL;
245 		in6p = sotoin6pcb(so);
246 		break;
247 #endif
248 	default:
249 		splx(s);
250 		return EAFNOSUPPORT;
251 	}
252 
253 #ifdef DIAGNOSTIC
254 #ifdef INET6
255 	if (inp && in6p)
256 		panic("tcp_usrreq: both inp and in6p set to non-NULL");
257 #endif
258 	if (req != PRU_SEND && req != PRU_SENDOOB && control)
259 		panic("tcp_usrreq: unexpected control mbuf");
260 #endif
261 	/*
262 	 * When a TCP is attached to a socket, then there will be
263 	 * a (struct inpcb) pointed at by the socket, and this
264 	 * structure will point at a subsidary (struct tcpcb).
265 	 */
266 	if ((inp == 0
267 #ifdef INET6
268 	    && in6p == 0
269 #endif
270 	    ) && (req != PRU_ATTACH && req != PRU_SENSE))
271 	{
272 		error = EINVAL;
273 		goto release;
274 	}
275 #ifdef INET
276 	if (inp) {
277 		tp = intotcpcb(inp);
278 		/* WHAT IF TP IS 0? */
279 #ifdef KPROF
280 		tcp_acounts[tp->t_state][req]++;
281 #endif
282 #ifdef TCP_DEBUG
283 		ostate = tp->t_state;
284 #endif
285 	}
286 #endif
287 #ifdef INET6
288 	if (in6p) {
289 		tp = in6totcpcb(in6p);
290 		/* WHAT IF TP IS 0? */
291 #ifdef KPROF
292 		tcp_acounts[tp->t_state][req]++;
293 #endif
294 #ifdef TCP_DEBUG
295 		ostate = tp->t_state;
296 #endif
297 	}
298 #endif
299 
300 	switch (req) {
301 
302 	/*
303 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
304 	 * and an internet control block.
305 	 */
306 	case PRU_ATTACH:
307 #ifndef INET6
308 		if (inp != 0)
309 #else
310 		if (inp != 0 || in6p != 0)
311 #endif
312 		{
313 			error = EISCONN;
314 			break;
315 		}
316 		error = tcp_attach(so);
317 		if (error)
318 			break;
319 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
320 			so->so_linger = TCP_LINGERTIME;
321 		tp = sototcpcb(so);
322 		break;
323 
324 	/*
325 	 * PRU_DETACH detaches the TCP protocol from the socket.
326 	 */
327 	case PRU_DETACH:
328 		tp = tcp_disconnect(tp);
329 		break;
330 
331 	/*
332 	 * Give the socket an address.
333 	 */
334 	case PRU_BIND:
335 		switch (family) {
336 #ifdef INET
337 		case PF_INET:
338 			error = in_pcbbind(inp, nam, l);
339 			break;
340 #endif
341 #ifdef INET6
342 		case PF_INET6:
343 			error = in6_pcbbind(in6p, nam, l);
344 			if (!error) {
345 				/* mapped addr case */
346 				if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr))
347 					tp->t_family = AF_INET;
348 				else
349 					tp->t_family = AF_INET6;
350 			}
351 			break;
352 #endif
353 		}
354 		break;
355 
356 	/*
357 	 * Prepare to accept connections.
358 	 */
359 	case PRU_LISTEN:
360 #ifdef INET
361 		if (inp && inp->inp_lport == 0) {
362 			error = in_pcbbind(inp, NULL, l);
363 			if (error)
364 				break;
365 		}
366 #endif
367 #ifdef INET6
368 		if (in6p && in6p->in6p_lport == 0) {
369 			error = in6_pcbbind(in6p, NULL, l);
370 			if (error)
371 				break;
372 		}
373 #endif
374 		tp->t_state = TCPS_LISTEN;
375 		break;
376 
377 	/*
378 	 * Initiate connection to peer.
379 	 * Create a template for use in transmissions on this connection.
380 	 * Enter SYN_SENT state, and mark socket as connecting.
381 	 * Start keep-alive timer, and seed output sequence space.
382 	 * Send initial segment on connection.
383 	 */
384 	case PRU_CONNECT:
385 #ifdef INET
386 		if (inp) {
387 			if (inp->inp_lport == 0) {
388 				error = in_pcbbind(inp, NULL, l);
389 				if (error)
390 					break;
391 			}
392 			error = in_pcbconnect(inp, nam, l);
393 		}
394 #endif
395 #ifdef INET6
396 		if (in6p) {
397 			if (in6p->in6p_lport == 0) {
398 				error = in6_pcbbind(in6p, NULL, l);
399 				if (error)
400 					break;
401 			}
402 			error = in6_pcbconnect(in6p, nam, l);
403 			if (!error) {
404 				/* mapped addr case */
405 				if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr))
406 					tp->t_family = AF_INET;
407 				else
408 					tp->t_family = AF_INET6;
409 			}
410 		}
411 #endif
412 		if (error)
413 			break;
414 		tp->t_template = tcp_template(tp);
415 		if (tp->t_template == 0) {
416 #ifdef INET
417 			if (inp)
418 				in_pcbdisconnect(inp);
419 #endif
420 #ifdef INET6
421 			if (in6p)
422 				in6_pcbdisconnect(in6p);
423 #endif
424 			error = ENOBUFS;
425 			break;
426 		}
427 		/*
428 		 * Compute window scaling to request.
429 		 * XXX: This should be moved to tcp_output().
430 		 */
431 		while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
432 		    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
433 			tp->request_r_scale++;
434 		soisconnecting(so);
435 		TCP_STATINC(TCP_STAT_CONNATTEMPT);
436 		tp->t_state = TCPS_SYN_SENT;
437 		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
438 		tp->iss = tcp_new_iss(tp, 0);
439 		tcp_sendseqinit(tp);
440 		error = tcp_output(tp);
441 		break;
442 
443 	/*
444 	 * Create a TCP connection between two sockets.
445 	 */
446 	case PRU_CONNECT2:
447 		error = EOPNOTSUPP;
448 		break;
449 
450 	/*
451 	 * Initiate disconnect from peer.
452 	 * If connection never passed embryonic stage, just drop;
453 	 * else if don't need to let data drain, then can just drop anyways,
454 	 * else have to begin TCP shutdown process: mark socket disconnecting,
455 	 * drain unread data, state switch to reflect user close, and
456 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
457 	 * when peer sends FIN and acks ours.
458 	 *
459 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
460 	 */
461 	case PRU_DISCONNECT:
462 		tp = tcp_disconnect(tp);
463 		break;
464 
465 	/*
466 	 * Accept a connection.  Essentially all the work is
467 	 * done at higher levels; just return the address
468 	 * of the peer, storing through addr.
469 	 */
470 	case PRU_ACCEPT:
471 #ifdef INET
472 		if (inp)
473 			in_setpeeraddr(inp, nam);
474 #endif
475 #ifdef INET6
476 		if (in6p)
477 			in6_setpeeraddr(in6p, nam);
478 #endif
479 		break;
480 
481 	/*
482 	 * Mark the connection as being incapable of further output.
483 	 */
484 	case PRU_SHUTDOWN:
485 		socantsendmore(so);
486 		tp = tcp_usrclosed(tp);
487 		if (tp)
488 			error = tcp_output(tp);
489 		break;
490 
491 	/*
492 	 * After a receive, possibly send window update to peer.
493 	 */
494 	case PRU_RCVD:
495 		/*
496 		 * soreceive() calls this function when a user receives
497 		 * ancillary data on a listening socket. We don't call
498 		 * tcp_output in such a case, since there is no header
499 		 * template for a listening socket and hence the kernel
500 		 * will panic.
501 		 */
502 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
503 			(void) tcp_output(tp);
504 		break;
505 
506 	/*
507 	 * Do a send by putting data in output queue and updating urgent
508 	 * marker if URG set.  Possibly send more data.
509 	 */
510 	case PRU_SEND:
511 		if (control && control->m_len) {
512 			m_freem(control);
513 			m_freem(m);
514 			error = EINVAL;
515 			break;
516 		}
517 		sbappendstream(&so->so_snd, m);
518 		error = tcp_output(tp);
519 		break;
520 
521 	/*
522 	 * Abort the TCP.
523 	 */
524 	case PRU_ABORT:
525 		tp = tcp_drop(tp, ECONNABORTED);
526 		break;
527 
528 	case PRU_SENSE:
529 		/*
530 		 * stat: don't bother with a blocksize.
531 		 */
532 		splx(s);
533 		return (0);
534 
535 	case PRU_RCVOOB:
536 		if (control && control->m_len) {
537 			m_freem(control);
538 			m_freem(m);
539 			error = EINVAL;
540 			break;
541 		}
542 		if ((so->so_oobmark == 0 &&
543 		    (so->so_state & SS_RCVATMARK) == 0) ||
544 		    so->so_options & SO_OOBINLINE ||
545 		    tp->t_oobflags & TCPOOB_HADDATA) {
546 			error = EINVAL;
547 			break;
548 		}
549 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
550 			error = EWOULDBLOCK;
551 			break;
552 		}
553 		m->m_len = 1;
554 		*mtod(m, char *) = tp->t_iobc;
555 		if (((long)nam & MSG_PEEK) == 0)
556 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
557 		break;
558 
559 	case PRU_SENDOOB:
560 		if (sbspace(&so->so_snd) < -512) {
561 			m_freem(m);
562 			error = ENOBUFS;
563 			break;
564 		}
565 		/*
566 		 * According to RFC961 (Assigned Protocols),
567 		 * the urgent pointer points to the last octet
568 		 * of urgent data.  We continue, however,
569 		 * to consider it to indicate the first octet
570 		 * of data past the urgent section.
571 		 * Otherwise, snd_up should be one lower.
572 		 */
573 		sbappendstream(&so->so_snd, m);
574 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
575 		tp->t_force = 1;
576 		error = tcp_output(tp);
577 		tp->t_force = 0;
578 		break;
579 
580 	case PRU_SOCKADDR:
581 #ifdef INET
582 		if (inp)
583 			in_setsockaddr(inp, nam);
584 #endif
585 #ifdef INET6
586 		if (in6p)
587 			in6_setsockaddr(in6p, nam);
588 #endif
589 		break;
590 
591 	case PRU_PEERADDR:
592 #ifdef INET
593 		if (inp)
594 			in_setpeeraddr(inp, nam);
595 #endif
596 #ifdef INET6
597 		if (in6p)
598 			in6_setpeeraddr(in6p, nam);
599 #endif
600 		break;
601 
602 	default:
603 		panic("tcp_usrreq");
604 	}
605 #ifdef TCP_DEBUG
606 	if (tp && (so->so_options & SO_DEBUG))
607 		tcp_trace(TA_USER, ostate, tp, NULL, req);
608 #endif
609 
610 release:
611 	splx(s);
612 	return (error);
613 }
614 
615 static void
616 change_keepalive(struct socket *so, struct tcpcb *tp)
617 {
618 	tp->t_maxidle = tp->t_keepcnt * tp->t_keepintvl;
619 	TCP_TIMER_DISARM(tp, TCPT_KEEP);
620 	TCP_TIMER_DISARM(tp, TCPT_2MSL);
621 
622 	if (tp->t_state == TCPS_SYN_RECEIVED ||
623 	    tp->t_state == TCPS_SYN_SENT) {
624 		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
625 	} else if (so->so_options & SO_KEEPALIVE &&
626 	    tp->t_state <= TCPS_CLOSE_WAIT) {
627 		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl);
628 	} else {
629 		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
630 	}
631 
632 	if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
633 		TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
634 }
635 
636 
637 int
638 tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
639 {
640 	int error = 0, s;
641 	struct inpcb *inp;
642 #ifdef INET6
643 	struct in6pcb *in6p;
644 #endif
645 	struct tcpcb *tp;
646 	u_int ui;
647 	int family;	/* family of the socket */
648 	int level, optname, optval;
649 
650 	level = sopt->sopt_level;
651 	optname = sopt->sopt_name;
652 
653 	family = so->so_proto->pr_domain->dom_family;
654 
655 	s = splsoftnet();
656 	switch (family) {
657 #ifdef INET
658 	case PF_INET:
659 		inp = sotoinpcb(so);
660 #ifdef INET6
661 		in6p = NULL;
662 #endif
663 		break;
664 #endif
665 #ifdef INET6
666 	case PF_INET6:
667 		inp = NULL;
668 		in6p = sotoin6pcb(so);
669 		break;
670 #endif
671 	default:
672 		splx(s);
673 		panic("%s: af %d", __func__, family);
674 	}
675 #ifndef INET6
676 	if (inp == NULL)
677 #else
678 	if (inp == NULL && in6p == NULL)
679 #endif
680 	{
681 		splx(s);
682 		return (ECONNRESET);
683 	}
684 	if (level != IPPROTO_TCP) {
685 		switch (family) {
686 #ifdef INET
687 		case PF_INET:
688 			error = ip_ctloutput(op, so, sopt);
689 			break;
690 #endif
691 #ifdef INET6
692 		case PF_INET6:
693 			error = ip6_ctloutput(op, so, sopt);
694 			break;
695 #endif
696 		}
697 		splx(s);
698 		return (error);
699 	}
700 	if (inp)
701 		tp = intotcpcb(inp);
702 #ifdef INET6
703 	else if (in6p)
704 		tp = in6totcpcb(in6p);
705 #endif
706 	else
707 		tp = NULL;
708 
709 	switch (op) {
710 	case PRCO_SETOPT:
711 		switch (optname) {
712 #ifdef TCP_SIGNATURE
713 		case TCP_MD5SIG:
714 			error = sockopt_getint(sopt, &optval);
715 			if (error)
716 				break;
717 			if (optval > 0)
718 				tp->t_flags |= TF_SIGNATURE;
719 			else
720 				tp->t_flags &= ~TF_SIGNATURE;
721 			break;
722 #endif /* TCP_SIGNATURE */
723 
724 		case TCP_NODELAY:
725 			error = sockopt_getint(sopt, &optval);
726 			if (error)
727 				break;
728 			if (optval)
729 				tp->t_flags |= TF_NODELAY;
730 			else
731 				tp->t_flags &= ~TF_NODELAY;
732 			break;
733 
734 		case TCP_MAXSEG:
735 			error = sockopt_getint(sopt, &optval);
736 			if (error)
737 				break;
738 			if (optval > 0 && optval <= tp->t_peermss)
739 				tp->t_peermss = optval; /* limit on send size */
740 			else
741 				error = EINVAL;
742 			break;
743 #ifdef notyet
744 		case TCP_CONGCTL:
745 			/* XXX string overflow XXX */
746 			error = tcp_congctl_select(tp, sopt->sopt_data);
747 			break;
748 #endif
749 
750 		case TCP_KEEPIDLE:
751 			error = sockopt_get(sopt, &ui, sizeof(ui));
752 			if (error)
753 				break;
754 			if (ui > 0) {
755 				tp->t_keepidle = ui;
756 				change_keepalive(so, tp);
757 			} else
758 				error = EINVAL;
759 			break;
760 
761 		case TCP_KEEPINTVL:
762 			error = sockopt_get(sopt, &ui, sizeof(ui));
763 			if (error)
764 				break;
765 			if (ui > 0) {
766 				tp->t_keepintvl = ui;
767 				change_keepalive(so, tp);
768 			} else
769 				error = EINVAL;
770 			break;
771 
772 		case TCP_KEEPCNT:
773 			error = sockopt_get(sopt, &ui, sizeof(ui));
774 			if (error)
775 				break;
776 			if (ui > 0) {
777 				tp->t_keepcnt = ui;
778 				change_keepalive(so, tp);
779 			} else
780 				error = EINVAL;
781 			break;
782 
783 		case TCP_KEEPINIT:
784 			error = sockopt_get(sopt, &ui, sizeof(ui));
785 			if (error)
786 				break;
787 			if (ui > 0) {
788 				tp->t_keepinit = ui;
789 				change_keepalive(so, tp);
790 			} else
791 				error = EINVAL;
792 			break;
793 
794 		default:
795 			error = ENOPROTOOPT;
796 			break;
797 		}
798 		break;
799 
800 	case PRCO_GETOPT:
801 		switch (optname) {
802 #ifdef TCP_SIGNATURE
803 		case TCP_MD5SIG:
804 			optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
805 			error = sockopt_set(sopt, &optval, sizeof(optval));
806 			break;
807 #endif
808 		case TCP_NODELAY:
809 			optval = tp->t_flags & TF_NODELAY;
810 			error = sockopt_set(sopt, &optval, sizeof(optval));
811 			break;
812 		case TCP_MAXSEG:
813 			optval = tp->t_peermss;
814 			error = sockopt_set(sopt, &optval, sizeof(optval));
815 			break;
816 #ifdef notyet
817 		case TCP_CONGCTL:
818 			break;
819 #endif
820 		default:
821 			error = ENOPROTOOPT;
822 			break;
823 		}
824 		break;
825 	}
826 	splx(s);
827 	return (error);
828 }
829 
830 #ifndef TCP_SENDSPACE
831 #define	TCP_SENDSPACE	1024*32
832 #endif
833 int	tcp_sendspace = TCP_SENDSPACE;
834 #ifndef TCP_RECVSPACE
835 #define	TCP_RECVSPACE	1024*32
836 #endif
837 int	tcp_recvspace = TCP_RECVSPACE;
838 
839 /*
840  * Attach TCP protocol to socket, allocating
841  * internet protocol control block, tcp control block,
842  * bufer space, and entering LISTEN state if to accept connections.
843  */
844 int
845 tcp_attach(struct socket *so)
846 {
847 	struct tcpcb *tp;
848 	struct inpcb *inp;
849 #ifdef INET6
850 	struct in6pcb *in6p;
851 #endif
852 	int error;
853 	int family;	/* family of the socket */
854 
855 	family = so->so_proto->pr_domain->dom_family;
856 
857 #ifdef MBUFTRACE
858 	so->so_mowner = &tcp_sock_mowner;
859 	so->so_rcv.sb_mowner = &tcp_sock_rx_mowner;
860 	so->so_snd.sb_mowner = &tcp_sock_tx_mowner;
861 #endif
862 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
863 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
864 		if (error)
865 			return (error);
866 	}
867 
868 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
869 	so->so_snd.sb_flags |= SB_AUTOSIZE;
870 
871 	switch (family) {
872 #ifdef INET
873 	case PF_INET:
874 		error = in_pcballoc(so, &tcbtable);
875 		if (error)
876 			return (error);
877 		inp = sotoinpcb(so);
878 #ifdef INET6
879 		in6p = NULL;
880 #endif
881 		break;
882 #endif
883 #ifdef INET6
884 	case PF_INET6:
885 		error = in6_pcballoc(so, &tcbtable);
886 		if (error)
887 			return (error);
888 		inp = NULL;
889 		in6p = sotoin6pcb(so);
890 		break;
891 #endif
892 	default:
893 		return EAFNOSUPPORT;
894 	}
895 	if (inp)
896 		tp = tcp_newtcpcb(family, (void *)inp);
897 #ifdef INET6
898 	else if (in6p)
899 		tp = tcp_newtcpcb(family, (void *)in6p);
900 #endif
901 	else
902 		tp = NULL;
903 
904 	if (tp == 0) {
905 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
906 
907 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
908 #ifdef INET
909 		if (inp)
910 			in_pcbdetach(inp);
911 #endif
912 #ifdef INET6
913 		if (in6p)
914 			in6_pcbdetach(in6p);
915 #endif
916 		so->so_state |= nofd;
917 		return (ENOBUFS);
918 	}
919 	tp->t_state = TCPS_CLOSED;
920 	return (0);
921 }
922 
923 /*
924  * Initiate (or continue) disconnect.
925  * If embryonic state, just send reset (once).
926  * If in ``let data drain'' option and linger null, just drop.
927  * Otherwise (hard), mark socket disconnecting and drop
928  * current input data; switch states based on user close, and
929  * send segment to peer (with FIN).
930  */
931 struct tcpcb *
932 tcp_disconnect(struct tcpcb *tp)
933 {
934 	struct socket *so;
935 
936 	if (tp->t_inpcb)
937 		so = tp->t_inpcb->inp_socket;
938 #ifdef INET6
939 	else if (tp->t_in6pcb)
940 		so = tp->t_in6pcb->in6p_socket;
941 #endif
942 	else
943 		so = NULL;
944 
945 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
946 		tp = tcp_close(tp);
947 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
948 		tp = tcp_drop(tp, 0);
949 	else {
950 		soisdisconnecting(so);
951 		sbflush(&so->so_rcv);
952 		tp = tcp_usrclosed(tp);
953 		if (tp)
954 			(void) tcp_output(tp);
955 	}
956 	return (tp);
957 }
958 
959 /*
960  * User issued close, and wish to trail through shutdown states:
961  * if never received SYN, just forget it.  If got a SYN from peer,
962  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
963  * If already got a FIN from peer, then almost done; go to LAST_ACK
964  * state.  In all other cases, have already sent FIN to peer (e.g.
965  * after PRU_SHUTDOWN), and just have to play tedious game waiting
966  * for peer to send FIN or not respond to keep-alives, etc.
967  * We can let the user exit from the close as soon as the FIN is acked.
968  */
969 struct tcpcb *
970 tcp_usrclosed(struct tcpcb *tp)
971 {
972 
973 	switch (tp->t_state) {
974 
975 	case TCPS_CLOSED:
976 	case TCPS_LISTEN:
977 	case TCPS_SYN_SENT:
978 		tp->t_state = TCPS_CLOSED;
979 		tp = tcp_close(tp);
980 		break;
981 
982 	case TCPS_SYN_RECEIVED:
983 	case TCPS_ESTABLISHED:
984 		tp->t_state = TCPS_FIN_WAIT_1;
985 		break;
986 
987 	case TCPS_CLOSE_WAIT:
988 		tp->t_state = TCPS_LAST_ACK;
989 		break;
990 	}
991 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
992 		struct socket *so;
993 		if (tp->t_inpcb)
994 			so = tp->t_inpcb->inp_socket;
995 #ifdef INET6
996 		else if (tp->t_in6pcb)
997 			so = tp->t_in6pcb->in6p_socket;
998 #endif
999 		else
1000 			so = NULL;
1001 		if (so)
1002 			soisdisconnected(so);
1003 		/*
1004 		 * If we are in FIN_WAIT_2, we arrived here because the
1005 		 * application did a shutdown of the send side.  Like the
1006 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
1007 		 * a full close, we start a timer to make sure sockets are
1008 		 * not left in FIN_WAIT_2 forever.
1009 		 */
1010 		if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
1011 			TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
1012 		else if (tp->t_state == TCPS_TIME_WAIT
1013 			 && ((tp->t_inpcb
1014 			      && (tcp4_vtw_enable & 1)
1015 			      && vtw_add(AF_INET, tp))
1016 			     ||
1017 			     (tp->t_in6pcb
1018 			      && (tcp6_vtw_enable & 1)
1019 			      && vtw_add(AF_INET6, tp)))) {
1020 			tp = 0;
1021 		}
1022 	}
1023 	return (tp);
1024 }
1025 
1026 /*
1027  * sysctl helper routine for net.inet.ip.mssdflt.  it can't be less
1028  * than 32.
1029  */
1030 static int
1031 sysctl_net_inet_tcp_mssdflt(SYSCTLFN_ARGS)
1032 {
1033 	int error, mssdflt;
1034 	struct sysctlnode node;
1035 
1036 	mssdflt = tcp_mssdflt;
1037 	node = *rnode;
1038 	node.sysctl_data = &mssdflt;
1039 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1040 	if (error || newp == NULL)
1041 		return (error);
1042 
1043 	if (mssdflt < 32)
1044 		return (EINVAL);
1045 	tcp_mssdflt = mssdflt;
1046 
1047 	return (0);
1048 }
1049 
1050 /*
1051  * sysctl helper routine for setting port related values under
1052  * net.inet.ip and net.inet6.ip6.  does basic range checking and does
1053  * additional checks for each type.  this code has placed in
1054  * tcp_input.c since INET and INET6 both use the same tcp code.
1055  *
1056  * this helper is not static so that both inet and inet6 can use it.
1057  */
1058 int
1059 sysctl_net_inet_ip_ports(SYSCTLFN_ARGS)
1060 {
1061 	int error, tmp;
1062 	int apmin, apmax;
1063 #ifndef IPNOPRIVPORTS
1064 	int lpmin, lpmax;
1065 #endif /* IPNOPRIVPORTS */
1066 	struct sysctlnode node;
1067 
1068 	if (namelen != 0)
1069 		return (EINVAL);
1070 
1071 	switch (name[-3]) {
1072 #ifdef INET
1073 	    case PF_INET:
1074 		apmin = anonportmin;
1075 		apmax = anonportmax;
1076 #ifndef IPNOPRIVPORTS
1077 		lpmin = lowportmin;
1078 		lpmax = lowportmax;
1079 #endif /* IPNOPRIVPORTS */
1080 		break;
1081 #endif /* INET */
1082 #ifdef INET6
1083 	    case PF_INET6:
1084 		apmin = ip6_anonportmin;
1085 		apmax = ip6_anonportmax;
1086 #ifndef IPNOPRIVPORTS
1087 		lpmin = ip6_lowportmin;
1088 		lpmax = ip6_lowportmax;
1089 #endif /* IPNOPRIVPORTS */
1090 		break;
1091 #endif /* INET6 */
1092 	    default:
1093 		return (EINVAL);
1094 	}
1095 
1096 	/*
1097 	 * insert temporary copy into node, perform lookup on
1098 	 * temporary, then restore pointer
1099 	 */
1100 	node = *rnode;
1101 	tmp = *(int*)rnode->sysctl_data;
1102 	node.sysctl_data = &tmp;
1103 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1104 	if (error || newp == NULL)
1105 		return (error);
1106 
1107 	/*
1108 	 * simple port range check
1109 	 */
1110 	if (tmp < 0 || tmp > 65535)
1111 		return (EINVAL);
1112 
1113 	/*
1114 	 * per-node range checks
1115 	 */
1116 	switch (rnode->sysctl_num) {
1117 	case IPCTL_ANONPORTMIN:
1118 	case IPV6CTL_ANONPORTMIN:
1119 		if (tmp >= apmax)
1120 			return (EINVAL);
1121 #ifndef IPNOPRIVPORTS
1122 		if (tmp < IPPORT_RESERVED)
1123                         return (EINVAL);
1124 #endif /* IPNOPRIVPORTS */
1125 		break;
1126 
1127 	case IPCTL_ANONPORTMAX:
1128 	case IPV6CTL_ANONPORTMAX:
1129                 if (apmin >= tmp)
1130 			return (EINVAL);
1131 #ifndef IPNOPRIVPORTS
1132 		if (tmp < IPPORT_RESERVED)
1133                         return (EINVAL);
1134 #endif /* IPNOPRIVPORTS */
1135 		break;
1136 
1137 #ifndef IPNOPRIVPORTS
1138 	case IPCTL_LOWPORTMIN:
1139 	case IPV6CTL_LOWPORTMIN:
1140 		if (tmp >= lpmax ||
1141 		    tmp > IPPORT_RESERVEDMAX ||
1142 		    tmp < IPPORT_RESERVEDMIN)
1143 			return (EINVAL);
1144 		break;
1145 
1146 	case IPCTL_LOWPORTMAX:
1147 	case IPV6CTL_LOWPORTMAX:
1148 		if (lpmin >= tmp ||
1149 		    tmp > IPPORT_RESERVEDMAX ||
1150 		    tmp < IPPORT_RESERVEDMIN)
1151 			return (EINVAL);
1152 		break;
1153 #endif /* IPNOPRIVPORTS */
1154 
1155 	default:
1156 		return (EINVAL);
1157 	}
1158 
1159 	*(int*)rnode->sysctl_data = tmp;
1160 
1161 	return (0);
1162 }
1163 
1164 static inline int
1165 copyout_uid(struct socket *sockp, void *oldp, size_t *oldlenp)
1166 {
1167 	size_t sz;
1168 	int error;
1169 	uid_t uid;
1170 
1171 	uid = kauth_cred_geteuid(sockp->so_cred);
1172 	if (oldp) {
1173 		sz = MIN(sizeof(uid), *oldlenp);
1174 		error = copyout(&uid, oldp, sz);
1175 		if (error)
1176 			return error;
1177 	}
1178 	*oldlenp = sizeof(uid);
1179 	return 0;
1180 }
1181 
1182 static inline int
1183 inet4_ident_core(struct in_addr raddr, u_int rport,
1184     struct in_addr laddr, u_int lport,
1185     void *oldp, size_t *oldlenp,
1186     struct lwp *l, int dodrop)
1187 {
1188 	struct inpcb *inp;
1189 	struct socket *sockp;
1190 
1191 	inp = in_pcblookup_connect(&tcbtable, raddr, rport, laddr, lport, 0);
1192 
1193 	if (inp == NULL || (sockp = inp->inp_socket) == NULL)
1194 		return ESRCH;
1195 
1196 	if (dodrop) {
1197 		struct tcpcb *tp;
1198 		int error;
1199 
1200 		if (inp == NULL || (tp = intotcpcb(inp)) == NULL ||
1201 		    (inp->inp_socket->so_options & SO_ACCEPTCONN) != 0)
1202 			return ESRCH;
1203 
1204 		error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
1205 		    KAUTH_REQ_NETWORK_SOCKET_DROP, inp->inp_socket, tp, NULL);
1206 		if (error)
1207 			return (error);
1208 
1209 		(void)tcp_drop(tp, ECONNABORTED);
1210 		return 0;
1211 	}
1212 	else
1213 		return copyout_uid(sockp, oldp, oldlenp);
1214 }
1215 
1216 #ifdef INET6
1217 static inline int
1218 inet6_ident_core(struct in6_addr *raddr, u_int rport,
1219     struct in6_addr *laddr, u_int lport,
1220     void *oldp, size_t *oldlenp,
1221     struct lwp *l, int dodrop)
1222 {
1223 	struct in6pcb *in6p;
1224 	struct socket *sockp;
1225 
1226 	in6p = in6_pcblookup_connect(&tcbtable, raddr, rport, laddr, lport, 0, 0);
1227 
1228 	if (in6p == NULL || (sockp = in6p->in6p_socket) == NULL)
1229 		return ESRCH;
1230 
1231 	if (dodrop) {
1232 		struct tcpcb *tp;
1233 		int error;
1234 
1235 		if (in6p == NULL || (tp = in6totcpcb(in6p)) == NULL ||
1236 		    (in6p->in6p_socket->so_options & SO_ACCEPTCONN) != 0)
1237 			return ESRCH;
1238 
1239 		error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
1240 		    KAUTH_REQ_NETWORK_SOCKET_DROP, in6p->in6p_socket, tp, NULL);
1241 		if (error)
1242 			return (error);
1243 
1244 		(void)tcp_drop(tp, ECONNABORTED);
1245 		return 0;
1246 	}
1247 	else
1248 		return copyout_uid(sockp, oldp, oldlenp);
1249 }
1250 #endif
1251 
1252 /*
1253  * sysctl helper routine for the net.inet.tcp.drop and
1254  * net.inet6.tcp6.drop nodes.
1255  */
1256 #define sysctl_net_inet_tcp_drop sysctl_net_inet_tcp_ident
1257 
1258 /*
1259  * sysctl helper routine for the net.inet.tcp.ident and
1260  * net.inet6.tcp6.ident nodes.  contains backwards compat code for the
1261  * old way of looking up the ident information for ipv4 which involves
1262  * stuffing the port/addr pairs into the mib lookup.
1263  */
1264 static int
1265 sysctl_net_inet_tcp_ident(SYSCTLFN_ARGS)
1266 {
1267 #ifdef INET
1268 	struct sockaddr_in *si4[2];
1269 #endif /* INET */
1270 #ifdef INET6
1271 	struct sockaddr_in6 *si6[2];
1272 #endif /* INET6 */
1273 	struct sockaddr_storage sa[2];
1274 	int error, pf, dodrop;
1275 
1276 	dodrop = name[-1] == TCPCTL_DROP;
1277 	if (dodrop) {
1278 		if (oldp != NULL || *oldlenp != 0)
1279 			return EINVAL;
1280 		if (newp == NULL)
1281 			return EPERM;
1282 		if (newlen < sizeof(sa))
1283 			return ENOMEM;
1284 	}
1285 	if (namelen != 4 && namelen != 0)
1286 		return EINVAL;
1287 	if (name[-2] != IPPROTO_TCP)
1288 		return EINVAL;
1289 	pf = name[-3];
1290 
1291 	/* old style lookup, ipv4 only */
1292 	if (namelen == 4) {
1293 #ifdef INET
1294 		struct in_addr laddr, raddr;
1295 		u_int lport, rport;
1296 
1297 		if (pf != PF_INET)
1298 			return EPROTONOSUPPORT;
1299 		raddr.s_addr = (uint32_t)name[0];
1300 		rport = (u_int)name[1];
1301 		laddr.s_addr = (uint32_t)name[2];
1302 		lport = (u_int)name[3];
1303 
1304 		mutex_enter(softnet_lock);
1305 		error = inet4_ident_core(raddr, rport, laddr, lport,
1306 		    oldp, oldlenp, l, dodrop);
1307 		mutex_exit(softnet_lock);
1308 		return error;
1309 #else /* INET */
1310 		return EINVAL;
1311 #endif /* INET */
1312 	}
1313 
1314 	if (newp == NULL || newlen != sizeof(sa))
1315 		return EINVAL;
1316 	error = copyin(newp, &sa, newlen);
1317 	if (error)
1318 		return error;
1319 
1320 	/*
1321 	 * requested families must match
1322 	 */
1323 	if (pf != sa[0].ss_family || sa[0].ss_family != sa[1].ss_family)
1324 		return EINVAL;
1325 
1326 	switch (pf) {
1327 #ifdef INET6
1328 	case PF_INET6:
1329 		si6[0] = (struct sockaddr_in6*)&sa[0];
1330 		si6[1] = (struct sockaddr_in6*)&sa[1];
1331 		if (si6[0]->sin6_len != sizeof(*si6[0]) ||
1332 		    si6[1]->sin6_len != sizeof(*si6[1]))
1333 			return EINVAL;
1334 
1335 		if (!IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) &&
1336 		    !IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr)) {
1337 			error = sa6_embedscope(si6[0], ip6_use_defzone);
1338 			if (error)
1339 				return error;
1340 			error = sa6_embedscope(si6[1], ip6_use_defzone);
1341 			if (error)
1342 				return error;
1343 
1344 			mutex_enter(softnet_lock);
1345 			error = inet6_ident_core(&si6[0]->sin6_addr,
1346 			    si6[0]->sin6_port, &si6[1]->sin6_addr,
1347 			    si6[1]->sin6_port, oldp, oldlenp, l, dodrop);
1348 			mutex_exit(softnet_lock);
1349 			return error;
1350 		}
1351 
1352 		if (IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) !=
1353 		    IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr))
1354 			return EINVAL;
1355 
1356 		in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[0]);
1357 		in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[1]);
1358 		/*FALLTHROUGH*/
1359 #endif /* INET6 */
1360 #ifdef INET
1361 	case PF_INET:
1362 		si4[0] = (struct sockaddr_in*)&sa[0];
1363 		si4[1] = (struct sockaddr_in*)&sa[1];
1364 		if (si4[0]->sin_len != sizeof(*si4[0]) ||
1365 		    si4[0]->sin_len != sizeof(*si4[1]))
1366 			return EINVAL;
1367 
1368 		mutex_enter(softnet_lock);
1369 		error = inet4_ident_core(si4[0]->sin_addr, si4[0]->sin_port,
1370 		    si4[1]->sin_addr, si4[1]->sin_port,
1371 		    oldp, oldlenp, l, dodrop);
1372 		mutex_exit(softnet_lock);
1373 		return error;
1374 #endif /* INET */
1375 	default:
1376 		return EPROTONOSUPPORT;
1377 	}
1378 }
1379 
1380 /*
1381  * sysctl helper for the inet and inet6 pcblists.  handles tcp/udp and
1382  * inet/inet6, as well as raw pcbs for each.  specifically not
1383  * declared static so that raw sockets and udp/udp6 can use it as
1384  * well.
1385  */
1386 int
1387 sysctl_inpcblist(SYSCTLFN_ARGS)
1388 {
1389 #ifdef INET
1390 	struct sockaddr_in *in;
1391 	const struct inpcb *inp;
1392 #endif
1393 #ifdef INET6
1394 	struct sockaddr_in6 *in6;
1395 	const struct in6pcb *in6p;
1396 #endif
1397 	/*
1398 	 * sysctl_data is const, but CIRCLEQ_FOREACH can't use a const
1399 	 * struct inpcbtable pointer, so we have to discard const.  :-/
1400 	 */
1401 	struct inpcbtable *pcbtbl = __UNCONST(rnode->sysctl_data);
1402 	const struct inpcb_hdr *inph;
1403 	struct tcpcb *tp;
1404 	struct kinfo_pcb pcb;
1405 	char *dp;
1406 	u_int op, arg;
1407 	size_t len, needed, elem_size, out_size;
1408 	int error, elem_count, pf, proto, pf2;
1409 
1410 	if (namelen != 4)
1411 		return (EINVAL);
1412 
1413 	if (oldp != NULL) {
1414 		    len = *oldlenp;
1415 		    elem_size = name[2];
1416 		    elem_count = name[3];
1417 		    if (elem_size != sizeof(pcb))
1418 			    return EINVAL;
1419 	} else {
1420 		    len = 0;
1421 		    elem_count = INT_MAX;
1422 		    elem_size = sizeof(pcb);
1423 	}
1424 	error = 0;
1425 	dp = oldp;
1426 	op = name[0];
1427 	arg = name[1];
1428 	out_size = elem_size;
1429 	needed = 0;
1430 
1431 	if (namelen == 1 && name[0] == CTL_QUERY)
1432 		return (sysctl_query(SYSCTLFN_CALL(rnode)));
1433 
1434 	if (name - oname != 4)
1435 		return (EINVAL);
1436 
1437 	pf = oname[1];
1438 	proto = oname[2];
1439 	pf2 = (oldp != NULL) ? pf : 0;
1440 
1441 	mutex_enter(softnet_lock);
1442 
1443 	CIRCLEQ_FOREACH(inph, &pcbtbl->inpt_queue, inph_queue) {
1444 #ifdef INET
1445 		inp = (const struct inpcb *)inph;
1446 #endif
1447 #ifdef INET6
1448 		in6p = (const struct in6pcb *)inph;
1449 #endif
1450 
1451 		if (inph->inph_af != pf)
1452 			continue;
1453 
1454 		if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
1455 		    KAUTH_REQ_NETWORK_SOCKET_CANSEE, inph->inph_socket, NULL,
1456 		    NULL) != 0)
1457 			continue;
1458 
1459 		memset(&pcb, 0, sizeof(pcb));
1460 
1461 		pcb.ki_family = pf;
1462 		pcb.ki_type = proto;
1463 
1464 		switch (pf2) {
1465 		case 0:
1466 			/* just probing for size */
1467 			break;
1468 #ifdef INET
1469 		case PF_INET:
1470 			pcb.ki_family = inp->inp_socket->so_proto->
1471 			    pr_domain->dom_family;
1472 			pcb.ki_type = inp->inp_socket->so_proto->
1473 			    pr_type;
1474 			pcb.ki_protocol = inp->inp_socket->so_proto->
1475 			    pr_protocol;
1476 			pcb.ki_pflags = inp->inp_flags;
1477 
1478 			pcb.ki_sostate = inp->inp_socket->so_state;
1479 			pcb.ki_prstate = inp->inp_state;
1480 			if (proto == IPPROTO_TCP) {
1481 				tp = intotcpcb(inp);
1482 				pcb.ki_tstate = tp->t_state;
1483 				pcb.ki_tflags = tp->t_flags;
1484 			}
1485 
1486 			pcb.ki_pcbaddr = PTRTOUINT64(inp);
1487 			pcb.ki_ppcbaddr = PTRTOUINT64(inp->inp_ppcb);
1488 			pcb.ki_sockaddr = PTRTOUINT64(inp->inp_socket);
1489 
1490 			pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc;
1491 			pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc;
1492 
1493 			in = satosin(&pcb.ki_src);
1494 			in->sin_len = sizeof(*in);
1495 			in->sin_family = pf;
1496 			in->sin_port = inp->inp_lport;
1497 			in->sin_addr = inp->inp_laddr;
1498 			if (pcb.ki_prstate >= INP_CONNECTED) {
1499 				in = satosin(&pcb.ki_dst);
1500 				in->sin_len = sizeof(*in);
1501 				in->sin_family = pf;
1502 				in->sin_port = inp->inp_fport;
1503 				in->sin_addr = inp->inp_faddr;
1504 			}
1505 			break;
1506 #endif
1507 #ifdef INET6
1508 		case PF_INET6:
1509 			pcb.ki_family = in6p->in6p_socket->so_proto->
1510 			    pr_domain->dom_family;
1511 			pcb.ki_type = in6p->in6p_socket->so_proto->pr_type;
1512 			pcb.ki_protocol = in6p->in6p_socket->so_proto->
1513 			    pr_protocol;
1514 			pcb.ki_pflags = in6p->in6p_flags;
1515 
1516 			pcb.ki_sostate = in6p->in6p_socket->so_state;
1517 			pcb.ki_prstate = in6p->in6p_state;
1518 			if (proto == IPPROTO_TCP) {
1519 				tp = in6totcpcb(in6p);
1520 				pcb.ki_tstate = tp->t_state;
1521 				pcb.ki_tflags = tp->t_flags;
1522 			}
1523 
1524 			pcb.ki_pcbaddr = PTRTOUINT64(in6p);
1525 			pcb.ki_ppcbaddr = PTRTOUINT64(in6p->in6p_ppcb);
1526 			pcb.ki_sockaddr = PTRTOUINT64(in6p->in6p_socket);
1527 
1528 			pcb.ki_rcvq = in6p->in6p_socket->so_rcv.sb_cc;
1529 			pcb.ki_sndq = in6p->in6p_socket->so_snd.sb_cc;
1530 
1531 			in6 = satosin6(&pcb.ki_src);
1532 			in6->sin6_len = sizeof(*in6);
1533 			in6->sin6_family = pf;
1534 			in6->sin6_port = in6p->in6p_lport;
1535 			in6->sin6_flowinfo = in6p->in6p_flowinfo;
1536 			in6->sin6_addr = in6p->in6p_laddr;
1537 			in6->sin6_scope_id = 0; /* XXX? */
1538 
1539 			if (pcb.ki_prstate >= IN6P_CONNECTED) {
1540 				in6 = satosin6(&pcb.ki_dst);
1541 				in6->sin6_len = sizeof(*in6);
1542 				in6->sin6_family = pf;
1543 				in6->sin6_port = in6p->in6p_fport;
1544 				in6->sin6_flowinfo = in6p->in6p_flowinfo;
1545 				in6->sin6_addr = in6p->in6p_faddr;
1546 				in6->sin6_scope_id = 0; /* XXX? */
1547 			}
1548 			break;
1549 #endif
1550 		}
1551 
1552 		if (len >= elem_size && elem_count > 0) {
1553 			error = copyout(&pcb, dp, out_size);
1554 			if (error) {
1555 				mutex_exit(softnet_lock);
1556 				return (error);
1557 			}
1558 			dp += elem_size;
1559 			len -= elem_size;
1560 		}
1561 		needed += elem_size;
1562 		if (elem_count > 0 && elem_count != INT_MAX)
1563 			elem_count--;
1564 	}
1565 
1566 	*oldlenp = needed;
1567 	if (oldp == NULL)
1568 		*oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb);
1569 
1570 	mutex_exit(softnet_lock);
1571 
1572 	return (error);
1573 }
1574 
1575 static int
1576 sysctl_tcp_congctl(SYSCTLFN_ARGS)
1577 {
1578 	struct sysctlnode node;
1579 	int error;
1580 	char newname[TCPCC_MAXLEN];
1581 
1582 	strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1);
1583 
1584 	node = *rnode;
1585 	node.sysctl_data = newname;
1586 	node.sysctl_size = sizeof(newname);
1587 
1588 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1589 
1590 	if (error ||
1591 	    newp == NULL ||
1592 	    strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0)
1593 		return error;
1594 
1595 	mutex_enter(softnet_lock);
1596 	error = tcp_congctl_select(NULL, newname);
1597 	mutex_exit(softnet_lock);
1598 
1599 	return error;
1600 }
1601 
1602 static int
1603 sysctl_tcp_keep(SYSCTLFN_ARGS)
1604 {
1605 	int error;
1606 	u_int tmp;
1607 	struct sysctlnode node;
1608 
1609 	node = *rnode;
1610 	tmp = *(u_int *)rnode->sysctl_data;
1611 	node.sysctl_data = &tmp;
1612 
1613 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1614 	if (error || newp == NULL)
1615 		return error;
1616 
1617 	mutex_enter(softnet_lock);
1618 
1619 	*(u_int *)rnode->sysctl_data = tmp;
1620 	tcp_tcpcb_template();	/* update the template */
1621 
1622 	mutex_exit(softnet_lock);
1623 	return 0;
1624 }
1625 
1626 static int
1627 sysctl_net_inet_tcp_stats(SYSCTLFN_ARGS)
1628 {
1629 
1630 	return (NETSTAT_SYSCTL(tcpstat_percpu, TCP_NSTATS));
1631 }
1632 
1633 /*
1634  * this (second stage) setup routine is a replacement for tcp_sysctl()
1635  * (which is currently used for ipv4 and ipv6)
1636  */
1637 static void
1638 sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname,
1639 			   const char *tcpname)
1640 {
1641 	const struct sysctlnode *sack_node;
1642 	const struct sysctlnode *abc_node;
1643 	const struct sysctlnode *ecn_node;
1644 	const struct sysctlnode *congctl_node;
1645 	const struct sysctlnode *mslt_node;
1646 	const struct sysctlnode *vtw_node;
1647 #ifdef TCP_DEBUG
1648 	extern struct tcp_debug tcp_debug[TCP_NDEBUG];
1649 	extern int tcp_debx;
1650 #endif
1651 
1652 	sysctl_createv(clog, 0, NULL, NULL,
1653 		       CTLFLAG_PERMANENT,
1654 		       CTLTYPE_NODE, "net", NULL,
1655 		       NULL, 0, NULL, 0,
1656 		       CTL_NET, CTL_EOL);
1657 	sysctl_createv(clog, 0, NULL, NULL,
1658 		       CTLFLAG_PERMANENT,
1659 		       CTLTYPE_NODE, pfname, NULL,
1660 		       NULL, 0, NULL, 0,
1661 		       CTL_NET, pf, CTL_EOL);
1662 	sysctl_createv(clog, 0, NULL, NULL,
1663 		       CTLFLAG_PERMANENT,
1664 		       CTLTYPE_NODE, tcpname,
1665 		       SYSCTL_DESCR("TCP related settings"),
1666 		       NULL, 0, NULL, 0,
1667 		       CTL_NET, pf, IPPROTO_TCP, CTL_EOL);
1668 
1669 	sysctl_createv(clog, 0, NULL, NULL,
1670 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1671 		       CTLTYPE_INT, "rfc1323",
1672 		       SYSCTL_DESCR("Enable RFC1323 TCP extensions"),
1673 		       NULL, 0, &tcp_do_rfc1323, 0,
1674 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RFC1323, CTL_EOL);
1675 	sysctl_createv(clog, 0, NULL, NULL,
1676 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1677 		       CTLTYPE_INT, "sendspace",
1678 		       SYSCTL_DESCR("Default TCP send buffer size"),
1679 		       NULL, 0, &tcp_sendspace, 0,
1680 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SENDSPACE, CTL_EOL);
1681 	sysctl_createv(clog, 0, NULL, NULL,
1682 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1683 		       CTLTYPE_INT, "recvspace",
1684 		       SYSCTL_DESCR("Default TCP receive buffer size"),
1685 		       NULL, 0, &tcp_recvspace, 0,
1686 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RECVSPACE, CTL_EOL);
1687 	sysctl_createv(clog, 0, NULL, NULL,
1688 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1689 		       CTLTYPE_INT, "mssdflt",
1690 		       SYSCTL_DESCR("Default maximum segment size"),
1691 		       sysctl_net_inet_tcp_mssdflt, 0, &tcp_mssdflt, 0,
1692 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSSDFLT, CTL_EOL);
1693 	sysctl_createv(clog, 0, NULL, NULL,
1694 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1695 		       CTLTYPE_INT, "minmss",
1696 		       SYSCTL_DESCR("Lower limit for TCP maximum segment size"),
1697 		       NULL, 0, &tcp_minmss, 0,
1698 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1699 	sysctl_createv(clog, 0, NULL, NULL,
1700 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1701 		       CTLTYPE_INT, "msl",
1702 		       SYSCTL_DESCR("Maximum Segment Life"),
1703 		       NULL, 0, &tcp_msl, 0,
1704 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSL, CTL_EOL);
1705 	sysctl_createv(clog, 0, NULL, NULL,
1706 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1707 		       CTLTYPE_INT, "syn_cache_limit",
1708 		       SYSCTL_DESCR("Maximum number of entries in the TCP "
1709 				    "compressed state engine"),
1710 		       NULL, 0, &tcp_syn_cache_limit, 0,
1711 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_LIMIT,
1712 		       CTL_EOL);
1713 	sysctl_createv(clog, 0, NULL, NULL,
1714 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1715 		       CTLTYPE_INT, "syn_bucket_limit",
1716 		       SYSCTL_DESCR("Maximum number of entries per hash "
1717 				    "bucket in the TCP compressed state "
1718 				    "engine"),
1719 		       NULL, 0, &tcp_syn_bucket_limit, 0,
1720 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_BUCKET_LIMIT,
1721 		       CTL_EOL);
1722 #if 0 /* obsoleted */
1723 	sysctl_createv(clog, 0, NULL, NULL,
1724 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1725 		       CTLTYPE_INT, "syn_cache_interval",
1726 		       SYSCTL_DESCR("TCP compressed state engine's timer interval"),
1727 		       NULL, 0, &tcp_syn_cache_interval, 0,
1728 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_INTER,
1729 		       CTL_EOL);
1730 #endif
1731 	sysctl_createv(clog, 0, NULL, NULL,
1732 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1733 		       CTLTYPE_INT, "init_win",
1734 		       SYSCTL_DESCR("Initial TCP congestion window"),
1735 		       NULL, 0, &tcp_init_win, 0,
1736 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN, CTL_EOL);
1737 	sysctl_createv(clog, 0, NULL, NULL,
1738 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1739 		       CTLTYPE_INT, "mss_ifmtu",
1740 		       SYSCTL_DESCR("Use interface MTU for calculating MSS"),
1741 		       NULL, 0, &tcp_mss_ifmtu, 0,
1742 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSS_IFMTU, CTL_EOL);
1743 	sysctl_createv(clog, 0, NULL, &sack_node,
1744 		       CTLFLAG_PERMANENT,
1745 		       CTLTYPE_NODE, "sack",
1746 		       SYSCTL_DESCR("RFC2018 Selective ACKnowledgement tunables"),
1747 		       NULL, 0, NULL, 0,
1748 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_EOL);
1749 
1750 	/* Congctl subtree */
1751 	sysctl_createv(clog, 0, NULL, &congctl_node,
1752 		       CTLFLAG_PERMANENT,
1753 		       CTLTYPE_NODE, "congctl",
1754 		       SYSCTL_DESCR("TCP Congestion Control"),
1755 	    	       NULL, 0, NULL, 0,
1756 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1757 	sysctl_createv(clog, 0, &congctl_node, NULL,
1758 		       CTLFLAG_PERMANENT,
1759 		       CTLTYPE_STRING, "available",
1760 		       SYSCTL_DESCR("Available Congestion Control Mechanisms"),
1761 		       NULL, 0, tcp_congctl_avail, 0, CTL_CREATE, CTL_EOL);
1762 	sysctl_createv(clog, 0, &congctl_node, NULL,
1763 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1764 		       CTLTYPE_STRING, "selected",
1765 		       SYSCTL_DESCR("Selected Congestion Control Mechanism"),
1766 		       sysctl_tcp_congctl, 0, NULL, TCPCC_MAXLEN,
1767 		       CTL_CREATE, CTL_EOL);
1768 
1769 	sysctl_createv(clog, 0, NULL, NULL,
1770 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1771 		       CTLTYPE_INT, "win_scale",
1772 		       SYSCTL_DESCR("Use RFC1323 window scale options"),
1773 		       NULL, 0, &tcp_do_win_scale, 0,
1774 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_WSCALE, CTL_EOL);
1775 	sysctl_createv(clog, 0, NULL, NULL,
1776 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1777 		       CTLTYPE_INT, "timestamps",
1778 		       SYSCTL_DESCR("Use RFC1323 time stamp options"),
1779 		       NULL, 0, &tcp_do_timestamps, 0,
1780 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_TSTAMP, CTL_EOL);
1781 	sysctl_createv(clog, 0, NULL, NULL,
1782 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1783 		       CTLTYPE_INT, "compat_42",
1784 		       SYSCTL_DESCR("Enable workarounds for 4.2BSD TCP bugs"),
1785 		       NULL, 0, &tcp_compat_42, 0,
1786 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_COMPAT_42, CTL_EOL);
1787 	sysctl_createv(clog, 0, NULL, NULL,
1788 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1789 		       CTLTYPE_INT, "cwm",
1790 		       SYSCTL_DESCR("Hughes/Touch/Heidemann Congestion Window "
1791 				    "Monitoring"),
1792 		       NULL, 0, &tcp_cwm, 0,
1793 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM, CTL_EOL);
1794 	sysctl_createv(clog, 0, NULL, NULL,
1795 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1796 		       CTLTYPE_INT, "cwm_burstsize",
1797 		       SYSCTL_DESCR("Congestion Window Monitoring allowed "
1798 				    "burst count in packets"),
1799 		       NULL, 0, &tcp_cwm_burstsize, 0,
1800 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM_BURSTSIZE,
1801 		       CTL_EOL);
1802 	sysctl_createv(clog, 0, NULL, NULL,
1803 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1804 		       CTLTYPE_INT, "ack_on_push",
1805 		       SYSCTL_DESCR("Immediately return ACK when PSH is "
1806 				    "received"),
1807 		       NULL, 0, &tcp_ack_on_push, 0,
1808 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_ACK_ON_PUSH, CTL_EOL);
1809 	sysctl_createv(clog, 0, NULL, NULL,
1810 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1811 		       CTLTYPE_INT, "keepidle",
1812 		       SYSCTL_DESCR("Allowed connection idle ticks before a "
1813 				    "keepalive probe is sent"),
1814 		       sysctl_tcp_keep, 0, &tcp_keepidle, 0,
1815 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPIDLE, CTL_EOL);
1816 	sysctl_createv(clog, 0, NULL, NULL,
1817 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1818 		       CTLTYPE_INT, "keepintvl",
1819 		       SYSCTL_DESCR("Ticks before next keepalive probe is sent"),
1820 		       sysctl_tcp_keep, 0, &tcp_keepintvl, 0,
1821 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPINTVL, CTL_EOL);
1822 	sysctl_createv(clog, 0, NULL, NULL,
1823 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1824 		       CTLTYPE_INT, "keepcnt",
1825 		       SYSCTL_DESCR("Number of keepalive probes to send"),
1826 		       sysctl_tcp_keep, 0, &tcp_keepcnt, 0,
1827 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPCNT, CTL_EOL);
1828 	sysctl_createv(clog, 0, NULL, NULL,
1829 		       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
1830 		       CTLTYPE_INT, "slowhz",
1831 		       SYSCTL_DESCR("Keepalive ticks per second"),
1832 		       NULL, PR_SLOWHZ, NULL, 0,
1833 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL);
1834 	sysctl_createv(clog, 0, NULL, NULL,
1835 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1836 		       CTLTYPE_INT, "log_refused",
1837 		       SYSCTL_DESCR("Log refused TCP connections"),
1838 		       NULL, 0, &tcp_log_refused, 0,
1839 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOG_REFUSED, CTL_EOL);
1840 #if 0 /* obsoleted */
1841 	sysctl_createv(clog, 0, NULL, NULL,
1842 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1843 		       CTLTYPE_INT, "rstratelimit", NULL,
1844 		       NULL, 0, &tcp_rst_ratelim, 0,
1845 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTRATELIMIT, CTL_EOL);
1846 #endif
1847 	sysctl_createv(clog, 0, NULL, NULL,
1848 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1849 		       CTLTYPE_INT, "rstppslimit",
1850 		       SYSCTL_DESCR("Maximum number of RST packets to send "
1851 				    "per second"),
1852 		       NULL, 0, &tcp_rst_ppslim, 0,
1853 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTPPSLIMIT, CTL_EOL);
1854 	sysctl_createv(clog, 0, NULL, NULL,
1855 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1856 		       CTLTYPE_INT, "delack_ticks",
1857 		       SYSCTL_DESCR("Number of ticks to delay sending an ACK"),
1858 		       NULL, 0, &tcp_delack_ticks, 0,
1859 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DELACK_TICKS, CTL_EOL);
1860 	sysctl_createv(clog, 0, NULL, NULL,
1861 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1862 		       CTLTYPE_INT, "init_win_local",
1863 		       SYSCTL_DESCR("Initial TCP window size (in segments)"),
1864 		       NULL, 0, &tcp_init_win_local, 0,
1865 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN_LOCAL,
1866 		       CTL_EOL);
1867 	sysctl_createv(clog, 0, NULL, NULL,
1868 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1869 		       CTLTYPE_STRUCT, "ident",
1870 		       SYSCTL_DESCR("RFC1413 Identification Protocol lookups"),
1871 		       sysctl_net_inet_tcp_ident, 0, NULL, sizeof(uid_t),
1872 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_IDENT, CTL_EOL);
1873 	sysctl_createv(clog, 0, NULL, NULL,
1874 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1875 		       CTLTYPE_INT, "do_loopback_cksum",
1876 		       SYSCTL_DESCR("Perform TCP checksum on loopback"),
1877 		       NULL, 0, &tcp_do_loopback_cksum, 0,
1878 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOOPBACKCKSUM,
1879 		       CTL_EOL);
1880 	sysctl_createv(clog, 0, NULL, NULL,
1881 		       CTLFLAG_PERMANENT,
1882 		       CTLTYPE_STRUCT, "pcblist",
1883 		       SYSCTL_DESCR("TCP protocol control block list"),
1884 		       sysctl_inpcblist, 0, &tcbtable, 0,
1885 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
1886 		       CTL_EOL);
1887 	sysctl_createv(clog, 0, NULL, NULL,
1888 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1889 		       CTLTYPE_INT, "keepinit",
1890 		       SYSCTL_DESCR("Ticks before initial tcp connection times out"),
1891 		       sysctl_tcp_keep, 0, &tcp_keepinit, 0,
1892 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1893 
1894 	/* TCP socket buffers auto-sizing nodes */
1895 	sysctl_createv(clog, 0, NULL, NULL,
1896 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1897 		       CTLTYPE_INT, "recvbuf_auto",
1898 		       SYSCTL_DESCR("Enable automatic receive "
1899 		           "buffer sizing (experimental)"),
1900 		       NULL, 0, &tcp_do_autorcvbuf, 0,
1901 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1902 	sysctl_createv(clog, 0, NULL, NULL,
1903 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1904 		       CTLTYPE_INT, "recvbuf_inc",
1905 		       SYSCTL_DESCR("Incrementor step size of "
1906 		           "automatic receive buffer"),
1907 		       NULL, 0, &tcp_autorcvbuf_inc, 0,
1908 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1909 	sysctl_createv(clog, 0, NULL, NULL,
1910 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1911 		       CTLTYPE_INT, "recvbuf_max",
1912 		       SYSCTL_DESCR("Max size of automatic receive buffer"),
1913 		       NULL, 0, &tcp_autorcvbuf_max, 0,
1914 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1915 
1916 	sysctl_createv(clog, 0, NULL, NULL,
1917 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1918 		       CTLTYPE_INT, "sendbuf_auto",
1919 		       SYSCTL_DESCR("Enable automatic send "
1920 		           "buffer sizing (experimental)"),
1921 		       NULL, 0, &tcp_do_autosndbuf, 0,
1922 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1923 	sysctl_createv(clog, 0, NULL, NULL,
1924 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1925 		       CTLTYPE_INT, "sendbuf_inc",
1926 		       SYSCTL_DESCR("Incrementor step size of "
1927 		           "automatic send buffer"),
1928 		       NULL, 0, &tcp_autosndbuf_inc, 0,
1929 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1930 	sysctl_createv(clog, 0, NULL, NULL,
1931 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1932 		       CTLTYPE_INT, "sendbuf_max",
1933 		       SYSCTL_DESCR("Max size of automatic send buffer"),
1934 		       NULL, 0, &tcp_autosndbuf_max, 0,
1935 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1936 
1937 	/* ECN subtree */
1938 	sysctl_createv(clog, 0, NULL, &ecn_node,
1939 	    	       CTLFLAG_PERMANENT,
1940 		       CTLTYPE_NODE, "ecn",
1941 	    	       SYSCTL_DESCR("RFC3168 Explicit Congestion Notification"),
1942 	    	       NULL, 0, NULL, 0,
1943 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1944 	sysctl_createv(clog, 0, &ecn_node, NULL,
1945 	    	       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1946 		       CTLTYPE_INT, "enable",
1947 		       SYSCTL_DESCR("Enable TCP Explicit Congestion "
1948 			   "Notification"),
1949 	    	       NULL, 0, &tcp_do_ecn, 0, CTL_CREATE, CTL_EOL);
1950 	sysctl_createv(clog, 0, &ecn_node, NULL,
1951 	    	       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1952 		       CTLTYPE_INT, "maxretries",
1953 		       SYSCTL_DESCR("Number of times to retry ECN setup "
1954 			       "before disabling ECN on the connection"),
1955 	    	       NULL, 0, &tcp_ecn_maxretries, 0, CTL_CREATE, CTL_EOL);
1956 
1957 	/* SACK gets it's own little subtree. */
1958 	sysctl_createv(clog, 0, NULL, &sack_node,
1959 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1960 		       CTLTYPE_INT, "enable",
1961 		       SYSCTL_DESCR("Enable RFC2018 Selective ACKnowledgement"),
1962 		       NULL, 0, &tcp_do_sack, 0,
1963 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
1964 	sysctl_createv(clog, 0, NULL, &sack_node,
1965 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1966 		       CTLTYPE_INT, "maxholes",
1967 		       SYSCTL_DESCR("Maximum number of TCP SACK holes allowed per connection"),
1968 		       NULL, 0, &tcp_sack_tp_maxholes, 0,
1969 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
1970 	sysctl_createv(clog, 0, NULL, &sack_node,
1971 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1972 		       CTLTYPE_INT, "globalmaxholes",
1973 		       SYSCTL_DESCR("Global maximum number of TCP SACK holes"),
1974 		       NULL, 0, &tcp_sack_globalmaxholes, 0,
1975 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
1976 	sysctl_createv(clog, 0, NULL, &sack_node,
1977 		       CTLFLAG_PERMANENT,
1978 		       CTLTYPE_INT, "globalholes",
1979 		       SYSCTL_DESCR("Global number of TCP SACK holes"),
1980 		       NULL, 0, &tcp_sack_globalholes, 0,
1981 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
1982 
1983 	sysctl_createv(clog, 0, NULL, NULL,
1984 		       CTLFLAG_PERMANENT,
1985 		       CTLTYPE_STRUCT, "stats",
1986 		       SYSCTL_DESCR("TCP statistics"),
1987 		       sysctl_net_inet_tcp_stats, 0, NULL, 0,
1988 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_STATS,
1989 		       CTL_EOL);
1990         sysctl_createv(clog, 0, NULL, NULL,
1991                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1992                        CTLTYPE_INT, "local_by_rtt",
1993                        SYSCTL_DESCR("Use RTT estimator to decide which hosts "
1994 				    "are local"),
1995 		       NULL, 0, &tcp_rttlocal, 0,
1996 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
1997 #ifdef TCP_DEBUG
1998 	sysctl_createv(clog, 0, NULL, NULL,
1999 		       CTLFLAG_PERMANENT,
2000 		       CTLTYPE_STRUCT, "debug",
2001 		       SYSCTL_DESCR("TCP sockets debug information"),
2002 		       NULL, 0, &tcp_debug, sizeof(tcp_debug),
2003 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBUG,
2004 		       CTL_EOL);
2005 	sysctl_createv(clog, 0, NULL, NULL,
2006 		       CTLFLAG_PERMANENT,
2007 		       CTLTYPE_INT, "debx",
2008 		       SYSCTL_DESCR("Number of TCP debug sockets messages"),
2009 		       NULL, 0, &tcp_debx, sizeof(tcp_debx),
2010 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBX,
2011 		       CTL_EOL);
2012 #endif
2013 	sysctl_createv(clog, 0, NULL, NULL,
2014 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2015 		       CTLTYPE_STRUCT, "drop",
2016 		       SYSCTL_DESCR("TCP drop connection"),
2017 		       sysctl_net_inet_tcp_drop, 0, NULL, 0,
2018 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DROP, CTL_EOL);
2019 	sysctl_createv(clog, 0, NULL, NULL,
2020 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2021 		       CTLTYPE_INT, "iss_hash",
2022 		       SYSCTL_DESCR("Enable RFC 1948 ISS by cryptographic "
2023 				    "hash computation"),
2024 		       NULL, 0, &tcp_do_rfc1948, sizeof(tcp_do_rfc1948),
2025 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
2026 		       CTL_EOL);
2027 
2028 	/* ABC subtree */
2029 
2030 	sysctl_createv(clog, 0, NULL, &abc_node,
2031 		       CTLFLAG_PERMANENT, CTLTYPE_NODE, "abc",
2032 		       SYSCTL_DESCR("RFC3465 Appropriate Byte Counting (ABC)"),
2033 		       NULL, 0, NULL, 0,
2034 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
2035 	sysctl_createv(clog, 0, &abc_node, NULL,
2036 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2037 		       CTLTYPE_INT, "enable",
2038 		       SYSCTL_DESCR("Enable RFC3465 Appropriate Byte Counting"),
2039 		       NULL, 0, &tcp_do_abc, 0, CTL_CREATE, CTL_EOL);
2040 	sysctl_createv(clog, 0, &abc_node, NULL,
2041 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2042 		       CTLTYPE_INT, "aggressive",
2043 		       SYSCTL_DESCR("1: L=2*SMSS 0: L=1*SMSS"),
2044 		       NULL, 0, &tcp_abc_aggressive, 0, CTL_CREATE, CTL_EOL);
2045 
2046 	/* MSL tuning subtree */
2047 
2048 	sysctl_createv(clog, 0, NULL, &mslt_node,
2049 		       CTLFLAG_PERMANENT, CTLTYPE_NODE, "mslt",
2050 		       SYSCTL_DESCR("MSL Tuning for TIME_WAIT truncation"),
2051 		       NULL, 0, NULL, 0,
2052 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
2053 	sysctl_createv(clog, 0, &mslt_node, NULL,
2054 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2055 		       CTLTYPE_INT, "enable",
2056 		       SYSCTL_DESCR("Enable TIME_WAIT truncation"),
2057 		       NULL, 0, &tcp_msl_enable, 0, CTL_CREATE, CTL_EOL);
2058 	sysctl_createv(clog, 0, &mslt_node, NULL,
2059 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2060 		       CTLTYPE_INT, "loopback",
2061 		       SYSCTL_DESCR("MSL value to use for loopback connections"),
2062 		       NULL, 0, &tcp_msl_loop, 0, CTL_CREATE, CTL_EOL);
2063 	sysctl_createv(clog, 0, &mslt_node, NULL,
2064 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2065 		       CTLTYPE_INT, "local",
2066 		       SYSCTL_DESCR("MSL value to use for local connections"),
2067 		       NULL, 0, &tcp_msl_local, 0, CTL_CREATE, CTL_EOL);
2068 	sysctl_createv(clog, 0, &mslt_node, NULL,
2069 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2070 		       CTLTYPE_INT, "remote",
2071 		       SYSCTL_DESCR("MSL value to use for remote connections"),
2072 		       NULL, 0, &tcp_msl_remote, 0, CTL_CREATE, CTL_EOL);
2073 	sysctl_createv(clog, 0, &mslt_node, NULL,
2074 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2075 		       CTLTYPE_INT, "remote_threshold",
2076 		       SYSCTL_DESCR("RTT estimate value to promote local to remote"),
2077 		       NULL, 0, &tcp_msl_remote_threshold, 0, CTL_CREATE, CTL_EOL);
2078 
2079 	/* vestigial TIME_WAIT tuning subtree */
2080 
2081 	sysctl_createv(clog, 0, NULL, &vtw_node,
2082 		       CTLFLAG_PERMANENT, CTLTYPE_NODE, "vtw",
2083 		       SYSCTL_DESCR("Tuning for Vestigial TIME_WAIT"),
2084 		       NULL, 0, NULL, 0,
2085 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
2086 	sysctl_createv(clog, 0, &vtw_node, NULL,
2087 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2088 		       CTLTYPE_INT, "enable",
2089 		       SYSCTL_DESCR("Enable Vestigial TIME_WAIT"),
2090 		       sysctl_tcp_vtw_enable, 0,
2091 	               (pf == AF_INET) ? &tcp4_vtw_enable : &tcp6_vtw_enable,
2092 		       0, CTL_CREATE, CTL_EOL);
2093 	sysctl_createv(clog, 0, &vtw_node, NULL,
2094 		       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2095 		       CTLTYPE_INT, "entries",
2096 		       SYSCTL_DESCR("Maximum number of vestigial TIME_WAIT entries"),
2097 		       NULL, 0, &tcp_vtw_entries, 0, CTL_CREATE, CTL_EOL);
2098 }
2099 
2100 void
2101 tcp_usrreq_init(void)
2102 {
2103 
2104 #ifdef INET
2105 	sysctl_net_inet_tcp_setup2(NULL, PF_INET, "inet", "tcp");
2106 #endif
2107 #ifdef INET6
2108 	sysctl_net_inet_tcp_setup2(NULL, PF_INET6, "inet6", "tcp6");
2109 #endif
2110 }
2111