xref: /dflybsd-src/sys/netinet/tcp_usrreq.c (revision 881dac8bcf7f6e26635fa38f071b93347ef92192)
1 /*
2  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
67  * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $
68  * $DragonFly: src/sys/netinet/tcp_usrreq.c,v 1.51 2008/09/29 20:52:23 dillon Exp $
69  */
70 
71 #include "opt_ipsec.h"
72 #include "opt_inet.h"
73 #include "opt_inet6.h"
74 #include "opt_tcpdebug.h"
75 
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/kernel.h>
79 #include <sys/malloc.h>
80 #include <sys/sysctl.h>
81 #include <sys/globaldata.h>
82 #include <sys/thread.h>
83 
84 #include <sys/mbuf.h>
85 #ifdef INET6
86 #include <sys/domain.h>
87 #endif /* INET6 */
88 #include <sys/socket.h>
89 #include <sys/socketvar.h>
90 #include <sys/protosw.h>
91 
92 #include <sys/thread2.h>
93 #include <sys/msgport2.h>
94 
95 #include <net/if.h>
96 #include <net/netisr.h>
97 #include <net/route.h>
98 
99 #include <net/netmsg2.h>
100 
101 #include <netinet/in.h>
102 #include <netinet/in_systm.h>
103 #ifdef INET6
104 #include <netinet/ip6.h>
105 #endif
106 #include <netinet/in_pcb.h>
107 #ifdef INET6
108 #include <netinet6/in6_pcb.h>
109 #endif
110 #include <netinet/in_var.h>
111 #include <netinet/ip_var.h>
112 #ifdef INET6
113 #include <netinet6/ip6_var.h>
114 #include <netinet6/tcp6_var.h>
115 #endif
116 #include <netinet/tcp.h>
117 #include <netinet/tcp_fsm.h>
118 #include <netinet/tcp_seq.h>
119 #include <netinet/tcp_timer.h>
120 #include <netinet/tcp_timer2.h>
121 #include <netinet/tcp_var.h>
122 #include <netinet/tcpip.h>
123 #ifdef TCPDEBUG
124 #include <netinet/tcp_debug.h>
125 #endif
126 
127 #ifdef IPSEC
128 #include <netinet6/ipsec.h>
129 #endif /*IPSEC*/
130 
131 /*
132  * TCP protocol interface to socket abstraction.
133  */
134 extern	char *tcpstates[];	/* XXX ??? */
135 
136 static int	tcp_attach (struct socket *, struct pru_attach_info *);
137 static int	tcp_connect (struct tcpcb *, int flags, struct mbuf *m,
138 				struct sockaddr *, struct thread *);
139 #ifdef INET6
140 static int	tcp6_connect (struct tcpcb *, int flags, struct mbuf *m,
141 				struct sockaddr *, struct thread *);
142 static int	tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
143 				struct sockaddr_in6 *sin6,
144 				struct in6_addr *addr6);
145 #endif /* INET6 */
146 static struct tcpcb *
147 		tcp_disconnect (struct tcpcb *);
148 static struct tcpcb *
149 		tcp_usrclosed (struct tcpcb *);
150 
151 #ifdef TCPDEBUG
152 #define	TCPDEBUG0	int ostate = 0
153 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
154 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
155 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
156 #else
157 #define	TCPDEBUG0
158 #define	TCPDEBUG1()
159 #define	TCPDEBUG2(req)
160 #endif
161 
162 /*
163  * TCP attaches to socket via pru_attach(), reserving space,
164  * and an internet control block.
165  */
166 static int
167 tcp_usr_attach(struct socket *so, int proto, struct pru_attach_info *ai)
168 {
169 	int error;
170 	struct inpcb *inp;
171 	struct tcpcb *tp = 0;
172 	TCPDEBUG0;
173 
174 	crit_enter();
175 	inp = so->so_pcb;
176 	TCPDEBUG1();
177 	if (inp) {
178 		error = EISCONN;
179 		goto out;
180 	}
181 
182 	error = tcp_attach(so, ai);
183 	if (error)
184 		goto out;
185 
186 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
187 		so->so_linger = TCP_LINGERTIME;
188 	tp = sototcpcb(so);
189 out:
190 	TCPDEBUG2(PRU_ATTACH);
191 	crit_exit();
192 	return error;
193 }
194 
195 /*
196  * pru_detach() detaches the TCP protocol from the socket.
197  * If the protocol state is non-embryonic, then can't
198  * do this directly: have to initiate a pru_disconnect(),
199  * which may finish later; embryonic TCB's can just
200  * be discarded here.
201  */
202 static int
203 tcp_usr_detach(struct socket *so)
204 {
205 	int error = 0;
206 	struct inpcb *inp;
207 	struct tcpcb *tp;
208 	TCPDEBUG0;
209 
210 	crit_enter();
211 	inp = so->so_pcb;
212 
213 	/*
214 	 * If the inp is already detached it may have been due to an async
215 	 * close.  Just return as if no error occured.
216 	 */
217 	if (inp == NULL) {
218 		crit_exit();
219 		return 0;
220 	}
221 
222 	/*
223 	 * It's possible for the tcpcb (tp) to disconnect from the inp due
224 	 * to tcp_drop()->tcp_close() being called.  This may occur *after*
225 	 * the detach message has been queued so we may find a NULL tp here.
226 	 */
227 	if ((tp = intotcpcb(inp)) != NULL) {
228 		TCPDEBUG1();
229 		tp = tcp_disconnect(tp);
230 		TCPDEBUG2(PRU_DETACH);
231 	}
232 	crit_exit();
233 	return error;
234 }
235 
236 /*
237  * Note: ignore_error is non-zero for certain disconnection races
238  * which we want to silently allow, otherwise close() may return
239  * an unexpected error.
240  */
241 #define	COMMON_START(so, inp, ignore_error)			\
242 	TCPDEBUG0; 		\
243 				\
244 	crit_enter();		\
245 	inp = so->so_pcb; 	\
246 	do {			\
247 		 if (inp == NULL) {				\
248 			 crit_exit();				\
249 			 return (ignore_error ? 0 : EINVAL);	\
250 		 }						\
251 		 tp = intotcpcb(inp);				\
252 		 TCPDEBUG1();					\
253 	} while(0)
254 
255 #define COMMON_END(req)	out: TCPDEBUG2(req); crit_exit(); return error; goto out
256 
257 
258 /*
259  * Give the socket an address.
260  */
261 static int
262 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
263 {
264 	int error = 0;
265 	struct inpcb *inp;
266 	struct tcpcb *tp;
267 	struct sockaddr_in *sinp;
268 
269 	COMMON_START(so, inp, 0);
270 
271 	/*
272 	 * Must check for multicast addresses and disallow binding
273 	 * to them.
274 	 */
275 	sinp = (struct sockaddr_in *)nam;
276 	if (sinp->sin_family == AF_INET &&
277 	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
278 		error = EAFNOSUPPORT;
279 		goto out;
280 	}
281 	error = in_pcbbind(inp, nam, td);
282 	if (error)
283 		goto out;
284 	COMMON_END(PRU_BIND);
285 
286 }
287 
288 #ifdef INET6
289 static int
290 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
291 {
292 	int error = 0;
293 	struct inpcb *inp;
294 	struct tcpcb *tp;
295 	struct sockaddr_in6 *sin6p;
296 
297 	COMMON_START(so, inp, 0);
298 
299 	/*
300 	 * Must check for multicast addresses and disallow binding
301 	 * to them.
302 	 */
303 	sin6p = (struct sockaddr_in6 *)nam;
304 	if (sin6p->sin6_family == AF_INET6 &&
305 	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
306 		error = EAFNOSUPPORT;
307 		goto out;
308 	}
309 	inp->inp_vflag &= ~INP_IPV4;
310 	inp->inp_vflag |= INP_IPV6;
311 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
312 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
313 			inp->inp_vflag |= INP_IPV4;
314 		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
315 			struct sockaddr_in sin;
316 
317 			in6_sin6_2_sin(&sin, sin6p);
318 			inp->inp_vflag |= INP_IPV4;
319 			inp->inp_vflag &= ~INP_IPV6;
320 			error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
321 			goto out;
322 		}
323 	}
324 	error = in6_pcbbind(inp, nam, td);
325 	if (error)
326 		goto out;
327 	COMMON_END(PRU_BIND);
328 }
329 #endif /* INET6 */
330 
331 #ifdef SMP
332 struct netmsg_inswildcard {
333 	struct netmsg		nm_netmsg;
334 	struct inpcb		*nm_inp;
335 	struct inpcbinfo	*nm_pcbinfo;
336 };
337 
338 static void
339 in_pcbinswildcardhash_handler(struct netmsg *msg0)
340 {
341 	struct netmsg_inswildcard *msg = (struct netmsg_inswildcard *)msg0;
342 
343 	in_pcbinswildcardhash_oncpu(msg->nm_inp, msg->nm_pcbinfo);
344 	lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, 0);
345 }
346 #endif
347 
348 /*
349  * Prepare to accept connections.
350  */
351 static int
352 tcp_usr_listen(struct socket *so, struct thread *td)
353 {
354 	int error = 0;
355 	struct inpcb *inp;
356 	struct tcpcb *tp;
357 #ifdef SMP
358 	int cpu;
359 #endif
360 
361 	COMMON_START(so, inp, 0);
362 	if (inp->inp_lport == 0) {
363 		error = in_pcbbind(inp, NULL, td);
364 		if (error != 0)
365 			goto out;
366 	}
367 
368 	tp->t_state = TCPS_LISTEN;
369 	tp->tt_msg = NULL; /* Catch any invalid timer usage */
370 #ifdef SMP
371 	/*
372 	 * We have to set the flag because we can't have other cpus
373 	 * messing with our inp's flags.
374 	 */
375 	inp->inp_flags |= INP_WILDCARD_MP;
376 	for (cpu = 0; cpu < ncpus2; cpu++) {
377 		struct netmsg_inswildcard *msg;
378 
379 		if (cpu == mycpu->gd_cpuid) {
380 			in_pcbinswildcardhash(inp);
381 			continue;
382 		}
383 
384 		msg = kmalloc(sizeof(struct netmsg_inswildcard), M_LWKTMSG,
385 			      M_INTWAIT);
386 		netmsg_init(&msg->nm_netmsg, NULL, &netisr_afree_rport,
387 			    0, in_pcbinswildcardhash_handler);
388 		msg->nm_inp = inp;
389 		msg->nm_pcbinfo = &tcbinfo[cpu];
390 		lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg);
391 	}
392 #else
393 	in_pcbinswildcardhash(inp);
394 #endif
395 	COMMON_END(PRU_LISTEN);
396 }
397 
398 #ifdef INET6
399 static int
400 tcp6_usr_listen(struct socket *so, struct thread *td)
401 {
402 	int error = 0;
403 	struct inpcb *inp;
404 	struct tcpcb *tp;
405 #ifdef SMP
406 	int cpu;
407 #endif
408 
409 	COMMON_START(so, inp, 0);
410 	if (inp->inp_lport == 0) {
411 		if (!(inp->inp_flags & IN6P_IPV6_V6ONLY))
412 			inp->inp_vflag |= INP_IPV4;
413 		else
414 			inp->inp_vflag &= ~INP_IPV4;
415 		error = in6_pcbbind(inp, NULL, td);
416 	}
417 	if (error == 0)
418 		tp->t_state = TCPS_LISTEN;
419 #ifdef SMP
420 	/*
421 	 * We have to set the flag because we can't have other cpus
422 	 * messing with our inp's flags.
423 	 */
424 	inp->inp_flags |= INP_WILDCARD_MP;
425 	for (cpu = 0; cpu < ncpus2; cpu++) {
426 		struct netmsg_inswildcard *msg;
427 
428 		if (cpu == mycpu->gd_cpuid) {
429 			in_pcbinswildcardhash(inp);
430 			continue;
431 		}
432 
433 		msg = kmalloc(sizeof(struct netmsg_inswildcard), M_LWKTMSG,
434 			      M_INTWAIT);
435 		netmsg_init(&msg->nm_netmsg, NULL, &netisr_afree_rport,
436 			    0, in_pcbinswildcardhash_handler);
437 		msg->nm_inp = inp;
438 		msg->nm_pcbinfo = &tcbinfo[cpu];
439 		lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg);
440 	}
441 #else
442 	in_pcbinswildcardhash(inp);
443 #endif
444 	COMMON_END(PRU_LISTEN);
445 }
446 #endif /* INET6 */
447 
448 /*
449  * Initiate connection to peer.
450  * Create a template for use in transmissions on this connection.
451  * Enter SYN_SENT state, and mark socket as connecting.
452  * Start keep-alive timer, and seed output sequence space.
453  * Send initial segment on connection.
454  */
455 static int
456 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
457 {
458 	int error = 0;
459 	struct inpcb *inp;
460 	struct tcpcb *tp;
461 	struct sockaddr_in *sinp;
462 
463 	COMMON_START(so, inp, 0);
464 
465 	/*
466 	 * Must disallow TCP ``connections'' to multicast addresses.
467 	 */
468 	sinp = (struct sockaddr_in *)nam;
469 	if (sinp->sin_family == AF_INET
470 	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
471 		error = EAFNOSUPPORT;
472 		goto out;
473 	}
474 
475 	if (!prison_remote_ip(td, (struct sockaddr*)sinp)) {
476 		error = EAFNOSUPPORT; /* IPv6 only jail */
477 		goto out;
478 	}
479 
480 	if ((error = tcp_connect(tp, 0, NULL, nam, td)) != 0)
481 		goto out;
482 	COMMON_END(PRU_CONNECT);
483 }
484 
485 #ifdef INET6
486 static int
487 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
488 {
489 	int error = 0;
490 	struct inpcb *inp;
491 	struct tcpcb *tp;
492 	struct sockaddr_in6 *sin6p;
493 
494 	COMMON_START(so, inp, 0);
495 
496 	/*
497 	 * Must disallow TCP ``connections'' to multicast addresses.
498 	 */
499 	sin6p = (struct sockaddr_in6 *)nam;
500 	if (sin6p->sin6_family == AF_INET6
501 	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
502 		error = EAFNOSUPPORT;
503 		goto out;
504 	}
505 
506 	if (!prison_remote_ip(td, nam)) {
507 		error = EAFNOSUPPORT; /* IPv4 only jail */
508 		goto out;
509 	}
510 
511 	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
512 		struct sockaddr_in sin;
513 
514 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
515 			error = EINVAL;
516 			goto out;
517 		}
518 
519 		in6_sin6_2_sin(&sin, sin6p);
520 		inp->inp_vflag |= INP_IPV4;
521 		inp->inp_vflag &= ~INP_IPV6;
522 		error = tcp_connect(tp, 0, NULL, (struct sockaddr *)&sin, td);
523 		if (error)
524 			goto out;
525 		goto out;
526 	}
527 	inp->inp_vflag &= ~INP_IPV4;
528 	inp->inp_vflag |= INP_IPV6;
529 	inp->inp_inc.inc_isipv6 = 1;
530 	if ((error = tcp6_connect(tp, 0, NULL, nam, td)) != 0)
531 		goto out;
532 	error = tcp_output(tp);
533 	COMMON_END(PRU_CONNECT);
534 }
535 #endif /* INET6 */
536 
537 /*
538  * Initiate disconnect from peer.
539  * If connection never passed embryonic stage, just drop;
540  * else if don't need to let data drain, then can just drop anyways,
541  * else have to begin TCP shutdown process: mark socket disconnecting,
542  * drain unread data, state switch to reflect user close, and
543  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
544  * when peer sends FIN and acks ours.
545  *
546  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
547  */
548 static int
549 tcp_usr_disconnect(struct socket *so)
550 {
551 	int error = 0;
552 	struct inpcb *inp;
553 	struct tcpcb *tp;
554 
555 	COMMON_START(so, inp, 1);
556 	tp = tcp_disconnect(tp);
557 	COMMON_END(PRU_DISCONNECT);
558 }
559 
560 /*
561  * Accept a connection.  Essentially all the work is
562  * done at higher levels; just return the address
563  * of the peer, storing through addr.
564  */
565 static int
566 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
567 {
568 	int error = 0;
569 	struct inpcb *inp;
570 	struct tcpcb *tp = NULL;
571 	TCPDEBUG0;
572 
573 	crit_enter();
574 	inp = so->so_pcb;
575 	if (so->so_state & SS_ISDISCONNECTED) {
576 		error = ECONNABORTED;
577 		goto out;
578 	}
579 	if (inp == 0) {
580 		crit_exit();
581 		return (EINVAL);
582 	}
583 	tp = intotcpcb(inp);
584 	TCPDEBUG1();
585 	in_setpeeraddr(so, nam);
586 	COMMON_END(PRU_ACCEPT);
587 }
588 
589 #ifdef INET6
590 static int
591 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
592 {
593 	int error = 0;
594 	struct inpcb *inp;
595 	struct tcpcb *tp = NULL;
596 	TCPDEBUG0;
597 
598 	crit_enter();
599 	inp = so->so_pcb;
600 
601 	if (so->so_state & SS_ISDISCONNECTED) {
602 		error = ECONNABORTED;
603 		goto out;
604 	}
605 	if (inp == 0) {
606 		crit_exit();
607 		return (EINVAL);
608 	}
609 	tp = intotcpcb(inp);
610 	TCPDEBUG1();
611 	in6_mapped_peeraddr(so, nam);
612 	COMMON_END(PRU_ACCEPT);
613 }
614 #endif /* INET6 */
615 /*
616  * Mark the connection as being incapable of further output.
617  */
618 static int
619 tcp_usr_shutdown(struct socket *so)
620 {
621 	int error = 0;
622 	struct inpcb *inp;
623 	struct tcpcb *tp;
624 
625 	COMMON_START(so, inp, 0);
626 	socantsendmore(so);
627 	tp = tcp_usrclosed(tp);
628 	if (tp)
629 		error = tcp_output(tp);
630 	COMMON_END(PRU_SHUTDOWN);
631 }
632 
633 /*
634  * After a receive, possibly send window update to peer.
635  */
636 static int
637 tcp_usr_rcvd(struct socket *so, int flags)
638 {
639 	int error = 0;
640 	struct inpcb *inp;
641 	struct tcpcb *tp;
642 
643 	COMMON_START(so, inp, 0);
644 	tcp_output(tp);
645 	COMMON_END(PRU_RCVD);
646 }
647 
648 /*
649  * Do a send by putting data in output queue and updating urgent
650  * marker if URG set.  Possibly send more data.  Unlike the other
651  * pru_*() routines, the mbuf chains are our responsibility.  We
652  * must either enqueue them or free them.  The other pru_* routines
653  * generally are caller-frees.
654  */
655 static int
656 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
657 	     struct sockaddr *nam, struct mbuf *control, struct thread *td)
658 {
659 	int error = 0;
660 	struct inpcb *inp;
661 	struct tcpcb *tp;
662 #ifdef INET6
663 	int isipv6;
664 #endif
665 	TCPDEBUG0;
666 
667 	crit_enter();
668 	inp = so->so_pcb;
669 
670 	if (inp == NULL) {
671 		/*
672 		 * OOPS! we lost a race, the TCP session got reset after
673 		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
674 		 * network interrupt in the non-critical section of sosend().
675 		 */
676 		m_freem(m);
677 		if (control)
678 			m_freem(control);
679 		error = ECONNRESET;	/* XXX EPIPE? */
680 		tp = NULL;
681 		TCPDEBUG1();
682 		goto out;
683 	}
684 #ifdef INET6
685 	isipv6 = nam && nam->sa_family == AF_INET6;
686 #endif /* INET6 */
687 	tp = intotcpcb(inp);
688 	TCPDEBUG1();
689 	if (control) {
690 		/* TCP doesn't do control messages (rights, creds, etc) */
691 		if (control->m_len) {
692 			m_freem(control);
693 			m_freem(m);
694 			error = EINVAL;
695 			goto out;
696 		}
697 		m_freem(control);	/* empty control, just free it */
698 	}
699 
700 	/*
701 	 * Don't let too much OOB data build up
702 	 */
703 	if (flags & PRUS_OOB) {
704 		if (ssb_space(&so->so_snd) < -512) {
705 			m_freem(m);
706 			error = ENOBUFS;
707 			goto out;
708 		}
709 	}
710 
711 	/*
712 	 * Do implied connect if not yet connected.  Any data sent
713 	 * with the connect is handled by tcp_connect() and friends.
714 	 *
715 	 * NOTE!  PROTOCOL THREAD MAY BE CHANGED BY THE CONNECT!
716 	 */
717 	if (nam && tp->t_state < TCPS_SYN_SENT) {
718 #ifdef INET6
719 		if (isipv6)
720 			error = tcp6_connect(tp, flags, m, nam, td);
721 		else
722 #endif /* INET6 */
723 		error = tcp_connect(tp, flags, m, nam, td);
724 #if 0
725 		/* WTF is this doing here? */
726 		tp->snd_wnd = TTCP_CLIENT_SND_WND;
727 		tcp_mss(tp, -1);
728 #endif
729 		goto out;
730 	}
731 
732 	/*
733 	 * Pump the data into the socket.
734 	 */
735 	if (m)
736 		ssb_appendstream(&so->so_snd, m);
737 	if (flags & PRUS_OOB) {
738 		/*
739 		 * According to RFC961 (Assigned Protocols),
740 		 * the urgent pointer points to the last octet
741 		 * of urgent data.  We continue, however,
742 		 * to consider it to indicate the first octet
743 		 * of data past the urgent section.
744 		 * Otherwise, snd_up should be one lower.
745 		 */
746 		tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
747 		tp->t_flags |= TF_FORCE;
748 		error = tcp_output(tp);
749 		tp->t_flags &= ~TF_FORCE;
750 	} else {
751 		if (flags & PRUS_EOF) {
752 			/*
753 			 * Close the send side of the connection after
754 			 * the data is sent.
755 			 */
756 			socantsendmore(so);
757 			tp = tcp_usrclosed(tp);
758 		}
759 		if (tp != NULL) {
760 			if (flags & PRUS_MORETOCOME)
761 				tp->t_flags |= TF_MORETOCOME;
762 			error = tcp_output(tp);
763 			if (flags & PRUS_MORETOCOME)
764 				tp->t_flags &= ~TF_MORETOCOME;
765 		}
766 	}
767 	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
768 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
769 }
770 
771 /*
772  * Abort the TCP.
773  */
774 static int
775 tcp_usr_abort(struct socket *so)
776 {
777 	int error = 0;
778 	struct inpcb *inp;
779 	struct tcpcb *tp;
780 
781 	COMMON_START(so, inp, 1);
782 	tp = tcp_drop(tp, ECONNABORTED);
783 	COMMON_END(PRU_ABORT);
784 }
785 
786 /*
787  * Receive out-of-band data.
788  */
789 static int
790 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
791 {
792 	int error = 0;
793 	struct inpcb *inp;
794 	struct tcpcb *tp;
795 
796 	COMMON_START(so, inp, 0);
797 	if ((so->so_oobmark == 0 &&
798 	     (so->so_state & SS_RCVATMARK) == 0) ||
799 	    so->so_options & SO_OOBINLINE ||
800 	    tp->t_oobflags & TCPOOB_HADDATA) {
801 		error = EINVAL;
802 		goto out;
803 	}
804 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
805 		error = EWOULDBLOCK;
806 		goto out;
807 	}
808 	m->m_len = 1;
809 	*mtod(m, caddr_t) = tp->t_iobc;
810 	if ((flags & MSG_PEEK) == 0)
811 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
812 	COMMON_END(PRU_RCVOOB);
813 }
814 
815 /* xxx - should be const */
816 struct pr_usrreqs tcp_usrreqs = {
817 	.pru_abort = tcp_usr_abort,
818 	.pru_accept = tcp_usr_accept,
819 	.pru_attach = tcp_usr_attach,
820 	.pru_bind = tcp_usr_bind,
821 	.pru_connect = tcp_usr_connect,
822 	.pru_connect2 = pru_connect2_notsupp,
823 	.pru_control = in_control,
824 	.pru_detach = tcp_usr_detach,
825 	.pru_disconnect = tcp_usr_disconnect,
826 	.pru_listen = tcp_usr_listen,
827 	.pru_peeraddr = in_setpeeraddr,
828 	.pru_rcvd = tcp_usr_rcvd,
829 	.pru_rcvoob = tcp_usr_rcvoob,
830 	.pru_send = tcp_usr_send,
831 	.pru_sense = pru_sense_null,
832 	.pru_shutdown = tcp_usr_shutdown,
833 	.pru_sockaddr = in_setsockaddr,
834 	.pru_sosend = sosend,
835 	.pru_soreceive = soreceive
836 };
837 
838 #ifdef INET6
839 struct pr_usrreqs tcp6_usrreqs = {
840 	.pru_abort = tcp_usr_abort,
841 	.pru_accept = tcp6_usr_accept,
842 	.pru_attach = tcp_usr_attach,
843 	.pru_bind = tcp6_usr_bind,
844 	.pru_connect = tcp6_usr_connect,
845 	.pru_connect2 = pru_connect2_notsupp,
846 	.pru_control = in6_control,
847 	.pru_detach = tcp_usr_detach,
848 	.pru_disconnect = tcp_usr_disconnect,
849 	.pru_listen = tcp6_usr_listen,
850 	.pru_peeraddr = in6_mapped_peeraddr,
851 	.pru_rcvd = tcp_usr_rcvd,
852 	.pru_rcvoob = tcp_usr_rcvoob,
853 	.pru_send = tcp_usr_send,
854 	.pru_sense = pru_sense_null,
855 	.pru_shutdown = tcp_usr_shutdown,
856 	.pru_sockaddr = in6_mapped_sockaddr,
857 	.pru_sosend = sosend,
858 	.pru_soreceive = soreceive
859 };
860 #endif /* INET6 */
861 
862 static int
863 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
864 		  struct sockaddr_in *sin, struct sockaddr_in *if_sin)
865 {
866 	struct inpcb *inp = tp->t_inpcb, *oinp;
867 	struct socket *so = inp->inp_socket;
868 	struct route *ro = &inp->inp_route;
869 
870 	oinp = in_pcblookup_hash(&tcbinfo[mycpu->gd_cpuid],
871 	    sin->sin_addr, sin->sin_port,
872 	    inp->inp_laddr.s_addr != INADDR_ANY ?
873 		inp->inp_laddr : if_sin->sin_addr,
874 	    inp->inp_lport, 0, NULL);
875 	if (oinp != NULL) {
876 		m_freem(m);
877 		return (EADDRINUSE);
878 	}
879 	if (inp->inp_laddr.s_addr == INADDR_ANY)
880 		inp->inp_laddr = if_sin->sin_addr;
881 	inp->inp_faddr = sin->sin_addr;
882 	inp->inp_fport = sin->sin_port;
883 	inp->inp_cpcbinfo = &tcbinfo[mycpu->gd_cpuid];
884 	in_pcbinsconnhash(inp);
885 
886 	/*
887 	 * We are now on the inpcb's owner CPU, if the cached route was
888 	 * freed because the rtentry's owner CPU is not the current CPU
889 	 * (e.g. in tcp_connect()), then we try to reallocate it here with
890 	 * the hope that a rtentry may be cloned from a RTF_PRCLONING
891 	 * rtentry.
892 	 */
893 	if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
894 	    ro->ro_rt == NULL) {
895 		bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
896 		ro->ro_dst.sa_family = AF_INET;
897 		ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
898 		((struct sockaddr_in *)&ro->ro_dst)->sin_addr =
899 			sin->sin_addr;
900 		rtalloc(ro);
901 	}
902 
903 	/*
904 	 * Now that no more errors can occur, change the protocol processing
905 	 * port to the current thread (which is the correct thread).
906 	 *
907 	 * Create TCP timer message now; we are on the tcpcb's owner
908 	 * CPU/thread.
909 	 */
910 	sosetport(so, &curthread->td_msgport);
911 	tcp_create_timermsg(tp, &curthread->td_msgport);
912 
913 	/*
914 	 * Compute window scaling to request.  Use a larger scaling then
915 	 * needed for the initial receive buffer in case the receive buffer
916 	 * gets expanded.
917 	 */
918 	if (tp->request_r_scale < TCP_MIN_WINSHIFT)
919 		tp->request_r_scale = TCP_MIN_WINSHIFT;
920 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
921 	       (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat
922 	) {
923 		tp->request_r_scale++;
924 	}
925 
926 	soisconnecting(so);
927 	tcpstat.tcps_connattempt++;
928 	tp->t_state = TCPS_SYN_SENT;
929 	tcp_callout_reset(tp, tp->tt_keep, tcp_keepinit, tcp_timer_keep);
930 	tp->iss = tcp_new_isn(tp);
931 	tcp_sendseqinit(tp);
932 	if (m) {
933 		ssb_appendstream(&so->so_snd, m);
934 		m = NULL;
935 		if (flags & PRUS_OOB)
936 			tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
937 	}
938 
939 	/*
940 	 * Close the send side of the connection after
941 	 * the data is sent if flagged.
942 	 */
943 	if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
944 		socantsendmore(so);
945 		tp = tcp_usrclosed(tp);
946 	}
947 	return (tcp_output(tp));
948 }
949 
950 #ifdef SMP
951 
952 struct netmsg_tcp_connect {
953 	struct netmsg		nm_netmsg;
954 	struct tcpcb		*nm_tp;
955 	struct sockaddr_in	*nm_sin;
956 	struct sockaddr_in	*nm_ifsin;
957 	int			nm_flags;
958 	struct mbuf		*nm_m;
959 };
960 
961 static void
962 tcp_connect_handler(netmsg_t netmsg)
963 {
964 	struct netmsg_tcp_connect *msg = (void *)netmsg;
965 	int error;
966 
967 	error = tcp_connect_oncpu(msg->nm_tp, msg->nm_flags, msg->nm_m,
968 				  msg->nm_sin, msg->nm_ifsin);
969 	lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, error);
970 }
971 
972 struct netmsg_tcp6_connect {
973 	struct netmsg		nm_netmsg;
974 	struct tcpcb		*nm_tp;
975 	struct sockaddr_in6	*nm_sin6;
976 	struct in6_addr		*nm_addr6;
977 	int			nm_flags;
978 	struct mbuf		*nm_m;
979 };
980 
981 #ifdef INET6
982 static void
983 tcp6_connect_handler(netmsg_t netmsg)
984 {
985 	struct netmsg_tcp6_connect *msg = (void *)netmsg;
986 	int error;
987 
988 	error = tcp6_connect_oncpu(msg->nm_tp, msg->nm_flags, msg->nm_m,
989 				   msg->nm_sin6, msg->nm_addr6);
990 	lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, error);
991 }
992 #endif
993 
994 #endif /* SMP */
995 
996 /*
997  * Common subroutine to open a TCP connection to remote host specified
998  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
999  * port number if needed.  Call in_pcbladdr to do the routing and to choose
1000  * a local host address (interface).
1001  * Initialize connection parameters and enter SYN-SENT state.
1002  */
1003 static int
1004 tcp_connect(struct tcpcb *tp, int flags, struct mbuf *m,
1005 	    struct sockaddr *nam, struct thread *td)
1006 {
1007 	struct inpcb *inp = tp->t_inpcb;
1008 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1009 	struct sockaddr_in *if_sin;
1010 	int error;
1011 #ifdef SMP
1012 	lwkt_port_t port;
1013 #endif
1014 
1015 	/*
1016 	 * Bind if we have to
1017 	 */
1018 	if (inp->inp_lport == 0) {
1019 		error = in_pcbbind(inp, NULL, td);
1020 		if (error) {
1021 			m_freem(m);
1022 			return (error);
1023 		}
1024 	}
1025 
1026 	/*
1027 	 * Calculate the correct protocol processing thread.  The connect
1028 	 * operation must run there.
1029 	 */
1030 	error = in_pcbladdr(inp, nam, &if_sin, td);
1031 	if (error) {
1032 		m_freem(m);
1033 		return (error);
1034 	}
1035 
1036 #ifdef SMP
1037 	port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port,
1038 	    inp->inp_laddr.s_addr ?
1039 		inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr,
1040 	    inp->inp_lport);
1041 
1042 	if (port != &curthread->td_msgport) {
1043 		struct netmsg_tcp_connect msg;
1044 		struct route *ro = &inp->inp_route;
1045 
1046 		/*
1047 		 * in_pcbladdr() may have allocated a route entry for us
1048 		 * on the current CPU, but we need a route entry on the
1049 		 * inpcb's owner CPU, so free it here.
1050 		 */
1051 		if (ro->ro_rt != NULL)
1052 			RTFREE(ro->ro_rt);
1053 		bzero(ro, sizeof(*ro));
1054 
1055 		/*
1056 		 * NOTE: We haven't set so->so_port yet do not pass so
1057 		 *	 to netmsg_init() or it will be improperly forwarded.
1058 		 */
1059 		netmsg_init(&msg.nm_netmsg, NULL, &curthread->td_msgport,
1060 			    0, tcp_connect_handler);
1061 		msg.nm_tp = tp;
1062 		msg.nm_sin = sin;
1063 		msg.nm_ifsin = if_sin;
1064 		msg.nm_flags = flags;
1065 		msg.nm_m = m;
1066 		error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, 0);
1067 	} else {
1068 		error = tcp_connect_oncpu(tp, flags, m, sin, if_sin);
1069 	}
1070 #else
1071 	error = tcp_connect_oncpu(tp, flags, m, sin, if_sin);
1072 #endif
1073 	return (error);
1074 }
1075 
1076 #ifdef INET6
1077 
1078 static int
1079 tcp6_connect(struct tcpcb *tp, int flags, struct mbuf *m,
1080 	     struct sockaddr *nam, struct thread *td)
1081 {
1082 	struct inpcb *inp = tp->t_inpcb;
1083 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
1084 	struct in6_addr *addr6;
1085 #ifdef SMP
1086 	lwkt_port_t port;
1087 #endif
1088 	int error;
1089 
1090 	if (inp->inp_lport == 0) {
1091 		error = in6_pcbbind(inp, NULL, td);
1092 		if (error) {
1093 			m_freem(m);
1094 			return (error);
1095 		}
1096 	}
1097 
1098 	/*
1099 	 * Cannot simply call in_pcbconnect, because there might be an
1100 	 * earlier incarnation of this same connection still in
1101 	 * TIME_WAIT state, creating an ADDRINUSE error.
1102 	 */
1103 	error = in6_pcbladdr(inp, nam, &addr6, td);
1104 	if (error) {
1105 		m_freem(m);
1106 		return (error);
1107 	}
1108 
1109 #ifdef SMP
1110 	port = tcp6_addrport();	/* XXX hack for now, always cpu0 */
1111 
1112 	if (port != &curthread->td_msgport) {
1113 		struct netmsg_tcp6_connect msg;
1114 		struct route *ro = &inp->inp_route;
1115 
1116 		/*
1117 		 * in_pcbladdr() may have allocated a route entry for us
1118 		 * on the current CPU, but we need a route entry on the
1119 		 * inpcb's owner CPU, so free it here.
1120 		 */
1121 		if (ro->ro_rt != NULL)
1122 			RTFREE(ro->ro_rt);
1123 		bzero(ro, sizeof(*ro));
1124 
1125 		netmsg_init(&msg.nm_netmsg, NULL, &curthread->td_msgport,
1126 			    0, tcp6_connect_handler);
1127 		msg.nm_tp = tp;
1128 		msg.nm_sin6 = sin6;
1129 		msg.nm_addr6 = addr6;
1130 		msg.nm_flags = flags;
1131 		msg.nm_m = m;
1132 		error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, 0);
1133 	} else {
1134 		error = tcp6_connect_oncpu(tp, flags, m, sin6, addr6);
1135 	}
1136 #else
1137 	error = tcp6_connect_oncpu(tp, flags, m, sin6, addr6);
1138 #endif
1139 	return (error);
1140 }
1141 
1142 static int
1143 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
1144 		   struct sockaddr_in6 *sin6, struct in6_addr *addr6)
1145 {
1146 	struct inpcb *inp = tp->t_inpcb;
1147 	struct socket *so = inp->inp_socket;
1148 	struct inpcb *oinp;
1149 
1150 	/*
1151 	 * Cannot simply call in_pcbconnect, because there might be an
1152 	 * earlier incarnation of this same connection still in
1153 	 * TIME_WAIT state, creating an ADDRINUSE error.
1154 	 */
1155 	oinp = in6_pcblookup_hash(inp->inp_cpcbinfo,
1156 				  &sin6->sin6_addr, sin6->sin6_port,
1157 				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ?
1158 				      addr6 : &inp->in6p_laddr,
1159 				  inp->inp_lport,  0, NULL);
1160 	if (oinp) {
1161 		m_freem(m);
1162 		return (EADDRINUSE);
1163 	}
1164 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1165 		inp->in6p_laddr = *addr6;
1166 	inp->in6p_faddr = sin6->sin6_addr;
1167 	inp->inp_fport = sin6->sin6_port;
1168 	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1169 		inp->in6p_flowinfo = sin6->sin6_flowinfo;
1170 	in_pcbinsconnhash(inp);
1171 
1172 	/*
1173 	 * Now that no more errors can occur, change the protocol processing
1174 	 * port to the current thread (which is the correct thread).
1175 	 *
1176 	 * Create TCP timer message now; we are on the tcpcb's owner
1177 	 * CPU/thread.
1178 	 */
1179 	sosetport(so, &curthread->td_msgport);
1180 	tcp_create_timermsg(tp, &curthread->td_msgport);
1181 
1182 	/* Compute window scaling to request.  */
1183 	if (tp->request_r_scale < TCP_MIN_WINSHIFT)
1184 		tp->request_r_scale = TCP_MIN_WINSHIFT;
1185 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1186 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) {
1187 		tp->request_r_scale++;
1188 	}
1189 
1190 	soisconnecting(so);
1191 	tcpstat.tcps_connattempt++;
1192 	tp->t_state = TCPS_SYN_SENT;
1193 	tcp_callout_reset(tp, tp->tt_keep, tcp_keepinit, tcp_timer_keep);
1194 	tp->iss = tcp_new_isn(tp);
1195 	tcp_sendseqinit(tp);
1196 	if (m) {
1197 		ssb_appendstream(&so->so_snd, m);
1198 		m = NULL;
1199 		if (flags & PRUS_OOB)
1200 			tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
1201 	}
1202 
1203 	/*
1204 	 * Close the send side of the connection after
1205 	 * the data is sent if flagged.
1206 	 */
1207 	if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
1208 		socantsendmore(so);
1209 		tp = tcp_usrclosed(tp);
1210 	}
1211 	return (tcp_output(tp));
1212 }
1213 
1214 #endif /* INET6 */
1215 
1216 /*
1217  * The new sockopt interface makes it possible for us to block in the
1218  * copyin/out step (if we take a page fault).  Taking a page fault while
1219  * in a critical section is probably a Bad Thing.  (Since sockets and pcbs
1220  * both now use TSM, there probably isn't any need for this function to
1221  * run in a critical section any more.  This needs more examination.)
1222  */
1223 int
1224 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1225 {
1226 	int	error, opt, optval;
1227 	struct	inpcb *inp;
1228 	struct	tcpcb *tp;
1229 
1230 	error = 0;
1231 	crit_enter();		/* XXX */
1232 	inp = so->so_pcb;
1233 	if (inp == NULL) {
1234 		crit_exit();
1235 		return (ECONNRESET);
1236 	}
1237 	if (sopt->sopt_level != IPPROTO_TCP) {
1238 #ifdef INET6
1239 		if (INP_CHECK_SOCKAF(so, AF_INET6))
1240 			error = ip6_ctloutput(so, sopt);
1241 		else
1242 #endif /* INET6 */
1243 		error = ip_ctloutput(so, sopt);
1244 		crit_exit();
1245 		return (error);
1246 	}
1247 	tp = intotcpcb(inp);
1248 
1249 	switch (sopt->sopt_dir) {
1250 	case SOPT_SET:
1251 		error = soopt_to_kbuf(sopt, &optval, sizeof optval,
1252 				      sizeof optval);
1253 		if (error)
1254 			break;
1255 		switch (sopt->sopt_name) {
1256 #ifdef TCP_SIGNATURE
1257 		case TCP_SIGNATURE_ENABLE:
1258 			if (optval > 0)
1259 				tp->t_flags |= TF_SIGNATURE;
1260 			else
1261 				tp->t_flags &= ~TF_SIGNATURE;
1262 			break;
1263 #endif /* TCP_SIGNATURE */
1264 		case TCP_NODELAY:
1265 		case TCP_NOOPT:
1266 			switch (sopt->sopt_name) {
1267 			case TCP_NODELAY:
1268 				opt = TF_NODELAY;
1269 				break;
1270 			case TCP_NOOPT:
1271 				opt = TF_NOOPT;
1272 				break;
1273 			default:
1274 				opt = 0; /* dead code to fool gcc */
1275 				break;
1276 			}
1277 
1278 			if (optval)
1279 				tp->t_flags |= opt;
1280 			else
1281 				tp->t_flags &= ~opt;
1282 			break;
1283 
1284 		case TCP_NOPUSH:
1285 			if (optval)
1286 				tp->t_flags |= TF_NOPUSH;
1287 			else {
1288 				tp->t_flags &= ~TF_NOPUSH;
1289 				error = tcp_output(tp);
1290 			}
1291 			break;
1292 
1293 		case TCP_MAXSEG:
1294 			/*
1295 			 * Must be between 0 and maxseg.  If the requested
1296 			 * maxseg is too small to satisfy the desired minmss,
1297 			 * pump it up (silently so sysctl modifications of
1298 			 * minmss do not create unexpected program failures).
1299 			 * Handle degenerate cases.
1300 			 */
1301 			if (optval > 0 && optval <= tp->t_maxseg) {
1302 				if (optval + 40 < tcp_minmss) {
1303 					optval = tcp_minmss - 40;
1304 					if (optval < 0)
1305 						optval = 1;
1306 				}
1307 				tp->t_maxseg = optval;
1308 			} else {
1309 				error = EINVAL;
1310 			}
1311 			break;
1312 
1313 		default:
1314 			error = ENOPROTOOPT;
1315 			break;
1316 		}
1317 		break;
1318 
1319 	case SOPT_GET:
1320 		switch (sopt->sopt_name) {
1321 #ifdef TCP_SIGNATURE
1322 		case TCP_SIGNATURE_ENABLE:
1323 			optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1324 			break;
1325 #endif /* TCP_SIGNATURE */
1326 		case TCP_NODELAY:
1327 			optval = tp->t_flags & TF_NODELAY;
1328 			break;
1329 		case TCP_MAXSEG:
1330 			optval = tp->t_maxseg;
1331 			break;
1332 		case TCP_NOOPT:
1333 			optval = tp->t_flags & TF_NOOPT;
1334 			break;
1335 		case TCP_NOPUSH:
1336 			optval = tp->t_flags & TF_NOPUSH;
1337 			break;
1338 		default:
1339 			error = ENOPROTOOPT;
1340 			break;
1341 		}
1342 		if (error == 0)
1343 			soopt_from_kbuf(sopt, &optval, sizeof optval);
1344 		break;
1345 	}
1346 	crit_exit();
1347 	return (error);
1348 }
1349 
1350 /*
1351  * tcp_sendspace and tcp_recvspace are the default send and receive window
1352  * sizes, respectively.  These are obsolescent (this information should
1353  * be set by the route).
1354  *
1355  * Use a default that does not require tcp window scaling to be turned
1356  * on.  Individual programs or the administrator can increase the default.
1357  */
1358 u_long	tcp_sendspace = 57344;	/* largest multiple of PAGE_SIZE < 64k */
1359 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1360     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1361 u_long	tcp_recvspace = 57344;	/* largest multiple of PAGE_SIZE < 64k */
1362 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1363     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1364 
1365 /*
1366  * Attach TCP protocol to socket, allocating
1367  * internet protocol control block, tcp control block,
1368  * bufer space, and entering LISTEN state if to accept connections.
1369  */
1370 static int
1371 tcp_attach(struct socket *so, struct pru_attach_info *ai)
1372 {
1373 	struct tcpcb *tp;
1374 	struct inpcb *inp;
1375 	int error;
1376 	int cpu;
1377 #ifdef INET6
1378 	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1379 #endif
1380 
1381 	if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
1382 		error = soreserve(so, tcp_sendspace, tcp_recvspace,
1383 				  ai->sb_rlimit);
1384 		if (error)
1385 			return (error);
1386 	}
1387 	atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
1388 	atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
1389 	cpu = mycpu->gd_cpuid;
1390 	error = in_pcballoc(so, &tcbinfo[cpu]);
1391 	if (error)
1392 		return (error);
1393 	inp = so->so_pcb;
1394 #ifdef INET6
1395 	if (isipv6) {
1396 		inp->inp_vflag |= INP_IPV6;
1397 		inp->in6p_hops = -1;	/* use kernel default */
1398 	}
1399 	else
1400 #endif
1401 	inp->inp_vflag |= INP_IPV4;
1402 	tp = tcp_newtcpcb(inp);
1403 	if (tp == 0) {
1404 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1405 
1406 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1407 #ifdef INET6
1408 		if (isipv6)
1409 			in6_pcbdetach(inp);
1410 		else
1411 #endif
1412 		in_pcbdetach(inp);
1413 		so->so_state |= nofd;
1414 		return (ENOBUFS);
1415 	}
1416 	tp->t_state = TCPS_CLOSED;
1417 	so->so_port = tcp_soport_attach(so);
1418 	return (0);
1419 }
1420 
1421 /*
1422  * Initiate (or continue) disconnect.
1423  * If embryonic state, just send reset (once).
1424  * If in ``let data drain'' option and linger null, just drop.
1425  * Otherwise (hard), mark socket disconnecting and drop
1426  * current input data; switch states based on user close, and
1427  * send segment to peer (with FIN).
1428  */
1429 static struct tcpcb *
1430 tcp_disconnect(struct tcpcb *tp)
1431 {
1432 	struct socket *so = tp->t_inpcb->inp_socket;
1433 
1434 	if (tp->t_state < TCPS_ESTABLISHED)
1435 		tp = tcp_close(tp);
1436 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1437 		tp = tcp_drop(tp, 0);
1438 	else {
1439 		soisdisconnecting(so);
1440 		sbflush(&so->so_rcv.sb);
1441 		tp = tcp_usrclosed(tp);
1442 		if (tp)
1443 			tcp_output(tp);
1444 	}
1445 	return (tp);
1446 }
1447 
1448 /*
1449  * User issued close, and wish to trail through shutdown states:
1450  * if never received SYN, just forget it.  If got a SYN from peer,
1451  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1452  * If already got a FIN from peer, then almost done; go to LAST_ACK
1453  * state.  In all other cases, have already sent FIN to peer (e.g.
1454  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1455  * for peer to send FIN or not respond to keep-alives, etc.
1456  * We can let the user exit from the close as soon as the FIN is acked.
1457  */
1458 static struct tcpcb *
1459 tcp_usrclosed(struct tcpcb *tp)
1460 {
1461 
1462 	switch (tp->t_state) {
1463 
1464 	case TCPS_CLOSED:
1465 	case TCPS_LISTEN:
1466 		tp->t_state = TCPS_CLOSED;
1467 		tp = tcp_close(tp);
1468 		break;
1469 
1470 	case TCPS_SYN_SENT:
1471 	case TCPS_SYN_RECEIVED:
1472 		tp->t_flags |= TF_NEEDFIN;
1473 		break;
1474 
1475 	case TCPS_ESTABLISHED:
1476 		tp->t_state = TCPS_FIN_WAIT_1;
1477 		break;
1478 
1479 	case TCPS_CLOSE_WAIT:
1480 		tp->t_state = TCPS_LAST_ACK;
1481 		break;
1482 	}
1483 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1484 		soisdisconnected(tp->t_inpcb->inp_socket);
1485 		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1486 		if (tp->t_state == TCPS_FIN_WAIT_2) {
1487 			tcp_callout_reset(tp, tp->tt_2msl, tcp_maxidle,
1488 			    tcp_timer_2msl);
1489 		}
1490 	}
1491 	return (tp);
1492 }
1493