xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision daf88648c0e349d5c02e1504293082072c981640)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.89 2005/03/04 13:21:42 markus Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  * 	This product includes software developed by the University of
46  * 	California, Berkeley and its contributors.
47  * 	This product includes software developed at the Information
48  * 	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 
82 #include <dev/rndvar.h>
83 
84 #include <net/if.h>
85 #include <net/route.h>
86 
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/in_var.h>
90 #include <netinet/ip.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_fsm.h>
95 #include <netinet/tcp_seq.h>
96 #include <netinet/tcp_timer.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet/tcpip.h>
99 #include <netinet/tcp_debug.h>
100 
101 /*
102  * TCP protocol interface to socket abstraction.
103  */
104 extern	char *tcpstates[];
105 extern	int tcptv_keep_init;
106 
107 extern int tcp_rst_ppslim;
108 
109 /* from in_pcb.c */
110 extern	struct baddynamicports baddynamicports;
111 
112 #ifndef TCP_SENDSPACE
113 #define	TCP_SENDSPACE	1024*16
114 #endif
115 u_int	tcp_sendspace = TCP_SENDSPACE;
116 #ifndef TCP_RECVSPACE
117 #define	TCP_RECVSPACE	1024*16
118 #endif
119 u_int	tcp_recvspace = TCP_RECVSPACE;
120 
121 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS;
122 
123 struct	inpcbtable tcbtable;
124 
125 int tcp_ident(void *, size_t *, void *, size_t, int);
126 
127 #ifdef INET6
128 int
129 tcp6_usrreq(so, req, m, nam, control, p)
130 	struct socket *so;
131 	int req;
132 	struct mbuf *m, *nam, *control;
133 	struct proc *p;
134 {
135 
136 	return tcp_usrreq(so, req, m, nam, control);
137 }
138 #endif
139 
140 /*
141  * Process a TCP user request for TCP tb.  If this is a send request
142  * then m is the mbuf chain of send data.  If this is a timer expiration
143  * (called from the software clock routine), then timertype tells which timer.
144  */
145 /*ARGSUSED*/
146 int
147 tcp_usrreq(so, req, m, nam, control)
148 	struct socket *so;
149 	int req;
150 	struct mbuf *m, *nam, *control;
151 {
152 	struct sockaddr_in *sin;
153 	struct inpcb *inp;
154 	struct tcpcb *tp = NULL;
155 	int s;
156 	int error = 0;
157 	short ostate;
158 
159 	if (req == PRU_CONTROL) {
160 #ifdef INET6
161 		if (sotopf(so) == PF_INET6)
162 			return in6_control(so, (u_long)m, (caddr_t)nam,
163 			    (struct ifnet *)control, 0);
164 		else
165 #endif /* INET6 */
166 			return (in_control(so, (u_long)m, (caddr_t)nam,
167 			    (struct ifnet *)control));
168 	}
169 	if (control && control->m_len) {
170 		m_freem(control);
171 		if (m)
172 			m_freem(m);
173 		return (EINVAL);
174 	}
175 
176 	s = splsoftnet();
177 	inp = sotoinpcb(so);
178 	/*
179 	 * When a TCP is attached to a socket, then there will be
180 	 * a (struct inpcb) pointed at by the socket, and this
181 	 * structure will point at a subsidiary (struct tcpcb).
182 	 */
183 	if (inp == 0 && req != PRU_ATTACH) {
184 		splx(s);
185 		/*
186 		 * The following corrects an mbuf leak under rare
187 		 * circumstances
188 		 */
189 		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
190 			m_freem(m);
191 		return (EINVAL);		/* XXX */
192 	}
193 	if (inp) {
194 		tp = intotcpcb(inp);
195 		/* WHAT IF TP IS 0? */
196 #ifdef KPROF
197 		tcp_acounts[tp->t_state][req]++;
198 #endif
199 		ostate = tp->t_state;
200 	} else
201 		ostate = 0;
202 	switch (req) {
203 
204 	/*
205 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
206 	 * and an internet control block.
207 	 */
208 	case PRU_ATTACH:
209 		if (inp) {
210 			error = EISCONN;
211 			break;
212 		}
213 		error = tcp_attach(so);
214 		if (error)
215 			break;
216 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
217 			so->so_linger = TCP_LINGERTIME;
218 		tp = sototcpcb(so);
219 		break;
220 
221 	/*
222 	 * PRU_DETACH detaches the TCP protocol from the socket.
223 	 * If the protocol state is non-embryonic, then can't
224 	 * do this directly: have to initiate a PRU_DISCONNECT,
225 	 * which may finish later; embryonic TCB's can just
226 	 * be discarded here.
227 	 */
228 	case PRU_DETACH:
229 		tp = tcp_disconnect(tp);
230 		break;
231 
232 	/*
233 	 * Give the socket an address.
234 	 */
235 	case PRU_BIND:
236 #ifdef INET6
237 		if (inp->inp_flags & INP_IPV6)
238 			error = in6_pcbbind(inp, nam);
239 		else
240 #endif
241 			error = in_pcbbind(inp, nam);
242 		if (error)
243 			break;
244 		break;
245 
246 	/*
247 	 * Prepare to accept connections.
248 	 */
249 	case PRU_LISTEN:
250 		if (inp->inp_lport == 0) {
251 #ifdef INET6
252 			if (inp->inp_flags & INP_IPV6)
253 				error = in6_pcbbind(inp, NULL);
254 			else
255 #endif
256 				error = in_pcbbind(inp, NULL);
257 		}
258 		/* If the in_pcbbind() above is called, the tp->pf
259 		   should still be whatever it was before. */
260 		if (error == 0)
261 			tp->t_state = TCPS_LISTEN;
262 		break;
263 
264 	/*
265 	 * Initiate connection to peer.
266 	 * Create a template for use in transmissions on this connection.
267 	 * Enter SYN_SENT state, and mark socket as connecting.
268 	 * Start keep-alive timer, and seed output sequence space.
269 	 * Send initial segment on connection.
270 	 */
271 	case PRU_CONNECT:
272 		sin = mtod(nam, struct sockaddr_in *);
273 
274 #ifdef INET6
275 		if (sin->sin_family == AF_INET6) {
276 			struct in6_addr *in6_addr = &mtod(nam,
277 			    struct sockaddr_in6 *)->sin6_addr;
278 
279 			if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) ||
280 			    IN6_IS_ADDR_MULTICAST(in6_addr) ||
281 			    (IN6_IS_ADDR_V4MAPPED(in6_addr) &&
282 			    ((in6_addr->s6_addr32[3] == INADDR_ANY) ||
283 			    IN_MULTICAST(in6_addr->s6_addr32[3]) ||
284 			    in_broadcast(sin->sin_addr, NULL)))) {
285 				error = EINVAL;
286 				break;
287 			}
288 
289 			if (inp->inp_lport == 0) {
290 				error = in6_pcbbind(inp, NULL);
291 				if (error)
292 					break;
293 			}
294 			error = in6_pcbconnect(inp, nam);
295 		} else if (sin->sin_family == AF_INET)
296 #endif /* INET6 */
297 		{
298 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
299 			    IN_MULTICAST(sin->sin_addr.s_addr) ||
300 			    in_broadcast(sin->sin_addr, NULL)) {
301 				error = EINVAL;
302 				break;
303 			}
304 
305 			if (inp->inp_lport == 0) {
306 				error = in_pcbbind(inp, NULL);
307 				if (error)
308 					break;
309 			}
310 			error = in_pcbconnect(inp, nam);
311 		}
312 
313 		if (error)
314 			break;
315 
316 		tp->t_template = tcp_template(tp);
317 		if (tp->t_template == 0) {
318 			in_pcbdisconnect(inp);
319 			error = ENOBUFS;
320 			break;
321 		}
322 
323 		so->so_state |= SS_CONNECTOUT;
324 
325 		/* initialise the timestamp modulator */
326 		if (tp->t_flags & TF_REQ_TSTMP)
327 			tp->ts_modulate = arc4random();
328 
329 		/* Compute window scaling to request.  */
330 		tcp_rscale(tp, so->so_rcv.sb_hiwat);
331 
332 		soisconnecting(so);
333 		tcpstat.tcps_connattempt++;
334 		tp->t_state = TCPS_SYN_SENT;
335 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
336 #ifdef TCP_COMPAT_42
337 		tp->iss = tcp_iss;
338 		tcp_iss += TCP_ISSINCR/2;
339 #else  /* TCP_COMPAT_42 */
340 		tp->iss = tcp_rndiss_next();
341 #endif /* !TCP_COMPAT_42 */
342 		tcp_sendseqinit(tp);
343 #if defined(TCP_SACK)
344 		tp->snd_last = tp->snd_una;
345 #endif
346 #if defined(TCP_SACK) && defined(TCP_FACK)
347 		tp->snd_fack = tp->snd_una;
348 		tp->retran_data = 0;
349 		tp->snd_awnd = 0;
350 #endif
351 		error = tcp_output(tp);
352 		break;
353 
354 	/*
355 	 * Create a TCP connection between two sockets.
356 	 */
357 	case PRU_CONNECT2:
358 		error = EOPNOTSUPP;
359 		break;
360 
361 	/*
362 	 * Initiate disconnect from peer.
363 	 * If connection never passed embryonic stage, just drop;
364 	 * else if don't need to let data drain, then can just drop anyways,
365 	 * else have to begin TCP shutdown process: mark socket disconnecting,
366 	 * drain unread data, state switch to reflect user close, and
367 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
368 	 * when peer sends FIN and acks ours.
369 	 *
370 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
371 	 */
372 	case PRU_DISCONNECT:
373 		tp = tcp_disconnect(tp);
374 		break;
375 
376 	/*
377 	 * Accept a connection.  Essentially all the work is
378 	 * done at higher levels; just return the address
379 	 * of the peer, storing through addr.
380 	 */
381 	case PRU_ACCEPT:
382 #ifdef INET6
383 		if (inp->inp_flags & INP_IPV6)
384 			in6_setpeeraddr(inp, nam);
385 		else
386 #endif
387 			in_setpeeraddr(inp, nam);
388 		break;
389 
390 	/*
391 	 * Mark the connection as being incapable of further output.
392 	 */
393 	case PRU_SHUTDOWN:
394 		if (so->so_state & SS_CANTSENDMORE)
395 			break;
396 		socantsendmore(so);
397 		tp = tcp_usrclosed(tp);
398 		if (tp)
399 			error = tcp_output(tp);
400 		break;
401 
402 	/*
403 	 * After a receive, possibly send window update to peer.
404 	 */
405 	case PRU_RCVD:
406 		/*
407 		 * soreceive() calls this function when a user receives
408 		 * ancillary data on a listening socket. We don't call
409 		 * tcp_output in such a case, since there is no header
410 		 * template for a listening socket and hence the kernel
411 		 * will panic.
412 		 */
413 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
414 			(void) tcp_output(tp);
415 		break;
416 
417 	/*
418 	 * Do a send by putting data in output queue and updating urgent
419 	 * marker if URG set.  Possibly send more data.
420 	 */
421 	case PRU_SEND:
422 		sbappendstream(&so->so_snd, m);
423 		error = tcp_output(tp);
424 		break;
425 
426 	/*
427 	 * Abort the TCP.
428 	 */
429 	case PRU_ABORT:
430 		tp = tcp_drop(tp, ECONNABORTED);
431 		break;
432 
433 	case PRU_SENSE:
434 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
435 		splx(s);
436 		return (0);
437 
438 	case PRU_RCVOOB:
439 		if ((so->so_oobmark == 0 &&
440 		    (so->so_state & SS_RCVATMARK) == 0) ||
441 		    so->so_options & SO_OOBINLINE ||
442 		    tp->t_oobflags & TCPOOB_HADDATA) {
443 			error = EINVAL;
444 			break;
445 		}
446 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
447 			error = EWOULDBLOCK;
448 			break;
449 		}
450 		m->m_len = 1;
451 		*mtod(m, caddr_t) = tp->t_iobc;
452 		if (((long)nam & MSG_PEEK) == 0)
453 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
454 		break;
455 
456 	case PRU_SENDOOB:
457 		if (sbspace(&so->so_snd) < -512) {
458 			m_freem(m);
459 			error = ENOBUFS;
460 			break;
461 		}
462 		/*
463 		 * According to RFC961 (Assigned Protocols),
464 		 * the urgent pointer points to the last octet
465 		 * of urgent data.  We continue, however,
466 		 * to consider it to indicate the first octet
467 		 * of data past the urgent section.
468 		 * Otherwise, snd_up should be one lower.
469 		 */
470 		sbappendstream(&so->so_snd, m);
471 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
472 		tp->t_force = 1;
473 		error = tcp_output(tp);
474 		tp->t_force = 0;
475 		break;
476 
477 	case PRU_SOCKADDR:
478 #ifdef INET6
479 		if (inp->inp_flags & INP_IPV6)
480 			in6_setsockaddr(inp, nam);
481 		else
482 #endif
483 			in_setsockaddr(inp, nam);
484 		break;
485 
486 	case PRU_PEERADDR:
487 #ifdef INET6
488 		if (inp->inp_flags & INP_IPV6)
489 			in6_setpeeraddr(inp, nam);
490 		else
491 #endif
492 			in_setpeeraddr(inp, nam);
493 		break;
494 
495 	default:
496 		panic("tcp_usrreq");
497 	}
498 	if (tp && (so->so_options & SO_DEBUG))
499 		tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0);
500 	splx(s);
501 	return (error);
502 }
503 
504 int
505 tcp_ctloutput(op, so, level, optname, mp)
506 	int op;
507 	struct socket *so;
508 	int level, optname;
509 	struct mbuf **mp;
510 {
511 	int error = 0, s;
512 	struct inpcb *inp;
513 	struct tcpcb *tp;
514 	struct mbuf *m;
515 	int i;
516 
517 	s = splsoftnet();
518 	inp = sotoinpcb(so);
519 	if (inp == NULL) {
520 		splx(s);
521 		if (op == PRCO_SETOPT && *mp)
522 			(void) m_free(*mp);
523 		return (ECONNRESET);
524 	}
525 #ifdef INET6
526 	tp = intotcpcb(inp);
527 #endif /* INET6 */
528 	if (level != IPPROTO_TCP) {
529 		switch (so->so_proto->pr_domain->dom_family) {
530 #ifdef INET6
531 		case PF_INET6:
532 			error = ip6_ctloutput(op, so, level, optname, mp);
533 			break;
534 #endif /* INET6 */
535 		case PF_INET:
536 			error = ip_ctloutput(op, so, level, optname, mp);
537 			break;
538 		default:
539 			error = EAFNOSUPPORT;	/*?*/
540 			break;
541 		}
542 		splx(s);
543 		return (error);
544 	}
545 #ifndef INET6
546 	tp = intotcpcb(inp);
547 #endif /* !INET6 */
548 
549 	switch (op) {
550 
551 	case PRCO_SETOPT:
552 		m = *mp;
553 		switch (optname) {
554 
555 		case TCP_NODELAY:
556 			if (m == NULL || m->m_len < sizeof (int))
557 				error = EINVAL;
558 			else if (*mtod(m, int *))
559 				tp->t_flags |= TF_NODELAY;
560 			else
561 				tp->t_flags &= ~TF_NODELAY;
562 			break;
563 
564 		case TCP_MAXSEG:
565 			if (m == NULL || m->m_len < sizeof (int)) {
566 				error = EINVAL;
567 				break;
568 			}
569 
570 			i = *mtod(m, int *);
571 			if (i > 0 && i <= tp->t_maxseg)
572 				tp->t_maxseg = i;
573 			else
574 				error = EINVAL;
575 			break;
576 
577 #ifdef TCP_SACK
578 		case TCP_SACK_ENABLE:
579 			if (m == NULL || m->m_len < sizeof (int)) {
580 				error = EINVAL;
581 				break;
582 			}
583 
584 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
585 				error = EPERM;
586 				break;
587 			}
588 
589 			if (tp->t_flags & TF_SIGNATURE) {
590 				error = EPERM;
591 				break;
592 			}
593 
594 			if (*mtod(m, int *))
595 				tp->sack_enable = 1;
596 			else
597 				tp->sack_enable = 0;
598 			break;
599 #endif
600 #ifdef TCP_SIGNATURE
601 		case TCP_MD5SIG:
602 			if (m == NULL || m->m_len < sizeof (int)) {
603 				error = EINVAL;
604 				break;
605 			}
606 
607 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
608 				error = EPERM;
609 				break;
610 			}
611 
612 			if (*mtod(m, int *)) {
613 				tp->t_flags |= TF_SIGNATURE;
614 #ifdef TCP_SACK
615 				tp->sack_enable = 0;
616 #endif /* TCP_SACK */
617 			} else
618 				tp->t_flags &= ~TF_SIGNATURE;
619 			break;
620 #endif /* TCP_SIGNATURE */
621 		default:
622 			error = ENOPROTOOPT;
623 			break;
624 		}
625 		if (m)
626 			(void) m_free(m);
627 		break;
628 
629 	case PRCO_GETOPT:
630 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
631 		m->m_len = sizeof(int);
632 
633 		switch (optname) {
634 		case TCP_NODELAY:
635 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
636 			break;
637 		case TCP_MAXSEG:
638 			*mtod(m, int *) = tp->t_maxseg;
639 			break;
640 #ifdef TCP_SACK
641 		case TCP_SACK_ENABLE:
642 			*mtod(m, int *) = tp->sack_enable;
643 			break;
644 #endif
645 #ifdef TCP_SIGNATURE
646 		case TCP_MD5SIG:
647 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
648 			break;
649 #endif
650 		default:
651 			error = ENOPROTOOPT;
652 			break;
653 		}
654 		break;
655 	}
656 	splx(s);
657 	return (error);
658 }
659 
660 /*
661  * Attach TCP protocol to socket, allocating
662  * internet protocol control block, tcp control block,
663  * bufer space, and entering LISTEN state if to accept connections.
664  */
665 int
666 tcp_attach(so)
667 	struct socket *so;
668 {
669 	struct tcpcb *tp;
670 	struct inpcb *inp;
671 	int error;
672 
673 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
674 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
675 		if (error)
676 			return (error);
677 	}
678 	error = in_pcballoc(so, &tcbtable);
679 	if (error)
680 		return (error);
681 	inp = sotoinpcb(so);
682 	tp = tcp_newtcpcb(inp);
683 	if (tp == NULL) {
684 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
685 
686 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
687 		in_pcbdetach(inp);
688 		so->so_state |= nofd;
689 		return (ENOBUFS);
690 	}
691 	tp->t_state = TCPS_CLOSED;
692 #ifdef INET6
693 	/* we disallow IPv4 mapped address completely. */
694 	if (inp->inp_flags & INP_IPV6)
695 		tp->pf = PF_INET6;
696 	else
697 		tp->pf = PF_INET;
698 #else
699 	tp->pf = PF_INET;
700 #endif
701 	return (0);
702 }
703 
704 /*
705  * Initiate (or continue) disconnect.
706  * If embryonic state, just send reset (once).
707  * If in ``let data drain'' option and linger null, just drop.
708  * Otherwise (hard), mark socket disconnecting and drop
709  * current input data; switch states based on user close, and
710  * send segment to peer (with FIN).
711  */
712 struct tcpcb *
713 tcp_disconnect(tp)
714 	struct tcpcb *tp;
715 {
716 	struct socket *so = tp->t_inpcb->inp_socket;
717 
718 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
719 		tp = tcp_close(tp);
720 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
721 		tp = tcp_drop(tp, 0);
722 	else {
723 		soisdisconnecting(so);
724 		sbflush(&so->so_rcv);
725 		tp = tcp_usrclosed(tp);
726 		if (tp)
727 			(void) tcp_output(tp);
728 	}
729 	return (tp);
730 }
731 
732 /*
733  * User issued close, and wish to trail through shutdown states:
734  * if never received SYN, just forget it.  If got a SYN from peer,
735  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
736  * If already got a FIN from peer, then almost done; go to LAST_ACK
737  * state.  In all other cases, have already sent FIN to peer (e.g.
738  * after PRU_SHUTDOWN), and just have to play tedious game waiting
739  * for peer to send FIN or not respond to keep-alives, etc.
740  * We can let the user exit from the close as soon as the FIN is acked.
741  */
742 struct tcpcb *
743 tcp_usrclosed(tp)
744 	struct tcpcb *tp;
745 {
746 
747 	switch (tp->t_state) {
748 
749 	case TCPS_CLOSED:
750 	case TCPS_LISTEN:
751 	case TCPS_SYN_SENT:
752 		tp->t_state = TCPS_CLOSED;
753 		tp = tcp_close(tp);
754 		break;
755 
756 	case TCPS_SYN_RECEIVED:
757 	case TCPS_ESTABLISHED:
758 		tp->t_state = TCPS_FIN_WAIT_1;
759 		break;
760 
761 	case TCPS_CLOSE_WAIT:
762 		tp->t_state = TCPS_LAST_ACK;
763 		break;
764 	}
765 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
766 		soisdisconnected(tp->t_inpcb->inp_socket);
767 		/*
768 		 * If we are in FIN_WAIT_2, we arrived here because the
769 		 * application did a shutdown of the send side.  Like the
770 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
771 		 * a full close, we start a timer to make sure sockets are
772 		 * not left in FIN_WAIT_2 forever.
773 		 */
774 		if (tp->t_state == TCPS_FIN_WAIT_2)
775 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
776 	}
777 	return (tp);
778 }
779 
780 /*
781  * Look up a socket for ident or tcpdrop, ...
782  */
783 int
784 tcp_ident(oldp, oldlenp, newp, newlen, dodrop)
785 	void *oldp;
786 	size_t *oldlenp;
787 	void *newp;
788 	size_t newlen;
789 	int dodrop;
790 {
791 	int error = 0, s;
792 	struct tcp_ident_mapping tir;
793 	struct inpcb *inp;
794 	struct tcpcb *tp = NULL;
795 	struct sockaddr_in *fin, *lin;
796 #ifdef INET6
797 	struct sockaddr_in6 *fin6, *lin6;
798 	struct in6_addr f6, l6;
799 #endif
800 	if (dodrop) {
801 		if (oldp != NULL || *oldlenp != 0)
802 			return (EINVAL);
803 		if (newp == NULL)
804 			return (EPERM);
805 		if (newlen < sizeof(tir))
806 			return (ENOMEM);
807 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
808 			return (error);
809 	} else {
810 		if (oldp == NULL)
811 			return (EINVAL);
812 		if (*oldlenp < sizeof(tir))
813 			return (ENOMEM);
814 		if (newp != NULL || newlen != 0)
815 			return (EINVAL);
816 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
817 			return (error);
818 	}
819 	switch (tir.faddr.ss_family) {
820 #ifdef INET6
821 	case AF_INET6:
822 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
823 		error = in6_embedscope(&f6, fin6, NULL, NULL);
824 		if (error)
825 			return EINVAL;	/*?*/
826 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
827 		error = in6_embedscope(&l6, lin6, NULL, NULL);
828 		if (error)
829 			return EINVAL;	/*?*/
830 		break;
831 #endif
832 	case AF_INET:
833 	  	fin = (struct sockaddr_in *)&tir.faddr;
834 		lin = (struct sockaddr_in *)&tir.laddr;
835 		break;
836 	default:
837 		return (EINVAL);
838 	}
839 
840 	s = splsoftnet();
841 	switch (tir.faddr.ss_family) {
842 #ifdef INET6
843 	case AF_INET6:
844 		inp = in6_pcbhashlookup(&tcbtable, &f6,
845 		    fin6->sin6_port, &l6, lin6->sin6_port);
846 		break;
847 #endif
848 	case AF_INET:
849 		inp = in_pcbhashlookup(&tcbtable,  fin->sin_addr,
850 		    fin->sin_port, lin->sin_addr, lin->sin_port);
851 		break;
852 	}
853 
854 	if (dodrop) {
855 		if (inp && (tp = intotcpcb(inp)) &&
856 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
857 			tp = tcp_drop(tp, ECONNABORTED);
858 		else
859 			error = ESRCH;
860 		splx(s);
861 		return (error);
862 	}
863 
864 	if (inp == NULL) {
865 		++tcpstat.tcps_pcbhashmiss;
866 		switch (tir.faddr.ss_family) {
867 #ifdef INET6
868 		case AF_INET6:
869 			inp = in6_pcblookup_listen(&tcbtable,
870 			    &l6, lin6->sin6_port, 0);
871 			break;
872 #endif
873 		case AF_INET:
874 			inp = in_pcblookup_listen(&tcbtable,
875 			    lin->sin_addr, lin->sin_port, 0);
876 			break;
877 		}
878 	}
879 
880 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
881 		tir.ruid = inp->inp_socket->so_ruid;
882 		tir.euid = inp->inp_socket->so_euid;
883 	} else {
884 		tir.ruid = -1;
885 		tir.euid = -1;
886 	}
887 	splx(s);
888 
889 	*oldlenp = sizeof (tir);
890 	error = copyout((void *)&tir, oldp, sizeof (tir));
891 	return (error);
892 }
893 
894 /*
895  * Sysctl for tcp variables.
896  */
897 int
898 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
899 	int *name;
900 	u_int namelen;
901 	void *oldp;
902 	size_t *oldlenp;
903 	void *newp;
904 	size_t newlen;
905 {
906 	int error, nval;
907 
908 	/* All sysctl names at this level are terminal. */
909 	if (namelen != 1)
910 		return (ENOTDIR);
911 
912 	switch (name[0]) {
913 #ifdef TCP_SACK
914 	case TCPCTL_SACK:
915 		return (sysctl_int(oldp, oldlenp, newp, newlen,
916 		    &tcp_do_sack));
917 #endif
918 	case TCPCTL_SLOWHZ:
919 		return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ));
920 
921 	case TCPCTL_BADDYNAMIC:
922 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
923 		    baddynamicports.tcp, sizeof(baddynamicports.tcp)));
924 
925 	case TCPCTL_IDENT:
926 		return (tcp_ident(oldp, oldlenp, newp, newlen, 0));
927 
928 	case TCPCTL_DROP:
929 		return (tcp_ident(oldp, oldlenp, newp, newlen, 1));
930 
931 #ifdef TCP_ECN
932 	case TCPCTL_ECN:
933 		return (sysctl_int(oldp, oldlenp, newp, newlen,
934 		   &tcp_do_ecn));
935 #endif
936 	case TCPCTL_REASS_LIMIT:
937 		nval = tcp_reass_limit;
938 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
939 		if (error)
940 			return (error);
941 		if (nval != tcp_reass_limit) {
942 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
943 			if (error)
944 				return (error);
945 			tcp_reass_limit = nval;
946 		}
947 		return (0);
948 #ifdef TCP_SACK
949 	case TCPCTL_SACKHOLE_LIMIT:
950 		nval = tcp_sackhole_limit;
951 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
952 		if (error)
953 			return (error);
954 		if (nval != tcp_sackhole_limit) {
955 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
956 			if (error)
957 				return (error);
958 			tcp_sackhole_limit = nval;
959 		}
960 		return (0);
961 #endif
962 	default:
963 		if (name[0] < TCPCTL_MAXID)
964 			return (sysctl_int_arr(tcpctl_vars, name, namelen,
965 			    oldp, oldlenp, newp, newlen));
966 		return (ENOPROTOOPT);
967 	}
968 	/* NOTREACHED */
969 }
970