xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision 5ad04d351680822078003e2b066cfc9680d6157d)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.118 2014/04/06 16:49:40 chrisz Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  * 	This product includes software developed by the University of
46  * 	California, Berkeley and its contributors.
47  * 	This product includes software developed at the Information
48  * 	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/proc.h>
79 #include <sys/sysctl.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/pool.h>
83 
84 #include <dev/rndvar.h>
85 
86 #include <net/if.h>
87 #include <net/route.h>
88 
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/in_pcb.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_timer.h>
99 #include <netinet/tcp_var.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/tcp_debug.h>
102 
103 #ifdef INET6
104 #include <netinet6/in6_var.h>
105 #endif
106 
107 #ifndef TCP_SENDSPACE
108 #define	TCP_SENDSPACE	1024*16
109 #endif
110 u_int	tcp_sendspace = TCP_SENDSPACE;
111 #ifndef TCP_RECVSPACE
112 #define	TCP_RECVSPACE	1024*16
113 #endif
114 u_int	tcp_recvspace = TCP_RECVSPACE;
115 u_int	tcp_autorcvbuf_inc = 16 * 1024;
116 
117 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS;
118 
119 struct	inpcbtable tcbtable;
120 
121 int tcp_ident(void *, size_t *, void *, size_t, int);
122 
123 /*
124  * Process a TCP user request for TCP tb.  If this is a send request
125  * then m is the mbuf chain of send data.  If this is a timer expiration
126  * (called from the software clock routine), then timertype tells which timer.
127  */
128 /*ARGSUSED*/
129 int
130 tcp_usrreq(so, req, m, nam, control, p)
131 	struct socket *so;
132 	int req;
133 	struct mbuf *m, *nam, *control;
134 	struct proc *p;
135 {
136 	struct sockaddr_in *sin;
137 	struct inpcb *inp;
138 	struct tcpcb *tp = NULL;
139 	int s;
140 	int error = 0;
141 	short ostate;
142 
143 	if (req == PRU_CONTROL) {
144 #ifdef INET6
145 		if (sotopf(so) == PF_INET6)
146 			return in6_control(so, (u_long)m, (caddr_t)nam,
147 			    (struct ifnet *)control);
148 		else
149 #endif /* INET6 */
150 			return (in_control(so, (u_long)m, (caddr_t)nam,
151 			    (struct ifnet *)control));
152 	}
153 	if (control && control->m_len) {
154 		m_freem(control);
155 		if (m)
156 			m_freem(m);
157 		return (EINVAL);
158 	}
159 
160 	s = splsoftnet();
161 	inp = sotoinpcb(so);
162 	/*
163 	 * When a TCP is attached to a socket, then there will be
164 	 * a (struct inpcb) pointed at by the socket, and this
165 	 * structure will point at a subsidiary (struct tcpcb).
166 	 */
167 	if (inp == 0 && req != PRU_ATTACH) {
168 		error = so->so_error;
169 		if (error == 0)
170 			error = EINVAL;
171 		splx(s);
172 		/*
173 		 * The following corrects an mbuf leak under rare
174 		 * circumstances
175 		 */
176 		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
177 			m_freem(m);
178 		return (error);
179 	}
180 	if (inp) {
181 		tp = intotcpcb(inp);
182 		/* tp might get 0 when using socket splicing */
183 		if (tp == NULL) {
184 			splx(s);
185 			return (0);
186 		}
187 #ifdef KPROF
188 		tcp_acounts[tp->t_state][req]++;
189 #endif
190 		ostate = tp->t_state;
191 	} else
192 		ostate = 0;
193 	switch (req) {
194 
195 	/*
196 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
197 	 * and an internet control block.
198 	 */
199 	case PRU_ATTACH:
200 		if (inp) {
201 			error = EISCONN;
202 			break;
203 		}
204 		error = tcp_attach(so);
205 		if (error)
206 			break;
207 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
208 			so->so_linger = TCP_LINGERTIME;
209 		tp = sototcpcb(so);
210 		break;
211 
212 	/*
213 	 * PRU_DETACH detaches the TCP protocol from the socket.
214 	 * If the protocol state is non-embryonic, then can't
215 	 * do this directly: have to initiate a PRU_DISCONNECT,
216 	 * which may finish later; embryonic TCB's can just
217 	 * be discarded here.
218 	 */
219 	case PRU_DETACH:
220 		tp = tcp_disconnect(tp);
221 		break;
222 
223 	/*
224 	 * Give the socket an address.
225 	 */
226 	case PRU_BIND:
227 #ifdef INET6
228 		if (inp->inp_flags & INP_IPV6)
229 			error = in6_pcbbind(inp, nam, p);
230 		else
231 #endif
232 			error = in_pcbbind(inp, nam, p);
233 		if (error)
234 			break;
235 		break;
236 
237 	/*
238 	 * Prepare to accept connections.
239 	 */
240 	case PRU_LISTEN:
241 		if (inp->inp_lport == 0) {
242 #ifdef INET6
243 			if (inp->inp_flags & INP_IPV6)
244 				error = in6_pcbbind(inp, NULL, p);
245 			else
246 #endif
247 				error = in_pcbbind(inp, NULL, p);
248 		}
249 		/* If the in_pcbbind() above is called, the tp->pf
250 		   should still be whatever it was before. */
251 		if (error == 0)
252 			tp->t_state = TCPS_LISTEN;
253 		break;
254 
255 	/*
256 	 * Initiate connection to peer.
257 	 * Create a template for use in transmissions on this connection.
258 	 * Enter SYN_SENT state, and mark socket as connecting.
259 	 * Start keep-alive timer, and seed output sequence space.
260 	 * Send initial segment on connection.
261 	 */
262 	case PRU_CONNECT:
263 		sin = mtod(nam, struct sockaddr_in *);
264 
265 #ifdef INET6
266 		if (sin->sin_family == AF_INET6) {
267 			struct in6_addr *in6_addr = &mtod(nam,
268 			    struct sockaddr_in6 *)->sin6_addr;
269 
270 			if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) ||
271 			    IN6_IS_ADDR_MULTICAST(in6_addr) ||
272 			    IN6_IS_ADDR_V4MAPPED(in6_addr)) {
273 				error = EINVAL;
274 				break;
275 			}
276 
277 			error = in6_pcbconnect(inp, nam);
278 		} else if (sin->sin_family == AF_INET)
279 #endif /* INET6 */
280 		{
281 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
282 			    IN_MULTICAST(sin->sin_addr.s_addr) ||
283 			    in_broadcast(sin->sin_addr, NULL,
284 			    inp->inp_rtableid)) {
285 				error = EINVAL;
286 				break;
287 			}
288 
289 			error = in_pcbconnect(inp, nam);
290 		}
291 
292 		if (error)
293 			break;
294 
295 		tp->t_template = tcp_template(tp);
296 		if (tp->t_template == 0) {
297 			in_pcbdisconnect(inp);
298 			error = ENOBUFS;
299 			break;
300 		}
301 
302 		so->so_state |= SS_CONNECTOUT;
303 
304 		/* Compute window scaling to request.  */
305 		tcp_rscale(tp, sb_max);
306 
307 		soisconnecting(so);
308 		tcpstat.tcps_connattempt++;
309 		tp->t_state = TCPS_SYN_SENT;
310 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
311 		tcp_set_iss_tsm(tp);
312 		tcp_sendseqinit(tp);
313 #if defined(TCP_SACK)
314 		tp->snd_last = tp->snd_una;
315 #endif
316 #if defined(TCP_SACK) && defined(TCP_FACK)
317 		tp->snd_fack = tp->snd_una;
318 		tp->retran_data = 0;
319 		tp->snd_awnd = 0;
320 #endif
321 		error = tcp_output(tp);
322 		break;
323 
324 	/*
325 	 * Create a TCP connection between two sockets.
326 	 */
327 	case PRU_CONNECT2:
328 		error = EOPNOTSUPP;
329 		break;
330 
331 	/*
332 	 * Initiate disconnect from peer.
333 	 * If connection never passed embryonic stage, just drop;
334 	 * else if don't need to let data drain, then can just drop anyways,
335 	 * else have to begin TCP shutdown process: mark socket disconnecting,
336 	 * drain unread data, state switch to reflect user close, and
337 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
338 	 * when peer sends FIN and acks ours.
339 	 *
340 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
341 	 */
342 	case PRU_DISCONNECT:
343 		tp = tcp_disconnect(tp);
344 		break;
345 
346 	/*
347 	 * Accept a connection.  Essentially all the work is
348 	 * done at higher levels; just return the address
349 	 * of the peer, storing through addr.
350 	 */
351 	case PRU_ACCEPT:
352 #ifdef INET6
353 		if (inp->inp_flags & INP_IPV6)
354 			in6_setpeeraddr(inp, nam);
355 		else
356 #endif
357 			in_setpeeraddr(inp, nam);
358 		break;
359 
360 	/*
361 	 * Mark the connection as being incapable of further output.
362 	 */
363 	case PRU_SHUTDOWN:
364 		if (so->so_state & SS_CANTSENDMORE)
365 			break;
366 		socantsendmore(so);
367 		tp = tcp_usrclosed(tp);
368 		if (tp)
369 			error = tcp_output(tp);
370 		break;
371 
372 	/*
373 	 * After a receive, possibly send window update to peer.
374 	 */
375 	case PRU_RCVD:
376 		/*
377 		 * soreceive() calls this function when a user receives
378 		 * ancillary data on a listening socket. We don't call
379 		 * tcp_output in such a case, since there is no header
380 		 * template for a listening socket and hence the kernel
381 		 * will panic.
382 		 */
383 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
384 			(void) tcp_output(tp);
385 		break;
386 
387 	/*
388 	 * Do a send by putting data in output queue and updating urgent
389 	 * marker if URG set.  Possibly send more data.
390 	 */
391 	case PRU_SEND:
392 		sbappendstream(&so->so_snd, m);
393 		error = tcp_output(tp);
394 		break;
395 
396 	/*
397 	 * Abort the TCP.
398 	 */
399 	case PRU_ABORT:
400 		tp = tcp_drop(tp, ECONNABORTED);
401 		break;
402 
403 	case PRU_SENSE:
404 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
405 		splx(s);
406 		return (0);
407 
408 	case PRU_RCVOOB:
409 		if ((so->so_oobmark == 0 &&
410 		    (so->so_state & SS_RCVATMARK) == 0) ||
411 		    so->so_options & SO_OOBINLINE ||
412 		    tp->t_oobflags & TCPOOB_HADDATA) {
413 			error = EINVAL;
414 			break;
415 		}
416 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
417 			error = EWOULDBLOCK;
418 			break;
419 		}
420 		m->m_len = 1;
421 		*mtod(m, caddr_t) = tp->t_iobc;
422 		if (((long)nam & MSG_PEEK) == 0)
423 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
424 		break;
425 
426 	case PRU_SENDOOB:
427 		if (sbspace(&so->so_snd) < -512) {
428 			m_freem(m);
429 			error = ENOBUFS;
430 			break;
431 		}
432 		/*
433 		 * According to RFC961 (Assigned Protocols),
434 		 * the urgent pointer points to the last octet
435 		 * of urgent data.  We continue, however,
436 		 * to consider it to indicate the first octet
437 		 * of data past the urgent section.
438 		 * Otherwise, snd_up should be one lower.
439 		 */
440 		sbappendstream(&so->so_snd, m);
441 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
442 		tp->t_force = 1;
443 		error = tcp_output(tp);
444 		tp->t_force = 0;
445 		break;
446 
447 	case PRU_SOCKADDR:
448 #ifdef INET6
449 		if (inp->inp_flags & INP_IPV6)
450 			in6_setsockaddr(inp, nam);
451 		else
452 #endif
453 			in_setsockaddr(inp, nam);
454 		break;
455 
456 	case PRU_PEERADDR:
457 #ifdef INET6
458 		if (inp->inp_flags & INP_IPV6)
459 			in6_setpeeraddr(inp, nam);
460 		else
461 #endif
462 			in_setpeeraddr(inp, nam);
463 		break;
464 
465 	default:
466 		panic("tcp_usrreq");
467 	}
468 	if (tp && (so->so_options & SO_DEBUG))
469 		tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0);
470 	splx(s);
471 	return (error);
472 }
473 
474 int
475 tcp_ctloutput(op, so, level, optname, mp)
476 	int op;
477 	struct socket *so;
478 	int level, optname;
479 	struct mbuf **mp;
480 {
481 	int error = 0, s;
482 	struct inpcb *inp;
483 	struct tcpcb *tp;
484 	struct mbuf *m;
485 	int i;
486 
487 	s = splsoftnet();
488 	inp = sotoinpcb(so);
489 	if (inp == NULL) {
490 		splx(s);
491 		if (op == PRCO_SETOPT && *mp)
492 			(void) m_free(*mp);
493 		return (ECONNRESET);
494 	}
495 	if (level != IPPROTO_TCP) {
496 		switch (so->so_proto->pr_domain->dom_family) {
497 #ifdef INET6
498 		case PF_INET6:
499 			error = ip6_ctloutput(op, so, level, optname, mp);
500 			break;
501 #endif /* INET6 */
502 		case PF_INET:
503 			error = ip_ctloutput(op, so, level, optname, mp);
504 			break;
505 		default:
506 			error = EAFNOSUPPORT;	/*?*/
507 			break;
508 		}
509 		splx(s);
510 		return (error);
511 	}
512 	tp = intotcpcb(inp);
513 
514 	switch (op) {
515 
516 	case PRCO_SETOPT:
517 		m = *mp;
518 		switch (optname) {
519 
520 		case TCP_NODELAY:
521 			if (m == NULL || m->m_len < sizeof (int))
522 				error = EINVAL;
523 			else if (*mtod(m, int *))
524 				tp->t_flags |= TF_NODELAY;
525 			else
526 				tp->t_flags &= ~TF_NODELAY;
527 			break;
528 
529 		case TCP_NOPUSH:
530 			if (m == NULL || m->m_len < sizeof (int))
531 				error = EINVAL;
532 			else if (*mtod(m, int *))
533 				tp->t_flags |= TF_NOPUSH;
534 			else if (tp->t_flags & TF_NOPUSH) {
535 				tp->t_flags &= ~TF_NOPUSH;
536 				if (TCPS_HAVEESTABLISHED(tp->t_state))
537 					error = tcp_output(tp);
538 			}
539 			break;
540 
541 		case TCP_MAXSEG:
542 			if (m == NULL || m->m_len < sizeof (int)) {
543 				error = EINVAL;
544 				break;
545 			}
546 
547 			i = *mtod(m, int *);
548 			if (i > 0 && i <= tp->t_maxseg)
549 				tp->t_maxseg = i;
550 			else
551 				error = EINVAL;
552 			break;
553 
554 #ifdef TCP_SACK
555 		case TCP_SACK_ENABLE:
556 			if (m == NULL || m->m_len < sizeof (int)) {
557 				error = EINVAL;
558 				break;
559 			}
560 
561 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
562 				error = EPERM;
563 				break;
564 			}
565 
566 			if (tp->t_flags & TF_SIGNATURE) {
567 				error = EPERM;
568 				break;
569 			}
570 
571 			if (*mtod(m, int *))
572 				tp->sack_enable = 1;
573 			else
574 				tp->sack_enable = 0;
575 			break;
576 #endif
577 #ifdef TCP_SIGNATURE
578 		case TCP_MD5SIG:
579 			if (m == NULL || m->m_len < sizeof (int)) {
580 				error = EINVAL;
581 				break;
582 			}
583 
584 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
585 				error = EPERM;
586 				break;
587 			}
588 
589 			if (*mtod(m, int *)) {
590 				tp->t_flags |= TF_SIGNATURE;
591 #ifdef TCP_SACK
592 				tp->sack_enable = 0;
593 #endif /* TCP_SACK */
594 			} else
595 				tp->t_flags &= ~TF_SIGNATURE;
596 			break;
597 #endif /* TCP_SIGNATURE */
598 		default:
599 			error = ENOPROTOOPT;
600 			break;
601 		}
602 		if (m)
603 			(void) m_free(m);
604 		break;
605 
606 	case PRCO_GETOPT:
607 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
608 		m->m_len = sizeof(int);
609 
610 		switch (optname) {
611 		case TCP_NODELAY:
612 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
613 			break;
614 		case TCP_NOPUSH:
615 			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
616 			break;
617 		case TCP_MAXSEG:
618 			*mtod(m, int *) = tp->t_maxseg;
619 			break;
620 #ifdef TCP_SACK
621 		case TCP_SACK_ENABLE:
622 			*mtod(m, int *) = tp->sack_enable;
623 			break;
624 #endif
625 #ifdef TCP_SIGNATURE
626 		case TCP_MD5SIG:
627 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
628 			break;
629 #endif
630 		default:
631 			error = ENOPROTOOPT;
632 			break;
633 		}
634 		break;
635 	}
636 	splx(s);
637 	return (error);
638 }
639 
640 /*
641  * Attach TCP protocol to socket, allocating
642  * internet protocol control block, tcp control block,
643  * bufer space, and entering LISTEN state if to accept connections.
644  */
645 int
646 tcp_attach(so)
647 	struct socket *so;
648 {
649 	struct tcpcb *tp;
650 	struct inpcb *inp;
651 	int error;
652 
653 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
654 	    sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
655 	    sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
656 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
657 		if (error)
658 			return (error);
659 	}
660 
661 	error = in_pcballoc(so, &tcbtable);
662 	if (error)
663 		return (error);
664 	inp = sotoinpcb(so);
665 	tp = tcp_newtcpcb(inp);
666 	if (tp == NULL) {
667 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
668 
669 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
670 		in_pcbdetach(inp);
671 		so->so_state |= nofd;
672 		return (ENOBUFS);
673 	}
674 	tp->t_state = TCPS_CLOSED;
675 #ifdef INET6
676 	/* we disallow IPv4 mapped address completely. */
677 	if (inp->inp_flags & INP_IPV6)
678 		tp->pf = PF_INET6;
679 	else
680 		tp->pf = PF_INET;
681 #else
682 	tp->pf = PF_INET;
683 #endif
684 	return (0);
685 }
686 
687 /*
688  * Initiate (or continue) disconnect.
689  * If embryonic state, just send reset (once).
690  * If in ``let data drain'' option and linger null, just drop.
691  * Otherwise (hard), mark socket disconnecting and drop
692  * current input data; switch states based on user close, and
693  * send segment to peer (with FIN).
694  */
695 struct tcpcb *
696 tcp_disconnect(tp)
697 	struct tcpcb *tp;
698 {
699 	struct socket *so = tp->t_inpcb->inp_socket;
700 
701 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
702 		tp = tcp_close(tp);
703 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
704 		tp = tcp_drop(tp, 0);
705 	else {
706 		soisdisconnecting(so);
707 		sbflush(&so->so_rcv);
708 		tp = tcp_usrclosed(tp);
709 		if (tp)
710 			(void) tcp_output(tp);
711 	}
712 	return (tp);
713 }
714 
715 /*
716  * User issued close, and wish to trail through shutdown states:
717  * if never received SYN, just forget it.  If got a SYN from peer,
718  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
719  * If already got a FIN from peer, then almost done; go to LAST_ACK
720  * state.  In all other cases, have already sent FIN to peer (e.g.
721  * after PRU_SHUTDOWN), and just have to play tedious game waiting
722  * for peer to send FIN or not respond to keep-alives, etc.
723  * We can let the user exit from the close as soon as the FIN is acked.
724  */
725 struct tcpcb *
726 tcp_usrclosed(tp)
727 	struct tcpcb *tp;
728 {
729 
730 	switch (tp->t_state) {
731 
732 	case TCPS_CLOSED:
733 	case TCPS_LISTEN:
734 	case TCPS_SYN_SENT:
735 		tp->t_state = TCPS_CLOSED;
736 		tp = tcp_close(tp);
737 		break;
738 
739 	case TCPS_SYN_RECEIVED:
740 	case TCPS_ESTABLISHED:
741 		tp->t_state = TCPS_FIN_WAIT_1;
742 		break;
743 
744 	case TCPS_CLOSE_WAIT:
745 		tp->t_state = TCPS_LAST_ACK;
746 		break;
747 	}
748 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
749 		soisdisconnected(tp->t_inpcb->inp_socket);
750 		/*
751 		 * If we are in FIN_WAIT_2, we arrived here because the
752 		 * application did a shutdown of the send side.  Like the
753 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
754 		 * a full close, we start a timer to make sure sockets are
755 		 * not left in FIN_WAIT_2 forever.
756 		 */
757 		if (tp->t_state == TCPS_FIN_WAIT_2)
758 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
759 	}
760 	return (tp);
761 }
762 
763 /*
764  * Look up a socket for ident or tcpdrop, ...
765  */
766 int
767 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
768 {
769 	int error = 0, s;
770 	struct tcp_ident_mapping tir;
771 	struct inpcb *inp;
772 	struct tcpcb *tp = NULL;
773 	struct sockaddr_in *fin, *lin;
774 #ifdef INET6
775 	struct sockaddr_in6 *fin6, *lin6;
776 	struct in6_addr f6, l6;
777 #endif
778 	if (dodrop) {
779 		if (oldp != NULL || *oldlenp != 0)
780 			return (EINVAL);
781 		if (newp == NULL)
782 			return (EPERM);
783 		if (newlen < sizeof(tir))
784 			return (ENOMEM);
785 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
786 			return (error);
787 	} else {
788 		if (oldp == NULL)
789 			return (EINVAL);
790 		if (*oldlenp < sizeof(tir))
791 			return (ENOMEM);
792 		if (newp != NULL || newlen != 0)
793 			return (EINVAL);
794 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
795 			return (error);
796 	}
797 	switch (tir.faddr.ss_family) {
798 #ifdef INET6
799 	case AF_INET6:
800 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
801 		error = in6_embedscope(&f6, fin6, NULL, NULL);
802 		if (error)
803 			return EINVAL;	/*?*/
804 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
805 		error = in6_embedscope(&l6, lin6, NULL, NULL);
806 		if (error)
807 			return EINVAL;	/*?*/
808 		break;
809 #endif
810 	case AF_INET:
811 	  	fin = (struct sockaddr_in *)&tir.faddr;
812 		lin = (struct sockaddr_in *)&tir.laddr;
813 		break;
814 	default:
815 		return (EINVAL);
816 	}
817 
818 	s = splsoftnet();
819 	switch (tir.faddr.ss_family) {
820 #ifdef INET6
821 	case AF_INET6:
822 		inp = in6_pcbhashlookup(&tcbtable, &f6,
823 		    fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
824 		break;
825 #endif
826 	case AF_INET:
827 		inp = in_pcbhashlookup(&tcbtable, fin->sin_addr,
828 		    fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
829 		break;
830 	}
831 
832 	if (dodrop) {
833 		if (inp && (tp = intotcpcb(inp)) &&
834 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
835 			tp = tcp_drop(tp, ECONNABORTED);
836 		else
837 			error = ESRCH;
838 		splx(s);
839 		return (error);
840 	}
841 
842 	if (inp == NULL) {
843 		++tcpstat.tcps_pcbhashmiss;
844 		switch (tir.faddr.ss_family) {
845 #ifdef INET6
846 		case AF_INET6:
847 			inp = in6_pcblookup_listen(&tcbtable,
848 			    &l6, lin6->sin6_port, 0, NULL, tir.rdomain);
849 			break;
850 #endif
851 		case AF_INET:
852 			inp = in_pcblookup_listen(&tcbtable,
853 			    lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain);
854 			break;
855 		}
856 	}
857 
858 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
859 		tir.ruid = inp->inp_socket->so_ruid;
860 		tir.euid = inp->inp_socket->so_euid;
861 	} else {
862 		tir.ruid = -1;
863 		tir.euid = -1;
864 	}
865 	splx(s);
866 
867 	*oldlenp = sizeof (tir);
868 	error = copyout((void *)&tir, oldp, sizeof (tir));
869 	return (error);
870 }
871 
872 /*
873  * Sysctl for tcp variables.
874  */
875 int
876 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
877 	int *name;
878 	u_int namelen;
879 	void *oldp;
880 	size_t *oldlenp;
881 	void *newp;
882 	size_t newlen;
883 {
884 	int error, nval;
885 
886 	/* All sysctl names at this level are terminal. */
887 	if (namelen != 1)
888 		return (ENOTDIR);
889 
890 	switch (name[0]) {
891 #ifdef TCP_SACK
892 	case TCPCTL_SACK:
893 		return (sysctl_int(oldp, oldlenp, newp, newlen,
894 		    &tcp_do_sack));
895 #endif
896 	case TCPCTL_SLOWHZ:
897 		return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ));
898 
899 	case TCPCTL_BADDYNAMIC:
900 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
901 		    baddynamicports.tcp, sizeof(baddynamicports.tcp)));
902 
903 	case TCPCTL_IDENT:
904 		return (tcp_ident(oldp, oldlenp, newp, newlen, 0));
905 
906 	case TCPCTL_DROP:
907 		return (tcp_ident(oldp, oldlenp, newp, newlen, 1));
908 
909 	case TCPCTL_ALWAYS_KEEPALIVE:
910 		return (sysctl_int(oldp, oldlenp, newp, newlen,
911 		    &tcp_always_keepalive));
912 
913 #ifdef TCP_ECN
914 	case TCPCTL_ECN:
915 		return (sysctl_int(oldp, oldlenp, newp, newlen,
916 		   &tcp_do_ecn));
917 #endif
918 	case TCPCTL_REASS_LIMIT:
919 		nval = tcp_reass_limit;
920 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
921 		if (error)
922 			return (error);
923 		if (nval != tcp_reass_limit) {
924 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
925 			if (error)
926 				return (error);
927 			tcp_reass_limit = nval;
928 		}
929 		return (0);
930 #ifdef TCP_SACK
931 	case TCPCTL_SACKHOLE_LIMIT:
932 		nval = tcp_sackhole_limit;
933 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
934 		if (error)
935 			return (error);
936 		if (nval != tcp_sackhole_limit) {
937 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
938 			if (error)
939 				return (error);
940 			tcp_sackhole_limit = nval;
941 		}
942 		return (0);
943 #endif
944 
945 	case TCPCTL_STATS:
946 		if (newp != NULL)
947 			return (EPERM);
948 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
949 		    &tcpstat, sizeof(tcpstat)));
950 
951 	default:
952 		if (name[0] < TCPCTL_MAXID)
953 			return (sysctl_int_arr(tcpctl_vars, name, namelen,
954 			    oldp, oldlenp, newp, newlen));
955 		return (ENOPROTOOPT);
956 	}
957 	/* NOTREACHED */
958 }
959 
960 /*
961  * Scale the send buffer so that inflight data is not accounted against
962  * the limit. The buffer will scale with the congestion window, if the
963  * the receiver stops acking data the window will shrink and therefor
964  * the buffer size will shrink as well.
965  * In low memory situation try to shrink the buffer to the initial size
966  * disabling the send buffer scaling as long as the situation persists.
967  */
968 void
969 tcp_update_sndspace(struct tcpcb *tp)
970 {
971 	struct socket *so = tp->t_inpcb->inp_socket;
972 	u_long nmax;
973 
974 	if (sbchecklowmem())
975 		/* low on memory try to get rid of some */
976 		nmax = tcp_sendspace;
977 	else if (so->so_snd.sb_wat != tcp_sendspace)
978 		/* user requested buffer size, auto-scaling disabled */
979 		nmax = so->so_snd.sb_wat;
980 	else
981 		/* automatic buffer scaling */
982 		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
983 		    tp->snd_una);
984 
985 	/* round to MSS boundary */
986 	nmax = roundup(nmax, tp->t_maxseg);
987 
988 	if (nmax != so->so_snd.sb_hiwat)
989 		sbreserve(&so->so_snd, nmax);
990 }
991 
992 /*
993  * Scale the recv buffer by looking at how much data was transferred in
994  * on approximated RTT. If more then a big part of the recv buffer was
995  * transferred during that time we increase the buffer by a constant.
996  * In low memory situation try to shrink the buffer to the initial size.
997  */
998 void
999 tcp_update_rcvspace(struct tcpcb *tp)
1000 {
1001 	struct socket *so = tp->t_inpcb->inp_socket;
1002 	u_long nmax = so->so_rcv.sb_hiwat;
1003 
1004 	if (sbchecklowmem())
1005 		/* low on memory try to get rid of some */
1006 		nmax = tcp_recvspace;
1007 	else if (so->so_rcv.sb_wat != tcp_recvspace)
1008 		/* user requested buffer size, auto-scaling disabled */
1009 		nmax = so->so_rcv.sb_wat;
1010 	else {
1011 		/* automatic buffer scaling */
1012 		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1013 			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1014 			    tcp_autorcvbuf_inc);
1015 	}
1016 
1017 	if (nmax == so->so_rcv.sb_hiwat)
1018 		return;
1019 
1020 	/* round to MSS boundary */
1021 	nmax = roundup(nmax, tp->t_maxseg);
1022 	sbreserve(&so->so_rcv, nmax);
1023 }
1024