xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision 48950c12d106c85f315112191a0228d7b83b9510)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.110 2012/02/24 06:19:00 guenther Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  * 	This product includes software developed by the University of
46  * 	California, Berkeley and its contributors.
47  * 	This product includes software developed at the Information
48  * 	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/proc.h>
79 #include <sys/sysctl.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/pool.h>
83 
84 #include <dev/rndvar.h>
85 
86 #include <net/if.h>
87 #include <net/route.h>
88 
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/in_pcb.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_timer.h>
99 #include <netinet/tcp_var.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/tcp_debug.h>
102 
103 /*
104  * TCP protocol interface to socket abstraction.
105  */
106 extern	char *tcpstates[];
107 extern	int tcptv_keep_init;
108 
109 extern int tcp_rst_ppslim;
110 
111 /* from in_pcb.c */
112 extern	struct baddynamicports baddynamicports;
113 
114 #ifndef TCP_SENDSPACE
115 #define	TCP_SENDSPACE	1024*16
116 #endif
117 u_int	tcp_sendspace = TCP_SENDSPACE;
118 #ifndef TCP_RECVSPACE
119 #define	TCP_RECVSPACE	1024*16
120 #endif
121 u_int	tcp_recvspace = TCP_RECVSPACE;
122 u_int	tcp_autorcvbuf_inc = 16 * 1024;
123 
124 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS;
125 
126 struct	inpcbtable tcbtable;
127 
128 int tcp_ident(void *, size_t *, void *, size_t, int);
129 
130 /*
131  * Process a TCP user request for TCP tb.  If this is a send request
132  * then m is the mbuf chain of send data.  If this is a timer expiration
133  * (called from the software clock routine), then timertype tells which timer.
134  */
135 /*ARGSUSED*/
136 int
137 tcp_usrreq(so, req, m, nam, control, p)
138 	struct socket *so;
139 	int req;
140 	struct mbuf *m, *nam, *control;
141 	struct proc *p;
142 {
143 	struct sockaddr_in *sin;
144 	struct inpcb *inp;
145 	struct tcpcb *tp = NULL;
146 	int s;
147 	int error = 0;
148 	short ostate;
149 
150 	if (req == PRU_CONTROL) {
151 #ifdef INET6
152 		if (sotopf(so) == PF_INET6)
153 			return in6_control(so, (u_long)m, (caddr_t)nam,
154 			    (struct ifnet *)control, 0);
155 		else
156 #endif /* INET6 */
157 			return (in_control(so, (u_long)m, (caddr_t)nam,
158 			    (struct ifnet *)control));
159 	}
160 	if (control && control->m_len) {
161 		m_freem(control);
162 		if (m)
163 			m_freem(m);
164 		return (EINVAL);
165 	}
166 
167 	s = splsoftnet();
168 	inp = sotoinpcb(so);
169 	/*
170 	 * When a TCP is attached to a socket, then there will be
171 	 * a (struct inpcb) pointed at by the socket, and this
172 	 * structure will point at a subsidiary (struct tcpcb).
173 	 */
174 	if (inp == 0 && req != PRU_ATTACH) {
175 		error = so->so_error;
176 		if (error == 0)
177 			error = EINVAL;
178 		splx(s);
179 		/*
180 		 * The following corrects an mbuf leak under rare
181 		 * circumstances
182 		 */
183 		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
184 			m_freem(m);
185 		return (error);
186 	}
187 	if (inp) {
188 		tp = intotcpcb(inp);
189 		/* tp might get 0 when using socket splicing */
190 		if (tp == NULL) {
191 			splx(s);
192 			return (0);
193 		}
194 #ifdef KPROF
195 		tcp_acounts[tp->t_state][req]++;
196 #endif
197 		ostate = tp->t_state;
198 	} else
199 		ostate = 0;
200 	switch (req) {
201 
202 	/*
203 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
204 	 * and an internet control block.
205 	 */
206 	case PRU_ATTACH:
207 		if (inp) {
208 			error = EISCONN;
209 			break;
210 		}
211 		error = tcp_attach(so);
212 		if (error)
213 			break;
214 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
215 			so->so_linger = TCP_LINGERTIME;
216 		tp = sototcpcb(so);
217 		break;
218 
219 	/*
220 	 * PRU_DETACH detaches the TCP protocol from the socket.
221 	 * If the protocol state is non-embryonic, then can't
222 	 * do this directly: have to initiate a PRU_DISCONNECT,
223 	 * which may finish later; embryonic TCB's can just
224 	 * be discarded here.
225 	 */
226 	case PRU_DETACH:
227 		tp = tcp_disconnect(tp);
228 		break;
229 
230 	/*
231 	 * Give the socket an address.
232 	 */
233 	case PRU_BIND:
234 #ifdef INET6
235 		if (inp->inp_flags & INP_IPV6)
236 			error = in6_pcbbind(inp, nam, p);
237 		else
238 #endif
239 			error = in_pcbbind(inp, nam, p);
240 		if (error)
241 			break;
242 		break;
243 
244 	/*
245 	 * Prepare to accept connections.
246 	 */
247 	case PRU_LISTEN:
248 		if (inp->inp_lport == 0) {
249 #ifdef INET6
250 			if (inp->inp_flags & INP_IPV6)
251 				error = in6_pcbbind(inp, NULL, p);
252 			else
253 #endif
254 				error = in_pcbbind(inp, NULL, p);
255 		}
256 		/* If the in_pcbbind() above is called, the tp->pf
257 		   should still be whatever it was before. */
258 		if (error == 0)
259 			tp->t_state = TCPS_LISTEN;
260 		break;
261 
262 	/*
263 	 * Initiate connection to peer.
264 	 * Create a template for use in transmissions on this connection.
265 	 * Enter SYN_SENT state, and mark socket as connecting.
266 	 * Start keep-alive timer, and seed output sequence space.
267 	 * Send initial segment on connection.
268 	 */
269 	case PRU_CONNECT:
270 		sin = mtod(nam, struct sockaddr_in *);
271 
272 #ifdef INET6
273 		if (sin->sin_family == AF_INET6) {
274 			struct in6_addr *in6_addr = &mtod(nam,
275 			    struct sockaddr_in6 *)->sin6_addr;
276 
277 			if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) ||
278 			    IN6_IS_ADDR_MULTICAST(in6_addr) ||
279 			    IN6_IS_ADDR_V4MAPPED(in6_addr)) {
280 				error = EINVAL;
281 				break;
282 			}
283 
284 			if (inp->inp_lport == 0) {
285 				error = in6_pcbbind(inp, NULL, p);
286 				if (error)
287 					break;
288 			}
289 			error = in6_pcbconnect(inp, nam);
290 		} else if (sin->sin_family == AF_INET)
291 #endif /* INET6 */
292 		{
293 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
294 			    IN_MULTICAST(sin->sin_addr.s_addr) ||
295 			    in_broadcast(sin->sin_addr, NULL,
296 			    inp->inp_rtableid)) {
297 				error = EINVAL;
298 				break;
299 			}
300 
301 			if (inp->inp_lport == 0) {
302 				error = in_pcbbind(inp, NULL, p);
303 				if (error)
304 					break;
305 			}
306 			error = in_pcbconnect(inp, nam);
307 		}
308 
309 		if (error)
310 			break;
311 
312 		tp->t_template = tcp_template(tp);
313 		if (tp->t_template == 0) {
314 			in_pcbdisconnect(inp);
315 			error = ENOBUFS;
316 			break;
317 		}
318 
319 		so->so_state |= SS_CONNECTOUT;
320 
321 		/* Compute window scaling to request.  */
322 		tcp_rscale(tp, sb_max);
323 
324 		soisconnecting(so);
325 		tcpstat.tcps_connattempt++;
326 		tp->t_state = TCPS_SYN_SENT;
327 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
328 		tcp_set_iss_tsm(tp);
329 		tcp_sendseqinit(tp);
330 #if defined(TCP_SACK)
331 		tp->snd_last = tp->snd_una;
332 #endif
333 #if defined(TCP_SACK) && defined(TCP_FACK)
334 		tp->snd_fack = tp->snd_una;
335 		tp->retran_data = 0;
336 		tp->snd_awnd = 0;
337 #endif
338 		error = tcp_output(tp);
339 		break;
340 
341 	/*
342 	 * Create a TCP connection between two sockets.
343 	 */
344 	case PRU_CONNECT2:
345 		error = EOPNOTSUPP;
346 		break;
347 
348 	/*
349 	 * Initiate disconnect from peer.
350 	 * If connection never passed embryonic stage, just drop;
351 	 * else if don't need to let data drain, then can just drop anyways,
352 	 * else have to begin TCP shutdown process: mark socket disconnecting,
353 	 * drain unread data, state switch to reflect user close, and
354 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
355 	 * when peer sends FIN and acks ours.
356 	 *
357 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
358 	 */
359 	case PRU_DISCONNECT:
360 		tp = tcp_disconnect(tp);
361 		break;
362 
363 	/*
364 	 * Accept a connection.  Essentially all the work is
365 	 * done at higher levels; just return the address
366 	 * of the peer, storing through addr.
367 	 */
368 	case PRU_ACCEPT:
369 #ifdef INET6
370 		if (inp->inp_flags & INP_IPV6)
371 			in6_setpeeraddr(inp, nam);
372 		else
373 #endif
374 			in_setpeeraddr(inp, nam);
375 		break;
376 
377 	/*
378 	 * Mark the connection as being incapable of further output.
379 	 */
380 	case PRU_SHUTDOWN:
381 		if (so->so_state & SS_CANTSENDMORE)
382 			break;
383 		socantsendmore(so);
384 		tp = tcp_usrclosed(tp);
385 		if (tp)
386 			error = tcp_output(tp);
387 		break;
388 
389 	/*
390 	 * After a receive, possibly send window update to peer.
391 	 */
392 	case PRU_RCVD:
393 		/*
394 		 * soreceive() calls this function when a user receives
395 		 * ancillary data on a listening socket. We don't call
396 		 * tcp_output in such a case, since there is no header
397 		 * template for a listening socket and hence the kernel
398 		 * will panic.
399 		 */
400 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
401 			(void) tcp_output(tp);
402 		break;
403 
404 	/*
405 	 * Do a send by putting data in output queue and updating urgent
406 	 * marker if URG set.  Possibly send more data.
407 	 */
408 	case PRU_SEND:
409 		sbappendstream(&so->so_snd, m);
410 		error = tcp_output(tp);
411 		break;
412 
413 	/*
414 	 * Abort the TCP.
415 	 */
416 	case PRU_ABORT:
417 		tp = tcp_drop(tp, ECONNABORTED);
418 		break;
419 
420 	case PRU_SENSE:
421 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
422 		splx(s);
423 		return (0);
424 
425 	case PRU_RCVOOB:
426 		if ((so->so_oobmark == 0 &&
427 		    (so->so_state & SS_RCVATMARK) == 0) ||
428 		    so->so_options & SO_OOBINLINE ||
429 		    tp->t_oobflags & TCPOOB_HADDATA) {
430 			error = EINVAL;
431 			break;
432 		}
433 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
434 			error = EWOULDBLOCK;
435 			break;
436 		}
437 		m->m_len = 1;
438 		*mtod(m, caddr_t) = tp->t_iobc;
439 		if (((long)nam & MSG_PEEK) == 0)
440 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
441 		break;
442 
443 	case PRU_SENDOOB:
444 		if (sbspace(&so->so_snd) < -512) {
445 			m_freem(m);
446 			error = ENOBUFS;
447 			break;
448 		}
449 		/*
450 		 * According to RFC961 (Assigned Protocols),
451 		 * the urgent pointer points to the last octet
452 		 * of urgent data.  We continue, however,
453 		 * to consider it to indicate the first octet
454 		 * of data past the urgent section.
455 		 * Otherwise, snd_up should be one lower.
456 		 */
457 		sbappendstream(&so->so_snd, m);
458 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
459 		tp->t_force = 1;
460 		error = tcp_output(tp);
461 		tp->t_force = 0;
462 		break;
463 
464 	case PRU_SOCKADDR:
465 #ifdef INET6
466 		if (inp->inp_flags & INP_IPV6)
467 			in6_setsockaddr(inp, nam);
468 		else
469 #endif
470 			in_setsockaddr(inp, nam);
471 		break;
472 
473 	case PRU_PEERADDR:
474 #ifdef INET6
475 		if (inp->inp_flags & INP_IPV6)
476 			in6_setpeeraddr(inp, nam);
477 		else
478 #endif
479 			in_setpeeraddr(inp, nam);
480 		break;
481 
482 	default:
483 		panic("tcp_usrreq");
484 	}
485 	if (tp && (so->so_options & SO_DEBUG))
486 		tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0);
487 	splx(s);
488 	return (error);
489 }
490 
491 int
492 tcp_ctloutput(op, so, level, optname, mp)
493 	int op;
494 	struct socket *so;
495 	int level, optname;
496 	struct mbuf **mp;
497 {
498 	int error = 0, s;
499 	struct inpcb *inp;
500 	struct tcpcb *tp;
501 	struct mbuf *m;
502 	int i;
503 
504 	s = splsoftnet();
505 	inp = sotoinpcb(so);
506 	if (inp == NULL) {
507 		splx(s);
508 		if (op == PRCO_SETOPT && *mp)
509 			(void) m_free(*mp);
510 		return (ECONNRESET);
511 	}
512 #ifdef INET6
513 	tp = intotcpcb(inp);
514 #endif /* INET6 */
515 	if (level != IPPROTO_TCP) {
516 		switch (so->so_proto->pr_domain->dom_family) {
517 #ifdef INET6
518 		case PF_INET6:
519 			error = ip6_ctloutput(op, so, level, optname, mp);
520 			break;
521 #endif /* INET6 */
522 		case PF_INET:
523 			error = ip_ctloutput(op, so, level, optname, mp);
524 			break;
525 		default:
526 			error = EAFNOSUPPORT;	/*?*/
527 			break;
528 		}
529 		splx(s);
530 		return (error);
531 	}
532 #ifndef INET6
533 	tp = intotcpcb(inp);
534 #endif /* !INET6 */
535 
536 	switch (op) {
537 
538 	case PRCO_SETOPT:
539 		m = *mp;
540 		switch (optname) {
541 
542 		case TCP_NODELAY:
543 			if (m == NULL || m->m_len < sizeof (int))
544 				error = EINVAL;
545 			else if (*mtod(m, int *))
546 				tp->t_flags |= TF_NODELAY;
547 			else
548 				tp->t_flags &= ~TF_NODELAY;
549 			break;
550 
551 		case TCP_MAXSEG:
552 			if (m == NULL || m->m_len < sizeof (int)) {
553 				error = EINVAL;
554 				break;
555 			}
556 
557 			i = *mtod(m, int *);
558 			if (i > 0 && i <= tp->t_maxseg)
559 				tp->t_maxseg = i;
560 			else
561 				error = EINVAL;
562 			break;
563 
564 #ifdef TCP_SACK
565 		case TCP_SACK_ENABLE:
566 			if (m == NULL || m->m_len < sizeof (int)) {
567 				error = EINVAL;
568 				break;
569 			}
570 
571 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
572 				error = EPERM;
573 				break;
574 			}
575 
576 			if (tp->t_flags & TF_SIGNATURE) {
577 				error = EPERM;
578 				break;
579 			}
580 
581 			if (*mtod(m, int *))
582 				tp->sack_enable = 1;
583 			else
584 				tp->sack_enable = 0;
585 			break;
586 #endif
587 #ifdef TCP_SIGNATURE
588 		case TCP_MD5SIG:
589 			if (m == NULL || m->m_len < sizeof (int)) {
590 				error = EINVAL;
591 				break;
592 			}
593 
594 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
595 				error = EPERM;
596 				break;
597 			}
598 
599 			if (*mtod(m, int *)) {
600 				tp->t_flags |= TF_SIGNATURE;
601 #ifdef TCP_SACK
602 				tp->sack_enable = 0;
603 #endif /* TCP_SACK */
604 			} else
605 				tp->t_flags &= ~TF_SIGNATURE;
606 			break;
607 #endif /* TCP_SIGNATURE */
608 		default:
609 			error = ENOPROTOOPT;
610 			break;
611 		}
612 		if (m)
613 			(void) m_free(m);
614 		break;
615 
616 	case PRCO_GETOPT:
617 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
618 		m->m_len = sizeof(int);
619 
620 		switch (optname) {
621 		case TCP_NODELAY:
622 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
623 			break;
624 		case TCP_MAXSEG:
625 			*mtod(m, int *) = tp->t_maxseg;
626 			break;
627 #ifdef TCP_SACK
628 		case TCP_SACK_ENABLE:
629 			*mtod(m, int *) = tp->sack_enable;
630 			break;
631 #endif
632 #ifdef TCP_SIGNATURE
633 		case TCP_MD5SIG:
634 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
635 			break;
636 #endif
637 		default:
638 			error = ENOPROTOOPT;
639 			break;
640 		}
641 		break;
642 	}
643 	splx(s);
644 	return (error);
645 }
646 
647 /*
648  * Attach TCP protocol to socket, allocating
649  * internet protocol control block, tcp control block,
650  * bufer space, and entering LISTEN state if to accept connections.
651  */
652 int
653 tcp_attach(so)
654 	struct socket *so;
655 {
656 	struct tcpcb *tp;
657 	struct inpcb *inp;
658 	int error;
659 
660 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
661 	    sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
662 	    sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
663 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
664 		if (error)
665 			return (error);
666 	}
667 
668 	error = in_pcballoc(so, &tcbtable);
669 	if (error)
670 		return (error);
671 	inp = sotoinpcb(so);
672 	tp = tcp_newtcpcb(inp);
673 	if (tp == NULL) {
674 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
675 
676 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
677 		in_pcbdetach(inp);
678 		so->so_state |= nofd;
679 		return (ENOBUFS);
680 	}
681 	tp->t_state = TCPS_CLOSED;
682 #ifdef INET6
683 	/* we disallow IPv4 mapped address completely. */
684 	if (inp->inp_flags & INP_IPV6)
685 		tp->pf = PF_INET6;
686 	else
687 		tp->pf = PF_INET;
688 #else
689 	tp->pf = PF_INET;
690 #endif
691 	return (0);
692 }
693 
694 /*
695  * Initiate (or continue) disconnect.
696  * If embryonic state, just send reset (once).
697  * If in ``let data drain'' option and linger null, just drop.
698  * Otherwise (hard), mark socket disconnecting and drop
699  * current input data; switch states based on user close, and
700  * send segment to peer (with FIN).
701  */
702 struct tcpcb *
703 tcp_disconnect(tp)
704 	struct tcpcb *tp;
705 {
706 	struct socket *so = tp->t_inpcb->inp_socket;
707 
708 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
709 		tp = tcp_close(tp);
710 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
711 		tp = tcp_drop(tp, 0);
712 	else {
713 		soisdisconnecting(so);
714 		sbflush(&so->so_rcv);
715 		tp = tcp_usrclosed(tp);
716 		if (tp)
717 			(void) tcp_output(tp);
718 	}
719 	return (tp);
720 }
721 
722 /*
723  * User issued close, and wish to trail through shutdown states:
724  * if never received SYN, just forget it.  If got a SYN from peer,
725  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
726  * If already got a FIN from peer, then almost done; go to LAST_ACK
727  * state.  In all other cases, have already sent FIN to peer (e.g.
728  * after PRU_SHUTDOWN), and just have to play tedious game waiting
729  * for peer to send FIN or not respond to keep-alives, etc.
730  * We can let the user exit from the close as soon as the FIN is acked.
731  */
732 struct tcpcb *
733 tcp_usrclosed(tp)
734 	struct tcpcb *tp;
735 {
736 
737 	switch (tp->t_state) {
738 
739 	case TCPS_CLOSED:
740 	case TCPS_LISTEN:
741 	case TCPS_SYN_SENT:
742 		tp->t_state = TCPS_CLOSED;
743 		tp = tcp_close(tp);
744 		break;
745 
746 	case TCPS_SYN_RECEIVED:
747 	case TCPS_ESTABLISHED:
748 		tp->t_state = TCPS_FIN_WAIT_1;
749 		break;
750 
751 	case TCPS_CLOSE_WAIT:
752 		tp->t_state = TCPS_LAST_ACK;
753 		break;
754 	}
755 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
756 		soisdisconnected(tp->t_inpcb->inp_socket);
757 		/*
758 		 * If we are in FIN_WAIT_2, we arrived here because the
759 		 * application did a shutdown of the send side.  Like the
760 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
761 		 * a full close, we start a timer to make sure sockets are
762 		 * not left in FIN_WAIT_2 forever.
763 		 */
764 		if (tp->t_state == TCPS_FIN_WAIT_2)
765 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
766 	}
767 	return (tp);
768 }
769 
770 /*
771  * Look up a socket for ident or tcpdrop, ...
772  */
773 int
774 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
775 {
776 	int error = 0, s;
777 	struct tcp_ident_mapping tir;
778 	struct inpcb *inp;
779 	struct tcpcb *tp = NULL;
780 	struct sockaddr_in *fin, *lin;
781 #ifdef INET6
782 	struct sockaddr_in6 *fin6, *lin6;
783 	struct in6_addr f6, l6;
784 #endif
785 	if (dodrop) {
786 		if (oldp != NULL || *oldlenp != 0)
787 			return (EINVAL);
788 		if (newp == NULL)
789 			return (EPERM);
790 		if (newlen < sizeof(tir))
791 			return (ENOMEM);
792 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
793 			return (error);
794 	} else {
795 		if (oldp == NULL)
796 			return (EINVAL);
797 		if (*oldlenp < sizeof(tir))
798 			return (ENOMEM);
799 		if (newp != NULL || newlen != 0)
800 			return (EINVAL);
801 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
802 			return (error);
803 	}
804 	switch (tir.faddr.ss_family) {
805 #ifdef INET6
806 	case AF_INET6:
807 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
808 		error = in6_embedscope(&f6, fin6, NULL, NULL);
809 		if (error)
810 			return EINVAL;	/*?*/
811 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
812 		error = in6_embedscope(&l6, lin6, NULL, NULL);
813 		if (error)
814 			return EINVAL;	/*?*/
815 		break;
816 #endif
817 	case AF_INET:
818 	  	fin = (struct sockaddr_in *)&tir.faddr;
819 		lin = (struct sockaddr_in *)&tir.laddr;
820 		break;
821 	default:
822 		return (EINVAL);
823 	}
824 
825 	s = splsoftnet();
826 	switch (tir.faddr.ss_family) {
827 #ifdef INET6
828 	case AF_INET6:
829 		inp = in6_pcbhashlookup(&tcbtable, &f6,
830 		    fin6->sin6_port, &l6, lin6->sin6_port);
831 		break;
832 #endif
833 	case AF_INET:
834 		inp = in_pcbhashlookup(&tcbtable,  fin->sin_addr,
835 		    fin->sin_port, lin->sin_addr, lin->sin_port , tir.rdomain);
836 		break;
837 	}
838 
839 	if (dodrop) {
840 		if (inp && (tp = intotcpcb(inp)) &&
841 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
842 			tp = tcp_drop(tp, ECONNABORTED);
843 		else
844 			error = ESRCH;
845 		splx(s);
846 		return (error);
847 	}
848 
849 	if (inp == NULL) {
850 		++tcpstat.tcps_pcbhashmiss;
851 		switch (tir.faddr.ss_family) {
852 #ifdef INET6
853 		case AF_INET6:
854 			inp = in6_pcblookup_listen(&tcbtable,
855 			    &l6, lin6->sin6_port, 0, NULL);
856 			break;
857 #endif
858 		case AF_INET:
859 			inp = in_pcblookup_listen(&tcbtable,
860 			    lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain);
861 			break;
862 		}
863 	}
864 
865 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
866 		tir.ruid = inp->inp_socket->so_ruid;
867 		tir.euid = inp->inp_socket->so_euid;
868 	} else {
869 		tir.ruid = -1;
870 		tir.euid = -1;
871 	}
872 	splx(s);
873 
874 	*oldlenp = sizeof (tir);
875 	error = copyout((void *)&tir, oldp, sizeof (tir));
876 	return (error);
877 }
878 
879 /*
880  * Sysctl for tcp variables.
881  */
882 int
883 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
884 	int *name;
885 	u_int namelen;
886 	void *oldp;
887 	size_t *oldlenp;
888 	void *newp;
889 	size_t newlen;
890 {
891 	int error, nval;
892 
893 	/* All sysctl names at this level are terminal. */
894 	if (namelen != 1)
895 		return (ENOTDIR);
896 
897 	switch (name[0]) {
898 #ifdef TCP_SACK
899 	case TCPCTL_SACK:
900 		return (sysctl_int(oldp, oldlenp, newp, newlen,
901 		    &tcp_do_sack));
902 #endif
903 	case TCPCTL_SLOWHZ:
904 		return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ));
905 
906 	case TCPCTL_BADDYNAMIC:
907 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
908 		    baddynamicports.tcp, sizeof(baddynamicports.tcp)));
909 
910 	case TCPCTL_IDENT:
911 		return (tcp_ident(oldp, oldlenp, newp, newlen, 0));
912 
913 	case TCPCTL_DROP:
914 		return (tcp_ident(oldp, oldlenp, newp, newlen, 1));
915 
916 	case TCPCTL_ALWAYS_KEEPALIVE:
917 		return (sysctl_int(oldp, oldlenp, newp, newlen,
918 		    &tcp_always_keepalive));
919 
920 #ifdef TCP_ECN
921 	case TCPCTL_ECN:
922 		return (sysctl_int(oldp, oldlenp, newp, newlen,
923 		   &tcp_do_ecn));
924 #endif
925 	case TCPCTL_REASS_LIMIT:
926 		nval = tcp_reass_limit;
927 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
928 		if (error)
929 			return (error);
930 		if (nval != tcp_reass_limit) {
931 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
932 			if (error)
933 				return (error);
934 			tcp_reass_limit = nval;
935 		}
936 		return (0);
937 #ifdef TCP_SACK
938 	case TCPCTL_SACKHOLE_LIMIT:
939 		nval = tcp_sackhole_limit;
940 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
941 		if (error)
942 			return (error);
943 		if (nval != tcp_sackhole_limit) {
944 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
945 			if (error)
946 				return (error);
947 			tcp_sackhole_limit = nval;
948 		}
949 		return (0);
950 #endif
951 
952 	case TCPCTL_STATS:
953 		if (newp != NULL)
954 			return (EPERM);
955 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
956 		    &tcpstat, sizeof(tcpstat)));
957 
958 	default:
959 		if (name[0] < TCPCTL_MAXID)
960 			return (sysctl_int_arr(tcpctl_vars, name, namelen,
961 			    oldp, oldlenp, newp, newlen));
962 		return (ENOPROTOOPT);
963 	}
964 	/* NOTREACHED */
965 }
966 
967 /*
968  * Scale the send buffer so that inflight data is not accounted against
969  * the limit. The buffer will scale with the congestion window, if the
970  * the receiver stops acking data the window will shrink and therefor
971  * the buffer size will shrink as well.
972  * In low memory situation try to shrink the buffer to the initial size
973  * disabling the send buffer scaling as long as the situation persists.
974  */
975 void
976 tcp_update_sndspace(struct tcpcb *tp)
977 {
978 	struct socket *so = tp->t_inpcb->inp_socket;
979 	u_long nmax;
980 
981 	if (sbchecklowmem())
982 		/* low on memory try to get rid of some */
983 		nmax = tcp_sendspace;
984 	else if (so->so_snd.sb_wat != tcp_sendspace)
985 		/* user requested buffer size, auto-scaling disabled */
986 		nmax = so->so_snd.sb_wat;
987 	else
988 		/* automatic buffer scaling */
989 		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
990 		    tp->snd_una);
991 
992 	/* round to MSS boundary */
993 	nmax = roundup(nmax, tp->t_maxseg);
994 
995 	if (nmax != so->so_snd.sb_hiwat)
996 		sbreserve(&so->so_snd, nmax);
997 }
998 
999 /*
1000  * Scale the recv buffer by looking at how much data was transferred in
1001  * on approximated RTT. If more then a big part of the recv buffer was
1002  * transferred during that time we increase the buffer by a constant.
1003  * In low memory situation try to shrink the buffer to the initial size.
1004  */
1005 void
1006 tcp_update_rcvspace(struct tcpcb *tp)
1007 {
1008 	struct socket *so = tp->t_inpcb->inp_socket;
1009 	u_long nmax = so->so_rcv.sb_hiwat;
1010 
1011 	if (sbchecklowmem())
1012 		/* low on memory try to get rid of some */
1013 		nmax = tcp_recvspace;
1014 	else if (so->so_rcv.sb_wat != tcp_recvspace)
1015 		/* user requested buffer size, auto-scaling disabled */
1016 		nmax = so->so_rcv.sb_wat;
1017 	else {
1018 		/* automatic buffer scaling */
1019 		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1020 			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1021 			    tcp_autorcvbuf_inc);
1022 	}
1023 
1024 	if (nmax == so->so_rcv.sb_hiwat)
1025 		return;
1026 
1027 	/* round to MSS boundary */
1028 	nmax = roundup(nmax, tp->t_maxseg);
1029 	sbreserve(&so->so_rcv, nmax);
1030 }
1031