xref: /openbsd-src/sys/netinet/tcp_usrreq.c (revision cd1eb269cafb12c415be1749cd4a4b5422710415)
1 /*	$OpenBSD: tcp_usrreq.c,v 1.101 2010/04/20 22:05:43 tedu Exp $	*/
2 /*	$NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  * 	This product includes software developed by the University of
46  * 	California, Berkeley and its contributors.
47  * 	This product includes software developed at the Information
48  * 	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/proc.h>
79 #include <sys/sysctl.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/pool.h>
83 
84 #include <dev/rndvar.h>
85 
86 #include <net/if.h>
87 #include <net/route.h>
88 
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/in_pcb.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_timer.h>
99 #include <netinet/tcp_var.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/tcp_debug.h>
102 
103 /*
104  * TCP protocol interface to socket abstraction.
105  */
106 extern	char *tcpstates[];
107 extern	int tcptv_keep_init;
108 
109 extern int tcp_rst_ppslim;
110 
111 /* from in_pcb.c */
112 extern	struct baddynamicports baddynamicports;
113 
114 #ifndef TCP_SENDSPACE
115 #define	TCP_SENDSPACE	1024*16
116 #endif
117 u_int	tcp_sendspace = TCP_SENDSPACE;
118 #ifndef TCP_RECVSPACE
119 #define	TCP_RECVSPACE	1024*16
120 #endif
121 u_int	tcp_recvspace = TCP_RECVSPACE;
122 
123 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS;
124 
125 struct	inpcbtable tcbtable;
126 
127 int tcp_ident(void *, size_t *, void *, size_t, int);
128 
129 /*
130  * Process a TCP user request for TCP tb.  If this is a send request
131  * then m is the mbuf chain of send data.  If this is a timer expiration
132  * (called from the software clock routine), then timertype tells which timer.
133  */
134 /*ARGSUSED*/
135 int
136 tcp_usrreq(so, req, m, nam, control, p)
137 	struct socket *so;
138 	int req;
139 	struct mbuf *m, *nam, *control;
140 	struct proc *p;
141 {
142 	struct sockaddr_in *sin;
143 	struct inpcb *inp;
144 	struct tcpcb *tp = NULL;
145 	int s;
146 	int error = 0;
147 	short ostate;
148 
149 	if (req == PRU_CONTROL) {
150 #ifdef INET6
151 		if (sotopf(so) == PF_INET6)
152 			return in6_control(so, (u_long)m, (caddr_t)nam,
153 			    (struct ifnet *)control, 0);
154 		else
155 #endif /* INET6 */
156 			return (in_control(so, (u_long)m, (caddr_t)nam,
157 			    (struct ifnet *)control));
158 	}
159 	if (control && control->m_len) {
160 		m_freem(control);
161 		if (m)
162 			m_freem(m);
163 		return (EINVAL);
164 	}
165 
166 	s = splsoftnet();
167 	inp = sotoinpcb(so);
168 	/*
169 	 * When a TCP is attached to a socket, then there will be
170 	 * a (struct inpcb) pointed at by the socket, and this
171 	 * structure will point at a subsidiary (struct tcpcb).
172 	 */
173 	if (inp == 0 && req != PRU_ATTACH) {
174 		error = so->so_error;
175 		if (error == 0)
176 			error = EINVAL;
177 		splx(s);
178 		/*
179 		 * The following corrects an mbuf leak under rare
180 		 * circumstances
181 		 */
182 		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
183 			m_freem(m);
184 		return (error);
185 	}
186 	if (inp) {
187 		tp = intotcpcb(inp);
188 		/* WHAT IF TP IS 0? */
189 #ifdef KPROF
190 		tcp_acounts[tp->t_state][req]++;
191 #endif
192 		ostate = tp->t_state;
193 	} else
194 		ostate = 0;
195 	switch (req) {
196 
197 	/*
198 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
199 	 * and an internet control block.
200 	 */
201 	case PRU_ATTACH:
202 		if (inp) {
203 			error = EISCONN;
204 			break;
205 		}
206 		error = tcp_attach(so);
207 		if (error)
208 			break;
209 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
210 			so->so_linger = TCP_LINGERTIME;
211 		tp = sototcpcb(so);
212 		break;
213 
214 	/*
215 	 * PRU_DETACH detaches the TCP protocol from the socket.
216 	 * If the protocol state is non-embryonic, then can't
217 	 * do this directly: have to initiate a PRU_DISCONNECT,
218 	 * which may finish later; embryonic TCB's can just
219 	 * be discarded here.
220 	 */
221 	case PRU_DETACH:
222 		tp = tcp_disconnect(tp);
223 		break;
224 
225 	/*
226 	 * Give the socket an address.
227 	 */
228 	case PRU_BIND:
229 #ifdef INET6
230 		if (inp->inp_flags & INP_IPV6)
231 			error = in6_pcbbind(inp, nam, p);
232 		else
233 #endif
234 			error = in_pcbbind(inp, nam, p);
235 		if (error)
236 			break;
237 		break;
238 
239 	/*
240 	 * Prepare to accept connections.
241 	 */
242 	case PRU_LISTEN:
243 		if (inp->inp_lport == 0) {
244 #ifdef INET6
245 			if (inp->inp_flags & INP_IPV6)
246 				error = in6_pcbbind(inp, NULL, p);
247 			else
248 #endif
249 				error = in_pcbbind(inp, NULL, p);
250 		}
251 		/* If the in_pcbbind() above is called, the tp->pf
252 		   should still be whatever it was before. */
253 		if (error == 0)
254 			tp->t_state = TCPS_LISTEN;
255 		break;
256 
257 	/*
258 	 * Initiate connection to peer.
259 	 * Create a template for use in transmissions on this connection.
260 	 * Enter SYN_SENT state, and mark socket as connecting.
261 	 * Start keep-alive timer, and seed output sequence space.
262 	 * Send initial segment on connection.
263 	 */
264 	case PRU_CONNECT:
265 		sin = mtod(nam, struct sockaddr_in *);
266 
267 #ifdef INET6
268 		if (sin->sin_family == AF_INET6) {
269 			struct in6_addr *in6_addr = &mtod(nam,
270 			    struct sockaddr_in6 *)->sin6_addr;
271 
272 			if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) ||
273 			    IN6_IS_ADDR_MULTICAST(in6_addr) ||
274 			    (IN6_IS_ADDR_V4MAPPED(in6_addr) &&
275 			    ((in6_addr->s6_addr32[3] == INADDR_ANY) ||
276 			    IN_MULTICAST(in6_addr->s6_addr32[3]) ||
277 			    in_broadcast(sin->sin_addr, NULL)))) {
278 				error = EINVAL;
279 				break;
280 			}
281 
282 			if (inp->inp_lport == 0) {
283 				error = in6_pcbbind(inp, NULL, p);
284 				if (error)
285 					break;
286 			}
287 			error = in6_pcbconnect(inp, nam);
288 		} else if (sin->sin_family == AF_INET)
289 #endif /* INET6 */
290 		{
291 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
292 			    IN_MULTICAST(sin->sin_addr.s_addr) ||
293 			    in_broadcast(sin->sin_addr, NULL)) {
294 				error = EINVAL;
295 				break;
296 			}
297 
298 			if (inp->inp_lport == 0) {
299 				error = in_pcbbind(inp, NULL, p);
300 				if (error)
301 					break;
302 			}
303 			error = in_pcbconnect(inp, nam);
304 		}
305 
306 		if (error)
307 			break;
308 
309 		tp->t_template = tcp_template(tp);
310 		if (tp->t_template == 0) {
311 			in_pcbdisconnect(inp);
312 			error = ENOBUFS;
313 			break;
314 		}
315 
316 		so->so_state |= SS_CONNECTOUT;
317 
318 		/* Compute window scaling to request.  */
319 		tcp_rscale(tp, so->so_rcv.sb_hiwat);
320 
321 		soisconnecting(so);
322 		tcpstat.tcps_connattempt++;
323 		tp->t_state = TCPS_SYN_SENT;
324 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
325 		tcp_set_iss_tsm(tp);
326 		tcp_sendseqinit(tp);
327 #if defined(TCP_SACK)
328 		tp->snd_last = tp->snd_una;
329 #endif
330 #if defined(TCP_SACK) && defined(TCP_FACK)
331 		tp->snd_fack = tp->snd_una;
332 		tp->retran_data = 0;
333 		tp->snd_awnd = 0;
334 #endif
335 		error = tcp_output(tp);
336 		break;
337 
338 	/*
339 	 * Create a TCP connection between two sockets.
340 	 */
341 	case PRU_CONNECT2:
342 		error = EOPNOTSUPP;
343 		break;
344 
345 	/*
346 	 * Initiate disconnect from peer.
347 	 * If connection never passed embryonic stage, just drop;
348 	 * else if don't need to let data drain, then can just drop anyways,
349 	 * else have to begin TCP shutdown process: mark socket disconnecting,
350 	 * drain unread data, state switch to reflect user close, and
351 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
352 	 * when peer sends FIN and acks ours.
353 	 *
354 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
355 	 */
356 	case PRU_DISCONNECT:
357 		tp = tcp_disconnect(tp);
358 		break;
359 
360 	/*
361 	 * Accept a connection.  Essentially all the work is
362 	 * done at higher levels; just return the address
363 	 * of the peer, storing through addr.
364 	 */
365 	case PRU_ACCEPT:
366 #ifdef INET6
367 		if (inp->inp_flags & INP_IPV6)
368 			in6_setpeeraddr(inp, nam);
369 		else
370 #endif
371 			in_setpeeraddr(inp, nam);
372 		break;
373 
374 	/*
375 	 * Mark the connection as being incapable of further output.
376 	 */
377 	case PRU_SHUTDOWN:
378 		if (so->so_state & SS_CANTSENDMORE)
379 			break;
380 		socantsendmore(so);
381 		tp = tcp_usrclosed(tp);
382 		if (tp)
383 			error = tcp_output(tp);
384 		break;
385 
386 	/*
387 	 * After a receive, possibly send window update to peer.
388 	 */
389 	case PRU_RCVD:
390 		/*
391 		 * soreceive() calls this function when a user receives
392 		 * ancillary data on a listening socket. We don't call
393 		 * tcp_output in such a case, since there is no header
394 		 * template for a listening socket and hence the kernel
395 		 * will panic.
396 		 */
397 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
398 			(void) tcp_output(tp);
399 		break;
400 
401 	/*
402 	 * Do a send by putting data in output queue and updating urgent
403 	 * marker if URG set.  Possibly send more data.
404 	 */
405 	case PRU_SEND:
406 		sbappendstream(&so->so_snd, m);
407 		error = tcp_output(tp);
408 		break;
409 
410 	/*
411 	 * Abort the TCP.
412 	 */
413 	case PRU_ABORT:
414 		tp = tcp_drop(tp, ECONNABORTED);
415 		break;
416 
417 	case PRU_SENSE:
418 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
419 		splx(s);
420 		return (0);
421 
422 	case PRU_RCVOOB:
423 		if ((so->so_oobmark == 0 &&
424 		    (so->so_state & SS_RCVATMARK) == 0) ||
425 		    so->so_options & SO_OOBINLINE ||
426 		    tp->t_oobflags & TCPOOB_HADDATA) {
427 			error = EINVAL;
428 			break;
429 		}
430 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
431 			error = EWOULDBLOCK;
432 			break;
433 		}
434 		m->m_len = 1;
435 		*mtod(m, caddr_t) = tp->t_iobc;
436 		if (((long)nam & MSG_PEEK) == 0)
437 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
438 		break;
439 
440 	case PRU_SENDOOB:
441 		if (sbspace(&so->so_snd) < -512) {
442 			m_freem(m);
443 			error = ENOBUFS;
444 			break;
445 		}
446 		/*
447 		 * According to RFC961 (Assigned Protocols),
448 		 * the urgent pointer points to the last octet
449 		 * of urgent data.  We continue, however,
450 		 * to consider it to indicate the first octet
451 		 * of data past the urgent section.
452 		 * Otherwise, snd_up should be one lower.
453 		 */
454 		sbappendstream(&so->so_snd, m);
455 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
456 		tp->t_force = 1;
457 		error = tcp_output(tp);
458 		tp->t_force = 0;
459 		break;
460 
461 	case PRU_SOCKADDR:
462 #ifdef INET6
463 		if (inp->inp_flags & INP_IPV6)
464 			in6_setsockaddr(inp, nam);
465 		else
466 #endif
467 			in_setsockaddr(inp, nam);
468 		break;
469 
470 	case PRU_PEERADDR:
471 #ifdef INET6
472 		if (inp->inp_flags & INP_IPV6)
473 			in6_setpeeraddr(inp, nam);
474 		else
475 #endif
476 			in_setpeeraddr(inp, nam);
477 		break;
478 
479 	default:
480 		panic("tcp_usrreq");
481 	}
482 	if (tp && (so->so_options & SO_DEBUG))
483 		tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0);
484 	splx(s);
485 	return (error);
486 }
487 
488 int
489 tcp_ctloutput(op, so, level, optname, mp)
490 	int op;
491 	struct socket *so;
492 	int level, optname;
493 	struct mbuf **mp;
494 {
495 	int error = 0, s;
496 	struct inpcb *inp;
497 	struct tcpcb *tp;
498 	struct mbuf *m;
499 	int i;
500 
501 	s = splsoftnet();
502 	inp = sotoinpcb(so);
503 	if (inp == NULL) {
504 		splx(s);
505 		if (op == PRCO_SETOPT && *mp)
506 			(void) m_free(*mp);
507 		return (ECONNRESET);
508 	}
509 #ifdef INET6
510 	tp = intotcpcb(inp);
511 #endif /* INET6 */
512 	if (level != IPPROTO_TCP) {
513 		switch (so->so_proto->pr_domain->dom_family) {
514 #ifdef INET6
515 		case PF_INET6:
516 			error = ip6_ctloutput(op, so, level, optname, mp);
517 			break;
518 #endif /* INET6 */
519 		case PF_INET:
520 			error = ip_ctloutput(op, so, level, optname, mp);
521 			break;
522 		default:
523 			error = EAFNOSUPPORT;	/*?*/
524 			break;
525 		}
526 		splx(s);
527 		return (error);
528 	}
529 #ifndef INET6
530 	tp = intotcpcb(inp);
531 #endif /* !INET6 */
532 
533 	switch (op) {
534 
535 	case PRCO_SETOPT:
536 		m = *mp;
537 		switch (optname) {
538 
539 		case TCP_NODELAY:
540 			if (m == NULL || m->m_len < sizeof (int))
541 				error = EINVAL;
542 			else if (*mtod(m, int *))
543 				tp->t_flags |= TF_NODELAY;
544 			else
545 				tp->t_flags &= ~TF_NODELAY;
546 			break;
547 
548 		case TCP_MAXSEG:
549 			if (m == NULL || m->m_len < sizeof (int)) {
550 				error = EINVAL;
551 				break;
552 			}
553 
554 			i = *mtod(m, int *);
555 			if (i > 0 && i <= tp->t_maxseg)
556 				tp->t_maxseg = i;
557 			else
558 				error = EINVAL;
559 			break;
560 
561 #ifdef TCP_SACK
562 		case TCP_SACK_ENABLE:
563 			if (m == NULL || m->m_len < sizeof (int)) {
564 				error = EINVAL;
565 				break;
566 			}
567 
568 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
569 				error = EPERM;
570 				break;
571 			}
572 
573 			if (tp->t_flags & TF_SIGNATURE) {
574 				error = EPERM;
575 				break;
576 			}
577 
578 			if (*mtod(m, int *))
579 				tp->sack_enable = 1;
580 			else
581 				tp->sack_enable = 0;
582 			break;
583 #endif
584 #ifdef TCP_SIGNATURE
585 		case TCP_MD5SIG:
586 			if (m == NULL || m->m_len < sizeof (int)) {
587 				error = EINVAL;
588 				break;
589 			}
590 
591 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
592 				error = EPERM;
593 				break;
594 			}
595 
596 			if (*mtod(m, int *)) {
597 				tp->t_flags |= TF_SIGNATURE;
598 #ifdef TCP_SACK
599 				tp->sack_enable = 0;
600 #endif /* TCP_SACK */
601 			} else
602 				tp->t_flags &= ~TF_SIGNATURE;
603 			break;
604 #endif /* TCP_SIGNATURE */
605 		default:
606 			error = ENOPROTOOPT;
607 			break;
608 		}
609 		if (m)
610 			(void) m_free(m);
611 		break;
612 
613 	case PRCO_GETOPT:
614 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
615 		m->m_len = sizeof(int);
616 
617 		switch (optname) {
618 		case TCP_NODELAY:
619 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
620 			break;
621 		case TCP_MAXSEG:
622 			*mtod(m, int *) = tp->t_maxseg;
623 			break;
624 #ifdef TCP_SACK
625 		case TCP_SACK_ENABLE:
626 			*mtod(m, int *) = tp->sack_enable;
627 			break;
628 #endif
629 #ifdef TCP_SIGNATURE
630 		case TCP_MD5SIG:
631 			*mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
632 			break;
633 #endif
634 		default:
635 			error = ENOPROTOOPT;
636 			break;
637 		}
638 		break;
639 	}
640 	splx(s);
641 	return (error);
642 }
643 
644 /*
645  * Attach TCP protocol to socket, allocating
646  * internet protocol control block, tcp control block,
647  * bufer space, and entering LISTEN state if to accept connections.
648  */
649 int
650 tcp_attach(so)
651 	struct socket *so;
652 {
653 	struct tcpcb *tp;
654 	struct inpcb *inp;
655 	int error;
656 
657 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
658 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
659 		if (error)
660 			return (error);
661 	}
662 	error = in_pcballoc(so, &tcbtable);
663 	if (error)
664 		return (error);
665 	inp = sotoinpcb(so);
666 	tp = tcp_newtcpcb(inp);
667 	if (tp == NULL) {
668 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
669 
670 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
671 		in_pcbdetach(inp);
672 		so->so_state |= nofd;
673 		return (ENOBUFS);
674 	}
675 	tp->t_state = TCPS_CLOSED;
676 #ifdef INET6
677 	/* we disallow IPv4 mapped address completely. */
678 	if (inp->inp_flags & INP_IPV6)
679 		tp->pf = PF_INET6;
680 	else
681 		tp->pf = PF_INET;
682 #else
683 	tp->pf = PF_INET;
684 #endif
685 	return (0);
686 }
687 
688 /*
689  * Initiate (or continue) disconnect.
690  * If embryonic state, just send reset (once).
691  * If in ``let data drain'' option and linger null, just drop.
692  * Otherwise (hard), mark socket disconnecting and drop
693  * current input data; switch states based on user close, and
694  * send segment to peer (with FIN).
695  */
696 struct tcpcb *
697 tcp_disconnect(tp)
698 	struct tcpcb *tp;
699 {
700 	struct socket *so = tp->t_inpcb->inp_socket;
701 
702 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
703 		tp = tcp_close(tp);
704 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
705 		tp = tcp_drop(tp, 0);
706 	else {
707 		soisdisconnecting(so);
708 		sbflush(&so->so_rcv);
709 		tp = tcp_usrclosed(tp);
710 		if (tp)
711 			(void) tcp_output(tp);
712 	}
713 	return (tp);
714 }
715 
716 /*
717  * User issued close, and wish to trail through shutdown states:
718  * if never received SYN, just forget it.  If got a SYN from peer,
719  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
720  * If already got a FIN from peer, then almost done; go to LAST_ACK
721  * state.  In all other cases, have already sent FIN to peer (e.g.
722  * after PRU_SHUTDOWN), and just have to play tedious game waiting
723  * for peer to send FIN or not respond to keep-alives, etc.
724  * We can let the user exit from the close as soon as the FIN is acked.
725  */
726 struct tcpcb *
727 tcp_usrclosed(tp)
728 	struct tcpcb *tp;
729 {
730 
731 	switch (tp->t_state) {
732 
733 	case TCPS_CLOSED:
734 	case TCPS_LISTEN:
735 	case TCPS_SYN_SENT:
736 		tp->t_state = TCPS_CLOSED;
737 		tp = tcp_close(tp);
738 		break;
739 
740 	case TCPS_SYN_RECEIVED:
741 	case TCPS_ESTABLISHED:
742 		tp->t_state = TCPS_FIN_WAIT_1;
743 		break;
744 
745 	case TCPS_CLOSE_WAIT:
746 		tp->t_state = TCPS_LAST_ACK;
747 		break;
748 	}
749 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
750 		soisdisconnected(tp->t_inpcb->inp_socket);
751 		/*
752 		 * If we are in FIN_WAIT_2, we arrived here because the
753 		 * application did a shutdown of the send side.  Like the
754 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
755 		 * a full close, we start a timer to make sure sockets are
756 		 * not left in FIN_WAIT_2 forever.
757 		 */
758 		if (tp->t_state == TCPS_FIN_WAIT_2)
759 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
760 	}
761 	return (tp);
762 }
763 
764 /*
765  * Look up a socket for ident or tcpdrop, ...
766  */
767 int
768 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
769 {
770 	int error = 0, s;
771 	struct tcp_ident_mapping tir;
772 	struct inpcb *inp;
773 	struct tcpcb *tp = NULL;
774 	struct sockaddr_in *fin, *lin;
775 #ifdef INET6
776 	struct sockaddr_in6 *fin6, *lin6;
777 	struct in6_addr f6, l6;
778 #endif
779 	if (dodrop) {
780 		if (oldp != NULL || *oldlenp != 0)
781 			return (EINVAL);
782 		if (newp == NULL)
783 			return (EPERM);
784 		if (newlen < sizeof(tir))
785 			return (ENOMEM);
786 		if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
787 			return (error);
788 	} else {
789 		if (oldp == NULL)
790 			return (EINVAL);
791 		if (*oldlenp < sizeof(tir))
792 			return (ENOMEM);
793 		if (newp != NULL || newlen != 0)
794 			return (EINVAL);
795 		if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
796 			return (error);
797 	}
798 	switch (tir.faddr.ss_family) {
799 #ifdef INET6
800 	case AF_INET6:
801 		fin6 = (struct sockaddr_in6 *)&tir.faddr;
802 		error = in6_embedscope(&f6, fin6, NULL, NULL);
803 		if (error)
804 			return EINVAL;	/*?*/
805 		lin6 = (struct sockaddr_in6 *)&tir.laddr;
806 		error = in6_embedscope(&l6, lin6, NULL, NULL);
807 		if (error)
808 			return EINVAL;	/*?*/
809 		break;
810 #endif
811 	case AF_INET:
812 	  	fin = (struct sockaddr_in *)&tir.faddr;
813 		lin = (struct sockaddr_in *)&tir.laddr;
814 		break;
815 	default:
816 		return (EINVAL);
817 	}
818 
819 	s = splsoftnet();
820 	switch (tir.faddr.ss_family) {
821 #ifdef INET6
822 	case AF_INET6:
823 		inp = in6_pcbhashlookup(&tcbtable, &f6,
824 		    fin6->sin6_port, &l6, lin6->sin6_port);
825 		break;
826 #endif
827 	case AF_INET:
828 		inp = in_pcbhashlookup(&tcbtable,  fin->sin_addr,
829 		    fin->sin_port, lin->sin_addr, lin->sin_port , tir.rdomain);
830 		break;
831 	}
832 
833 	if (dodrop) {
834 		if (inp && (tp = intotcpcb(inp)) &&
835 		    ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
836 			tp = tcp_drop(tp, ECONNABORTED);
837 		else
838 			error = ESRCH;
839 		splx(s);
840 		return (error);
841 	}
842 
843 	if (inp == NULL) {
844 		++tcpstat.tcps_pcbhashmiss;
845 		switch (tir.faddr.ss_family) {
846 #ifdef INET6
847 		case AF_INET6:
848 			inp = in6_pcblookup_listen(&tcbtable,
849 			    &l6, lin6->sin6_port, 0, NULL);
850 			break;
851 #endif
852 		case AF_INET:
853 			inp = in_pcblookup_listen(&tcbtable,
854 			    lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain);
855 			break;
856 		}
857 	}
858 
859 	if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
860 		tir.ruid = inp->inp_socket->so_ruid;
861 		tir.euid = inp->inp_socket->so_euid;
862 	} else {
863 		tir.ruid = -1;
864 		tir.euid = -1;
865 	}
866 	splx(s);
867 
868 	*oldlenp = sizeof (tir);
869 	error = copyout((void *)&tir, oldp, sizeof (tir));
870 	return (error);
871 }
872 
873 /*
874  * Sysctl for tcp variables.
875  */
876 int
877 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
878 	int *name;
879 	u_int namelen;
880 	void *oldp;
881 	size_t *oldlenp;
882 	void *newp;
883 	size_t newlen;
884 {
885 	int error, nval;
886 
887 	/* All sysctl names at this level are terminal. */
888 	if (namelen != 1)
889 		return (ENOTDIR);
890 
891 	switch (name[0]) {
892 #ifdef TCP_SACK
893 	case TCPCTL_SACK:
894 		return (sysctl_int(oldp, oldlenp, newp, newlen,
895 		    &tcp_do_sack));
896 #endif
897 	case TCPCTL_SLOWHZ:
898 		return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ));
899 
900 	case TCPCTL_BADDYNAMIC:
901 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
902 		    baddynamicports.tcp, sizeof(baddynamicports.tcp)));
903 
904 	case TCPCTL_IDENT:
905 		return (tcp_ident(oldp, oldlenp, newp, newlen, 0));
906 
907 	case TCPCTL_DROP:
908 		return (tcp_ident(oldp, oldlenp, newp, newlen, 1));
909 
910 #ifdef TCP_ECN
911 	case TCPCTL_ECN:
912 		return (sysctl_int(oldp, oldlenp, newp, newlen,
913 		   &tcp_do_ecn));
914 #endif
915 	case TCPCTL_REASS_LIMIT:
916 		nval = tcp_reass_limit;
917 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
918 		if (error)
919 			return (error);
920 		if (nval != tcp_reass_limit) {
921 			error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
922 			if (error)
923 				return (error);
924 			tcp_reass_limit = nval;
925 		}
926 		return (0);
927 #ifdef TCP_SACK
928 	case TCPCTL_SACKHOLE_LIMIT:
929 		nval = tcp_sackhole_limit;
930 		error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
931 		if (error)
932 			return (error);
933 		if (nval != tcp_sackhole_limit) {
934 			error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
935 			if (error)
936 				return (error);
937 			tcp_sackhole_limit = nval;
938 		}
939 		return (0);
940 #endif
941 
942 	case TCPCTL_STATS:
943 		if (newp != NULL)
944 			return (EPERM);
945 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
946 		    &tcpstat, sizeof(tcpstat)));
947 
948 	default:
949 		if (name[0] < TCPCTL_MAXID)
950 			return (sysctl_int_arr(tcpctl_vars, name, namelen,
951 			    oldp, oldlenp, newp, newlen));
952 		return (ENOPROTOOPT);
953 	}
954 	/* NOTREACHED */
955 }
956