xref: /netbsd-src/sys/netinet/tcp_usrreq.c (revision 93f9db1b75d415b78f73ed629beeb86235153473)
1 /*	$NetBSD: tcp_usrreq.c,v 1.39 1998/09/10 19:53:28 tv Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9  * Facility, NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1982, 1986, 1988, 1993, 1995
42  *	The Regents of the University of California.  All rights reserved.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. All advertising materials mentioning features or use of this software
53  *    must display the following acknowledgement:
54  *	This product includes software developed by the University of
55  *	California, Berkeley and its contributors.
56  * 4. Neither the name of the University nor the names of its contributors
57  *    may be used to endorse or promote products derived from this software
58  *    without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70  * SUCH DAMAGE.
71  *
72  *	@(#)tcp_usrreq.c	8.5 (Berkeley) 6/21/95
73  */
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/protosw.h>
83 #include <sys/errno.h>
84 #include <sys/stat.h>
85 #include <sys/proc.h>
86 #include <sys/ucred.h>
87 
88 #include <vm/vm.h>
89 #include <sys/sysctl.h>
90 
91 #include <net/if.h>
92 #include <net/route.h>
93 
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/in_var.h>
97 #include <netinet/ip.h>
98 #include <netinet/in_pcb.h>
99 #include <netinet/ip_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_timer.h>
104 #include <netinet/tcp_var.h>
105 #include <netinet/tcpip.h>
106 #include <netinet/tcp_debug.h>
107 
108 #include "opt_tcp_recvspace.h"
109 #include "opt_tcp_sendspace.h"
110 
111 /*
112  * TCP protocol interface to socket abstraction.
113  */
114 extern	char *tcpstates[];
115 
116 /*
117  * Process a TCP user request for TCP tb.  If this is a send request
118  * then m is the mbuf chain of send data.  If this is a timer expiration
119  * (called from the software clock routine), then timertype tells which timer.
120  */
121 /*ARGSUSED*/
122 int
123 tcp_usrreq(so, req, m, nam, control, p)
124 	struct socket *so;
125 	int req;
126 	struct mbuf *m, *nam, *control;
127 	struct proc *p;
128 {
129 	register struct inpcb *inp;
130 	register struct tcpcb *tp = NULL;
131 	int s;
132 	int error = 0;
133 	int ostate;
134 
135 	if (req == PRU_CONTROL)
136 		return (in_control(so, (long)m, (caddr_t)nam,
137 		    (struct ifnet *)control, p));
138 
139 	s = splsoftnet();
140 	inp = sotoinpcb(so);
141 #ifdef DIAGNOSTIC
142 	if (req != PRU_SEND && req != PRU_SENDOOB && control)
143 		panic("tcp_usrreq: unexpected control mbuf");
144 #endif
145 	/*
146 	 * When a TCP is attached to a socket, then there will be
147 	 * a (struct inpcb) pointed at by the socket, and this
148 	 * structure will point at a subsidary (struct tcpcb).
149 	 */
150 	if (inp == 0 && req != PRU_ATTACH) {
151 		error = EINVAL;
152 		goto release;
153 	}
154 	if (inp) {
155 		tp = intotcpcb(inp);
156 		/* WHAT IF TP IS 0? */
157 #ifdef KPROF
158 		tcp_acounts[tp->t_state][req]++;
159 #endif
160 		ostate = tp->t_state;
161 	} else
162 		ostate = 0;
163 
164 	switch (req) {
165 
166 	/*
167 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
168 	 * and an internet control block.
169 	 */
170 	case PRU_ATTACH:
171 		if (inp != 0) {
172 			error = EISCONN;
173 			break;
174 		}
175 		error = tcp_attach(so);
176 		if (error)
177 			break;
178 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
179 			so->so_linger = TCP_LINGERTIME;
180 		tp = sototcpcb(so);
181 		break;
182 
183 	/*
184 	 * PRU_DETACH detaches the TCP protocol from the socket.
185 	 */
186 	case PRU_DETACH:
187 		tp = tcp_disconnect(tp);
188 		break;
189 
190 	/*
191 	 * Give the socket an address.
192 	 */
193 	case PRU_BIND:
194 		error = in_pcbbind(inp, nam, p);
195 		break;
196 
197 	/*
198 	 * Prepare to accept connections.
199 	 */
200 	case PRU_LISTEN:
201 		if (inp->inp_lport == 0) {
202 			error = in_pcbbind(inp, (struct mbuf *)0,
203 			    (struct proc *)0);
204 			if (error)
205 				break;
206 		}
207 		tp->t_state = TCPS_LISTEN;
208 		break;
209 
210 	/*
211 	 * Initiate connection to peer.
212 	 * Create a template for use in transmissions on this connection.
213 	 * Enter SYN_SENT state, and mark socket as connecting.
214 	 * Start keep-alive timer, and seed output sequence space.
215 	 * Send initial segment on connection.
216 	 */
217 	case PRU_CONNECT:
218 		if (inp->inp_lport == 0) {
219 			error = in_pcbbind(inp, (struct mbuf *)0,
220 			    (struct proc *)0);
221 			if (error)
222 				break;
223 		}
224 		error = in_pcbconnect(inp, nam);
225 		if (error)
226 			break;
227 		tp->t_template = tcp_template(tp);
228 		if (tp->t_template == 0) {
229 			in_pcbdisconnect(inp);
230 			error = ENOBUFS;
231 			break;
232 		}
233 		/* Compute window scaling to request.  */
234 		while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
235 		    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
236 			tp->request_r_scale++;
237 		soisconnecting(so);
238 		tcpstat.tcps_connattempt++;
239 		tp->t_state = TCPS_SYN_SENT;
240 		TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
241 		tp->iss = tcp_new_iss(tp, sizeof(struct tcpcb), 0);
242 		tcp_sendseqinit(tp);
243 		error = tcp_output(tp);
244 		break;
245 
246 	/*
247 	 * Create a TCP connection between two sockets.
248 	 */
249 	case PRU_CONNECT2:
250 		error = EOPNOTSUPP;
251 		break;
252 
253 	/*
254 	 * Initiate disconnect from peer.
255 	 * If connection never passed embryonic stage, just drop;
256 	 * else if don't need to let data drain, then can just drop anyways,
257 	 * else have to begin TCP shutdown process: mark socket disconnecting,
258 	 * drain unread data, state switch to reflect user close, and
259 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
260 	 * when peer sends FIN and acks ours.
261 	 *
262 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
263 	 */
264 	case PRU_DISCONNECT:
265 		tp = tcp_disconnect(tp);
266 		break;
267 
268 	/*
269 	 * Accept a connection.  Essentially all the work is
270 	 * done at higher levels; just return the address
271 	 * of the peer, storing through addr.
272 	 */
273 	case PRU_ACCEPT:
274 		in_setpeeraddr(inp, nam);
275 		break;
276 
277 	/*
278 	 * Mark the connection as being incapable of further output.
279 	 */
280 	case PRU_SHUTDOWN:
281 		socantsendmore(so);
282 		tp = tcp_usrclosed(tp);
283 		if (tp)
284 			error = tcp_output(tp);
285 		break;
286 
287 	/*
288 	 * After a receive, possibly send window update to peer.
289 	 */
290 	case PRU_RCVD:
291 		(void) tcp_output(tp);
292 		break;
293 
294 	/*
295 	 * Do a send by putting data in output queue and updating urgent
296 	 * marker if URG set.  Possibly send more data.
297 	 */
298 	case PRU_SEND:
299 		if (control && control->m_len) {
300 			m_freem(control);
301 			m_freem(m);
302 			error = EINVAL;
303 			break;
304 		}
305 		sbappend(&so->so_snd, m);
306 		error = tcp_output(tp);
307 		break;
308 
309 	/*
310 	 * Abort the TCP.
311 	 */
312 	case PRU_ABORT:
313 		tp = tcp_drop(tp, ECONNABORTED);
314 		break;
315 
316 	case PRU_SENSE:
317 		/*
318 		 * stat: don't bother with a blocksize.
319 		 */
320 		splx(s);
321 		return (0);
322 
323 	case PRU_RCVOOB:
324 		if (control && control->m_len) {
325 			m_freem(control);
326 			m_freem(m);
327 			error = EINVAL;
328 			break;
329 		}
330 		if ((so->so_oobmark == 0 &&
331 		    (so->so_state & SS_RCVATMARK) == 0) ||
332 		    so->so_options & SO_OOBINLINE ||
333 		    tp->t_oobflags & TCPOOB_HADDATA) {
334 			error = EINVAL;
335 			break;
336 		}
337 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
338 			error = EWOULDBLOCK;
339 			break;
340 		}
341 		m->m_len = 1;
342 		*mtod(m, caddr_t) = tp->t_iobc;
343 		if (((long)nam & MSG_PEEK) == 0)
344 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
345 		break;
346 
347 	case PRU_SENDOOB:
348 		if (sbspace(&so->so_snd) < -512) {
349 			m_freem(m);
350 			error = ENOBUFS;
351 			break;
352 		}
353 		/*
354 		 * According to RFC961 (Assigned Protocols),
355 		 * the urgent pointer points to the last octet
356 		 * of urgent data.  We continue, however,
357 		 * to consider it to indicate the first octet
358 		 * of data past the urgent section.
359 		 * Otherwise, snd_up should be one lower.
360 		 */
361 		sbappend(&so->so_snd, m);
362 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
363 		tp->t_force = 1;
364 		error = tcp_output(tp);
365 		tp->t_force = 0;
366 		break;
367 
368 	case PRU_SOCKADDR:
369 		in_setsockaddr(inp, nam);
370 		break;
371 
372 	case PRU_PEERADDR:
373 		in_setpeeraddr(inp, nam);
374 		break;
375 
376 	/*
377 	 * TCP slow timer went off; going through this
378 	 * routine for tracing's sake.
379 	 */
380 	case PRU_SLOWTIMO:
381 		tp = tcp_timers(tp, (long)nam);
382 		req |= (long)nam << 8;		/* for debug's sake */
383 		break;
384 
385 	default:
386 		panic("tcp_usrreq");
387 	}
388 	if (tp && (so->so_options & SO_DEBUG))
389 		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
390 
391 release:
392 	splx(s);
393 	return (error);
394 }
395 
396 int
397 tcp_ctloutput(op, so, level, optname, mp)
398 	int op;
399 	struct socket *so;
400 	int level, optname;
401 	struct mbuf **mp;
402 {
403 	int error = 0, s;
404 	struct inpcb *inp;
405 	register struct tcpcb *tp;
406 	register struct mbuf *m;
407 	register int i;
408 
409 	s = splsoftnet();
410 	inp = sotoinpcb(so);
411 	if (inp == NULL) {
412 		splx(s);
413 		if (op == PRCO_SETOPT && *mp)
414 			(void) m_free(*mp);
415 		return (ECONNRESET);
416 	}
417 	if (level != IPPROTO_TCP) {
418 		error = ip_ctloutput(op, so, level, optname, mp);
419 		splx(s);
420 		return (error);
421 	}
422 	tp = intotcpcb(inp);
423 
424 	switch (op) {
425 
426 	case PRCO_SETOPT:
427 		m = *mp;
428 		switch (optname) {
429 
430 		case TCP_NODELAY:
431 			if (m == NULL || m->m_len < sizeof (int))
432 				error = EINVAL;
433 			else if (*mtod(m, int *))
434 				tp->t_flags |= TF_NODELAY;
435 			else
436 				tp->t_flags &= ~TF_NODELAY;
437 			break;
438 
439 		case TCP_MAXSEG:
440 			if (m && (i = *mtod(m, int *)) > 0 &&
441 			    i <= tp->t_peermss)
442 				tp->t_peermss = i;  /* limit on send size */
443 			else
444 				error = EINVAL;
445 			break;
446 
447 		default:
448 			error = ENOPROTOOPT;
449 			break;
450 		}
451 		if (m)
452 			(void) m_free(m);
453 		break;
454 
455 	case PRCO_GETOPT:
456 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
457 		m->m_len = sizeof(int);
458 
459 		switch (optname) {
460 		case TCP_NODELAY:
461 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
462 			break;
463 		case TCP_MAXSEG:
464 			*mtod(m, int *) = tp->t_peermss;
465 			break;
466 		default:
467 			error = ENOPROTOOPT;
468 			break;
469 		}
470 		break;
471 	}
472 	splx(s);
473 	return (error);
474 }
475 
476 #ifndef TCP_SENDSPACE
477 #define	TCP_SENDSPACE	1024*16;
478 #endif
479 int	tcp_sendspace = TCP_SENDSPACE;
480 #ifndef TCP_RECVSPACE
481 #define	TCP_RECVSPACE	1024*16;
482 #endif
483 int	tcp_recvspace = TCP_RECVSPACE;
484 
485 /*
486  * Attach TCP protocol to socket, allocating
487  * internet protocol control block, tcp control block,
488  * bufer space, and entering LISTEN state if to accept connections.
489  */
490 int
491 tcp_attach(so)
492 	struct socket *so;
493 {
494 	register struct tcpcb *tp;
495 	struct inpcb *inp;
496 	int error;
497 
498 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
499 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
500 		if (error)
501 			return (error);
502 	}
503 	error = in_pcballoc(so, &tcbtable);
504 	if (error)
505 		return (error);
506 	inp = sotoinpcb(so);
507 	tp = tcp_newtcpcb(inp);
508 	if (tp == 0) {
509 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
510 
511 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
512 		in_pcbdetach(inp);
513 		so->so_state |= nofd;
514 		return (ENOBUFS);
515 	}
516 	tp->t_state = TCPS_CLOSED;
517 	return (0);
518 }
519 
520 /*
521  * Initiate (or continue) disconnect.
522  * If embryonic state, just send reset (once).
523  * If in ``let data drain'' option and linger null, just drop.
524  * Otherwise (hard), mark socket disconnecting and drop
525  * current input data; switch states based on user close, and
526  * send segment to peer (with FIN).
527  */
528 struct tcpcb *
529 tcp_disconnect(tp)
530 	register struct tcpcb *tp;
531 {
532 	struct socket *so = tp->t_inpcb->inp_socket;
533 
534 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
535 		tp = tcp_close(tp);
536 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
537 		tp = tcp_drop(tp, 0);
538 	else {
539 		soisdisconnecting(so);
540 		sbflush(&so->so_rcv);
541 		tp = tcp_usrclosed(tp);
542 		if (tp)
543 			(void) tcp_output(tp);
544 	}
545 	return (tp);
546 }
547 
548 /*
549  * User issued close, and wish to trail through shutdown states:
550  * if never received SYN, just forget it.  If got a SYN from peer,
551  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
552  * If already got a FIN from peer, then almost done; go to LAST_ACK
553  * state.  In all other cases, have already sent FIN to peer (e.g.
554  * after PRU_SHUTDOWN), and just have to play tedious game waiting
555  * for peer to send FIN or not respond to keep-alives, etc.
556  * We can let the user exit from the close as soon as the FIN is acked.
557  */
558 struct tcpcb *
559 tcp_usrclosed(tp)
560 	register struct tcpcb *tp;
561 {
562 
563 	switch (tp->t_state) {
564 
565 	case TCPS_CLOSED:
566 	case TCPS_LISTEN:
567 	case TCPS_SYN_SENT:
568 		tp->t_state = TCPS_CLOSED;
569 		tp = tcp_close(tp);
570 		break;
571 
572 	case TCPS_SYN_RECEIVED:
573 	case TCPS_ESTABLISHED:
574 		tp->t_state = TCPS_FIN_WAIT_1;
575 		break;
576 
577 	case TCPS_CLOSE_WAIT:
578 		tp->t_state = TCPS_LAST_ACK;
579 		break;
580 	}
581 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
582 		soisdisconnected(tp->t_inpcb->inp_socket);
583 		/*
584 		 * If we are in FIN_WAIT_2, we arrived here because the
585 		 * application did a shutdown of the send side.  Like the
586 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
587 		 * a full close, we start a timer to make sure sockets are
588 		 * not left in FIN_WAIT_2 forever.
589 		 */
590 		if ((tp->t_state == TCPS_FIN_WAIT_2) && (tcp_maxidle > 0))
591 			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
592 	}
593 	return (tp);
594 }
595 
596 static struct {
597 	 unsigned int valid : 1;
598 	 unsigned int rdonly : 1;
599 	 int *var;
600 	 int val;
601 	 } tcp_ctlvars[] = TCPCTL_VARIABLES;
602 
603 /*
604  * Sysctl for tcp variables.
605  */
606 int
607 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
608 	int *name;
609 	u_int namelen;
610 	void *oldp;
611 	size_t *oldlenp;
612 	void *newp;
613 	size_t newlen;
614 {
615 	/* All sysctl names at this level are terminal. */
616 	if (namelen != 1)
617 		return (ENOTDIR);
618 
619 	if (name[0] < sizeof(tcp_ctlvars)/sizeof(tcp_ctlvars[0])
620 	    && tcp_ctlvars[name[0]].valid) {
621 		if (tcp_ctlvars[name[0]].rdonly)
622 			return (sysctl_rdint(oldp, oldlenp, newp,
623 			    tcp_ctlvars[name[0]].val));
624 		else
625 			return (sysctl_int(oldp, oldlenp, newp, newlen,
626 			    tcp_ctlvars[name[0]].var));
627 	}
628 
629 	return (ENOPROTOOPT);
630 }
631