xref: /netbsd-src/sys/netinet/tcp_usrreq.c (revision 76dfffe33547c37f8bdd446e3e4ab0f3c16cea4b)
1 /*	$NetBSD: tcp_usrreq.c,v 1.23 1996/05/23 17:03:29 mycroft Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1988, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/ucred.h>
50 
51 #include <vm/vm.h>
52 #include <sys/sysctl.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/in_var.h>
60 #include <netinet/ip.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
63 #include <netinet/tcp.h>
64 #include <netinet/tcp_fsm.h>
65 #include <netinet/tcp_seq.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcpip.h>
69 #include <netinet/tcp_debug.h>
70 
71 /*
72  * TCP protocol interface to socket abstraction.
73  */
74 extern	char *tcpstates[];
75 
76 /*
77  * Process a TCP user request for TCP tb.  If this is a send request
78  * then m is the mbuf chain of send data.  If this is a timer expiration
79  * (called from the software clock routine), then timertype tells which timer.
80  */
81 /*ARGSUSED*/
82 int
83 tcp_usrreq(so, req, m, nam, control, p)
84 	struct socket *so;
85 	int req;
86 	struct mbuf *m, *nam, *control;
87 	struct proc *p;
88 {
89 	register struct inpcb *inp;
90 	register struct tcpcb *tp = NULL;
91 	int s;
92 	int error = 0;
93 	int ostate;
94 
95 	if (req == PRU_CONTROL)
96 		return (in_control(so, (long)m, (caddr_t)nam,
97 		    (struct ifnet *)control, p));
98 
99 	s = splsoftnet();
100 	inp = sotoinpcb(so);
101 #ifdef DIAGNOSTIC
102 	if (req != PRU_SEND && req != PRU_SENDOOB && control)
103 		panic("tcp_usrreq: unexpected control mbuf");
104 #endif
105 	/*
106 	 * When a TCP is attached to a socket, then there will be
107 	 * a (struct inpcb) pointed at by the socket, and this
108 	 * structure will point at a subsidary (struct tcpcb).
109 	 */
110 	if (inp == 0 && req != PRU_ATTACH) {
111 		error = EINVAL;
112 		goto release;
113 	}
114 	if (inp) {
115 		tp = intotcpcb(inp);
116 		/* WHAT IF TP IS 0? */
117 #ifdef KPROF
118 		tcp_acounts[tp->t_state][req]++;
119 #endif
120 		ostate = tp->t_state;
121 	} else
122 		ostate = 0;
123 
124 	switch (req) {
125 
126 	/*
127 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
128 	 * and an internet control block.
129 	 */
130 	case PRU_ATTACH:
131 		if (inp != 0) {
132 			error = EISCONN;
133 			break;
134 		}
135 		error = tcp_attach(so);
136 		if (error)
137 			break;
138 		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
139 			so->so_linger = TCP_LINGERTIME * hz;
140 		tp = sototcpcb(so);
141 		break;
142 
143 	/*
144 	 * PRU_DETACH detaches the TCP protocol from the socket.
145 	 * If the protocol state is non-embryonic, then can't
146 	 * do this directly: have to initiate a PRU_DISCONNECT,
147 	 * which may finish later; embryonic TCB's can just
148 	 * be discarded here.
149 	 */
150 	case PRU_DETACH:
151 		if (tp->t_state > TCPS_LISTEN)
152 			tp = tcp_disconnect(tp);
153 		else
154 			tp = tcp_close(tp);
155 		break;
156 
157 	/*
158 	 * Give the socket an address.
159 	 */
160 	case PRU_BIND:
161 		error = in_pcbbind(inp, nam, p);
162 		break;
163 
164 	/*
165 	 * Prepare to accept connections.
166 	 */
167 	case PRU_LISTEN:
168 		if (inp->inp_lport == 0) {
169 			error = in_pcbbind(inp, (struct mbuf *)0,
170 			    (struct proc *)0);
171 			if (error)
172 				break;
173 		}
174 		tp->t_state = TCPS_LISTEN;
175 		break;
176 
177 	/*
178 	 * Initiate connection to peer.
179 	 * Create a template for use in transmissions on this connection.
180 	 * Enter SYN_SENT state, and mark socket as connecting.
181 	 * Start keep-alive timer, and seed output sequence space.
182 	 * Send initial segment on connection.
183 	 */
184 	case PRU_CONNECT:
185 		if (inp->inp_lport == 0) {
186 			error = in_pcbbind(inp, (struct mbuf *)0,
187 			    (struct proc *)0);
188 			if (error)
189 				break;
190 		}
191 		error = in_pcbconnect(inp, nam);
192 		if (error)
193 			break;
194 		tp->t_template = tcp_template(tp);
195 		if (tp->t_template == 0) {
196 			in_pcbdisconnect(inp);
197 			error = ENOBUFS;
198 			break;
199 		}
200 		/* Compute window scaling to request.  */
201 		while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
202 		    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
203 			tp->request_r_scale++;
204 		soisconnecting(so);
205 		tcpstat.tcps_connattempt++;
206 		tp->t_state = TCPS_SYN_SENT;
207 		tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
208 		tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
209 		tcp_sendseqinit(tp);
210 		error = tcp_output(tp);
211 		break;
212 
213 	/*
214 	 * Create a TCP connection between two sockets.
215 	 */
216 	case PRU_CONNECT2:
217 		error = EOPNOTSUPP;
218 		break;
219 
220 	/*
221 	 * Initiate disconnect from peer.
222 	 * If connection never passed embryonic stage, just drop;
223 	 * else if don't need to let data drain, then can just drop anyways,
224 	 * else have to begin TCP shutdown process: mark socket disconnecting,
225 	 * drain unread data, state switch to reflect user close, and
226 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
227 	 * when peer sends FIN and acks ours.
228 	 *
229 	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
230 	 */
231 	case PRU_DISCONNECT:
232 		tp = tcp_disconnect(tp);
233 		break;
234 
235 	/*
236 	 * Accept a connection.  Essentially all the work is
237 	 * done at higher levels; just return the address
238 	 * of the peer, storing through addr.
239 	 */
240 	case PRU_ACCEPT:
241 		in_setpeeraddr(inp, nam);
242 		break;
243 
244 	/*
245 	 * Mark the connection as being incapable of further output.
246 	 */
247 	case PRU_SHUTDOWN:
248 		socantsendmore(so);
249 		tp = tcp_usrclosed(tp);
250 		if (tp)
251 			error = tcp_output(tp);
252 		break;
253 
254 	/*
255 	 * After a receive, possibly send window update to peer.
256 	 */
257 	case PRU_RCVD:
258 		(void) tcp_output(tp);
259 		break;
260 
261 	/*
262 	 * Do a send by putting data in output queue and updating urgent
263 	 * marker if URG set.  Possibly send more data.
264 	 */
265 	case PRU_SEND:
266 		if (control && control->m_len) {
267 			m_freem(control);
268 			m_freem(m);
269 			error = EINVAL;
270 			break;
271 		}
272 		sbappend(&so->so_snd, m);
273 		error = tcp_output(tp);
274 		break;
275 
276 	/*
277 	 * Abort the TCP.
278 	 */
279 	case PRU_ABORT:
280 		tp = tcp_drop(tp, ECONNABORTED);
281 		break;
282 
283 	case PRU_SENSE:
284 		/*
285 		 * stat: don't bother with a blocksize.
286 		 */
287 		splx(s);
288 		return (0);
289 
290 	case PRU_RCVOOB:
291 		if (control && control->m_len) {
292 			m_freem(control);
293 			m_freem(m);
294 			error = EINVAL;
295 			break;
296 		}
297 		if ((so->so_oobmark == 0 &&
298 		    (so->so_state & SS_RCVATMARK) == 0) ||
299 		    so->so_options & SO_OOBINLINE ||
300 		    tp->t_oobflags & TCPOOB_HADDATA) {
301 			error = EINVAL;
302 			break;
303 		}
304 		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
305 			error = EWOULDBLOCK;
306 			break;
307 		}
308 		m->m_len = 1;
309 		*mtod(m, caddr_t) = tp->t_iobc;
310 		if (((long)nam & MSG_PEEK) == 0)
311 			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
312 		break;
313 
314 	case PRU_SENDOOB:
315 		if (sbspace(&so->so_snd) < -512) {
316 			m_freem(m);
317 			error = ENOBUFS;
318 			break;
319 		}
320 		/*
321 		 * According to RFC961 (Assigned Protocols),
322 		 * the urgent pointer points to the last octet
323 		 * of urgent data.  We continue, however,
324 		 * to consider it to indicate the first octet
325 		 * of data past the urgent section.
326 		 * Otherwise, snd_up should be one lower.
327 		 */
328 		sbappend(&so->so_snd, m);
329 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
330 		tp->t_force = 1;
331 		error = tcp_output(tp);
332 		tp->t_force = 0;
333 		break;
334 
335 	case PRU_SOCKADDR:
336 		in_setsockaddr(inp, nam);
337 		break;
338 
339 	case PRU_PEERADDR:
340 		in_setpeeraddr(inp, nam);
341 		break;
342 
343 	/*
344 	 * TCP slow timer went off; going through this
345 	 * routine for tracing's sake.
346 	 */
347 	case PRU_SLOWTIMO:
348 		tp = tcp_timers(tp, (long)nam);
349 		req |= (long)nam << 8;		/* for debug's sake */
350 		break;
351 
352 	default:
353 		panic("tcp_usrreq");
354 	}
355 	if (tp && (so->so_options & SO_DEBUG))
356 		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
357 
358 release:
359 	splx(s);
360 	return (error);
361 }
362 
363 int
364 tcp_ctloutput(op, so, level, optname, mp)
365 	int op;
366 	struct socket *so;
367 	int level, optname;
368 	struct mbuf **mp;
369 {
370 	int error = 0, s;
371 	struct inpcb *inp;
372 	register struct tcpcb *tp;
373 	register struct mbuf *m;
374 	register int i;
375 
376 	s = splsoftnet();
377 	inp = sotoinpcb(so);
378 	if (inp == NULL) {
379 		splx(s);
380 		if (op == PRCO_SETOPT && *mp)
381 			(void) m_free(*mp);
382 		return (ECONNRESET);
383 	}
384 	if (level != IPPROTO_TCP) {
385 		error = ip_ctloutput(op, so, level, optname, mp);
386 		splx(s);
387 		return (error);
388 	}
389 	tp = intotcpcb(inp);
390 
391 	switch (op) {
392 
393 	case PRCO_SETOPT:
394 		m = *mp;
395 		switch (optname) {
396 
397 		case TCP_NODELAY:
398 			if (m == NULL || m->m_len < sizeof (int))
399 				error = EINVAL;
400 			else if (*mtod(m, int *))
401 				tp->t_flags |= TF_NODELAY;
402 			else
403 				tp->t_flags &= ~TF_NODELAY;
404 			break;
405 
406 		case TCP_MAXSEG:
407 			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
408 				tp->t_maxseg = i;
409 			else
410 				error = EINVAL;
411 			break;
412 
413 		default:
414 			error = ENOPROTOOPT;
415 			break;
416 		}
417 		if (m)
418 			(void) m_free(m);
419 		break;
420 
421 	case PRCO_GETOPT:
422 		*mp = m = m_get(M_WAIT, MT_SOOPTS);
423 		m->m_len = sizeof(int);
424 
425 		switch (optname) {
426 		case TCP_NODELAY:
427 			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
428 			break;
429 		case TCP_MAXSEG:
430 			*mtod(m, int *) = tp->t_maxseg;
431 			break;
432 		default:
433 			error = ENOPROTOOPT;
434 			break;
435 		}
436 		break;
437 	}
438 	splx(s);
439 	return (error);
440 }
441 
442 #ifndef TCP_SENDSPACE
443 #define	TCP_SENDSPACE	1024*16;
444 #endif
445 u_long	tcp_sendspace = TCP_SENDSPACE;
446 #ifndef TCP_RECVSPACE
447 #define	TCP_RECVSPACE	1024*16;
448 #endif
449 u_long	tcp_recvspace = TCP_RECVSPACE;
450 
451 /*
452  * Attach TCP protocol to socket, allocating
453  * internet protocol control block, tcp control block,
454  * bufer space, and entering LISTEN state if to accept connections.
455  */
456 int
457 tcp_attach(so)
458 	struct socket *so;
459 {
460 	register struct tcpcb *tp;
461 	struct inpcb *inp;
462 	int error;
463 
464 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
465 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
466 		if (error)
467 			return (error);
468 	}
469 	error = in_pcballoc(so, &tcbtable);
470 	if (error)
471 		return (error);
472 	inp = sotoinpcb(so);
473 	tp = tcp_newtcpcb(inp);
474 	if (tp == 0) {
475 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
476 
477 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
478 		in_pcbdetach(inp);
479 		so->so_state |= nofd;
480 		return (ENOBUFS);
481 	}
482 	tp->t_state = TCPS_CLOSED;
483 	return (0);
484 }
485 
486 /*
487  * Initiate (or continue) disconnect.
488  * If embryonic state, just send reset (once).
489  * If in ``let data drain'' option and linger null, just drop.
490  * Otherwise (hard), mark socket disconnecting and drop
491  * current input data; switch states based on user close, and
492  * send segment to peer (with FIN).
493  */
494 struct tcpcb *
495 tcp_disconnect(tp)
496 	register struct tcpcb *tp;
497 {
498 	struct socket *so = tp->t_inpcb->inp_socket;
499 
500 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
501 		tp = tcp_close(tp);
502 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
503 		tp = tcp_drop(tp, 0);
504 	else {
505 		soisdisconnecting(so);
506 		sbflush(&so->so_rcv);
507 		tp = tcp_usrclosed(tp);
508 		if (tp)
509 			(void) tcp_output(tp);
510 	}
511 	return (tp);
512 }
513 
514 /*
515  * User issued close, and wish to trail through shutdown states:
516  * if never received SYN, just forget it.  If got a SYN from peer,
517  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
518  * If already got a FIN from peer, then almost done; go to LAST_ACK
519  * state.  In all other cases, have already sent FIN to peer (e.g.
520  * after PRU_SHUTDOWN), and just have to play tedious game waiting
521  * for peer to send FIN or not respond to keep-alives, etc.
522  * We can let the user exit from the close as soon as the FIN is acked.
523  */
524 struct tcpcb *
525 tcp_usrclosed(tp)
526 	register struct tcpcb *tp;
527 {
528 
529 	switch (tp->t_state) {
530 
531 	case TCPS_CLOSED:
532 	case TCPS_LISTEN:
533 	case TCPS_SYN_SENT:
534 		tp->t_state = TCPS_CLOSED;
535 		tp = tcp_close(tp);
536 		break;
537 
538 	case TCPS_SYN_RECEIVED:
539 	case TCPS_ESTABLISHED:
540 		tp->t_state = TCPS_FIN_WAIT_1;
541 		break;
542 
543 	case TCPS_CLOSE_WAIT:
544 		tp->t_state = TCPS_LAST_ACK;
545 		break;
546 	}
547 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
548 		soisdisconnected(tp->t_inpcb->inp_socket);
549 		/*
550 		 * If we are in FIN_WAIT_2, we arrived here because the
551 		 * application did a shutdown of the send side.  Like the
552 		 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
553 		 * a full close, we start a timer to make sure sockets are
554 		 * not left in FIN_WAIT_2 forever.
555 		 */
556 		if (tp->t_state == TCPS_FIN_WAIT_2)
557 			tp->t_timer[TCPT_2MSL] = tcp_maxidle;
558 	}
559 	return (tp);
560 }
561 
562 /*
563  * Sysctl for tcp variables.
564  */
565 int
566 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
567 	int *name;
568 	u_int namelen;
569 	void *oldp;
570 	size_t *oldlenp;
571 	void *newp;
572 	size_t newlen;
573 {
574 
575 	/* All sysctl names at this level are terminal. */
576 	if (namelen != 1)
577 		return (ENOTDIR);
578 
579 	switch (name[0]) {
580 	case TCPCTL_RFC1323:
581 		return (sysctl_int(oldp, oldlenp, newp, newlen,
582 		    &tcp_do_rfc1323));
583 
584 	default:
585 		return (ENOPROTOOPT);
586 	}
587 	/* NOTREACHED */
588 }
589