1*25202Skarels #ifdef	RCSIDENT
2*25202Skarels static char rcsident[] = "$Header: tcp_usrreq.c,v 1.30 85/07/31 09:43:43 walsh Exp $";
3*25202Skarels #endif RCSIDENT
4*25202Skarels 
5*25202Skarels #include "../h/param.h"
6*25202Skarels #include "../h/systm.h"
7*25202Skarels #include "../h/mbuf.h"
8*25202Skarels #include "../h/socket.h"
9*25202Skarels #include "../h/socketvar.h"
10*25202Skarels #include "../h/protosw.h"
11*25202Skarels #include "../h/errno.h"
12*25202Skarels #include "../h/ioctl.h"
13*25202Skarels #include "../h/time.h"
14*25202Skarels #include "../h/kernel.h"
15*25202Skarels 
16*25202Skarels #include "../net/if.h"
17*25202Skarels #include "../net/route.h"
18*25202Skarels 
19*25202Skarels #include "../bbnnet/in.h"
20*25202Skarels #include "../bbnnet/in_var.h"
21*25202Skarels #include "../bbnnet/in_pcb.h"
22*25202Skarels #include "../bbnnet/net.h"
23*25202Skarels #include "../bbnnet/fsm.h"
24*25202Skarels #include "../bbnnet/tcp.h"
25*25202Skarels #include "../bbnnet/ip.h"
26*25202Skarels #include "../bbnnet/icmp.h"
27*25202Skarels #include "../bbnnet/macros.h"
28*25202Skarels #include "../bbnnet/sws.h"
29*25202Skarels 
30*25202Skarels /*
31*25202Skarels  * TCP protocol interface to socket abstraction.
32*25202Skarels  */
33*25202Skarels 
34*25202Skarels #ifdef GPROF
35*25202Skarels int	tcp_acounts[TCP_NSTATES][PRU_NREQ];
36*25202Skarels #endif
37*25202Skarels 
38*25202Skarels extern tcp_pcbdisconnect();
39*25202Skarels extern tcp_binding_used();
40*25202Skarels 
41*25202Skarels struct inpcb tcp;
42*25202Skarels struct tcp_stat tcpstat;
43*25202Skarels sequence tcp_iss;		/* tcp initial send seq # */
44*25202Skarels 
45*25202Skarels struct dfilter tcp_dfilter;
46*25202Skarels 
47*25202Skarels struct pr_advice tcp_advice =
48*25202Skarels {
49*25202Skarels     TCP_RESERVED,	/* application reserved */
50*25202Skarels     TCP_USERRESERVED,	/* user reserved */
51*25202Skarels     TCP_MAXPORT,	/* max port */
52*25202Skarels     TCP_USERRESERVED+1,	/* random last used */
53*25202Skarels     sizeof(u_short),	/* port size */
54*25202Skarels     tcp_binding_used,	/* confirmation routine */
55*25202Skarels } ;
56*25202Skarels 
dowedebug(inp,so,filter)57*25202Skarels dowedebug(inp, so, filter)
58*25202Skarels register struct inpcb	*inp;
59*25202Skarels struct socket	*so;
60*25202Skarels register struct dfilter *filter;
61*25202Skarels {
62*25202Skarels     register int		 count;
63*25202Skarels 
64*25202Skarels     count = 0;
65*25202Skarels     if (inp->inp_faddr.s_addr == filter->foreign_host.s_addr)
66*25202Skarels 	count ++;
67*25202Skarels     if (inp->inp_fport == filter->foreign_port)
68*25202Skarels 	count ++;
69*25202Skarels     if (inp->inp_laddr.s_addr == filter->local_host.s_addr)
70*25202Skarels 	count ++;
71*25202Skarels     if (inp->inp_lport == filter->local_port)
72*25202Skarels 	count ++;
73*25202Skarels 
74*25202Skarels     if (count >= filter->matches)
75*25202Skarels 	so->so_options |= SO_DEBUG;
76*25202Skarels }
77*25202Skarels 
78*25202Skarels int tcp_noact = 0; /* patchable */
79*25202Skarels 
80*25202Skarels /*
81*25202Skarels  * Allocate and initialize a new TCB
82*25202Skarels  * tcp_usrreq calls tcp_attach calls us.  tcp_usrreq splnet()'s
83*25202Skarels  */
tcp_newtcpcb(inp)84*25202Skarels struct tcpcb *tcp_newtcpcb(inp)
85*25202Skarels register struct inpcb *inp;
86*25202Skarels {
87*25202Skarels     register struct tcpcb  *tp;
88*25202Skarels     register struct mbuf   *m;
89*25202Skarels 
90*25202Skarels     m = m_getclr(M_WAIT, MT_PCB);
91*25202Skarels     if (m == NULL)
92*25202Skarels 	return(NULL);
93*25202Skarels     tp = mtod(m, struct tcpcb *);
94*25202Skarels 
95*25202Skarels     /* initialize non-zero tcb fields */
96*25202Skarels 
97*25202Skarels     tp->t_rcv_next	= (struct th *)tp;
98*25202Skarels     tp->t_rcv_prev	= (struct th *)tp;
99*25202Skarels     /*
100*25202Skarels      * Don't start off assuming minimum srtt/rxmitime.  If we do, and
101*25202Skarels      * TCP_tvRXMIN is small and we decide to communicate over a
102*25202Skarels      * reliable, but slow, network then we may not find true values for
103*25202Skarels      * these.  We may assume an ACK was for a retransmission that
104*25202Skarels      * we're measuring the srtt of, not the original packet.
105*25202Skarels      *
106*25202Skarels      * Instead, start high and approach from above in a deterministic
107*25202Skarels      * fashion.  We should get close to the right values fairly rapidly.
108*25202Skarels      *
109*25202Skarels      * 7/85: start from above by special casing first round trip time
110*25202Skarels      * measurement.  If srtt == 0, do not reset rtt, and do not use
111*25202Skarels      * weighted averaging.  srtt starts as time to ack(xmit [+ rxmit...])
112*25202Skarels      * and then gets smoothed with new round trip times.  This compromise
113*25202Skarels      * for getting to long-term srtt more quickly on LANs should work
114*25202Skarels      * on the Internet as well.  It will only hurt Internet connections
115*25202Skarels      * if packet loss is high, and even then would only slow getting
116*25202Skarels      * to long term srtt.
117*25202Skarels      * This method can be turned off by initializing srtt with a non-zero
118*25202Skarels      * value.
119*25202Skarels      */
120*25202Skarels     /* tp->t_srtt   = TCP_tvMAXSRTT; */
121*25202Skarels     tp->t_rxmitime	= TCP_tvMAXSRTT + 1;
122*25202Skarels     tp->t_rttltimeo	= TCP_tvRTTL;
123*25202Skarels     tp->t_xmt_val = tp->snd_end = tp->seq_fin = tp->snd_nxt =
124*25202Skarels 	tp->snd_hi = tp->snd_una = tp->iss = tcp_iss;
125*25202Skarels     tcp_iss += ISSINCR;
126*25202Skarels 
127*25202Skarels     /*
128*25202Skarels      * Imitate Berkeley code by setting push as a default.  This should
129*25202Skarels      * increase compatibility at the user code level.
130*25202Skarels      */
131*25202Skarels     tp->t_push	 = TRUE;
132*25202Skarels 
133*25202Skarels     /*
134*25202Skarels      * Berkeley 4.2 code sends a data byte beyond the window's edge to see
135*25202Skarels      * if the other end is up.  If other end does not respond, connection
136*25202Skarels      * times out and aborts.  This is dangerous since the byte may make its
137*25202Skarels      * way into the input stream if the recipient is coded keeping in mind
138*25202Skarels      * how expensive packets are.
139*25202Skarels      *
140*25202Skarels      * We'll provide for an optional method to send a well formed ack that
141*25202Skarels      * will catch remote failure and generate a tcp reset.  Note that we
142*25202Skarels      * don't care if the other end ignores the ack; we only hope for a well
143*25202Skarels      * coded tcp to respond with a reset in the right circumstances.  This
144*25202Skarels      * sort of handshaking/probing should really be done at the application
145*25202Skarels      * level, but not all specs (eg., SMTP) provide for such a noop.
146*25202Skarels      *
147*25202Skarels      * Optional, since some networks charge for packets and since some might
148*25202Skarels      * see this as unecessary traffic.
149*25202Skarels      *
150*25202Skarels      * also see tcp_ioctl()
151*25202Skarels      */
152*25202Skarels     if (tp->t_noact = tcp_noact)
153*25202Skarels 	tp->t_noactprobe = TRUE;
154*25202Skarels 
155*25202Skarels     /* attach the tcpcb to the in_pcb */
156*25202Skarels 
157*25202Skarels     inp->inp_ppcb = (caddr_t)tp;
158*25202Skarels     tp->t_in_pcb = inp;
159*25202Skarels 
160*25202Skarels     return(tp);
161*25202Skarels }
162*25202Skarels 
163*25202Skarels /*
164*25202Skarels  * Is a tcp port/address pair already in use by some socket on this machine?
165*25202Skarels  * Passed to in_pcbbind() to help it find a port/address binding
166*25202Skarels  * that is unique for tcp.
167*25202Skarels  */
tcp_binding_used(inp,lport,lsaddr,reuselocal)168*25202Skarels int tcp_binding_used(inp, lport, lsaddr, reuselocal)
169*25202Skarels struct inpcb   *inp;
170*25202Skarels u_short	lport;
171*25202Skarels u_long	lsaddr;
172*25202Skarels {
173*25202Skarels     register struct inpcb *i;
174*25202Skarels 
175*25202Skarels     for (i = tcp.inp_next; i != &tcp; i = i->inp_next)
176*25202Skarels     {
177*25202Skarels 	/*
178*25202Skarels 	 * Since our inpcb is in this linked list, don't want to know
179*25202Skarels 	 * if we, ourselves, are already using this binding.
180*25202Skarels 	 */
181*25202Skarels 	if (i != inp)
182*25202Skarels 	    if (i->inp_lport == lport)
183*25202Skarels 		/*
184*25202Skarels 		 * Our/His address is unbound (INADDR_ANY) iff
185*25202Skarels 		 * not yet connected to foreign host.
186*25202Skarels 		 */
187*25202Skarels 		if ((i->inp_laddr.s_addr == lsaddr) ||
188*25202Skarels 		    (i->inp_laddr.s_addr == INADDR_ANY) ||
189*25202Skarels 		    (lsaddr == INADDR_ANY))
190*25202Skarels 		{
191*25202Skarels 		    if (!reuselocal)
192*25202Skarels 			break;
193*25202Skarels 		    if (i->inp_faddr.s_addr == INADDR_ANY)
194*25202Skarels 			/*
195*25202Skarels 			 * We're both waiting for foreign
196*25202Skarels 			 * connection.  Could only re-use if
197*25202Skarels 			 * he was already connected.
198*25202Skarels 			 */
199*25202Skarels 			break;
200*25202Skarels 		}
201*25202Skarels     }
202*25202Skarels     return (i != &tcp);
203*25202Skarels }
204*25202Skarels 
205*25202Skarels /*
206*25202Skarels  * returns a (struct tcpcb *) cast to a (char *).  This is
207*25202Skarels  * so in_pcbconnect() can correctly handle return value. All
208*25202Skarels  * other uses promptly cast back.
209*25202Skarels  */
210*25202Skarels 
tcp_conn_used(inp,lport,lsaddr,fport,fsaddr)211*25202Skarels char *tcp_conn_used(inp, lport, lsaddr, fport, fsaddr)
212*25202Skarels struct inpcb   *inp;
213*25202Skarels u_short	lport;
214*25202Skarels u_long	lsaddr;
215*25202Skarels u_short	fport;
216*25202Skarels u_long	fsaddr;
217*25202Skarels {
218*25202Skarels     register struct inpcb *i;
219*25202Skarels 
220*25202Skarels     for (i = tcp.inp_next; i != &tcp; i = i->inp_next)
221*25202Skarels     {
222*25202Skarels 	/*
223*25202Skarels 	 * Since our inpcb is in this linked list, don't want to know
224*25202Skarels 	 * if we, ourselves, are already using this connetion.
225*25202Skarels 	 */
226*25202Skarels 	if (i != inp)
227*25202Skarels 	    if ((i->inp_lport == lport) &&
228*25202Skarels 		(i->inp_fport == fport) &&
229*25202Skarels 		(i->inp_laddr.s_addr == lsaddr) &&
230*25202Skarels 		(i->inp_faddr.s_addr == fsaddr))
231*25202Skarels 		    return((char *)i->inp_ppcb);
232*25202Skarels     }
233*25202Skarels     return ((char *) NULL);
234*25202Skarels }
235*25202Skarels 
236*25202Skarels tcp_ioctl (tp, command, data)
237*25202Skarels struct tcpcb *tp;
238*25202Skarels int command;
239*25202Skarels caddr_t	data;
240*25202Skarels {
241*25202Skarels     switch (command)
242*25202Skarels     {
243*25202Skarels 	/* push */
244*25202Skarels       case SIOCSPUSH:
245*25202Skarels 	tp->t_push = TRUE;
246*25202Skarels 	break;
247*25202Skarels 
248*25202Skarels       case SIOCCPUSH:
249*25202Skarels 	tp->t_push = FALSE;
250*25202Skarels 	break;
251*25202Skarels 
252*25202Skarels 	/* no activity timer */
253*25202Skarels       case SIOCSNOACT:
254*25202Skarels 	{
255*25202Skarels 	u_long	value;
256*25202Skarels 
257*25202Skarels 	value = *((u_long *) data);
258*25202Skarels 	/*
259*25202Skarels 	 * A shutdown socket should still be able to request some sort of
260*25202Skarels 	 * check on the status of the remote end.  Also see tcp_newtcpcb().
261*25202Skarels 	 */
262*25202Skarels 	tp->t_noactprobe = (value & TCP_NOACTPROBE) ? TRUE : FALSE;
263*25202Skarels 	tp->t_noactsig = (value & TCP_NOACTSIG) ? TRUE : FALSE;
264*25202Skarels 
265*25202Skarels 	if ((tp->t_state <= ESTAB) || (tp->t_state == CLOSE_WAIT))
266*25202Skarels 	{
267*25202Skarels 	    /* don't interfere with system use of timer */
268*25202Skarels 	    value &= ~(TCP_NOACTPROBE|TCP_NOACTSIG);
269*25202Skarels 	    tp->t_noact = MIN (MAX_TCPTIMERVAL, value);
270*25202Skarels 	    tp->t_timers[TNOACT] = tp->t_noact;
271*25202Skarels 	}
272*25202Skarels 	}
273*25202Skarels 	break;
274*25202Skarels 
275*25202Skarels       case SIOCGNOACT:
276*25202Skarels 	{
277*25202Skarels 	u_long	value;
278*25202Skarels 
279*25202Skarels 	value = tp->t_noact;
280*25202Skarels 	if (tp->t_noactprobe)
281*25202Skarels 	    value |= TCP_NOACTPROBE;
282*25202Skarels 	if (tp->t_noactsig)
283*25202Skarels 	    value |= TCP_NOACTSIG;
284*25202Skarels 
285*25202Skarels 	*((u_long *) data) = value;
286*25202Skarels 	}
287*25202Skarels 	break;
288*25202Skarels 
289*25202Skarels 	/* init timer */
290*25202Skarels       case SIOCSINIT:
291*25202Skarels 	tp->t_itimeo = MIN (MAX_TCPTIMERVAL, *((unsigned *) data));
292*25202Skarels 	break;
293*25202Skarels 
294*25202Skarels       case SIOCGINIT:
295*25202Skarels 	*((int *) data) = tp->t_itimeo;
296*25202Skarels 	break;
297*25202Skarels 
298*25202Skarels 	/* retransmit took too long timer */
299*25202Skarels       case SIOCSRTTL:
300*25202Skarels 	tp->t_rttltimeo = MIN (MAX_TCPTIMERVAL, *((unsigned *) data));
301*25202Skarels 	break;
302*25202Skarels 
303*25202Skarels       case SIOCGRTTL:
304*25202Skarels 	*((int *) data) = tp->t_rttltimeo;
305*25202Skarels 	break;
306*25202Skarels 
307*25202Skarels       case SIOCABORT:
308*25202Skarels 	{
309*25202Skarels 	    struct socket *so;
310*25202Skarels 
311*25202Skarels 	    /* there really should be a generic way for
312*25202Skarels 	     * a user to get to soabort()
313*25202Skarels 	     */
314*25202Skarels 
315*25202Skarels 	    tp->usr_abort = TRUE;
316*25202Skarels 	    /*
317*25202Skarels 	     * Just in case asked to abort a LISTENing socket,
318*25202Skarels 	     * Don't leave unattached, unaccepted connections.
319*25202Skarels 	     */
320*25202Skarels 	    so = tp->t_in_pcb->inp_socket;
321*25202Skarels 	    while (so->so_q0 && (so->so_q0 != so))
322*25202Skarels 		(void) soabort(so->so_q0);
323*25202Skarels 	    while (so->so_q  && (so->so_q  != so))
324*25202Skarels 		(void) soabort(so->so_q);
325*25202Skarels 
326*25202Skarels 	    w_alloc(IUABORT, 0, tp, tp->t_in_pcb);
327*25202Skarels 	}
328*25202Skarels 	break;
329*25202Skarels 
330*25202Skarels       default:
331*25202Skarels 	/* not our ioctl, let lower level try ioctl */
332*25202Skarels 	return ip_ioctl (tp->t_in_pcb, command, data);
333*25202Skarels     }
334*25202Skarels 
335*25202Skarels     return (0);
336*25202Skarels }
337*25202Skarels 
338*25202Skarels 
339*25202Skarels /*
340*25202Skarels  * Process a TCP user request for TCP tb.  If this is a send request
341*25202Skarels  * then m is the mbuf chain of send data.  If this is a timer expiration
342*25202Skarels  * (called from the software clock routine), then timertype tells which timer.
343*25202Skarels  */
344*25202Skarels /*ARGSUSED*/
345*25202Skarels tcp_usrreq(so, req, m, nam, rights)
346*25202Skarels struct socket *so;
347*25202Skarels int req;
348*25202Skarels struct mbuf *m, *nam, *rights;
349*25202Skarels {
350*25202Skarels     register struct inpcb *inp;
351*25202Skarels     register struct tcpcb *tp;
352*25202Skarels     register int s;
353*25202Skarels     register int act, newstate;
354*25202Skarels     int error = 0;
355*25202Skarels 
356*25202Skarels     s = splnet();
357*25202Skarels     inp = sotoinpcb(so);
358*25202Skarels 
359*25202Skarels     /* keep in mind call from ifioctl() */
360*25202Skarels     if (rights && req != PRU_CONTROL)
361*25202Skarels     {
362*25202Skarels 	if (rights->m_len)
363*25202Skarels 	{
364*25202Skarels 	    splx(s);
365*25202Skarels 	    return (EINVAL);
366*25202Skarels 	}
367*25202Skarels     }
368*25202Skarels     /*
369*25202Skarels      * When a TCP is attached to a socket, then there will be
370*25202Skarels      * a (struct inpcb) pointed at by the socket, and this
371*25202Skarels      * structure will point at a subsidary (struct tcpcb).
372*25202Skarels      */
373*25202Skarels     if (inp == NULL && req != PRU_ATTACH)
374*25202Skarels     {
375*25202Skarels 	splx(s);
376*25202Skarels 	return (EINVAL);	/* XXX */
377*25202Skarels     }
378*25202Skarels     if (inp)
379*25202Skarels     {
380*25202Skarels 	tp = inptotcpcb(inp);
381*25202Skarels 	/* WHAT IF TP IS 0? */
382*25202Skarels #ifdef GPROF
383*25202Skarels 	tcp_acounts[tp->t_state][req]++;
384*25202Skarels #endif
385*25202Skarels     }
386*25202Skarels 
387*25202Skarels     /*
388*25202Skarels      * This switch becomes a 'caseb', so put common ones at top.
389*25202Skarels      */
390*25202Skarels     switch (req)
391*25202Skarels     {
392*25202Skarels 
393*25202Skarels       case PRU_RCVD:
394*25202Skarels 	/*
395*25202Skarels 	 * After a receive, possibly send window update to peer.
396*25202Skarels 	 */
397*25202Skarels 	W_ALLOC(IURECV, 0, tp, NULL, so, act, newstate);
398*25202Skarels 	break;
399*25202Skarels 
400*25202Skarels       case PRU_SEND:
401*25202Skarels 	/*
402*25202Skarels 	 * Do a send by initiating the proper entry to the FSM.
403*25202Skarels 	 * Don't let urgent continue.
404*25202Skarels 	 */
405*25202Skarels 	tp->t_urg = FALSE;
406*25202Skarels 	W_ALLOC(IUSEND, 0, tp, m, so, act, newstate);
407*25202Skarels 	break;
408*25202Skarels 
409*25202Skarels 	/*
410*25202Skarels 	 * TCP attaches to socket via PRU_ATTACH, reserving space,
411*25202Skarels 	 * and an internet control block.
412*25202Skarels 	 */
413*25202Skarels       case PRU_ATTACH:
414*25202Skarels 	if (inp)
415*25202Skarels 	{
416*25202Skarels 	    error = EISCONN;
417*25202Skarels 	    break;
418*25202Skarels 	}
419*25202Skarels 	error = tcp_attach(so);
420*25202Skarels 	if (error)
421*25202Skarels 	    break;
422*25202Skarels 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
423*25202Skarels 	    so->so_linger = T_LINGERTIME;
424*25202Skarels 	tp = sototcpcb(so);
425*25202Skarels 	break;
426*25202Skarels 
427*25202Skarels 	/*
428*25202Skarels 	 * PRU_DETACH detaches the TCP protocol from the socket.
429*25202Skarels 	 * This is only done after SO_ISCONNECTED has been cleared.
430*25202Skarels 	 */
431*25202Skarels       case PRU_DETACH:
432*25202Skarels 	tcp_disconnect(tp);
433*25202Skarels 	break;
434*25202Skarels 
435*25202Skarels 	/*
436*25202Skarels 	 * Give the socket an address.
437*25202Skarels 	 */
438*25202Skarels       case PRU_BIND:
439*25202Skarels 	error = in_pcbbind(inp, nam, &tcp_advice);
440*25202Skarels 	break;
441*25202Skarels 
442*25202Skarels 	/*
443*25202Skarels 	 * Prepare to accept connections.
444*25202Skarels 	 */
445*25202Skarels       case PRU_LISTEN:
446*25202Skarels 	if (inp->inp_lport == 0)
447*25202Skarels 	    error = in_pcbbind(inp, (struct mbuf *)0, &tcp_advice);
448*25202Skarels 	if (error == 0)
449*25202Skarels 	    w_alloc(IUOPENA, 0, tp, NULL);
450*25202Skarels 	break;
451*25202Skarels 
452*25202Skarels 	/*
453*25202Skarels 	 * Initiate connection to peer.
454*25202Skarels 	 * Bind the local end if not already.
455*25202Skarels 	 * Set the routing.
456*25202Skarels 	 * Crank up the TCP state machine.
457*25202Skarels 	 */
458*25202Skarels       case PRU_CONNECT:
459*25202Skarels 	{
460*25202Skarels 	    struct in_addr laddr;
461*25202Skarels 
462*25202Skarels 	    laddr = inp->inp_laddr;
463*25202Skarels 	    if (inp->inp_lport == 0)
464*25202Skarels 	    {
465*25202Skarels 		error = in_pcbbind(inp, (struct mbuf *)0, &tcp_advice);
466*25202Skarels 		if (error)
467*25202Skarels 		    break;
468*25202Skarels 	    }
469*25202Skarels 	    error = in_pcbconnect(inp, nam, tcp_conn_used);
470*25202Skarels 	    if (error)
471*25202Skarels 		break;
472*25202Skarels 
473*25202Skarels 	    if (in_broadcast(inp->inp_faddr))
474*25202Skarels 	    {
475*25202Skarels 		in_pcbdisconnect (inp, tcp_pcbdisconnect);
476*25202Skarels 		inp->inp_laddr = laddr;
477*25202Skarels 		error = EADDRNOTAVAIL;
478*25202Skarels 		break;
479*25202Skarels 	    }
480*25202Skarels 
481*25202Skarels 	    if (! (tp->t_template = tcp_template(tp)))
482*25202Skarels 	    {
483*25202Skarels 		in_pcbdisconnect (inp, tcp_pcbdisconnect);
484*25202Skarels 		inp->inp_laddr = laddr;
485*25202Skarels 		error = ENOBUFS;
486*25202Skarels 		break;
487*25202Skarels 	    }
488*25202Skarels 
489*25202Skarels 	    tp->sws_qff = SWS_QFF_DEF;
490*25202Skarels 
491*25202Skarels 	    /*
492*25202Skarels 	     * So can debug connection problems without having to change
493*25202Skarels 	     * every program or apply debugging flag to each program every
494*25202Skarels 	     * time run it.
495*25202Skarels 	     */
496*25202Skarels 	    dowedebug(inp, so, &tcp_dfilter);
497*25202Skarels 
498*25202Skarels 	    soisconnecting(so);
499*25202Skarels 	    w_alloc(IUOPENR, 0, tp, NULL);
500*25202Skarels 	}
501*25202Skarels 	break;
502*25202Skarels 
503*25202Skarels 	/*
504*25202Skarels 	 * Create a TCP connection between two sockets.
505*25202Skarels 	 */
506*25202Skarels       case PRU_CONNECT2:
507*25202Skarels 	error = EOPNOTSUPP;
508*25202Skarels 	break;
509*25202Skarels 
510*25202Skarels 	/*
511*25202Skarels 	 * Initiate disconnect from peer.
512*25202Skarels 	 * If connection never passed embryonic stage, just drop;
513*25202Skarels 	 * else if don't need to let data drain, then can just drop anyways,
514*25202Skarels 	 * else have to begin TCP shutdown process: mark socket disconnecting,
515*25202Skarels 	 * drain unread data, state switch to reflect user close, and
516*25202Skarels 	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
517*25202Skarels 	 * when peer sends FIN and acks ours.
518*25202Skarels 	 */
519*25202Skarels       case PRU_DISCONNECT:
520*25202Skarels 	tcp_disconnect(tp);
521*25202Skarels 	break;
522*25202Skarels 
523*25202Skarels 	/*
524*25202Skarels 	 * Accept a connection.  Essentially all the work is
525*25202Skarels 	 * done at higher levels; just return the address
526*25202Skarels 	 * of the peer, storing through addr.
527*25202Skarels 	 *
528*25202Skarels 	 * BBN-NOTE: upper levels do all the waiting;  this stays the same.
529*25202Skarels 	 */
530*25202Skarels       case PRU_ACCEPT:
531*25202Skarels 	{
532*25202Skarels 	    struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
533*25202Skarels 
534*25202Skarels 	    nam->m_len = sizeof (struct sockaddr_in);
535*25202Skarels 	    sin->sin_family = AF_INET;
536*25202Skarels 	    sin->sin_port = inp->inp_fport;
537*25202Skarels 	    sin->sin_addr = inp->inp_faddr;
538*25202Skarels 	    break;
539*25202Skarels 	}
540*25202Skarels 
541*25202Skarels 	/*
542*25202Skarels 	 * Mark the connection as being incapable of further output.
543*25202Skarels 	 */
544*25202Skarels       case PRU_SHUTDOWN:
545*25202Skarels 	socantsendmore(so);
546*25202Skarels 	if (! tp->usr_closed)
547*25202Skarels 	    w_alloc(IUCLOSE, 0, tp, inp);
548*25202Skarels 	break;
549*25202Skarels 
550*25202Skarels 	/*
551*25202Skarels 	 * Abort the TCP.
552*25202Skarels 	 */
553*25202Skarels       case PRU_ABORT:
554*25202Skarels 	w_alloc(IUABORT, 0, tp, inp);
555*25202Skarels 	break;
556*25202Skarels 
557*25202Skarels       case PRU_CONTROL:
558*25202Skarels 	error = tcp_ioctl(tp, (int) m, (caddr_t) nam);
559*25202Skarels 	break;
560*25202Skarels 
561*25202Skarels 
562*25202Skarels /* SOME AS YET UNIMPLEMENTED HOOKS */
563*25202Skarels       case PRU_SENSE:
564*25202Skarels 	error = EOPNOTSUPP;
565*25202Skarels 	break;
566*25202Skarels /* END UNIMPLEMENTED HOOKS */
567*25202Skarels 
568*25202Skarels       case PRU_RCVOOB:
569*25202Skarels 
570*25202Skarels 	{
571*25202Skarels 	    int	desired;
572*25202Skarels 
573*25202Skarels 	    if (so->so_oobmark == 0 && (so->so_state & SS_RCVATMARK) == 0)
574*25202Skarels 	    {
575*25202Skarels 		error = EINVAL;
576*25202Skarels 		break;
577*25202Skarels 	    }
578*25202Skarels 	    if (tp->oob_data == NULL)
579*25202Skarels 	    {
580*25202Skarels 		error = EWOULDBLOCK;
581*25202Skarels 		break;
582*25202Skarels 	    }
583*25202Skarels 	    desired = *(mtod(m, int *));
584*25202Skarels 
585*25202Skarels 	    while ((desired > 0) && (tp->oob_data))
586*25202Skarels 	    {
587*25202Skarels 		char	*p;
588*25202Skarels 		unsigned count;
589*25202Skarels 
590*25202Skarels 		p = mtod(m, caddr_t);
591*25202Skarels 		count = MIN(desired, tp->oob_data->m_len);
592*25202Skarels 		count = MIN(count, MLEN);
593*25202Skarels 		bcopy(mtod(tp->oob_data, caddr_t), p, count);
594*25202Skarels 		m->m_len = count;
595*25202Skarels 		desired -= count;
596*25202Skarels 
597*25202Skarels 		tp->oob_data->m_len -= count;
598*25202Skarels 		tp->oob_data->m_off += count;
599*25202Skarels 		if (tp->oob_data->m_len <= 0)
600*25202Skarels 		    tp->oob_data = m_free(tp->oob_data);
601*25202Skarels 
602*25202Skarels 		if ((desired > 0) && (tp->oob_data))
603*25202Skarels 		{
604*25202Skarels 		    m->m_next = m_get(M_WAIT, MT_DATA);
605*25202Skarels 		    m = m->m_next;
606*25202Skarels 		}
607*25202Skarels 	    }
608*25202Skarels 
609*25202Skarels 	}
610*25202Skarels 	break;
611*25202Skarels 
612*25202Skarels       case PRU_SENDOOB:
613*25202Skarels 	/*
614*25202Skarels 	 * allows up to MAX_TCPOOB bytes of out of band data
615*25202Skarels 	 * even if user has used up all his allocated space.
616*25202Skarels 	 */
617*25202Skarels 	if (sbspace(&so->so_snd) < (- MAX_TCPOOB))
618*25202Skarels 	{
619*25202Skarels 	    m_freem(m);
620*25202Skarels 	    error = ENOBUFS;
621*25202Skarels 	    break;
622*25202Skarels 
623*25202Skarels 	}
624*25202Skarels 	tp->t_urg = TRUE;
625*25202Skarels 	w_alloc(IUSEND, 0, tp, m);
626*25202Skarels 	break;
627*25202Skarels 
628*25202Skarels 	/*
629*25202Skarels 	 * Return the address of this socket (local-side binding)
630*25202Skarels 	 */
631*25202Skarels       case PRU_SOCKADDR:
632*25202Skarels 	in_setsockaddr(inp, nam);
633*25202Skarels 	break;
634*25202Skarels 
635*25202Skarels       case PRU_PEERADDR:
636*25202Skarels 	in_setpeeraddr(inp, nam);
637*25202Skarels 	break;
638*25202Skarels 
639*25202Skarels 	/*
640*25202Skarels 	 * TCP slow timer went off; run down all those timers.
641*25202Skarels 	 */
642*25202Skarels       case PRU_SLOWTIMO:
643*25202Skarels 	tcp_timeo();
644*25202Skarels 	break;
645*25202Skarels 
646*25202Skarels       default:
647*25202Skarels 	panic("tcp_usrreq");
648*25202Skarels     }
649*25202Skarels     splx(s);
650*25202Skarels     return (error);
651*25202Skarels }
652*25202Skarels 
653*25202Skarels /*
654*25202Skarels  * getsockopt() / setsockopt()
655*25202Skarels  */
tcp_ctloutput(req,so,level,optname,optval)656*25202Skarels tcp_ctloutput (req,so,level,optname,optval)
657*25202Skarels int req;
658*25202Skarels struct socket *so;
659*25202Skarels int level, optname;
660*25202Skarels struct mbuf **optval;
661*25202Skarels {
662*25202Skarels     int s = splnet(); /* like PRU/packet/timer entry into net code */
663*25202Skarels     int error;
664*25202Skarels     struct inpcb *inp;
665*25202Skarels 
666*25202Skarels     /*
667*25202Skarels      * possibly for us?
668*25202Skarels      * Follow Berkeley methods: level is protocol number if meant for the
669*25202Skarels      * protocol layer.  (Why not say if=0, arp=1, ip=2, udp/tcp/rdp=3....?)
670*25202Skarels      *
671*25202Skarels      * Problem: tcp needs to know about IP options in order to use right
672*25202Skarels      * maxseg.  This doesn't quite work with the layering.
673*25202Skarels      *
674*25202Skarels      * Why not combine ioctl/setsockopt/getsockopt paths, since ioctl can be
675*25202Skarels      * seen as fixed size sockopt- tried at BBN; removed for 4.3
676*25202Skarels      */
677*25202Skarels 
678*25202Skarels     /* should be "mature" socket so pointers all valid... */
679*25202Skarels     inp = sotoinpcb(so);
680*25202Skarels 
681*25202Skarels     switch(req)
682*25202Skarels     {
683*25202Skarels 	case PRCO_GETOPT:
684*25202Skarels 	    error = tcp_getopt (inp, optname, optval);
685*25202Skarels 	    break;
686*25202Skarels 
687*25202Skarels 	case PRCO_SETOPT:
688*25202Skarels 	    error = tcp_setopt (inp, optname, optval);
689*25202Skarels 	    break;
690*25202Skarels 
691*25202Skarels 	default:
692*25202Skarels 	    panic("tcp_ctloutput");
693*25202Skarels     }
694*25202Skarels 
695*25202Skarels     splx(s);
696*25202Skarels     return (error);
697*25202Skarels }
698*25202Skarels 
699*25202Skarels tcp_getopt (inp, command, data)
700*25202Skarels struct inpcb	*inp;
701*25202Skarels struct mbuf	**data;
702*25202Skarels {
703*25202Skarels     /*
704*25202Skarels      * no TCP specific options accessed by getsockopt() as yet.
705*25202Skarels      * let lower level at cmd
706*25202Skarels      */
707*25202Skarels     return ip_getopt (inp, command, data);
708*25202Skarels }
709*25202Skarels 
710*25202Skarels tcp_setopt (inp, command, data)
711*25202Skarels struct inpcb	*inp;
712*25202Skarels struct mbuf	**data;
713*25202Skarels {
714*25202Skarels     int error;
715*25202Skarels     struct tcpcb *tp;
716*25202Skarels 
717*25202Skarels     /* no TCP specific options accessed by setsockopt() as yet */
718*25202Skarels     tp = inptotcpcb(inp);
719*25202Skarels 
720*25202Skarels     if (command == SO_IPROUTE)
721*25202Skarels 	tp->t_maxseg += inp->inp_optlen;
722*25202Skarels 
723*25202Skarels     error =  ip_setopt(inp, command, data);
724*25202Skarels 
725*25202Skarels     if (command == SO_IPROUTE)
726*25202Skarels 	tp->t_maxseg -= inp->inp_optlen;
727*25202Skarels 
728*25202Skarels     return (error);
729*25202Skarels }
730*25202Skarels 
731*25202Skarels /*
732*25202Skarels  * These numbers come from measurements described in the paper
733*25202Skarels  *	"Converting the BBN TCP/IP to 4.2BSD"  (S.L.C. USENIX)
734*25202Skarels  * If your network handles packets larger than an ethernet frame, you
735*25202Skarels  * could change tcp_init back to determine the largest net's packet size,
736*25202Skarels  * multiply that by some number, and round up to a multiple of a CLSIZE.
737*25202Skarels  */
738*25202Skarels int	tcp_recvspace = 4096;
739*25202Skarels int	tcp_sendspace = 4096;
740*25202Skarels 
741*25202Skarels /*
742*25202Skarels  * Attach TCP protocol to socket, allocating
743*25202Skarels  * internet protocol control block, tcp control block, buffer space.
744*25202Skarels  */
745*25202Skarels tcp_attach(so)
746*25202Skarels struct socket *so;
747*25202Skarels {
748*25202Skarels     register struct tcpcb *tp;
749*25202Skarels     struct inpcb *inp;
750*25202Skarels     int error;
751*25202Skarels 
752*25202Skarels     if (! (error = soreserve(so, tcp_sendspace, tcp_recvspace)))
753*25202Skarels     {
754*25202Skarels 	if (! (error = in_pcballoc(so, &tcp)))
755*25202Skarels 	{
756*25202Skarels 	    inp = sotoinpcb(so);
757*25202Skarels 	    if (tp = tcp_newtcpcb(inp))
758*25202Skarels 	    {
759*25202Skarels 		/*
760*25202Skarels 		 * Should change state tables to have an UNOPENED state like
761*25202Skarels 		 * the butterfly's which is different from SAME.
762*25202Skarels 		 */
763*25202Skarels 		tp->t_state = 0;
764*25202Skarels 		return (0);
765*25202Skarels 	    }
766*25202Skarels 	    error = ENOBUFS;
767*25202Skarels 	    in_pcbdetach(inp, (int (*)())0);
768*25202Skarels 	}
769*25202Skarels     }
770*25202Skarels     return (error);
771*25202Skarels }
772*25202Skarels 
773*25202Skarels /*
774*25202Skarels  * Initiate (or continue) disconnect.
775*25202Skarels  * If embryonic state, just send reset (once).
776*25202Skarels  * If not in ``let data drain'' option, just drop.
777*25202Skarels  * Otherwise (hard), mark socket disconnecting and drop
778*25202Skarels  * current input data; switch states based on user close, and
779*25202Skarels  * send segment to peer (with FIN).
780*25202Skarels  */
781*25202Skarels 
tcp_disconnect(tp)782*25202Skarels tcp_disconnect(tp)
783*25202Skarels register struct tcpcb *tp;
784*25202Skarels {
785*25202Skarels     struct socket *so = tp->t_in_pcb->inp_socket;
786*25202Skarels 
787*25202Skarels     soisdisconnecting(so);
788*25202Skarels     sbflush(&so->so_rcv);
789*25202Skarels     tp->usr_abort = TRUE;
790*25202Skarels     if (!tp->usr_closed)
791*25202Skarels 	w_alloc(IUCLOSE, 0, tp, tp->t_in_pcb);
792*25202Skarels }
793*25202Skarels 
tcp_init()794*25202Skarels tcp_init()
795*25202Skarels {
796*25202Skarels     /*
797*25202Skarels      * Leave these checks in!  It's a pain in the ass to find out
798*25202Skarels      * problems caused by too small mbufs if someone changes the
799*25202Skarels      * size of an mbuf.
800*25202Skarels      */
801*25202Skarels     if (sizeof(struct inpcb) > MLEN)
802*25202Skarels 	panic("inpcb too big");
803*25202Skarels 
804*25202Skarels     if (sizeof(struct socket) > MLEN)
805*25202Skarels 	panic("socket too big");
806*25202Skarels 
807*25202Skarels     if (sizeof(struct th) > MLEN)
808*25202Skarels 	panic("th too big");
809*25202Skarels 
810*25202Skarels     if (sizeof(struct tcpcb) > MLEN)
811*25202Skarels 	panic("tcpcb too big");
812*25202Skarels 
813*25202Skarels     if (sizeof(struct t_debug) > MLEN)
814*25202Skarels 	panic("t_debug too big");
815*25202Skarels 
816*25202Skarels     /* init queue */
817*25202Skarels     tcp.inp_next = tcp.inp_prev = &tcp;
818*25202Skarels 
819*25202Skarels     /* are only 4 things to match. turn off for now */
820*25202Skarels     tcp_dfilter.matches = 5;
821*25202Skarels 
822*25202Skarels     tcp_iss = time.tv_sec;
823*25202Skarels 
824*25202Skarels     ipsw[IPPROTO_TCP].ipsw_hlen = sizeof(struct th);
825*25202Skarels }
826*25202Skarels 
tcp_ctlinput(prc_code,arg)827*25202Skarels tcp_ctlinput (prc_code, arg)
828*25202Skarels caddr_t arg;
829*25202Skarels {
830*25202Skarels     int error;
831*25202Skarels 
832*25202Skarels     error = inetctlerrmap[prc_code];
833*25202Skarels 
834*25202Skarels     switch (prc_code)
835*25202Skarels     {
836*25202Skarels 	case PRC_UNREACH_PROTOCOL:	/* icmp message */
837*25202Skarels 	case PRC_UNREACH_PORT:
838*25202Skarels 	case PRC_MSGSIZE:
839*25202Skarels 	    {
840*25202Skarels 	    register struct th	*tp;
841*25202Skarels 	    struct tcpcb	*t;
842*25202Skarels 
843*25202Skarels 	    tp = (struct th *) (&((struct icmp *) arg)->ic_iphdr);
844*25202Skarels 	    t = (struct tcpcb *)tcp_conn_used ((struct inpcb *) 0,
845*25202Skarels 		tp->t_src, tp->t_s.s_addr,
846*25202Skarels 		tp->t_dst, tp->t_d.s_addr);
847*25202Skarels 	    if (t)
848*25202Skarels 		t_close(t, error);
849*25202Skarels 	    }
850*25202Skarels 	    break;
851*25202Skarels 
852*25202Skarels 	case PRC_UNREACH_NET:
853*25202Skarels 	case PRC_UNREACH_HOST:
854*25202Skarels 	    {
855*25202Skarels 	    register struct th	*tp;
856*25202Skarels 	    struct tcpcb	*t;
857*25202Skarels 
858*25202Skarels 	    tp = (struct th *) (&((struct icmp *) arg)->ic_iphdr);
859*25202Skarels 	    t = (struct tcpcb *)tcp_conn_used ((struct inpcb *) 0,
860*25202Skarels 		tp->t_src, tp->t_s.s_addr,
861*25202Skarels 		tp->t_dst, tp->t_d.s_addr);
862*25202Skarels 	    if (t)
863*25202Skarels 	    {
864*25202Skarels 		struct socket *so;
865*25202Skarels 
866*25202Skarels 		so = t->t_in_pcb->inp_socket;
867*25202Skarels 		if ((so->so_state & SS_NOFDREF) == 0)
868*25202Skarels 		    advise_user(so, error);
869*25202Skarels 		else
870*25202Skarels 		    t_close(t, error);
871*25202Skarels 	    }
872*25202Skarels 	    }
873*25202Skarels 	    break;
874*25202Skarels 
875*25202Skarels 	case PRC_GWDOWN:
876*25202Skarels 	    in_gdown (&tcp, (u_long) arg);
877*25202Skarels 	    break;
878*25202Skarels 
879*25202Skarels 	case PRC_REDIRECT_NET:	/* icmp message */
880*25202Skarels 	case PRC_REDIRECT_HOST:
881*25202Skarels 	    {
882*25202Skarels 	    struct tcpcb	*t;
883*25202Skarels 	    register struct th	*tp;
884*25202Skarels 
885*25202Skarels 	    tp = (struct th *) (&((struct icmp *) arg)->ic_iphdr);
886*25202Skarels 	    t = (struct tcpcb *)tcp_conn_used ((struct inpcb *) 0,
887*25202Skarels 		tp->t_src, tp->t_s.s_addr,
888*25202Skarels 		tp->t_dst, tp->t_d.s_addr);
889*25202Skarels 	    if (t)
890*25202Skarels 		icmp_redirect_inp(t->t_in_pcb, (struct icmp *) arg,
891*25202Skarels 		    prc_code == PRC_REDIRECT_NET ? rtnet : rthost);
892*25202Skarels 	    }
893*25202Skarels 	    break;
894*25202Skarels 
895*25202Skarels 	case PRC_TIMXCEED_INTRANS:	/* icmp message */
896*25202Skarels 	case PRC_TIMXCEED_REASS:
897*25202Skarels 	case PRC_PARAMPROB:
898*25202Skarels 	    break;
899*25202Skarels 
900*25202Skarels 	case PRC_QUENCH:	/* icmp message */
901*25202Skarels 	    /*
902*25202Skarels 	     * See RFC 896.  The idea is, when we get a source quench message on
903*25202Skarels 	     * a connection we should send fewer packets.  This ties in with the
904*25202Skarels 	     * silly window syndrome whose solution is to send fewer, larger packets.
905*25202Skarels 	     * Deal with quenches by altering threshold used by silly window
906*25202Skarels 	     * syndrome.  This is similar to acting as if the window is smaller
907*25202Skarels 	     * than it actually is for deciding when to send, except that when we
908*25202Skarels 	     * do, we use as much as there really is.
909*25202Skarels 	     */
910*25202Skarels 	    {
911*25202Skarels 	    register struct th	*tp;
912*25202Skarels 	    struct tcpcb	*t;
913*25202Skarels 
914*25202Skarels 	    tp = (struct th *) (&((struct icmp *) arg)->ic_iphdr);
915*25202Skarels 	    t = (struct tcpcb *)tcp_conn_used ((struct inpcb *) 0,
916*25202Skarels 		tp->t_src, tp->t_s.s_addr,
917*25202Skarels 		tp->t_dst, tp->t_d.s_addr);
918*25202Skarels 	    if (t)
919*25202Skarels 	    {
920*25202Skarels 		t->sws_qff -= SWS_QFF_DEC;
921*25202Skarels 		if (t->sws_qff < SWS_QFF_MIN)
922*25202Skarels 		    t->sws_qff = SWS_QFF_MIN;
923*25202Skarels 	    }
924*25202Skarels 	    }
925*25202Skarels 	    break;
926*25202Skarels 
927*25202Skarels 	case PRC_IFDOWN:
928*25202Skarels 	    {
929*25202Skarels 	    u_long addr;
930*25202Skarels 
931*25202Skarels 	    addr = ((struct sockaddr_in *)(arg))->sin_addr.s_addr;
932*25202Skarels 	    inpcb_notify(&tcp, addr, (u_long) 0, error);
933*25202Skarels 	    inpcb_notify(&tcp, (u_long) 0, addr, error);
934*25202Skarels 	    }
935*25202Skarels 	    break;
936*25202Skarels 
937*25202Skarels 	case PRC_HOSTDEAD:	/* from imp interface */
938*25202Skarels 	case PRC_HOSTUNREACH:
939*25202Skarels 	    /*
940*25202Skarels 	     * get same message for destination hosts and gateways.
941*25202Skarels 	     */
942*25202Skarels 	    {
943*25202Skarels 	    u_long addr;
944*25202Skarels 
945*25202Skarels 	    addr = ((struct sockaddr_in *)arg)->sin_addr.s_addr;
946*25202Skarels 	    in_gdown (&tcp, addr);
947*25202Skarels 	    inpcb_notify(&tcp, (u_long) 0, addr, error);
948*25202Skarels 	    }
949*25202Skarels 	    break;
950*25202Skarels 
951*25202Skarels 	default:
952*25202Skarels 	    panic("tcp_ctlinput");
953*25202Skarels     }
954*25202Skarels }
955