xref: /inferno-os/os/ip/tcp.c (revision 043f83732c06a092cd12b5ad4f92264dee44c61a)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Mean segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default mean segment */
50 	DEF_MSS6	= 1280,		/* Default mean segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	HaveWS		= 1<<8,
85 };
86 
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
91 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
92 	"Closing", 	"Last_ack", 	"Time_wait"
93 };
94 
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 	Tcptimer	*next;
99 	Tcptimer	*prev;
100 	Tcptimer	*readynext;
101 	int	state;
102 	int	start;
103 	int	count;
104 	void	(*func)(void*);
105 	void	*arg;
106 };
107 
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 	uchar	vihl;		/* Version and header length */
116 	uchar	tos;		/* Type of service */
117 	uchar	length[2];	/* packet length */
118 	uchar	id[2];		/* Identification */
119 	uchar	frag[2];	/* Fragment information */
120 	uchar	Unused;
121 	uchar	proto;
122 	uchar	tcplen[2];
123 	uchar	tcpsrc[4];
124 	uchar	tcpdst[4];
125 	uchar	tcpsport[2];
126 	uchar	tcpdport[2];
127 	uchar	tcpseq[4];
128 	uchar	tcpack[4];
129 	uchar	tcpflag[2];
130 	uchar	tcpwin[2];
131 	uchar	tcpcksum[2];
132 	uchar	tcpurg[2];
133 	/* Options segment */
134 	uchar	tcpopt[1];
135 };
136 
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 	uchar	vcf[4];
141 	uchar	ploadlen[2];
142 	uchar	proto;
143 	uchar	ttl;
144 	uchar	tcpsrc[IPaddrlen];
145 	uchar	tcpdst[IPaddrlen];
146 	uchar	tcpsport[2];
147 	uchar	tcpdport[2];
148 	uchar	tcpseq[4];
149 	uchar	tcpack[4];
150 	uchar	tcpflag[2];
151 	uchar	tcpwin[2];
152 	uchar	tcpcksum[2];
153 	uchar	tcpurg[2];
154 	/* Options segment */
155 	uchar	tcpopt[1];
156 };
157 
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct	Tcp
166 {
167 	ushort	source;
168 	ushort	dest;
169 	ulong	seq;
170 	ulong	ack;
171 	uchar	flags;
172 	ushort	ws;	/* window scale option (if not zero) */
173 	ulong	wnd;
174 	ushort	urg;
175 	ushort	mss;	/* max segment size option (if not zero) */
176 	ushort	len;	/* size of data */
177 };
178 
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 	Reseq	*next;
187 	Tcp	seg;
188 	Block	*bp;
189 	ushort	length;
190 };
191 
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 	uchar	state;			/* Connection state */
199 	uchar	type;			/* Listening or active connection */
200 	uchar	code;			/* Icmp code */
201 	struct {
202 		ulong	una;		/* Unacked data pointer */
203 		ulong	nxt;		/* Next sequence expected */
204 		ulong	ptr;		/* Data pointer */
205 		ulong	wnd;		/* Tcp send window */
206 		ulong	urg;		/* Urgent data pointer */
207 		ulong	wl2;
208 		int	scale;		/* how much to right shift window in xmitted packets */
209 		/* to implement tahoe and reno TCP */
210 		ulong	dupacks;	/* number of duplicate acks rcvd */
211 		int	recovery;	/* loss recovery flag */
212 		ulong	rxt;		/* right window marker for recovery */
213 	} snd;
214 	struct {
215 		ulong	nxt;		/* Receive pointer to next uchar slot */
216 		ulong	wnd;		/* Receive window incoming */
217 		ulong	urg;		/* Urgent pointer */
218 		int	blocked;
219 		int	una;		/* unacked data segs */
220 		int	scale;		/* how much to left shift window in rcved packets */
221 	} rcv;
222 	ulong	iss;			/* Initial sequence number */
223 	int	sawwsopt;		/* true if we saw a wsopt on the incoming SYN */
224 	ulong	cwind;			/* Congestion window */
225 	int	scale;			/* desired snd.scale */
226 	ushort	ssthresh;		/* Slow start threshold */
227 	int	resent;			/* Bytes just resent */
228 	int	irs;			/* Initial received squence */
229 	ushort	mss;			/* Mean segment size */
230 	int	rerecv;			/* Overlap of data rerecevived */
231 	ulong	window;			/* Recevive window */
232 	uchar	backoff;		/* Exponential backoff counter */
233 	int	backedoff;		/* ms we've backed off for rexmits */
234 	uchar	flags;			/* State flags */
235 	Reseq	*reseq;			/* Resequencing queue */
236 	Tcptimer	timer;			/* Activity timer */
237 	Tcptimer	acktimer;		/* Acknowledge timer */
238 	Tcptimer	rtt_timer;		/* Round trip timer */
239 	Tcptimer	katimer;		/* keep alive timer */
240 	ulong	rttseq;			/* Round trip sequence */
241 	int	srtt;			/* Shortened round trip */
242 	int	mdev;			/* Mean deviation of round trip */
243 	int	kacounter;		/* count down for keep alive */
244 	uint	sndsyntime;		/* time syn sent */
245 	ulong	time;			/* time Finwait2 or Syn_received was sent */
246 	int	nochecksum;		/* non-zero means don't send checksums */
247 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
248 
249 	union {
250 		Tcp4hdr	tcp4hdr;
251 		Tcp6hdr	tcp6hdr;
252 	} protohdr;		/* prototype header */
253 };
254 
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 	Limbo	*next;
272 
273 	uchar	laddr[IPaddrlen];
274 	uchar	raddr[IPaddrlen];
275 	ushort	lport;
276 	ushort	rport;
277 	ulong	irs;		/* initial received sequence */
278 	ulong	iss;		/* initial sent sequence */
279 	ushort	mss;		/* mss from the other end */
280 	ushort	rcvscale;	/* how much to scale rcvd windows */
281 	ushort	sndscale;	/* how much to scale sent windows */
282 	ulong	lastsend;	/* last time we sent a synack */
283 	uchar	version;	/* v4 or v6 */
284 	uchar	rexmits;	/* number of retransmissions */
285 };
286 
287 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
288 ushort	tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */
289 
290 enum {
291 	/* MIB stats */
292 	MaxConn,
293 	ActiveOpens,
294 	PassiveOpens,
295 	EstabResets,
296 	CurrEstab,
297 	InSegs,
298 	OutSegs,
299 	RetransSegs,
300 	RetransTimeouts,
301 	InErrs,
302 	OutRsts,
303 
304 	/* non-MIB stats */
305 	CsumErrs,
306 	HlenErrs,
307 	LenErrs,
308 	OutOfOrder,
309 
310 	Nstats
311 };
312 
313 static char *statnames[] =
314 {
315 [MaxConn]	"MaxConn",
316 [ActiveOpens]	"ActiveOpens",
317 [PassiveOpens]	"PassiveOpens",
318 [EstabResets]	"EstabResets",
319 [CurrEstab]	"CurrEstab",
320 [InSegs]	"InSegs",
321 [OutSegs]	"OutSegs",
322 [RetransSegs]	"RetransSegs",
323 [RetransTimeouts]	"RetransTimeouts",
324 [InErrs]	"InErrs",
325 [OutRsts]	"OutRsts",
326 [CsumErrs]	"CsumErrs",
327 [HlenErrs]	"HlenErrs",
328 [LenErrs]	"LenErrs",
329 [OutOfOrder]	"OutOfOrder",
330 };
331 
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335 	/* List of active timers */
336 	QLock 	tl;
337 	Tcptimer *timers;
338 
339 	/* hash table for matching conversations */
340 	Ipht	ht;
341 
342 	/* calls in limbo waiting for an ACK to our SYN ACK */
343 	int	nlimbo;
344 	Limbo	*lht[NLHT];
345 
346 	/* for keeping track of tcpackproc */
347 	QLock	apl;
348 	int	ackprocstarted;
349 
350 	ulong	stats[Nstats];
351 };
352 
353 /*
354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
355  *  solution to hijacked systems staking out port's as a form
356  *  of DoS attack.
357  *
358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
359  *  it that number gets acked by the other end, we shut down the connection.
360  *  Look for tcpporthogedefense in the code.
361  */
362 int tcpporthogdefense = 0;
363 
364 int	addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void	localclose(Conv*, char*);
367 void	procsyn(Conv*, Tcp*);
368 void	tcpiput(Proto*, Ipifc*, Block*);
369 void	tcpoutput(Conv*);
370 int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void	tcpstart(Conv*, int);
372 void	tcptimeout(void*);
373 void	tcpsndsyn(Conv*, Tcpctl*);
374 void	tcprcvwin(Conv*);
375 void	tcpacktimer(void*);
376 void	tcpkeepalive(void*);
377 void	tcpsetkacounter(Tcpctl*);
378 void	tcprxmit(Conv*);
379 void	tcpsettimer(Tcpctl*);
380 void	tcpsynackrtt(Conv*);
381 void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382 
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385 
386 void
387 tcpsetstate(Conv *s, uchar newstate)
388 {
389 	Tcpctl *tcb;
390 	uchar oldstate;
391 	Tcppriv *tpriv;
392 
393 	tpriv = s->p->priv;
394 
395 	tcb = (Tcpctl*)s->ptcl;
396 
397 	oldstate = tcb->state;
398 	if(oldstate == newstate)
399 		return;
400 
401 	if(oldstate == Established)
402 		tpriv->stats[CurrEstab]--;
403 	if(newstate == Established)
404 		tpriv->stats[CurrEstab]++;
405 
406 	/**
407 	print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408 		tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409 	**/
410 
411 	switch(newstate) {
412 	case Closed:
413 		qclose(s->rq);
414 		qclose(s->wq);
415 		qclose(s->eq);
416 		break;
417 
418 	case Close_wait:		/* Remote closes */
419 		qhangup(s->rq, nil);
420 		break;
421 	}
422 
423 	tcb->state = newstate;
424 
425 	if(oldstate == Syn_sent && newstate != Closed)
426 		Fsconnected(s, nil);
427 }
428 
429 static char*
430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432 	char *e;
433 
434 	e = Fsstdconnect(c, argv, argc);
435 	if(e != nil)
436 		return e;
437 	tcpstart(c, TCP_CONNECT);
438 
439 	return nil;
440 }
441 
442 static int
443 tcpstate(Conv *c, char *state, int n)
444 {
445 	Tcpctl *s;
446 
447 	s = (Tcpctl*)(c->ptcl);
448 
449 	return snprint(state, n,
450 		"%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
451 		tcpstates[s->state],
452 		c->rq ? qlen(c->rq) : 0,
453 		c->wq ? qlen(c->wq) : 0,
454 		s->srtt, s->mdev,
455 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
456 		s->timer.start, s->timer.count, s->rerecv,
457 		s->katimer.start, s->katimer.count);
458 }
459 
460 static int
461 tcpinuse(Conv *c)
462 {
463 	Tcpctl *s;
464 
465 	s = (Tcpctl*)(c->ptcl);
466 	return s->state != Closed;
467 }
468 
469 static char*
470 tcpannounce(Conv *c, char **argv, int argc)
471 {
472 	char *e;
473 
474 	e = Fsstdannounce(c, argv, argc);
475 	if(e != nil)
476 		return e;
477 	tcpstart(c, TCP_LISTEN);
478 	Fsconnected(c, nil);
479 
480 	return nil;
481 }
482 
483 /*
484  *  tcpclose is always called with the q locked
485  */
486 static void
487 tcpclose(Conv *c)
488 {
489 	Tcpctl *tcb;
490 
491 	tcb = (Tcpctl*)c->ptcl;
492 
493 	qhangup(c->rq, nil);
494 	qhangup(c->wq, nil);
495 	qhangup(c->eq, nil);
496 	qflush(c->rq);
497 
498 	switch(tcb->state) {
499 	case Listen:
500 		/*
501 		 *  reset any incoming calls to this listener
502 		 */
503 		Fsconnected(c, "Hangup");
504 
505 		localclose(c, nil);
506 		break;
507 	case Closed:
508 	case Syn_sent:
509 		localclose(c, nil);
510 		break;
511 	case Syn_received:
512 	case Established:
513 		tcb->flgcnt++;
514 		tcb->snd.nxt++;
515 		tcpsetstate(c, Finwait1);
516 		tcpoutput(c);
517 		break;
518 	case Close_wait:
519 		tcb->flgcnt++;
520 		tcb->snd.nxt++;
521 		tcpsetstate(c, Last_ack);
522 		tcpoutput(c);
523 		break;
524 	}
525 }
526 
527 void
528 tcpkick(void *x)
529 {
530 	Conv *s = x;
531 	Tcpctl *tcb;
532 
533 	tcb = (Tcpctl*)s->ptcl;
534 
535 	if(waserror()){
536 		qunlock(s);
537 		nexterror();
538 	}
539 	qlock(s);
540 
541 	switch(tcb->state) {
542 	case Syn_sent:
543 	case Syn_received:
544 	case Established:
545 	case Close_wait:
546 		/*
547 		 * Push data
548 		 */
549 		tcprcvwin(s);
550 		tcpoutput(s);
551 		break;
552 	default:
553 		localclose(s, "Hangup");
554 		break;
555 	}
556 
557 	qunlock(s);
558 	poperror();
559 }
560 
561 void
562 tcprcvwin(Conv *s)				/* Call with tcb locked */
563 {
564 	int w;
565 	Tcpctl *tcb;
566 
567 	tcb = (Tcpctl*)s->ptcl;
568 	w = tcb->window - qlen(s->rq);
569 	if(w < 0)
570 		w = 0;
571 	tcb->rcv.wnd = w;
572 	if(w == 0)
573 		tcb->rcv.blocked = 1;
574 }
575 
576 void
577 tcpacktimer(void *v)
578 {
579 	Tcpctl *tcb;
580 	Conv *s;
581 
582 	s = v;
583 	tcb = (Tcpctl*)s->ptcl;
584 
585 	if(waserror()){
586 		qunlock(s);
587 		nexterror();
588 	}
589 	qlock(s);
590 	if(tcb->state != Closed){
591 		tcb->flags |= FORCE;
592 		tcprcvwin(s);
593 		tcpoutput(s);
594 	}
595 	qunlock(s);
596 	poperror();
597 }
598 
599 static void
600 tcpcreate(Conv *c)
601 {
602 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
603 	c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
604 }
605 
606 static void
607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
608 {
609 	if(newstate != TcptimerON){
610 		if(t->state == TcptimerON){
611 			// unchain
612 			if(priv->timers == t){
613 				priv->timers = t->next;
614 				if(t->prev != nil)
615 					panic("timerstate1");
616 			}
617 			if(t->next)
618 				t->next->prev = t->prev;
619 			if(t->prev)
620 				t->prev->next = t->next;
621 			t->next = t->prev = nil;
622 		}
623 	} else {
624 		if(t->state != TcptimerON){
625 			// chain
626 			if(t->prev != nil || t->next != nil)
627 				panic("timerstate2");
628 			t->prev = nil;
629 			t->next = priv->timers;
630 			if(t->next)
631 				t->next->prev = t;
632 			priv->timers = t;
633 		}
634 	}
635 	t->state = newstate;
636 }
637 
638 void
639 tcpackproc(void *a)
640 {
641 	Tcptimer *t, *tp, *timeo;
642 	Proto *tcp;
643 	Tcppriv *priv;
644 	int loop;
645 
646 	tcp = a;
647 	priv = tcp->priv;
648 
649 	for(;;) {
650 		tsleep(&up->sleep, return0, 0, MSPTICK);
651 
652 		qlock(&priv->tl);
653 		timeo = nil;
654 		loop = 0;
655 		for(t = priv->timers; t != nil; t = tp) {
656 			if(loop++ > 10000)
657 				panic("tcpackproc1");
658 			tp = t->next;
659  			if(t->state == TcptimerON) {
660 				t->count--;
661 				if(t->count == 0) {
662 					timerstate(priv, t, TcptimerDONE);
663 					t->readynext = timeo;
664 					timeo = t;
665 				}
666 			}
667 		}
668 		qunlock(&priv->tl);
669 
670 		loop = 0;
671 		for(t = timeo; t != nil; t = t->readynext) {
672 			if(loop++ > 10000)
673 				panic("tcpackproc2");
674 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
675 				(*t->func)(t->arg);
676 				poperror();
677 			}
678 		}
679 
680 		limborexmit(tcp);
681 	}
682 }
683 
684 void
685 tcpgo(Tcppriv *priv, Tcptimer *t)
686 {
687 	if(t == nil || t->start == 0)
688 		return;
689 
690 	qlock(&priv->tl);
691 	t->count = t->start;
692 	timerstate(priv, t, TcptimerON);
693 	qunlock(&priv->tl);
694 }
695 
696 void
697 tcphalt(Tcppriv *priv, Tcptimer *t)
698 {
699 	if(t == nil)
700 		return;
701 
702 	qlock(&priv->tl);
703 	timerstate(priv, t, TcptimerOFF);
704 	qunlock(&priv->tl);
705 }
706 
707 int
708 backoff(int n)
709 {
710 	return 1 << n;
711 }
712 
713 void
714 localclose(Conv *s, char *reason)	/* called with tcb locked */
715 {
716 	Tcpctl *tcb;
717 	Reseq *rp,*rp1;
718 	Tcppriv *tpriv;
719 
720 	tpriv = s->p->priv;
721 	tcb = (Tcpctl*)s->ptcl;
722 
723 	iphtrem(&tpriv->ht, s);
724 
725 	tcphalt(tpriv, &tcb->timer);
726 	tcphalt(tpriv, &tcb->rtt_timer);
727 	tcphalt(tpriv, &tcb->acktimer);
728 	tcphalt(tpriv, &tcb->katimer);
729 
730 	/* Flush reassembly queue; nothing more can arrive */
731 	for(rp = tcb->reseq; rp != nil; rp = rp1) {
732 		rp1 = rp->next;
733 		freeblist(rp->bp);
734 		free(rp);
735 	}
736 	tcb->reseq = nil;
737 
738 	if(tcb->state == Syn_sent)
739 		Fsconnected(s, reason);
740 	if(s->state == Announced)
741 		wakeup(&s->listenr);
742 
743 	qhangup(s->rq, reason);
744 	qhangup(s->wq, reason);
745 
746 	tcpsetstate(s, Closed);
747 }
748 
749 /* mtu (- TCP + IP hdr len) of 1st hop */
750 int
751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
752 {
753 	Ipifc *ifc;
754 	int mtu;
755 
756 	ifc = findipifc(tcp->f, addr, 0);
757 	switch(version){
758 	default:
759 	case V4:
760 		mtu = DEF_MSS;
761 		if(ifc != nil)
762 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763 		break;
764 	case V6:
765 		mtu = DEF_MSS6;
766 		if(ifc != nil)
767 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768 		break;
769 	}
770 	if(ifc != nil){
771 		if(ifc->mbps > 100)
772 			*scale = HaveWS | 3;
773 		else if(ifc->mbps > 10)
774 			*scale = HaveWS | 1;
775 		else
776 			*scale = HaveWS | 0;
777 	} else
778 		*scale = HaveWS | 0;
779 
780 	return mtu;
781 }
782 
783 void
784 inittcpctl(Conv *s, int mode)
785 {
786 	Tcpctl *tcb;
787 	Tcp4hdr* h4;
788 	Tcp6hdr* h6;
789 	int mss;
790 
791 	tcb = (Tcpctl*)s->ptcl;
792 
793 	memset(tcb, 0, sizeof(Tcpctl));
794 
795 	tcb->ssthresh = 65535;
796 	tcb->srtt = tcp_irtt<<LOGAGAIN;
797 	tcb->mdev = 0;
798 
799 	/* setup timers */
800 	tcb->timer.start = tcp_irtt / MSPTICK;
801 	tcb->timer.func = tcptimeout;
802 	tcb->timer.arg = s;
803 	tcb->rtt_timer.start = MAX_TIME;
804 	tcb->acktimer.start = TCP_ACK / MSPTICK;
805 	tcb->acktimer.func = tcpacktimer;
806 	tcb->acktimer.arg = s;
807 	tcb->katimer.start = DEF_KAT / MSPTICK;
808 	tcb->katimer.func = tcpkeepalive;
809 	tcb->katimer.arg = s;
810 
811 	mss = DEF_MSS;
812 
813 	/* create a prototype(pseudo) header */
814 	if(mode != TCP_LISTEN){
815 		if(ipcmp(s->laddr, IPnoaddr) == 0)
816 			findlocalip(s->p->f, s->laddr, s->raddr);
817 
818 		switch(s->ipversion){
819 		case V4:
820 			h4 = &tcb->protohdr.tcp4hdr;
821 			memset(h4, 0, sizeof(*h4));
822 			h4->proto = IP_TCPPROTO;
823 			hnputs(h4->tcpsport, s->lport);
824 			hnputs(h4->tcpdport, s->rport);
825 			v6tov4(h4->tcpsrc, s->laddr);
826 			v6tov4(h4->tcpdst, s->raddr);
827 			break;
828 		case V6:
829 			h6 = &tcb->protohdr.tcp6hdr;
830 			memset(h6, 0, sizeof(*h6));
831 			h6->proto = IP_TCPPROTO;
832 			hnputs(h6->tcpsport, s->lport);
833 			hnputs(h6->tcpdport, s->rport);
834 			ipmove(h6->tcpsrc, s->laddr);
835 			ipmove(h6->tcpdst, s->raddr);
836 			mss = DEF_MSS6;
837 			break;
838 		default:
839 			panic("inittcpctl: version %d", s->ipversion);
840 		}
841 	}
842 
843 	tcb->mss = tcb->cwind = mss;
844 
845 	/* default is no window scaling */
846 	tcb->window = QMAX;
847 	tcb->rcv.wnd = QMAX;
848 	tcb->rcv.scale = 0;
849 	tcb->snd.scale = 0;
850 	qsetlimit(s->rq, QMAX);
851 }
852 
853 /*
854  *  called with s qlocked
855  */
856 void
857 tcpstart(Conv *s, int mode)
858 {
859 	Tcpctl *tcb;
860 	Tcppriv *tpriv;
861 	char kpname[KNAMELEN];
862 
863 	tpriv = s->p->priv;
864 
865 	if(tpriv->ackprocstarted == 0){
866 		qlock(&tpriv->apl);
867 		if(tpriv->ackprocstarted == 0){
868 			sprint(kpname, "#I%dtcpack", s->p->f->dev);
869 			kproc(kpname, tcpackproc, s->p, 0);
870 			tpriv->ackprocstarted = 1;
871 		}
872 		qunlock(&tpriv->apl);
873 	}
874 
875 	tcb = (Tcpctl*)s->ptcl;
876 
877 	inittcpctl(s, mode);
878 
879 	iphtadd(&tpriv->ht, s);
880 	switch(mode) {
881 	case TCP_LISTEN:
882 		tpriv->stats[PassiveOpens]++;
883 		tcb->flags |= CLONE;
884 		tcpsetstate(s, Listen);
885 		break;
886 
887 	case TCP_CONNECT:
888 		tpriv->stats[ActiveOpens]++;
889 		tcb->flags |= ACTIVE;
890 		tcpsndsyn(s, tcb);
891 		tcpsetstate(s, Syn_sent);
892 		tcpoutput(s);
893 		break;
894 	}
895 }
896 
897 static char*
898 tcpflag(ushort flag)
899 {
900 	static char buf[128];
901 
902 	sprint(buf, "%d", flag>>10);	/* Head len */
903 	if(flag & URG)
904 		strcat(buf, " URG");
905 	if(flag & ACK)
906 		strcat(buf, " ACK");
907 	if(flag & PSH)
908 		strcat(buf, " PSH");
909 	if(flag & RST)
910 		strcat(buf, " RST");
911 	if(flag & SYN)
912 		strcat(buf, " SYN");
913 	if(flag & FIN)
914 		strcat(buf, " FIN");
915 
916 	return buf;
917 }
918 
919 Block *
920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
921 {
922 	int dlen;
923 	Tcp6hdr *h;
924 	ushort csum;
925 	ushort hdrlen, optpad = 0;
926 	uchar *opt;
927 
928 	hdrlen = TCP6_HDRSIZE;
929 	if(tcph->flags & SYN){
930 		if(tcph->mss)
931 			hdrlen += MSS_LENGTH;
932 		if(tcph->ws)
933 			hdrlen += WS_LENGTH;
934 		optpad = hdrlen & 3;
935 		if(optpad)
936 			optpad = 4 - optpad;
937 		hdrlen += optpad;
938 	}
939 
940 	if(data) {
941 		dlen = blocklen(data);
942 		data = padblock(data, hdrlen + TCP6_PKT);
943 		if(data == nil)
944 			return nil;
945 	}
946 	else {
947 		dlen = 0;
948 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
949 		if(data == nil)
950 			return nil;
951 		data->wp += hdrlen + TCP6_PKT;
952 	}
953 
954 	/* copy in pseudo ip header plus port numbers */
955 	h = (Tcp6hdr *)(data->rp);
956 	memmove(h, ph, TCP6_TCBPHDRSZ);
957 
958 	/* compose pseudo tcp header, do cksum calculation */
959 	hnputl(h->vcf, hdrlen + dlen);
960 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
961 	h->ttl = ph->proto;
962 
963 	/* copy in variable bits */
964 	hnputl(h->tcpseq, tcph->seq);
965 	hnputl(h->tcpack, tcph->ack);
966 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
967 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
968 	hnputs(h->tcpurg, tcph->urg);
969 
970 	if(tcph->flags & SYN){
971 		opt = h->tcpopt;
972 		if(tcph->mss != 0){
973 			*opt++ = MSSOPT;
974 			*opt++ = MSS_LENGTH;
975 			hnputs(opt, tcph->mss);
976 			opt += 2;
977 		}
978 		if(tcph->ws != 0){
979 			*opt++ = WSOPT;
980 			*opt++ = WS_LENGTH;
981 			*opt++ = tcph->ws;
982 		}
983 		while(optpad-- > 0)
984 			*opt++ = NOOPOPT;
985 	}
986 
987 	if(tcb != nil && tcb->nochecksum){
988 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
989 	} else {
990 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
991 		hnputs(h->tcpcksum, csum);
992 	}
993 
994 	/* move from pseudo header back to normal ip header */
995 	memset(h->vcf, 0, 4);
996 	h->vcf[0] = IP_VER6;
997 	hnputs(h->ploadlen, hdrlen+dlen);
998 	h->proto = ph->proto;
999 
1000 	return data;
1001 }
1002 
1003 Block *
1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1005 {
1006 	int dlen;
1007 	Tcp4hdr *h;
1008 	ushort csum;
1009 	ushort hdrlen, optpad = 0;
1010 	uchar *opt;
1011 
1012 	hdrlen = TCP4_HDRSIZE;
1013 	if(tcph->flags & SYN){
1014 		if(tcph->mss)
1015 			hdrlen += MSS_LENGTH;
1016 		if(tcph->ws)
1017 			hdrlen += WS_LENGTH;
1018 		optpad = hdrlen & 3;
1019 		if(optpad)
1020 			optpad = 4 - optpad;
1021 		hdrlen += optpad;
1022 	}
1023 
1024 	if(data) {
1025 		dlen = blocklen(data);
1026 		data = padblock(data, hdrlen + TCP4_PKT);
1027 		if(data == nil)
1028 			return nil;
1029 	}
1030 	else {
1031 		dlen = 0;
1032 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1033 		if(data == nil)
1034 			return nil;
1035 		data->wp += hdrlen + TCP4_PKT;
1036 	}
1037 
1038 	/* copy in pseudo ip header plus port numbers */
1039 	h = (Tcp4hdr *)(data->rp);
1040 	memmove(h, ph, TCP4_TCBPHDRSZ);
1041 
1042 	/* copy in variable bits */
1043 	hnputs(h->tcplen, hdrlen + dlen);
1044 	hnputl(h->tcpseq, tcph->seq);
1045 	hnputl(h->tcpack, tcph->ack);
1046 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1047 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1048 	hnputs(h->tcpurg, tcph->urg);
1049 
1050 	if(tcph->flags & SYN){
1051 		opt = h->tcpopt;
1052 		if(tcph->mss != 0){
1053 			*opt++ = MSSOPT;
1054 			*opt++ = MSS_LENGTH;
1055 			hnputs(opt, tcph->mss);
1056 			opt += 2;
1057 		}
1058 		if(tcph->ws != 0){
1059 			*opt++ = WSOPT;
1060 			*opt++ = WS_LENGTH;
1061 			*opt++ = tcph->ws;
1062 		}
1063 		while(optpad-- > 0)
1064 			*opt++ = NOOPOPT;
1065 	}
1066 
1067 	if(tcb != nil && tcb->nochecksum){
1068 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1069 	} else {
1070 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1071 		hnputs(h->tcpcksum, csum);
1072 	}
1073 
1074 	return data;
1075 }
1076 
1077 int
1078 ntohtcp6(Tcp *tcph, Block **bpp)
1079 {
1080 	Tcp6hdr *h;
1081 	uchar *optr;
1082 	ushort hdrlen;
1083 	ushort optlen;
1084 	int n;
1085 
1086 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1087 	if(*bpp == nil)
1088 		return -1;
1089 
1090 	h = (Tcp6hdr *)((*bpp)->rp);
1091 	tcph->source = nhgets(h->tcpsport);
1092 	tcph->dest = nhgets(h->tcpdport);
1093 	tcph->seq = nhgetl(h->tcpseq);
1094 	tcph->ack = nhgetl(h->tcpack);
1095 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1096 	if(hdrlen < TCP6_HDRSIZE) {
1097 		freeblist(*bpp);
1098 		return -1;
1099 	}
1100 
1101 	tcph->flags = h->tcpflag[1];
1102 	tcph->wnd = nhgets(h->tcpwin);
1103 	tcph->urg = nhgets(h->tcpurg);
1104 	tcph->mss = 0;
1105 	tcph->ws = 0;
1106 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1107 
1108 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1109 	if(*bpp == nil)
1110 		return -1;
1111 
1112 	optr = h->tcpopt;
1113 	n = hdrlen - TCP6_HDRSIZE;
1114 	while(n > 0 && *optr != EOLOPT) {
1115 		if(*optr == NOOPOPT) {
1116 			n--;
1117 			optr++;
1118 			continue;
1119 		}
1120 		optlen = optr[1];
1121 		if(optlen < 2 || optlen > n)
1122 			break;
1123 		switch(*optr) {
1124 		case MSSOPT:
1125 			if(optlen == MSS_LENGTH)
1126 				tcph->mss = nhgets(optr+2);
1127 			break;
1128 		case WSOPT:
1129 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1130 				tcph->ws = HaveWS | *(optr+2);
1131 			break;
1132 		}
1133 		n -= optlen;
1134 		optr += optlen;
1135 	}
1136 	return hdrlen;
1137 }
1138 
1139 int
1140 ntohtcp4(Tcp *tcph, Block **bpp)
1141 {
1142 	Tcp4hdr *h;
1143 	uchar *optr;
1144 	ushort hdrlen;
1145 	ushort optlen;
1146 	int n;
1147 
1148 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1149 	if(*bpp == nil)
1150 		return -1;
1151 
1152 	h = (Tcp4hdr *)((*bpp)->rp);
1153 	tcph->source = nhgets(h->tcpsport);
1154 	tcph->dest = nhgets(h->tcpdport);
1155 	tcph->seq = nhgetl(h->tcpseq);
1156 	tcph->ack = nhgetl(h->tcpack);
1157 
1158 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1159 	if(hdrlen < TCP4_HDRSIZE) {
1160 		freeblist(*bpp);
1161 		return -1;
1162 	}
1163 
1164 	tcph->flags = h->tcpflag[1];
1165 	tcph->wnd = nhgets(h->tcpwin);
1166 	tcph->urg = nhgets(h->tcpurg);
1167 	tcph->mss = 0;
1168 	tcph->ws = 0;
1169 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1170 
1171 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1172 	if(*bpp == nil)
1173 		return -1;
1174 
1175 	optr = h->tcpopt;
1176 	n = hdrlen - TCP4_HDRSIZE;
1177 	while(n > 0 && *optr != EOLOPT) {
1178 		if(*optr == NOOPOPT) {
1179 			n--;
1180 			optr++;
1181 			continue;
1182 		}
1183 		optlen = optr[1];
1184 		if(optlen < 2 || optlen > n)
1185 			break;
1186 		switch(*optr) {
1187 		case MSSOPT:
1188 			if(optlen == MSS_LENGTH)
1189 				tcph->mss = nhgets(optr+2);
1190 			break;
1191 		case WSOPT:
1192 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1193 				tcph->ws = HaveWS | *(optr+2);
1194 			break;
1195 		}
1196 		n -= optlen;
1197 		optr += optlen;
1198 	}
1199 	return hdrlen;
1200 }
1201 
1202 /*
1203  *  For outgiing calls, generate an initial sequence
1204  *  number and put a SYN on the send queue
1205  */
1206 void
1207 tcpsndsyn(Conv *s, Tcpctl *tcb)
1208 {
1209 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1210 	tcb->rttseq = tcb->iss;
1211 	tcb->snd.wl2 = tcb->iss;
1212 	tcb->snd.una = tcb->iss;
1213 	tcb->snd.ptr = tcb->rttseq;
1214 	tcb->snd.nxt = tcb->rttseq;
1215 	tcb->flgcnt++;
1216 	tcb->flags |= FORCE;
1217 	tcb->sndsyntime = NOW;
1218 
1219 	/* set desired mss and scale */
1220 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1221 }
1222 
1223 void
1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1225 {
1226 	Block *hbp;
1227 	uchar rflags;
1228 	Tcppriv *tpriv;
1229 	Tcp4hdr ph4;
1230 	Tcp6hdr ph6;
1231 
1232 	netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1233 
1234 	tpriv = tcp->priv;
1235 
1236 	if(seg->flags & RST)
1237 		return;
1238 
1239 	/* make pseudo header */
1240 	switch(version) {
1241 	case V4:
1242 		memset(&ph4, 0, sizeof(ph4));
1243 		ph4.vihl = IP_VER4;
1244 		v6tov4(ph4.tcpsrc, dest);
1245 		v6tov4(ph4.tcpdst, source);
1246 		ph4.proto = IP_TCPPROTO;
1247 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1248 		hnputs(ph4.tcpsport, seg->dest);
1249 		hnputs(ph4.tcpdport, seg->source);
1250 		break;
1251 	case V6:
1252 		memset(&ph6, 0, sizeof(ph6));
1253 		ph6.vcf[0] = IP_VER6;
1254 		ipmove(ph6.tcpsrc, dest);
1255 		ipmove(ph6.tcpdst, source);
1256 		ph6.proto = IP_TCPPROTO;
1257 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1258 		hnputs(ph6.tcpsport, seg->dest);
1259 		hnputs(ph6.tcpdport, seg->source);
1260 		break;
1261 	default:
1262 		panic("sndrst: version %d", version);
1263 	}
1264 
1265 	tpriv->stats[OutRsts]++;
1266 	rflags = RST;
1267 
1268 	/* convince the other end that this reset is in band */
1269 	if(seg->flags & ACK) {
1270 		seg->seq = seg->ack;
1271 		seg->ack = 0;
1272 	}
1273 	else {
1274 		rflags |= ACK;
1275 		seg->ack = seg->seq;
1276 		seg->seq = 0;
1277 		if(seg->flags & SYN)
1278 			seg->ack++;
1279 		seg->ack += length;
1280 		if(seg->flags & FIN)
1281 			seg->ack++;
1282 	}
1283 	seg->flags = rflags;
1284 	seg->wnd = 0;
1285 	seg->urg = 0;
1286 	seg->mss = 0;
1287 	seg->ws = 0;
1288 	switch(version) {
1289 	case V4:
1290 		hbp = htontcp4(seg, nil, &ph4, nil);
1291 		if(hbp == nil)
1292 			return;
1293 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1294 		break;
1295 	case V6:
1296 		hbp = htontcp6(seg, nil, &ph6, nil);
1297 		if(hbp == nil)
1298 			return;
1299 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1300 		break;
1301 	default:
1302 		panic("sndrst2: version %d", version);
1303 	}
1304 }
1305 
1306 /*
1307  *  send a reset to the remote side and close the conversation
1308  *  called with s qlocked
1309  */
1310 char*
1311 tcphangup(Conv *s)
1312 {
1313 	Tcp seg;
1314 	Tcpctl *tcb;
1315 	Block *hbp;
1316 
1317 	tcb = (Tcpctl*)s->ptcl;
1318 	if(waserror())
1319 		return commonerror();
1320 	if(s->raddr != 0) {
1321 		if(!waserror()){
1322 			seg.flags = RST | ACK;
1323 			seg.ack = tcb->rcv.nxt;
1324 			tcb->rcv.una = 0;
1325 			seg.seq = tcb->snd.ptr;
1326 			seg.wnd = 0;
1327 			seg.urg = 0;
1328 			seg.mss = 0;
1329 			seg.ws = 0;
1330 			switch(s->ipversion) {
1331 			case V4:
1332 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1333 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1334 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1335 				break;
1336 			case V6:
1337 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1338 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1339 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1340 				break;
1341 			default:
1342 				panic("tcphangup: version %d", s->ipversion);
1343 			}
1344 			poperror();
1345 		}
1346 	}
1347 	localclose(s, nil);
1348 	poperror();
1349 	return nil;
1350 }
1351 
1352 /*
1353  *  (re)send a SYN ACK
1354  */
1355 int
1356 sndsynack(Proto *tcp, Limbo *lp)
1357 {
1358 	Block *hbp;
1359 	Tcp4hdr ph4;
1360 	Tcp6hdr ph6;
1361 	Tcp seg;
1362 	int scale;
1363 
1364 	/* make pseudo header */
1365 	switch(lp->version) {
1366 	case V4:
1367 		memset(&ph4, 0, sizeof(ph4));
1368 		ph4.vihl = IP_VER4;
1369 		v6tov4(ph4.tcpsrc, lp->laddr);
1370 		v6tov4(ph4.tcpdst, lp->raddr);
1371 		ph4.proto = IP_TCPPROTO;
1372 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1373 		hnputs(ph4.tcpsport, lp->lport);
1374 		hnputs(ph4.tcpdport, lp->rport);
1375 		break;
1376 	case V6:
1377 		memset(&ph6, 0, sizeof(ph6));
1378 		ph6.vcf[0] = IP_VER6;
1379 		ipmove(ph6.tcpsrc, lp->laddr);
1380 		ipmove(ph6.tcpdst, lp->raddr);
1381 		ph6.proto = IP_TCPPROTO;
1382 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1383 		hnputs(ph6.tcpsport, lp->lport);
1384 		hnputs(ph6.tcpdport, lp->rport);
1385 		break;
1386 	default:
1387 		panic("sndrst: version %d", lp->version);
1388 	}
1389 
1390 	seg.seq = lp->iss;
1391 	seg.ack = lp->irs+1;
1392 	seg.flags = SYN|ACK;
1393 	seg.urg = 0;
1394 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1395 	seg.wnd = QMAX;
1396 
1397 	/* if the other side set scale, we should too */
1398 	if(lp->rcvscale){
1399 		seg.ws = scale;
1400 		lp->sndscale = scale;
1401 	} else {
1402 		seg.ws = 0;
1403 		lp->sndscale = 0;
1404 	}
1405 
1406 	switch(lp->version) {
1407 	case V4:
1408 		hbp = htontcp4(&seg, nil, &ph4, nil);
1409 		if(hbp == nil)
1410 			return -1;
1411 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1412 		break;
1413 	case V6:
1414 		hbp = htontcp6(&seg, nil, &ph6, nil);
1415 		if(hbp == nil)
1416 			return -1;
1417 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1418 		break;
1419 	default:
1420 		panic("sndsnack: version %d", lp->version);
1421 	}
1422 	lp->lastsend = NOW;
1423 	return 0;
1424 }
1425 
1426 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1427 
1428 /*
1429  *  put a call into limbo and respond with a SYN ACK
1430  *
1431  *  called with proto locked
1432  */
1433 static void
1434 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1435 {
1436 	Limbo *lp, **l;
1437 	Tcppriv *tpriv;
1438 	int h;
1439 
1440 	tpriv = s->p->priv;
1441 	h = hashipa(source, seg->source);
1442 
1443 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1444 		lp = *l;
1445 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1446 			continue;
1447 		if(ipcmp(lp->raddr, source) != 0)
1448 			continue;
1449 		if(ipcmp(lp->laddr, dest) != 0)
1450 			continue;
1451 
1452 		/* each new SYN restarts the retransmits */
1453 		lp->irs = seg->seq;
1454 		break;
1455 	}
1456 	lp = *l;
1457 	if(lp == nil){
1458 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1459 			lp = tpriv->lht[h];
1460 			tpriv->lht[h] = lp->next;
1461 			lp->next = nil;
1462 		} else {
1463 			lp = malloc(sizeof(*lp));
1464 			if(lp == nil)
1465 				return;
1466 			tpriv->nlimbo++;
1467 		}
1468 		*l = lp;
1469 		lp->version = version;
1470 		ipmove(lp->laddr, dest);
1471 		ipmove(lp->raddr, source);
1472 		lp->lport = seg->dest;
1473 		lp->rport = seg->source;
1474 		lp->mss = seg->mss;
1475 		lp->rcvscale = seg->ws;
1476 		lp->irs = seg->seq;
1477 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1478 	}
1479 
1480 	if(sndsynack(s->p, lp) < 0){
1481 		*l = lp->next;
1482 		tpriv->nlimbo--;
1483 		free(lp);
1484 	}
1485 }
1486 
1487 /*
1488  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1489  */
1490 static void
1491 limborexmit(Proto *tcp)
1492 {
1493 	Tcppriv *tpriv;
1494 	Limbo **l, *lp;
1495 	int h;
1496 	int seen;
1497 	ulong now;
1498 
1499 	tpriv = tcp->priv;
1500 
1501 	if(!canqlock(tcp))
1502 		return;
1503 	seen = 0;
1504 	now = NOW;
1505 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1506 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1507 			lp = *l;
1508 			seen++;
1509 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1510 				continue;
1511 
1512 			/* time it out after 1 second */
1513 			if(++(lp->rexmits) > 5){
1514 				tpriv->nlimbo--;
1515 				*l = lp->next;
1516 				free(lp);
1517 				continue;
1518 			}
1519 
1520 			/* if we're being attacked, don't bother resending SYN ACK's */
1521 			if(tpriv->nlimbo > 100)
1522 				continue;
1523 
1524 			if(sndsynack(tcp, lp) < 0){
1525 				tpriv->nlimbo--;
1526 				*l = lp->next;
1527 				free(lp);
1528 				continue;
1529 			}
1530 
1531 			l = &lp->next;
1532 		}
1533 	}
1534 	qunlock(tcp);
1535 }
1536 
1537 /*
1538  *  lookup call in limbo.  if found, throw it out.
1539  *
1540  *  called with proto locked
1541  */
1542 static void
1543 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1544 {
1545 	Limbo *lp, **l;
1546 	int h;
1547 	Tcppriv *tpriv;
1548 
1549 	tpriv = s->p->priv;
1550 
1551 	/* find a call in limbo */
1552 	h = hashipa(src, segp->source);
1553 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1554 		lp = *l;
1555 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1556 			continue;
1557 		if(ipcmp(lp->laddr, dst) != 0)
1558 			continue;
1559 		if(ipcmp(lp->raddr, src) != 0)
1560 			continue;
1561 
1562 		/* RST can only follow the SYN */
1563 		if(segp->seq == lp->irs+1){
1564 			tpriv->nlimbo--;
1565 			*l = lp->next;
1566 			free(lp);
1567 		}
1568 		break;
1569 	}
1570 }
1571 
1572 /*
1573  *  come here when we finally get an ACK to our SYN-ACK.
1574  *  lookup call in limbo.  if found, create a new conversation
1575  *
1576  *  called with proto locked
1577  */
1578 static Conv*
1579 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1580 {
1581 	Conv *new;
1582 	Tcpctl *tcb;
1583 	Tcppriv *tpriv;
1584 	Tcp4hdr *h4;
1585 	Tcp6hdr *h6;
1586 	Limbo *lp, **l;
1587 	int h;
1588 
1589 	/* unless it's just an ack, it can't be someone coming out of limbo */
1590 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1591 		return nil;
1592 
1593 	tpriv = s->p->priv;
1594 
1595 	/* find a call in limbo */
1596 	h = hashipa(src, segp->source);
1597 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1598 		netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d",
1599 			src, segp->source, lp->raddr, lp->rport,
1600 			dst, segp->dest, lp->laddr, lp->lport,
1601 			version, lp->version
1602  		);
1603 
1604 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1605 			continue;
1606 		if(ipcmp(lp->laddr, dst) != 0)
1607 			continue;
1608 		if(ipcmp(lp->raddr, src) != 0)
1609 			continue;
1610 
1611 		/* we're assuming no data with the initial SYN */
1612 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1613 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux",
1614 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1615 			lp = nil;
1616 		} else {
1617 			tpriv->nlimbo--;
1618 			*l = lp->next;
1619 		}
1620 		break;
1621 	}
1622 	if(lp == nil)
1623 		return nil;
1624 
1625 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1626 	if(new == nil)
1627 		return nil;
1628 
1629 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1630 	tcb = (Tcpctl*)new->ptcl;
1631 	tcb->flags &= ~CLONE;
1632 	tcb->timer.arg = new;
1633 	tcb->timer.state = TcptimerOFF;
1634 	tcb->acktimer.arg = new;
1635 	tcb->acktimer.state = TcptimerOFF;
1636 	tcb->katimer.arg = new;
1637 	tcb->katimer.state = TcptimerOFF;
1638 	tcb->rtt_timer.arg = new;
1639 	tcb->rtt_timer.state = TcptimerOFF;
1640 
1641 	tcb->irs = lp->irs;
1642 	tcb->rcv.nxt = tcb->irs+1;
1643 	tcb->rcv.urg = tcb->rcv.nxt;
1644 
1645 	tcb->iss = lp->iss;
1646 	tcb->rttseq = tcb->iss;
1647 	tcb->snd.wl2 = tcb->iss;
1648 	tcb->snd.una = tcb->iss+1;
1649 	tcb->snd.ptr = tcb->iss+1;
1650 	tcb->snd.nxt = tcb->iss+1;
1651 	tcb->flgcnt = 0;
1652 	tcb->flags |= SYNACK;
1653 
1654 	/* our sending max segment size cannot be bigger than what he asked for */
1655 	if(lp->mss != 0 && lp->mss < tcb->mss)
1656 		tcb->mss = lp->mss;
1657 
1658 	/* window scaling */
1659 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1660 
1661 	/* the congestion window always starts out as a single segment */
1662 	tcb->snd.wnd = segp->wnd;
1663 	tcb->cwind = tcb->mss;
1664 
1665 	/* set initial round trip time */
1666 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1667 	tcpsynackrtt(new);
1668 
1669 	free(lp);
1670 
1671 	/* set up proto header */
1672 	switch(version){
1673 	case V4:
1674 		h4 = &tcb->protohdr.tcp4hdr;
1675 		memset(h4, 0, sizeof(*h4));
1676 		h4->proto = IP_TCPPROTO;
1677 		hnputs(h4->tcpsport, new->lport);
1678 		hnputs(h4->tcpdport, new->rport);
1679 		v6tov4(h4->tcpsrc, dst);
1680 		v6tov4(h4->tcpdst, src);
1681 		break;
1682 	case V6:
1683 		h6 = &tcb->protohdr.tcp6hdr;
1684 		memset(h6, 0, sizeof(*h6));
1685 		h6->proto = IP_TCPPROTO;
1686 		hnputs(h6->tcpsport, new->lport);
1687 		hnputs(h6->tcpdport, new->rport);
1688 		ipmove(h6->tcpsrc, dst);
1689 		ipmove(h6->tcpdst, src);
1690 		break;
1691 	default:
1692 		panic("tcpincoming: version %d", new->ipversion);
1693 	}
1694 
1695 	tcpsetstate(new, Established);
1696 
1697 	iphtadd(&tpriv->ht, new);
1698 
1699 	return new;
1700 }
1701 
1702 int
1703 seq_within(ulong x, ulong low, ulong high)
1704 {
1705 	if(low <= high){
1706 		if(low <= x && x <= high)
1707 			return 1;
1708 	}
1709 	else {
1710 		if(x >= low || x <= high)
1711 			return 1;
1712 	}
1713 	return 0;
1714 }
1715 
1716 int
1717 seq_lt(ulong x, ulong y)
1718 {
1719 	return (int)(x-y) < 0;
1720 }
1721 
1722 int
1723 seq_le(ulong x, ulong y)
1724 {
1725 	return (int)(x-y) <= 0;
1726 }
1727 
1728 int
1729 seq_gt(ulong x, ulong y)
1730 {
1731 	return (int)(x-y) > 0;
1732 }
1733 
1734 int
1735 seq_ge(ulong x, ulong y)
1736 {
1737 	return (int)(x-y) >= 0;
1738 }
1739 
1740 /*
1741  *  use the time between the first SYN and it's ack as the
1742  *  initial round trip time
1743  */
1744 void
1745 tcpsynackrtt(Conv *s)
1746 {
1747 	Tcpctl *tcb;
1748 	int delta;
1749 	Tcppriv *tpriv;
1750 
1751 	tcb = (Tcpctl*)s->ptcl;
1752 	tpriv = s->p->priv;
1753 
1754 	delta = NOW - tcb->sndsyntime;
1755 	tcb->srtt = delta<<LOGAGAIN;
1756 	tcb->mdev = delta<<LOGDGAIN;
1757 
1758 	/* halt round trip timer */
1759 	tcphalt(tpriv, &tcb->rtt_timer);
1760 }
1761 
1762 void
1763 update(Conv *s, Tcp *seg)
1764 {
1765 	int rtt, delta;
1766 	Tcpctl *tcb;
1767 	ulong acked;
1768 	ulong expand;
1769 	Tcppriv *tpriv;
1770 
1771 	tpriv = s->p->priv;
1772 	tcb = (Tcpctl*)s->ptcl;
1773 
1774 	/* if everything has been acked, force output(?) */
1775 	if(seq_gt(seg->ack, tcb->snd.nxt)) {
1776 		tcb->flags |= FORCE;
1777 		return;
1778 	}
1779 
1780 	/* added by Dong Lin for fast retransmission */
1781 	if(seg->ack == tcb->snd.una
1782 	&& tcb->snd.una != tcb->snd.nxt
1783 	&& seg->len == 0
1784 	&& seg->wnd == tcb->snd.wnd) {
1785 
1786 		/* this is a pure ack w/o window update */
1787 		netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1788 			tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1789 
1790 		if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1791 			/*
1792 			 *  tahoe tcp rxt the packet, half sshthresh,
1793  			 *  and set cwnd to one packet
1794 			 */
1795 			tcb->snd.recovery = 1;
1796 			tcb->snd.rxt = tcb->snd.nxt;
1797 			netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1798 			tcprxmit(s);
1799 		} else {
1800 			/* do reno tcp here. */
1801 		}
1802 	}
1803 
1804 	/*
1805 	 *  update window
1806 	 */
1807 	if(seq_gt(seg->ack, tcb->snd.wl2)
1808 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1809 		tcb->snd.wnd = seg->wnd;
1810 		tcb->snd.wl2 = seg->ack;
1811 	}
1812 
1813 	if(!seq_gt(seg->ack, tcb->snd.una)){
1814 		/*
1815 		 *  don't let us hangup if sending into a closed window and
1816 		 *  we're still getting acks
1817 		 */
1818 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1819 			tcb->backedoff = MAXBACKMS/4;
1820 		}
1821 		return;
1822 	}
1823 
1824 	/*
1825 	 *  any positive ack turns off fast rxt,
1826 	 *  (should we do new-reno on partial acks?)
1827 	 */
1828 	if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1829 		tcb->snd.dupacks = 0;
1830 		tcb->snd.recovery = 0;
1831 	} else
1832 		netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1833 
1834 	/* Compute the new send window size */
1835 	acked = seg->ack - tcb->snd.una;
1836 
1837 	/* avoid slow start and timers for SYN acks */
1838 	if((tcb->flags & SYNACK) == 0) {
1839 		tcb->flags |= SYNACK;
1840 		acked--;
1841 		tcb->flgcnt--;
1842 		goto done;
1843 	}
1844 
1845 	/* slow start as long as we're not recovering from lost packets */
1846 	if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1847 		if(tcb->cwind < tcb->ssthresh) {
1848 			expand = tcb->mss;
1849 			if(acked < expand)
1850 				expand = acked;
1851 		}
1852 		else
1853 			expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1854 
1855 		if(tcb->cwind + expand < tcb->cwind)
1856 			expand = tcb->snd.wnd - tcb->cwind;
1857 		if(tcb->cwind + expand > tcb->snd.wnd)
1858 			expand = tcb->snd.wnd - tcb->cwind;
1859 		tcb->cwind += expand;
1860 	}
1861 
1862 	/* Adjust the timers according to the round trip time */
1863 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1864 		tcphalt(tpriv, &tcb->rtt_timer);
1865 		if((tcb->flags&RETRAN) == 0) {
1866 			tcb->backoff = 0;
1867 			tcb->backedoff = 0;
1868 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1869 			if(rtt == 0)
1870 				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
1871 			rtt *= MSPTICK;
1872 			if(tcb->srtt == 0) {
1873 				tcb->srtt = rtt << LOGAGAIN;
1874 				tcb->mdev = rtt << LOGDGAIN;
1875 			} else {
1876 				delta = rtt - (tcb->srtt>>LOGAGAIN);
1877 				tcb->srtt += delta;
1878 				if(tcb->srtt <= 0)
1879 					tcb->srtt = 1;
1880 
1881 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1882 				tcb->mdev += delta;
1883 				if(tcb->mdev <= 0)
1884 					tcb->mdev = 1;
1885 			}
1886 			tcpsettimer(tcb);
1887 		}
1888 	}
1889 
1890 done:
1891 	if(qdiscard(s->wq, acked) < acked)
1892 		tcb->flgcnt--;
1893 
1894 	tcb->snd.una = seg->ack;
1895 	if(seq_gt(seg->ack, tcb->snd.urg))
1896 		tcb->snd.urg = seg->ack;
1897 
1898 	if(tcb->snd.una != tcb->snd.nxt)
1899 		tcpgo(tpriv, &tcb->timer);
1900 	else
1901 		tcphalt(tpriv, &tcb->timer);
1902 
1903 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1904 		tcb->snd.ptr = tcb->snd.una;
1905 
1906 	tcb->flags &= ~RETRAN;
1907 	tcb->backoff = 0;
1908 	tcb->backedoff = 0;
1909 }
1910 
1911 void
1912 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1913 {
1914 	Tcp seg;
1915 	Tcp4hdr *h4;
1916 	Tcp6hdr *h6;
1917 	int hdrlen;
1918 	Tcpctl *tcb;
1919 	ushort length;
1920 	uchar source[IPaddrlen], dest[IPaddrlen];
1921 	Conv *s;
1922 	Fs *f;
1923 	Tcppriv *tpriv;
1924 	uchar version;
1925 
1926 	f = tcp->f;
1927 	tpriv = tcp->priv;
1928 
1929 	tpriv->stats[InSegs]++;
1930 
1931 	h4 = (Tcp4hdr*)(bp->rp);
1932 	h6 = (Tcp6hdr*)(bp->rp);
1933 
1934 	if((h4->vihl&0xF0)==IP_VER4) {
1935 		version = V4;
1936 		length = nhgets(h4->length);
1937 		v4tov6(dest, h4->tcpdst);
1938 		v4tov6(source, h4->tcpsrc);
1939 
1940 		h4->Unused = 0;
1941 		hnputs(h4->tcplen, length-TCP4_PKT);
1942 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1943 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1944 			tpriv->stats[CsumErrs]++;
1945 			tpriv->stats[InErrs]++;
1946 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1947 			freeblist(bp);
1948 			return;
1949 		}
1950 
1951 		hdrlen = ntohtcp4(&seg, &bp);
1952 		if(hdrlen < 0){
1953 			tpriv->stats[HlenErrs]++;
1954 			tpriv->stats[InErrs]++;
1955 			netlog(f, Logtcp, "bad tcp hdr len\n");
1956 			return;
1957 		}
1958 
1959 		/* trim the packet to the size claimed by the datagram */
1960 		length -= hdrlen+TCP4_PKT;
1961 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1962 		if(bp == nil){
1963 			tpriv->stats[LenErrs]++;
1964 			tpriv->stats[InErrs]++;
1965 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
1966 			return;
1967 		}
1968 	}
1969 	else {
1970 		int ttl = h6->ttl;
1971 		int proto = h6->proto;
1972 
1973 		version = V6;
1974 		length = nhgets(h6->ploadlen);
1975 		ipmove(dest, h6->tcpdst);
1976 		ipmove(source, h6->tcpsrc);
1977 
1978 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1979 		h6->ttl = proto;
1980 		hnputl(h6->vcf, length);
1981 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1982 			ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
1983 			tpriv->stats[CsumErrs]++;
1984 			tpriv->stats[InErrs]++;
1985 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1986 			freeblist(bp);
1987 			return;
1988 		}
1989 		h6->ttl = ttl;
1990 		h6->proto = proto;
1991 		hnputs(h6->ploadlen, length);
1992 
1993 		hdrlen = ntohtcp6(&seg, &bp);
1994 		if(hdrlen < 0){
1995 			tpriv->stats[HlenErrs]++;
1996 			tpriv->stats[InErrs]++;
1997 			netlog(f, Logtcp, "bad tcp hdr len\n");
1998 			return;
1999 		}
2000 
2001 		/* trim the packet to the size claimed by the datagram */
2002 		length -= hdrlen;
2003 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2004 		if(bp == nil){
2005 			tpriv->stats[LenErrs]++;
2006 			tpriv->stats[InErrs]++;
2007 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2008 			return;
2009 		}
2010 	}
2011 
2012 	/* lock protocol while searching for a conversation */
2013 	qlock(tcp);
2014 
2015 	/* Look for a matching conversation */
2016 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2017 	if(s == nil){
2018 		netlog(f, Logtcp, "iphtlook failed");
2019 reset:
2020 		qunlock(tcp);
2021 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2022 		freeblist(bp);
2023 		return;
2024 	}
2025 
2026 	/* if it's a listener, look for the right flags and get a new conv */
2027 	tcb = (Tcpctl*)s->ptcl;
2028 	if(tcb->state == Listen){
2029 		if(seg.flags & RST){
2030 			limborst(s, &seg, source, dest, version);
2031 			qunlock(tcp);
2032 			freeblist(bp);
2033 			return;
2034 		}
2035 
2036 		/* if this is a new SYN, put the call into limbo */
2037 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2038 			limbo(s, source, dest, &seg, version);
2039 			qunlock(tcp);
2040 			freeblist(bp);
2041 			return;
2042 		}
2043 
2044 		/*
2045 		 *  if there's a matching call in limbo, tcpincoming will
2046 		 *  return it in state Syn_received
2047 		 */
2048 		s = tcpincoming(s, &seg, source, dest, version);
2049 		if(s == nil)
2050 			goto reset;
2051 	}
2052 
2053 	/* The rest of the input state machine is run with the control block
2054 	 * locked and implements the state machine directly out of the RFC.
2055 	 * Out-of-band data is ignored - it was always a bad idea.
2056 	 */
2057 	tcb = (Tcpctl*)s->ptcl;
2058 	if(waserror()){
2059 		qunlock(s);
2060 		nexterror();
2061 	}
2062 	qlock(s);
2063 	qunlock(tcp);
2064 
2065 	/* fix up window */
2066 	seg.wnd <<= tcb->rcv.scale;
2067 
2068 	/* every input packet in puts off the keep alive time out */
2069 	tcpsetkacounter(tcb);
2070 
2071 	switch(tcb->state) {
2072 	case Closed:
2073 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2074 		goto raise;
2075 	case Syn_sent:
2076 		if(seg.flags & ACK) {
2077 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2078 				sndrst(tcp, source, dest, length, &seg, version,
2079 					 "bad seq in Syn_sent");
2080 				goto raise;
2081 			}
2082 		}
2083 		if(seg.flags & RST) {
2084 			if(seg.flags & ACK)
2085 				localclose(s, Econrefused);
2086 			goto raise;
2087 		}
2088 
2089 		if(seg.flags & SYN) {
2090 			procsyn(s, &seg);
2091 			if(seg.flags & ACK){
2092 				update(s, &seg);
2093 				tcpsynackrtt(s);
2094 				tcpsetstate(s, Established);
2095 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2096 			}
2097 			else {
2098 				tcb->time = NOW;
2099 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2100 			}
2101 
2102 			if(length != 0 || (seg.flags & FIN))
2103 				break;
2104 
2105 			freeblist(bp);
2106 			goto output;
2107 		}
2108 		else
2109 			freeblist(bp);
2110 
2111 		qunlock(s);
2112 		poperror();
2113 		return;
2114 	case Syn_received:
2115 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2116 		if(seg.flags & ACK)
2117 			tcpsynackrtt(s);
2118 		break;
2119 	}
2120 
2121 	/*
2122 	 *  One DOS attack is to open connections to us and then forget about them,
2123 	 *  thereby tying up a conv at no long term cost to the attacker.
2124 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2125 	 *  corresponding code in tcpsendka().
2126 	 */
2127 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2128 		if(tcpporthogdefense
2129 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2130 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2131 				source, seg.source, dest, seg.dest, seg.flags,
2132 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2133 			localclose(s, "stateless hog");
2134 		}
2135 	}
2136 
2137 	/* Cut the data to fit the receive window */
2138 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2139 		netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2140 		update(s, &seg);
2141 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2142 			tcphalt(tpriv, &tcb->rtt_timer);
2143 			tcphalt(tpriv, &tcb->acktimer);
2144 			tcphalt(tpriv, &tcb->katimer);
2145 			tcpsetstate(s, Time_wait);
2146 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2147 			tcpgo(tpriv, &tcb->timer);
2148 		}
2149 		if(!(seg.flags & RST)) {
2150 			tcb->flags |= FORCE;
2151 			goto output;
2152 		}
2153 		qunlock(s);
2154 		poperror();
2155 		return;
2156 	}
2157 
2158 	/* Cannot accept so answer with a rst */
2159 	if(length && tcb->state == Closed) {
2160 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2161 		goto raise;
2162 	}
2163 
2164 	/* The segment is beyond the current receive pointer so
2165 	 * queue the data in the resequence queue
2166 	 */
2167 	if(seg.seq != tcb->rcv.nxt)
2168 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2169 		update(s, &seg);
2170 		if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2171 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2172 		tcb->flags |= FORCE;
2173 		goto output;
2174 	}
2175 
2176 	/*
2177 	 *  keep looping till we've processed this packet plus any
2178 	 *  adjacent packets in the resequence queue
2179 	 */
2180 	for(;;) {
2181 		if(seg.flags & RST) {
2182 			if(tcb->state == Established) {
2183 				tpriv->stats[EstabResets]++;
2184 				if(tcb->rcv.nxt != seg.seq)
2185 					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2186 			}
2187 			localclose(s, Econrefused);
2188 			goto raise;
2189 		}
2190 
2191 		if((seg.flags&ACK) == 0)
2192 			goto raise;
2193 
2194 		switch(tcb->state) {
2195 		case Syn_received:
2196 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2197 				sndrst(tcp, source, dest, length, &seg, version,
2198 					"bad seq in Syn_received");
2199 				goto raise;
2200 			}
2201 			update(s, &seg);
2202 			tcpsetstate(s, Established);
2203 		case Established:
2204 		case Close_wait:
2205 			update(s, &seg);
2206 			break;
2207 		case Finwait1:
2208 			update(s, &seg);
2209 			if(qlen(s->wq)+tcb->flgcnt == 0){
2210 				tcphalt(tpriv, &tcb->rtt_timer);
2211 				tcphalt(tpriv, &tcb->acktimer);
2212 				tcpsetkacounter(tcb);
2213 				tcb->time = NOW;
2214 				tcpsetstate(s, Finwait2);
2215 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2216 				tcpgo(tpriv, &tcb->katimer);
2217 			}
2218 			break;
2219 		case Finwait2:
2220 			update(s, &seg);
2221 			break;
2222 		case Closing:
2223 			update(s, &seg);
2224 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2225 				tcphalt(tpriv, &tcb->rtt_timer);
2226 				tcphalt(tpriv, &tcb->acktimer);
2227 				tcphalt(tpriv, &tcb->katimer);
2228 				tcpsetstate(s, Time_wait);
2229 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2230 				tcpgo(tpriv, &tcb->timer);
2231 			}
2232 			break;
2233 		case Last_ack:
2234 			update(s, &seg);
2235 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2236 				localclose(s, nil);
2237 				goto raise;
2238 			}
2239 		case Time_wait:
2240 			tcb->flags |= FORCE;
2241 			if(tcb->timer.state != TcptimerON)
2242 				tcpgo(tpriv, &tcb->timer);
2243 		}
2244 
2245 		if((seg.flags&URG) && seg.urg) {
2246 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2247 				tcb->rcv.urg = seg.urg + seg.seq;
2248 				pullblock(&bp, seg.urg);
2249 			}
2250 		}
2251 		else
2252 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2253 			tcb->rcv.urg = tcb->rcv.nxt;
2254 
2255 		if(length == 0) {
2256 			if(bp != nil)
2257 				freeblist(bp);
2258 		}
2259 		else {
2260 			switch(tcb->state){
2261 			default:
2262 				/* Ignore segment text */
2263 				if(bp != nil)
2264 					freeblist(bp);
2265 				break;
2266 
2267 			case Syn_received:
2268 			case Established:
2269 			case Finwait1:
2270 				/* If we still have some data place on
2271 				 * receive queue
2272 				 */
2273 				if(bp) {
2274 					bp = packblock(bp);
2275 					if(bp == nil)
2276 						panic("tcp packblock");
2277 					qpassnolim(s->rq, bp);
2278 					bp = nil;
2279 
2280 					/*
2281 					 *  Force an ack every 2 data messages.  This is
2282 					 *  a hack for rob to make his home system run
2283 					 *  faster.
2284 					 *
2285 					 *  this also keeps the standard TCP congestion
2286 					 *  control working since it needs an ack every
2287 					 *  2 max segs worth.  This is not quite that,
2288 					 *  but under a real stream is equivalent since
2289 					 *  every packet has a max seg in it.
2290 					 */
2291 					if(++(tcb->rcv.una) >= 2)
2292 						tcb->flags |= FORCE;
2293 				}
2294 				tcb->rcv.nxt += length;
2295 
2296 				/*
2297 				 *  update our rcv window
2298 				 */
2299 				tcprcvwin(s);
2300 
2301 				/*
2302 				 *  turn on the acktimer if there's something
2303 				 *  to ack
2304 				 */
2305 				if(tcb->acktimer.state != TcptimerON)
2306 					tcpgo(tpriv, &tcb->acktimer);
2307 
2308 				break;
2309 			case Finwait2:
2310 				/* no process to read the data, send a reset */
2311 				if(bp != nil)
2312 					freeblist(bp);
2313 				sndrst(tcp, source, dest, length, &seg, version,
2314 					"send to Finwait2");
2315 				qunlock(s);
2316 				poperror();
2317 				return;
2318 			}
2319 		}
2320 
2321 		if(seg.flags & FIN) {
2322 			tcb->flags |= FORCE;
2323 
2324 			switch(tcb->state) {
2325 			case Syn_received:
2326 			case Established:
2327 				tcb->rcv.nxt++;
2328 				tcpsetstate(s, Close_wait);
2329 				break;
2330 			case Finwait1:
2331 				tcb->rcv.nxt++;
2332 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2333 					tcphalt(tpriv, &tcb->rtt_timer);
2334 					tcphalt(tpriv, &tcb->acktimer);
2335 					tcphalt(tpriv, &tcb->katimer);
2336 					tcpsetstate(s, Time_wait);
2337 					tcb->timer.start = MSL2*(1000/MSPTICK);
2338 					tcpgo(tpriv, &tcb->timer);
2339 				}
2340 				else
2341 					tcpsetstate(s, Closing);
2342 				break;
2343 			case Finwait2:
2344 				tcb->rcv.nxt++;
2345 				tcphalt(tpriv, &tcb->rtt_timer);
2346 				tcphalt(tpriv, &tcb->acktimer);
2347 				tcphalt(tpriv, &tcb->katimer);
2348 				tcpsetstate(s, Time_wait);
2349 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2350 				tcpgo(tpriv, &tcb->timer);
2351 				break;
2352 			case Close_wait:
2353 			case Closing:
2354 			case Last_ack:
2355 				break;
2356 			case Time_wait:
2357 				tcpgo(tpriv, &tcb->timer);
2358 				break;
2359 			}
2360 		}
2361 
2362 		/*
2363 		 *  get next adjacent segment from the resequence queue.
2364 		 *  dump/trim any overlapping segments
2365 		 */
2366 		for(;;) {
2367 			if(tcb->reseq == nil)
2368 				goto output;
2369 
2370 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2371 				goto output;
2372 
2373 			getreseq(tcb, &seg, &bp, &length);
2374 
2375 			if(tcptrim(tcb, &seg, &bp, &length) == 0)
2376 				break;
2377 		}
2378 	}
2379 output:
2380 	tcpoutput(s);
2381 	qunlock(s);
2382 	poperror();
2383 	return;
2384 raise:
2385 	qunlock(s);
2386 	poperror();
2387 	freeblist(bp);
2388 	tcpkick(s);
2389 }
2390 
2391 /*
2392  *  always enters and exits with the s locked.  We drop
2393  *  the lock to ipoput the packet so some care has to be
2394  *  taken by callers.
2395  */
2396 void
2397 tcpoutput(Conv *s)
2398 {
2399 	Tcp seg;
2400 	int msgs;
2401 	Tcpctl *tcb;
2402 	Block *hbp, *bp;
2403 	int sndcnt, n;
2404 	ulong ssize, dsize, usable, sent;
2405 	Fs *f;
2406 	Tcppriv *tpriv;
2407 	uchar version;
2408 
2409 	f = s->p->f;
2410 	tpriv = s->p->priv;
2411 	version = s->ipversion;
2412 
2413 	for(msgs = 0; msgs < 100; msgs++) {
2414 		tcb = (Tcpctl*)s->ptcl;
2415 
2416 		switch(tcb->state) {
2417 		case Listen:
2418 		case Closed:
2419 		case Finwait2:
2420 			return;
2421 		}
2422 
2423 		/* force an ack when a window has opened up */
2424 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2425 			tcb->rcv.blocked = 0;
2426 			tcb->flags |= FORCE;
2427 		}
2428 
2429 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2430 		sent = tcb->snd.ptr - tcb->snd.una;
2431 
2432 		/* Don't send anything else until our SYN has been acked */
2433 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2434 			break;
2435 
2436 		/* Compute usable segment based on offered window and limit
2437 		 * window probes to one
2438 		 */
2439 		if(tcb->snd.wnd == 0){
2440 			if(sent != 0) {
2441 				if((tcb->flags&FORCE) == 0)
2442 					break;
2443 //				tcb->snd.ptr = tcb->snd.una;
2444 			}
2445 			usable = 1;
2446 		}
2447 		else {
2448 			usable = tcb->cwind;
2449 			if(tcb->snd.wnd < usable)
2450 				usable = tcb->snd.wnd;
2451 			usable -= sent;
2452 		}
2453 		ssize = sndcnt-sent;
2454 		if(ssize && usable < 2)
2455 			netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2456 				tcb->snd.wnd, tcb->cwind);
2457 		if(usable < ssize)
2458 			ssize = usable;
2459 		if(tcb->mss < ssize)
2460 			ssize = tcb->mss;
2461 		dsize = ssize;
2462 		seg.urg = 0;
2463 
2464 		if(ssize == 0)
2465 		if((tcb->flags&FORCE) == 0)
2466 			break;
2467 
2468 		tcb->flags &= ~FORCE;
2469 		tcprcvwin(s);
2470 
2471 		/* By default we will generate an ack */
2472 		tcphalt(tpriv, &tcb->acktimer);
2473 		tcb->rcv.una = 0;
2474 		seg.source = s->lport;
2475 		seg.dest = s->rport;
2476 		seg.flags = ACK;
2477 		seg.mss = 0;
2478 		seg.ws = 0;
2479 		switch(tcb->state){
2480 		case Syn_sent:
2481 			seg.flags = 0;
2482 			if(tcb->snd.ptr == tcb->iss){
2483 				seg.flags |= SYN;
2484 				dsize--;
2485 				seg.mss = tcb->mss;
2486 				seg.ws = tcb->scale;
2487 			}
2488 			break;
2489 		case Syn_received:
2490 			/*
2491 			 *  don't send any data with a SYN/ACK packet
2492 			 *  because Linux rejects the packet in its
2493 			 *  attempt to solve the SYN attack problem
2494 			 */
2495 			if(tcb->snd.ptr == tcb->iss){
2496 				seg.flags |= SYN;
2497 				dsize = 0;
2498 				ssize = 1;
2499 				seg.mss = tcb->mss;
2500 				seg.ws = tcb->scale;
2501 			}
2502 			break;
2503 		}
2504 		seg.seq = tcb->snd.ptr;
2505 		seg.ack = tcb->rcv.nxt;
2506 		seg.wnd = tcb->rcv.wnd;
2507 
2508 		/* Pull out data to send */
2509 		bp = nil;
2510 		if(dsize != 0) {
2511 			bp = qcopy(s->wq, dsize, sent);
2512 			if(BLEN(bp) != dsize) {
2513 				seg.flags |= FIN;
2514 				dsize--;
2515 			}
2516 		}
2517 
2518 		if(sent+dsize == sndcnt)
2519 			seg.flags |= PSH;
2520 
2521 		/* keep track of balance of resent data */
2522 		if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2523 			n = tcb->snd.nxt - tcb->snd.ptr;
2524 			if(ssize < n)
2525 				n = ssize;
2526 			tcb->resent += n;
2527 			netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2528 				s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2529 			tpriv->stats[RetransSegs]++;
2530 		}
2531 
2532 		tcb->snd.ptr += ssize;
2533 
2534 		/* Pull up the send pointer so we can accept acks
2535 		 * for this window
2536 		 */
2537 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2538 			tcb->snd.nxt = tcb->snd.ptr;
2539 
2540 		/* Build header, link data and compute cksum */
2541 		switch(version){
2542 		case V4:
2543 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2544 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2545 			if(hbp == nil) {
2546 				freeblist(bp);
2547 				return;
2548 			}
2549 			break;
2550 		case V6:
2551 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2552 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2553 			if(hbp == nil) {
2554 				freeblist(bp);
2555 				return;
2556 			}
2557 			break;
2558 		default:
2559 			hbp = nil;	/* to suppress a warning */
2560 			panic("tcpoutput: version %d", version);
2561 		}
2562 
2563 		/* Start the transmission timers if there is new data and we
2564 		 * expect acknowledges
2565 		 */
2566 		if(ssize != 0){
2567 			if(tcb->timer.state != TcptimerON)
2568 				tcpgo(tpriv, &tcb->timer);
2569 
2570 			/*  If round trip timer isn't running, start it.
2571 			 *  measure the longest packet only in case the
2572 			 *  transmission time dominates RTT
2573 			 */
2574 			if(tcb->rtt_timer.state != TcptimerON)
2575 			if(ssize == tcb->mss) {
2576 				tcpgo(tpriv, &tcb->rtt_timer);
2577 				tcb->rttseq = tcb->snd.ptr;
2578 			}
2579 		}
2580 
2581 		tpriv->stats[OutSegs]++;
2582 
2583 		/* put off the next keep alive */
2584 		tcpgo(tpriv, &tcb->katimer);
2585 
2586 		switch(version){
2587 		case V4:
2588 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2589 				/* a negative return means no route */
2590 				localclose(s, "no route");
2591 			}
2592 			break;
2593 		case V6:
2594 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2595 				/* a negative return means no route */
2596 				localclose(s, "no route");
2597 			}
2598 			break;
2599 		default:
2600 			panic("tcpoutput2: version %d", version);
2601 		}
2602 		if((msgs%4) == 1){
2603 			qunlock(s);
2604 			sched();
2605 			qlock(s);
2606 		}
2607 	}
2608 }
2609 
2610 /*
2611  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2612  */
2613 void
2614 tcpsendka(Conv *s)
2615 {
2616 	Tcp seg;
2617 	Tcpctl *tcb;
2618 	Block *hbp,*dbp;
2619 
2620 	tcb = (Tcpctl*)s->ptcl;
2621 
2622 	dbp = nil;
2623 	seg.urg = 0;
2624 	seg.source = s->lport;
2625 	seg.dest = s->rport;
2626 	seg.flags = ACK|PSH;
2627 	seg.mss = 0;
2628 	seg.ws = 0;
2629 	if(tcpporthogdefense)
2630 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2631 	else
2632 		seg.seq = tcb->snd.una-1;
2633 	seg.ack = tcb->rcv.nxt;
2634 	tcb->rcv.una = 0;
2635 	seg.wnd = tcb->rcv.wnd;
2636 	if(tcb->state == Finwait2){
2637 		seg.flags |= FIN;
2638 	} else {
2639 		dbp = allocb(1);
2640 		dbp->wp++;
2641 	}
2642 
2643 	if(isv4(s->raddr)) {
2644 		/* Build header, link data and compute cksum */
2645 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2646 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2647 		if(hbp == nil) {
2648 			freeblist(dbp);
2649 			return;
2650 		}
2651 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2652 	}
2653 	else {
2654 		/* Build header, link data and compute cksum */
2655 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2656 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2657 		if(hbp == nil) {
2658 			freeblist(dbp);
2659 			return;
2660 		}
2661 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2662 	}
2663 }
2664 
2665 /*
2666  *  set connection to time out after 12 minutes
2667  */
2668 void
2669 tcpsetkacounter(Tcpctl *tcb)
2670 {
2671 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2672 	if(tcb->kacounter < 3)
2673 		tcb->kacounter = 3;
2674 }
2675 
2676 /*
2677  *  if we've timed out, close the connection
2678  *  otherwise, send a keepalive and restart the timer
2679  */
2680 void
2681 tcpkeepalive(void *v)
2682 {
2683 	Tcpctl *tcb;
2684 	Conv *s;
2685 
2686 	s = v;
2687 	tcb = (Tcpctl*)s->ptcl;
2688 	if(waserror()){
2689 		qunlock(s);
2690 		nexterror();
2691 	}
2692 	qlock(s);
2693 	if(tcb->state != Closed){
2694 		if(--(tcb->kacounter) <= 0) {
2695 			localclose(s, Etimedout);
2696 		} else {
2697 			tcpsendka(s);
2698 			tcpgo(s->p->priv, &tcb->katimer);
2699 		}
2700 	}
2701 	qunlock(s);
2702 	poperror();
2703 }
2704 
2705 /*
2706  *  start keepalive timer
2707  */
2708 char*
2709 tcpstartka(Conv *s, char **f, int n)
2710 {
2711 	Tcpctl *tcb;
2712 	int x;
2713 
2714 	tcb = (Tcpctl*)s->ptcl;
2715 	if(tcb->state != Established)
2716 		return "connection must be in Establised state";
2717 	if(n > 1){
2718 		x = atoi(f[1]);
2719 		if(x >= MSPTICK)
2720 			tcb->katimer.start = x/MSPTICK;
2721 	}
2722 	tcpsetkacounter(tcb);
2723 	tcpgo(s->p->priv, &tcb->katimer);
2724 
2725 	return nil;
2726 }
2727 
2728 /*
2729  *  turn checksums on/off
2730  */
2731 char*
2732 tcpsetchecksum(Conv *s, char **f, int)
2733 {
2734 	Tcpctl *tcb;
2735 
2736 	tcb = (Tcpctl*)s->ptcl;
2737 	tcb->nochecksum = !atoi(f[1]);
2738 
2739 	return nil;
2740 }
2741 
2742 void
2743 tcprxmit(Conv *s)
2744 {
2745 	Tcpctl *tcb;
2746 
2747 	tcb = (Tcpctl*)s->ptcl;
2748 
2749 	tcb->flags |= RETRAN|FORCE;
2750 	tcb->snd.ptr = tcb->snd.una;
2751 
2752 	/*
2753 	 *  We should be halving the slow start threshhold (down to one
2754 	 *  mss) but leaving it at mss seems to work well enough
2755 	 */
2756  	tcb->ssthresh = tcb->mss;
2757 
2758 	/*
2759 	 *  pull window down to a single packet
2760 	 */
2761 	tcb->cwind = tcb->mss;
2762 	tcpoutput(s);
2763 }
2764 
2765 void
2766 tcptimeout(void *arg)
2767 {
2768 	Conv *s;
2769 	Tcpctl *tcb;
2770 	int maxback;
2771 	Tcppriv *tpriv;
2772 
2773 	s = (Conv*)arg;
2774 	tpriv = s->p->priv;
2775 	tcb = (Tcpctl*)s->ptcl;
2776 
2777 	if(waserror()){
2778 		qunlock(s);
2779 		nexterror();
2780 	}
2781 	qlock(s);
2782 	switch(tcb->state){
2783 	default:
2784 		tcb->backoff++;
2785 		if(tcb->state == Syn_sent)
2786 			maxback = MAXBACKMS/2;
2787 		else
2788 			maxback = MAXBACKMS;
2789 		tcb->backedoff += tcb->timer.start * MSPTICK;
2790 		if(tcb->backedoff >= maxback) {
2791 			localclose(s, Etimedout);
2792 			break;
2793 		}
2794 		netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2795 		tcpsettimer(tcb);
2796 		tcprxmit(s);
2797 		tpriv->stats[RetransTimeouts]++;
2798 		tcb->snd.dupacks = 0;
2799 		break;
2800 	case Time_wait:
2801 		localclose(s, nil);
2802 		break;
2803 	case Closed:
2804 		break;
2805 	}
2806 	qunlock(s);
2807 	poperror();
2808 }
2809 
2810 int
2811 inwindow(Tcpctl *tcb, int seq)
2812 {
2813 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2814 }
2815 
2816 /*
2817  *  set up state for a received SYN (or SYN ACK) packet
2818  */
2819 void
2820 procsyn(Conv *s, Tcp *seg)
2821 {
2822 	Tcpctl *tcb;
2823 
2824 	tcb = (Tcpctl*)s->ptcl;
2825 	tcb->flags |= FORCE;
2826 
2827 	tcb->rcv.nxt = seg->seq + 1;
2828 	tcb->rcv.urg = tcb->rcv.nxt;
2829 	tcb->irs = seg->seq;
2830 
2831 	/* our sending max segment size cannot be bigger than what he asked for */
2832 	if(seg->mss != 0 && seg->mss < tcb->mss)
2833 		tcb->mss = seg->mss;
2834 
2835 	/* the congestion window always starts out as a single segment */
2836 	tcb->snd.wnd = seg->wnd;
2837 	tcb->cwind = tcb->mss;
2838 }
2839 
2840 int
2841 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2842 {
2843 	Reseq *rp, *rp1;
2844 	int i, rqlen, qmax;
2845 
2846 	rp = malloc(sizeof(Reseq));
2847 	if(rp == nil){
2848 		freeblist(bp);	/* bp always consumed by add_reseq */
2849 		return 0;
2850 	}
2851 
2852 	rp->seg = *seg;
2853 	rp->bp = bp;
2854 	rp->length = length;
2855 
2856 	/* Place on reassembly list sorting by starting seq number */
2857 	rp1 = tcb->reseq;
2858 	if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2859 		rp->next = rp1;
2860 		tcb->reseq = rp;
2861 		if(rp->next != nil)
2862 			tpriv->stats[OutOfOrder]++;
2863 		return 0;
2864 	}
2865 
2866 	rqlen = 0;
2867 	for(i = 0;; i++) {
2868 		rqlen += rp1->length;
2869 		if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2870 			rp->next = rp1->next;
2871 			rp1->next = rp;
2872 			if(rp->next != nil)
2873 				tpriv->stats[OutOfOrder]++;
2874 			break;
2875 		}
2876 		rp1 = rp1->next;
2877 	}
2878 	qmax = QMAX<<tcb->rcv.scale;
2879 	if(rqlen > qmax){
2880 		print("resequence queue > window: %d > %d\n", rqlen, qmax);
2881 		i = 0;
2882 	  	for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2883 	  		print("%#lux %#lux %#ux\n", rp1->seg.seq,
2884 	  			rp1->seg.ack, rp1->seg.flags);
2885 			if(i++ > 10){
2886 				print("...\n");
2887 				break;
2888 			}
2889 		}
2890 
2891 		// delete entire reassembly queue; wait for retransmit.
2892 		// - should we be smarter and only delete the tail?
2893 		for(rp = tcb->reseq; rp != nil; rp = rp1){
2894 			rp1 = rp->next;
2895 			freeblist(rp->bp);
2896 			free(rp);
2897 		}
2898 		tcb->reseq = nil;
2899 
2900 	  	return -1;
2901 	}
2902 	return 0;
2903 }
2904 
2905 void
2906 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2907 {
2908 	Reseq *rp;
2909 
2910 	rp = tcb->reseq;
2911 	if(rp == nil)
2912 		return;
2913 
2914 	tcb->reseq = rp->next;
2915 
2916 	*seg = rp->seg;
2917 	*bp = rp->bp;
2918 	*length = rp->length;
2919 
2920 	free(rp);
2921 }
2922 
2923 int
2924 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2925 {
2926 	ushort len;
2927 	uchar accept;
2928 	int dupcnt, excess;
2929 
2930 	accept = 0;
2931 	len = *length;
2932 	if(seg->flags & SYN)
2933 		len++;
2934 	if(seg->flags & FIN)
2935 		len++;
2936 
2937 	if(tcb->rcv.wnd == 0) {
2938 		if(len == 0 && seg->seq == tcb->rcv.nxt)
2939 			return 0;
2940 	}
2941 	else {
2942 		/* Some part of the segment should be in the window */
2943 		if(inwindow(tcb,seg->seq))
2944 			accept++;
2945 		else
2946 		if(len != 0) {
2947 			if(inwindow(tcb, seg->seq+len-1) ||
2948 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2949 				accept++;
2950 		}
2951 	}
2952 	if(!accept) {
2953 		freeblist(*bp);
2954 		return -1;
2955 	}
2956 	dupcnt = tcb->rcv.nxt - seg->seq;
2957 	if(dupcnt > 0){
2958 		tcb->rerecv += dupcnt;
2959 		if(seg->flags & SYN){
2960 			seg->flags &= ~SYN;
2961 			seg->seq++;
2962 
2963 			if(seg->urg > 1)
2964 				seg->urg--;
2965 			else
2966 				seg->flags &= ~URG;
2967 			dupcnt--;
2968 		}
2969 		if(dupcnt > 0){
2970 			pullblock(bp, (ushort)dupcnt);
2971 			seg->seq += dupcnt;
2972 			*length -= dupcnt;
2973 
2974 			if(seg->urg > dupcnt)
2975 				seg->urg -= dupcnt;
2976 			else {
2977 				seg->flags &= ~URG;
2978 				seg->urg = 0;
2979 			}
2980 		}
2981 	}
2982 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2983 	if(excess > 0) {
2984 		tcb->rerecv += excess;
2985 		*length -= excess;
2986 		*bp = trimblock(*bp, 0, *length);
2987 		if(*bp == nil)
2988 			panic("presotto is a boofhead");
2989 		seg->flags &= ~FIN;
2990 	}
2991 	return 0;
2992 }
2993 
2994 void
2995 tcpadvise(Proto *tcp, Block *bp, char *msg)
2996 {
2997 	Tcp4hdr *h4;
2998 	Tcp6hdr *h6;
2999 	Tcpctl *tcb;
3000 	uchar source[IPaddrlen];
3001 	uchar dest[IPaddrlen];
3002 	ushort psource, pdest;
3003 	Conv *s, **p;
3004 
3005 	h4 = (Tcp4hdr*)(bp->rp);
3006 	h6 = (Tcp6hdr*)(bp->rp);
3007 
3008 	if((h4->vihl&0xF0)==IP_VER4) {
3009 		v4tov6(dest, h4->tcpdst);
3010 		v4tov6(source, h4->tcpsrc);
3011 		psource = nhgets(h4->tcpsport);
3012 		pdest = nhgets(h4->tcpdport);
3013 	}
3014 	else {
3015 		ipmove(dest, h6->tcpdst);
3016 		ipmove(source, h6->tcpsrc);
3017 		psource = nhgets(h6->tcpsport);
3018 		pdest = nhgets(h6->tcpdport);
3019 	}
3020 
3021 	/* Look for a connection */
3022 	qlock(tcp);
3023 	for(p = tcp->conv; *p; p++) {
3024 		s = *p;
3025 		tcb = (Tcpctl*)s->ptcl;
3026 		if(s->rport == pdest)
3027 		if(s->lport == psource)
3028 		if(tcb->state != Closed)
3029 		if(ipcmp(s->raddr, dest) == 0)
3030 		if(ipcmp(s->laddr, source) == 0){
3031 			qlock(s);
3032 			qunlock(tcp);
3033 			switch(tcb->state){
3034 			case Syn_sent:
3035 				localclose(s, msg);
3036 				break;
3037 			}
3038 			qunlock(s);
3039 			freeblist(bp);
3040 			return;
3041 		}
3042 	}
3043 	qunlock(tcp);
3044 	freeblist(bp);
3045 }
3046 
3047 static char*
3048 tcpporthogdefensectl(char *val)
3049 {
3050 	if(strcmp(val, "on") == 0)
3051 		tcpporthogdefense = 1;
3052 	else if(strcmp(val, "off") == 0)
3053 		tcpporthogdefense = 0;
3054 	else
3055 		return "unknown value for tcpporthogdefense";
3056 	return nil;
3057 }
3058 
3059 /* called with c qlocked */
3060 char*
3061 tcpctl(Conv* c, char** f, int n)
3062 {
3063 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3064 		return tcphangup(c);
3065 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3066 		return tcpstartka(c, f, n);
3067 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3068 		return tcpsetchecksum(c, f, n);
3069 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3070 		return tcpporthogdefensectl(f[1]);
3071 	return "unknown control request";
3072 }
3073 
3074 int
3075 tcpstats(Proto *tcp, char *buf, int len)
3076 {
3077 	Tcppriv *priv;
3078 	char *p, *e;
3079 	int i;
3080 
3081 	priv = tcp->priv;
3082 	p = buf;
3083 	e = p+len;
3084 	for(i = 0; i < Nstats; i++)
3085 		p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3086 	return p - buf;
3087 }
3088 
3089 /*
3090  *  garbage collect any stale conversations:
3091  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3092  *	- Finwait2 after 5 minutes
3093  *
3094  *  this is called whenever we run out of channels.  Both checks are
3095  *  of questionable validity so we try to use them only when we're
3096  *  up against the wall.
3097  */
3098 int
3099 tcpgc(Proto *tcp)
3100 {
3101 	Conv *c, **pp, **ep;
3102 	int n;
3103 	Tcpctl *tcb;
3104 
3105 
3106 	n = 0;
3107 	ep = &tcp->conv[tcp->nc];
3108 	for(pp = tcp->conv; pp < ep; pp++) {
3109 		c = *pp;
3110 		if(c == nil)
3111 			break;
3112 		if(!canqlock(c))
3113 			continue;
3114 		tcb = (Tcpctl*)c->ptcl;
3115 		switch(tcb->state){
3116 		case Syn_received:
3117 			if(NOW - tcb->time > 5000){
3118 				localclose(c, "timed out");
3119 				n++;
3120 			}
3121 			break;
3122 		case Finwait2:
3123 			if(NOW - tcb->time > 5*60*1000){
3124 				localclose(c, "timed out");
3125 				n++;
3126 			}
3127 			break;
3128 		}
3129 		qunlock(c);
3130 	}
3131 	return n;
3132 }
3133 
3134 void
3135 tcpsettimer(Tcpctl *tcb)
3136 {
3137 	int x;
3138 
3139 	/* round trip dependency */
3140 	x = backoff(tcb->backoff) *
3141 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3142 
3143 	/* bounded twixt 1/2 and 64 seconds */
3144 	if(x < 500/MSPTICK)
3145 		x = 500/MSPTICK;
3146 	else if(x > (64000/MSPTICK))
3147 		x = 64000/MSPTICK;
3148 	tcb->timer.start = x;
3149 }
3150 
3151 void
3152 tcpinit(Fs *fs)
3153 {
3154 	Proto *tcp;
3155 	Tcppriv *tpriv;
3156 
3157 	tcp = smalloc(sizeof(Proto));
3158 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3159 	tcp->name = "tcp";
3160 	tcp->connect = tcpconnect;
3161 	tcp->announce = tcpannounce;
3162 	tcp->ctl = tcpctl;
3163 	tcp->state = tcpstate;
3164 	tcp->create = tcpcreate;
3165 	tcp->close = tcpclose;
3166 	tcp->rcv = tcpiput;
3167 	tcp->advise = tcpadvise;
3168 	tcp->stats = tcpstats;
3169 	tcp->inuse = tcpinuse;
3170 	tcp->gc = tcpgc;
3171 	tcp->ipproto = IP_TCPPROTO;
3172 	tcp->nc = scalednconv();
3173 	tcp->ptclsize = sizeof(Tcpctl);
3174 	tpriv->stats[MaxConn] = tcp->nc;
3175 
3176 	Fsproto(fs, tcp);
3177 }
3178 
3179 void
3180 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3181 {
3182 	if(rcvscale){
3183 		tcb->rcv.scale = rcvscale & 0xff;
3184 		tcb->snd.scale = sndscale & 0xff;
3185 		tcb->window = QMAX<<tcb->snd.scale;
3186 		qsetlimit(s->rq, tcb->window);
3187 	} else {
3188 		tcb->rcv.scale = 0;
3189 		tcb->snd.scale = 0;
3190 		tcb->window = QMAX;
3191 		qsetlimit(s->rq, tcb->window);
3192 	}
3193 }
3194