xref: /inferno-os/os/ip/tcp.c (revision e34c0431c44a07ee598e31d67e77d663413649fa)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Mean segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default mean segment */
50 	DEF_MSS6	= 1280,		/* Default mean segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	HaveWS		= 1<<8,
85 };
86 
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
91 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
92 	"Closing", 	"Last_ack", 	"Time_wait"
93 };
94 
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 	Tcptimer	*next;
99 	Tcptimer	*prev;
100 	Tcptimer	*readynext;
101 	int	state;
102 	int	start;
103 	int	count;
104 	void	(*func)(void*);
105 	void	*arg;
106 };
107 
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 	uchar	vihl;		/* Version and header length */
116 	uchar	tos;		/* Type of service */
117 	uchar	length[2];	/* packet length */
118 	uchar	id[2];		/* Identification */
119 	uchar	frag[2];	/* Fragment information */
120 	uchar	Unused;
121 	uchar	proto;
122 	uchar	tcplen[2];
123 	uchar	tcpsrc[4];
124 	uchar	tcpdst[4];
125 	uchar	tcpsport[2];
126 	uchar	tcpdport[2];
127 	uchar	tcpseq[4];
128 	uchar	tcpack[4];
129 	uchar	tcpflag[2];
130 	uchar	tcpwin[2];
131 	uchar	tcpcksum[2];
132 	uchar	tcpurg[2];
133 	/* Options segment */
134 	uchar	tcpopt[1];
135 };
136 
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 	uchar	vcf[4];
141 	uchar	ploadlen[2];
142 	uchar	proto;
143 	uchar	ttl;
144 	uchar	tcpsrc[IPaddrlen];
145 	uchar	tcpdst[IPaddrlen];
146 	uchar	tcpsport[2];
147 	uchar	tcpdport[2];
148 	uchar	tcpseq[4];
149 	uchar	tcpack[4];
150 	uchar	tcpflag[2];
151 	uchar	tcpwin[2];
152 	uchar	tcpcksum[2];
153 	uchar	tcpurg[2];
154 	/* Options segment */
155 	uchar	tcpopt[1];
156 };
157 
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct	Tcp
166 {
167 	ushort	source;
168 	ushort	dest;
169 	ulong	seq;
170 	ulong	ack;
171 	uchar	flags;
172 	ushort	ws;	/* window scale option (if not zero) */
173 	ulong	wnd;
174 	ushort	urg;
175 	ushort	mss;	/* max segment size option (if not zero) */
176 	ushort	len;	/* size of data */
177 };
178 
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 	Reseq	*next;
187 	Tcp	seg;
188 	Block	*bp;
189 	ushort	length;
190 };
191 
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 	uchar	state;			/* Connection state */
199 	uchar	type;			/* Listening or active connection */
200 	uchar	code;			/* Icmp code */
201 	struct {
202 		ulong	una;		/* Unacked data pointer */
203 		ulong	nxt;		/* Next sequence expected */
204 		ulong	ptr;		/* Data pointer */
205 		ulong	wnd;		/* Tcp send window */
206 		ulong	urg;		/* Urgent data pointer */
207 		ulong	wl2;
208 		int	scale;		/* how much to right shift window in xmitted packets */
209 		/* to implement tahoe and reno TCP */
210 		ulong	dupacks;	/* number of duplicate acks rcvd */
211 		int	recovery;	/* loss recovery flag */
212 		ulong	rxt;		/* right window marker for recovery */
213 	} snd;
214 	struct {
215 		ulong	nxt;		/* Receive pointer to next uchar slot */
216 		ulong	wnd;		/* Receive window incoming */
217 		ulong	urg;		/* Urgent pointer */
218 		int	blocked;
219 		int	una;		/* unacked data segs */
220 		int	scale;		/* how much to left shift window in rcved packets */
221 	} rcv;
222 	ulong	iss;			/* Initial sequence number */
223 	int	sawwsopt;		/* true if we saw a wsopt on the incoming SYN */
224 	ulong	cwind;			/* Congestion window */
225 	int	scale;			/* desired snd.scale */
226 	ushort	ssthresh;		/* Slow start threshold */
227 	int	resent;			/* Bytes just resent */
228 	int	irs;			/* Initial received squence */
229 	ushort	mss;			/* Mean segment size */
230 	int	rerecv;			/* Overlap of data rerecevived */
231 	ulong	window;			/* Recevive window */
232 	uchar	backoff;		/* Exponential backoff counter */
233 	int	backedoff;		/* ms we've backed off for rexmits */
234 	uchar	flags;			/* State flags */
235 	Reseq	*reseq;			/* Resequencing queue */
236 	Tcptimer	timer;			/* Activity timer */
237 	Tcptimer	acktimer;		/* Acknowledge timer */
238 	Tcptimer	rtt_timer;		/* Round trip timer */
239 	Tcptimer	katimer;		/* keep alive timer */
240 	ulong	rttseq;			/* Round trip sequence */
241 	int	srtt;			/* Shortened round trip */
242 	int	mdev;			/* Mean deviation of round trip */
243 	int	kacounter;		/* count down for keep alive */
244 	uint	sndsyntime;		/* time syn sent */
245 	ulong	time;			/* time Finwait2 or Syn_received was sent */
246 	int	nochecksum;		/* non-zero means don't send checksums */
247 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
248 
249 	union {
250 		Tcp4hdr	tcp4hdr;
251 		Tcp6hdr	tcp6hdr;
252 	} protohdr;		/* prototype header */
253 };
254 
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 	Limbo	*next;
272 
273 	uchar	laddr[IPaddrlen];
274 	uchar	raddr[IPaddrlen];
275 	ushort	lport;
276 	ushort	rport;
277 	ulong	irs;		/* initial received sequence */
278 	ulong	iss;		/* initial sent sequence */
279 	ushort	mss;		/* mss from the other end */
280 	ushort	rcvscale;	/* how much to scale rcvd windows */
281 	ushort	sndscale;	/* how much to scale sent windows */
282 	ulong	lastsend;	/* last time we sent a synack */
283 	uchar	version;	/* v4 or v6 */
284 	uchar	rexmits;	/* number of retransmissions */
285 };
286 
287 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
288 ushort	tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */
289 
290 enum {
291 	/* MIB stats */
292 	MaxConn,
293 	ActiveOpens,
294 	PassiveOpens,
295 	EstabResets,
296 	CurrEstab,
297 	InSegs,
298 	OutSegs,
299 	RetransSegs,
300 	RetransTimeouts,
301 	InErrs,
302 	OutRsts,
303 
304 	/* non-MIB stats */
305 	CsumErrs,
306 	HlenErrs,
307 	LenErrs,
308 	OutOfOrder,
309 
310 	Nstats
311 };
312 
313 static char *statnames[] =
314 {
315 [MaxConn]	"MaxConn",
316 [ActiveOpens]	"ActiveOpens",
317 [PassiveOpens]	"PassiveOpens",
318 [EstabResets]	"EstabResets",
319 [CurrEstab]	"CurrEstab",
320 [InSegs]	"InSegs",
321 [OutSegs]	"OutSegs",
322 [RetransSegs]	"RetransSegs",
323 [RetransTimeouts]	"RetransTimeouts",
324 [InErrs]	"InErrs",
325 [OutRsts]	"OutRsts",
326 [CsumErrs]	"CsumErrs",
327 [HlenErrs]	"HlenErrs",
328 [LenErrs]	"LenErrs",
329 [OutOfOrder]	"OutOfOrder",
330 };
331 
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335 	/* List of active timers */
336 	QLock 	tl;
337 	Tcptimer *timers;
338 
339 	/* hash table for matching conversations */
340 	Ipht	ht;
341 
342 	/* calls in limbo waiting for an ACK to our SYN ACK */
343 	int	nlimbo;
344 	Limbo	*lht[NLHT];
345 
346 	/* for keeping track of tcpackproc */
347 	QLock	apl;
348 	int	ackprocstarted;
349 
350 	ulong	stats[Nstats];
351 };
352 
353 /*
354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
355  *  solution to hijacked systems staking out port's as a form
356  *  of DoS attack.
357  *
358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
359  *  it that number gets acked by the other end, we shut down the connection.
360  *  Look for tcpporthogedefense in the code.
361  */
362 int tcpporthogdefense = 0;
363 
364 int	addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void	localclose(Conv*, char*);
367 void	procsyn(Conv*, Tcp*);
368 void	tcpiput(Proto*, Ipifc*, Block*);
369 void	tcpoutput(Conv*);
370 int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void	tcpstart(Conv*, int);
372 void	tcptimeout(void*);
373 void	tcpsndsyn(Conv*, Tcpctl*);
374 void	tcprcvwin(Conv*);
375 void	tcpacktimer(void*);
376 void	tcpkeepalive(void*);
377 void	tcpsetkacounter(Tcpctl*);
378 void	tcprxmit(Conv*);
379 void	tcpsettimer(Tcpctl*);
380 void	tcpsynackrtt(Conv*);
381 void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382 
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385 
386 void
387 tcpsetstate(Conv *s, uchar newstate)
388 {
389 	Tcpctl *tcb;
390 	uchar oldstate;
391 	Tcppriv *tpriv;
392 
393 	tpriv = s->p->priv;
394 
395 	tcb = (Tcpctl*)s->ptcl;
396 
397 	oldstate = tcb->state;
398 	if(oldstate == newstate)
399 		return;
400 
401 	if(oldstate == Established)
402 		tpriv->stats[CurrEstab]--;
403 	if(newstate == Established)
404 		tpriv->stats[CurrEstab]++;
405 
406 	/**
407 	print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408 		tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409 	**/
410 
411 	switch(newstate) {
412 	case Closed:
413 		qclose(s->rq);
414 		qclose(s->wq);
415 		qclose(s->eq);
416 		break;
417 
418 	case Close_wait:		/* Remote closes */
419 		qhangup(s->rq, nil);
420 		break;
421 	}
422 
423 	tcb->state = newstate;
424 
425 	if(oldstate == Syn_sent && newstate != Closed)
426 		Fsconnected(s, nil);
427 }
428 
429 static char*
430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432 	char *e;
433 
434 	e = Fsstdconnect(c, argv, argc);
435 	if(e != nil)
436 		return e;
437 	tcpstart(c, TCP_CONNECT);
438 
439 	return nil;
440 }
441 
442 static int
443 tcpstate(Conv *c, char *state, int n)
444 {
445 	Tcpctl *s;
446 
447 	s = (Tcpctl*)(c->ptcl);
448 
449 	return snprint(state, n,
450 		"%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
451 		tcpstates[s->state],
452 		c->rq ? qlen(c->rq) : 0,
453 		c->wq ? qlen(c->wq) : 0,
454 		s->srtt, s->mdev,
455 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
456 		s->timer.start, s->timer.count, s->rerecv,
457 		s->katimer.start, s->katimer.count);
458 }
459 
460 static int
461 tcpinuse(Conv *c)
462 {
463 	Tcpctl *s;
464 
465 	s = (Tcpctl*)(c->ptcl);
466 	return s->state != Closed;
467 }
468 
469 static char*
470 tcpannounce(Conv *c, char **argv, int argc)
471 {
472 	char *e;
473 
474 	e = Fsstdannounce(c, argv, argc);
475 	if(e != nil)
476 		return e;
477 	tcpstart(c, TCP_LISTEN);
478 	Fsconnected(c, nil);
479 
480 	return nil;
481 }
482 
483 /*
484  *  tcpclose is always called with the q locked
485  */
486 static void
487 tcpclose(Conv *c)
488 {
489 	Tcpctl *tcb;
490 
491 	tcb = (Tcpctl*)c->ptcl;
492 
493 	qhangup(c->rq, nil);
494 	qhangup(c->wq, nil);
495 	qhangup(c->eq, nil);
496 	qflush(c->rq);
497 
498 	switch(tcb->state) {
499 	case Listen:
500 		/*
501 		 *  reset any incoming calls to this listener
502 		 */
503 		Fsconnected(c, "Hangup");
504 
505 		localclose(c, nil);
506 		break;
507 	case Closed:
508 	case Syn_sent:
509 		localclose(c, nil);
510 		break;
511 	case Syn_received:
512 	case Established:
513 		tcb->flgcnt++;
514 		tcb->snd.nxt++;
515 		tcpsetstate(c, Finwait1);
516 		tcpoutput(c);
517 		break;
518 	case Close_wait:
519 		tcb->flgcnt++;
520 		tcb->snd.nxt++;
521 		tcpsetstate(c, Last_ack);
522 		tcpoutput(c);
523 		break;
524 	}
525 }
526 
527 void
528 tcpkick(void *x)
529 {
530 	Conv *s = x;
531 	Tcpctl *tcb;
532 
533 	tcb = (Tcpctl*)s->ptcl;
534 
535 	if(waserror()){
536 		qunlock(s);
537 		nexterror();
538 	}
539 	qlock(s);
540 
541 	switch(tcb->state) {
542 	case Syn_sent:
543 	case Syn_received:
544 	case Established:
545 	case Close_wait:
546 		/*
547 		 * Push data
548 		 */
549 		tcprcvwin(s);
550 		tcpoutput(s);
551 		break;
552 	default:
553 		localclose(s, "Hangup");
554 		break;
555 	}
556 
557 	qunlock(s);
558 	poperror();
559 }
560 
561 void
562 tcprcvwin(Conv *s)				/* Call with tcb locked */
563 {
564 	int w;
565 	Tcpctl *tcb;
566 
567 	tcb = (Tcpctl*)s->ptcl;
568 	w = tcb->window - qlen(s->rq);
569 	if(w < 0)
570 		w = 0;
571 	tcb->rcv.wnd = w;
572 	if(w == 0)
573 		tcb->rcv.blocked = 1;
574 }
575 
576 void
577 tcpacktimer(void *v)
578 {
579 	Tcpctl *tcb;
580 	Conv *s;
581 
582 	s = v;
583 	tcb = (Tcpctl*)s->ptcl;
584 
585 	if(waserror()){
586 		qunlock(s);
587 		nexterror();
588 	}
589 	qlock(s);
590 	if(tcb->state != Closed){
591 		tcb->flags |= FORCE;
592 		tcprcvwin(s);
593 		tcpoutput(s);
594 	}
595 	qunlock(s);
596 	poperror();
597 }
598 
599 static void
600 tcpcreate(Conv *c)
601 {
602 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
603 	c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
604 }
605 
606 static void
607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
608 {
609 	if(newstate != TcptimerON){
610 		if(t->state == TcptimerON){
611 			// unchain
612 			if(priv->timers == t){
613 				priv->timers = t->next;
614 				if(t->prev != nil)
615 					panic("timerstate1");
616 			}
617 			if(t->next)
618 				t->next->prev = t->prev;
619 			if(t->prev)
620 				t->prev->next = t->next;
621 			t->next = t->prev = nil;
622 		}
623 	} else {
624 		if(t->state != TcptimerON){
625 			// chain
626 			if(t->prev != nil || t->next != nil)
627 				panic("timerstate2");
628 			t->prev = nil;
629 			t->next = priv->timers;
630 			if(t->next)
631 				t->next->prev = t;
632 			priv->timers = t;
633 		}
634 	}
635 	t->state = newstate;
636 }
637 
638 void
639 tcpackproc(void *a)
640 {
641 	Tcptimer *t, *tp, *timeo;
642 	Proto *tcp;
643 	Tcppriv *priv;
644 	int loop;
645 
646 	tcp = a;
647 	priv = tcp->priv;
648 
649 	for(;;) {
650 		tsleep(&up->sleep, return0, 0, MSPTICK);
651 
652 		qlock(&priv->tl);
653 		timeo = nil;
654 		loop = 0;
655 		for(t = priv->timers; t != nil; t = tp) {
656 			if(loop++ > 10000)
657 				panic("tcpackproc1");
658 			tp = t->next;
659  			if(t->state == TcptimerON) {
660 				t->count--;
661 				if(t->count == 0) {
662 					timerstate(priv, t, TcptimerDONE);
663 					t->readynext = timeo;
664 					timeo = t;
665 				}
666 			}
667 		}
668 		qunlock(&priv->tl);
669 
670 		loop = 0;
671 		for(t = timeo; t != nil; t = t->readynext) {
672 			if(loop++ > 10000)
673 				panic("tcpackproc2");
674 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
675 				(*t->func)(t->arg);
676 				poperror();
677 			}
678 		}
679 
680 		limborexmit(tcp);
681 	}
682 }
683 
684 void
685 tcpgo(Tcppriv *priv, Tcptimer *t)
686 {
687 	if(t == nil || t->start == 0)
688 		return;
689 
690 	qlock(&priv->tl);
691 	t->count = t->start;
692 	timerstate(priv, t, TcptimerON);
693 	qunlock(&priv->tl);
694 }
695 
696 void
697 tcphalt(Tcppriv *priv, Tcptimer *t)
698 {
699 	if(t == nil)
700 		return;
701 
702 	qlock(&priv->tl);
703 	timerstate(priv, t, TcptimerOFF);
704 	qunlock(&priv->tl);
705 }
706 
707 int
708 backoff(int n)
709 {
710 	return 1 << n;
711 }
712 
713 void
714 localclose(Conv *s, char *reason)	/* called with tcb locked */
715 {
716 	Tcpctl *tcb;
717 	Reseq *rp,*rp1;
718 	Tcppriv *tpriv;
719 
720 	tpriv = s->p->priv;
721 	tcb = (Tcpctl*)s->ptcl;
722 
723 	iphtrem(&tpriv->ht, s);
724 
725 	tcphalt(tpriv, &tcb->timer);
726 	tcphalt(tpriv, &tcb->rtt_timer);
727 	tcphalt(tpriv, &tcb->acktimer);
728 	tcphalt(tpriv, &tcb->katimer);
729 
730 	/* Flush reassembly queue; nothing more can arrive */
731 	for(rp = tcb->reseq; rp != nil; rp = rp1) {
732 		rp1 = rp->next;
733 		freeblist(rp->bp);
734 		free(rp);
735 	}
736 	tcb->reseq = nil;
737 
738 	if(tcb->state == Syn_sent)
739 		Fsconnected(s, reason);
740 	if(s->state == Announced)
741 		wakeup(&s->listenr);
742 
743 	qhangup(s->rq, reason);
744 	qhangup(s->wq, reason);
745 
746 	tcpsetstate(s, Closed);
747 }
748 
749 /* mtu (- TCP + IP hdr len) of 1st hop */
750 int
751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
752 {
753 	Ipifc *ifc;
754 	int mtu;
755 
756 	ifc = findipifc(tcp->f, addr, 0);
757 	switch(version){
758 	default:
759 	case V4:
760 		mtu = DEF_MSS;
761 		if(ifc != nil)
762 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763 		break;
764 	case V6:
765 		mtu = DEF_MSS6;
766 		if(ifc != nil)
767 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768 		break;
769 	}
770 	if(ifc != nil){
771 		if(ifc->mbps > 100)
772 			*scale = HaveWS | 3;
773 		else if(ifc->mbps > 10)
774 			*scale = HaveWS | 1;
775 		else
776 			*scale = HaveWS | 0;
777 	} else
778 		*scale = HaveWS | 0;
779 
780 	return mtu;
781 }
782 
783 void
784 inittcpctl(Conv *s, int mode)
785 {
786 	Tcpctl *tcb;
787 	Tcp4hdr* h4;
788 	Tcp6hdr* h6;
789 	int mss;
790 
791 	tcb = (Tcpctl*)s->ptcl;
792 
793 	memset(tcb, 0, sizeof(Tcpctl));
794 
795 	tcb->ssthresh = 65535;
796 	tcb->srtt = tcp_irtt<<LOGAGAIN;
797 	tcb->mdev = 0;
798 
799 	/* setup timers */
800 	tcb->timer.start = tcp_irtt / MSPTICK;
801 	tcb->timer.func = tcptimeout;
802 	tcb->timer.arg = s;
803 	tcb->rtt_timer.start = MAX_TIME;
804 	tcb->acktimer.start = TCP_ACK / MSPTICK;
805 	tcb->acktimer.func = tcpacktimer;
806 	tcb->acktimer.arg = s;
807 	tcb->katimer.start = DEF_KAT / MSPTICK;
808 	tcb->katimer.func = tcpkeepalive;
809 	tcb->katimer.arg = s;
810 
811 	mss = DEF_MSS;
812 
813 	/* create a prototype(pseudo) header */
814 	if(mode != TCP_LISTEN){
815 		if(ipcmp(s->laddr, IPnoaddr) == 0)
816 			findlocalip(s->p->f, s->laddr, s->raddr);
817 
818 		switch(s->ipversion){
819 		case V4:
820 			h4 = &tcb->protohdr.tcp4hdr;
821 			memset(h4, 0, sizeof(*h4));
822 			h4->proto = IP_TCPPROTO;
823 			hnputs(h4->tcpsport, s->lport);
824 			hnputs(h4->tcpdport, s->rport);
825 			v6tov4(h4->tcpsrc, s->laddr);
826 			v6tov4(h4->tcpdst, s->raddr);
827 			break;
828 		case V6:
829 			h6 = &tcb->protohdr.tcp6hdr;
830 			memset(h6, 0, sizeof(*h6));
831 			h6->proto = IP_TCPPROTO;
832 			hnputs(h6->tcpsport, s->lport);
833 			hnputs(h6->tcpdport, s->rport);
834 			ipmove(h6->tcpsrc, s->laddr);
835 			ipmove(h6->tcpdst, s->raddr);
836 			mss = DEF_MSS6;
837 			break;
838 		default:
839 			panic("inittcpctl: version %d", s->ipversion);
840 		}
841 	}
842 
843 	tcb->mss = tcb->cwind = mss;
844 
845 	/* default is no window scaling */
846 	tcb->window = QMAX;
847 	tcb->rcv.wnd = QMAX;
848 	tcb->rcv.scale = 0;
849 	tcb->snd.scale = 0;
850 	qsetlimit(s->rq, QMAX);
851 }
852 
853 /*
854  *  called with s qlocked
855  */
856 void
857 tcpstart(Conv *s, int mode)
858 {
859 	Tcpctl *tcb;
860 	Tcppriv *tpriv;
861 	char kpname[KNAMELEN];
862 
863 	tpriv = s->p->priv;
864 
865 	if(tpriv->ackprocstarted == 0){
866 		qlock(&tpriv->apl);
867 		if(tpriv->ackprocstarted == 0){
868 			sprint(kpname, "#I%dtcpack", s->p->f->dev);
869 			kproc(kpname, tcpackproc, s->p, 0);
870 			tpriv->ackprocstarted = 1;
871 		}
872 		qunlock(&tpriv->apl);
873 	}
874 
875 	tcb = (Tcpctl*)s->ptcl;
876 
877 	inittcpctl(s, mode);
878 
879 	iphtadd(&tpriv->ht, s);
880 	switch(mode) {
881 	case TCP_LISTEN:
882 		tpriv->stats[PassiveOpens]++;
883 		tcb->flags |= CLONE;
884 		tcpsetstate(s, Listen);
885 		break;
886 
887 	case TCP_CONNECT:
888 		tpriv->stats[ActiveOpens]++;
889 		tcb->flags |= ACTIVE;
890 		tcpsndsyn(s, tcb);
891 		tcpsetstate(s, Syn_sent);
892 		tcpoutput(s);
893 		break;
894 	}
895 }
896 
897 static char*
898 tcpflag(ushort flag)
899 {
900 	static char buf[128];
901 
902 	sprint(buf, "%d", flag>>10);	/* Head len */
903 	if(flag & URG)
904 		strcat(buf, " URG");
905 	if(flag & ACK)
906 		strcat(buf, " ACK");
907 	if(flag & PSH)
908 		strcat(buf, " PSH");
909 	if(flag & RST)
910 		strcat(buf, " RST");
911 	if(flag & SYN)
912 		strcat(buf, " SYN");
913 	if(flag & FIN)
914 		strcat(buf, " FIN");
915 
916 	return buf;
917 }
918 
919 Block *
920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
921 {
922 	int dlen;
923 	Tcp6hdr *h;
924 	ushort csum;
925 	ushort hdrlen, optpad = 0;
926 	uchar *opt;
927 
928 	hdrlen = TCP6_HDRSIZE;
929 	if(tcph->flags & SYN){
930 		if(tcph->mss)
931 			hdrlen += MSS_LENGTH;
932 		if(tcph->ws)
933 			hdrlen += WS_LENGTH;
934 		optpad = hdrlen & 3;
935 		if(optpad)
936 			optpad = 4 - optpad;
937 		hdrlen += optpad;
938 	}
939 
940 	if(data) {
941 		dlen = blocklen(data);
942 		data = padblock(data, hdrlen + TCP6_PKT);
943 		if(data == nil)
944 			return nil;
945 	}
946 	else {
947 		dlen = 0;
948 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
949 		if(data == nil)
950 			return nil;
951 		data->wp += hdrlen + TCP6_PKT;
952 	}
953 
954 	/* copy in pseudo ip header plus port numbers */
955 	h = (Tcp6hdr *)(data->rp);
956 	memmove(h, ph, TCP6_TCBPHDRSZ);
957 
958 	/* compose pseudo tcp header, do cksum calculation */
959 	hnputl(h->vcf, hdrlen + dlen);
960 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
961 	h->ttl = ph->proto;
962 
963 	/* copy in variable bits */
964 	hnputl(h->tcpseq, tcph->seq);
965 	hnputl(h->tcpack, tcph->ack);
966 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
967 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
968 	hnputs(h->tcpurg, tcph->urg);
969 
970 	if(tcph->flags & SYN){
971 		opt = h->tcpopt;
972 		if(tcph->mss != 0){
973 			*opt++ = MSSOPT;
974 			*opt++ = MSS_LENGTH;
975 			hnputs(opt, tcph->mss);
976 			opt += 2;
977 		}
978 		if(tcph->ws != 0){
979 			*opt++ = WSOPT;
980 			*opt++ = WS_LENGTH;
981 			*opt++ = tcph->ws;
982 		}
983 		while(optpad-- > 0)
984 			*opt++ = NOOPOPT;
985 	}
986 
987 	if(tcb != nil && tcb->nochecksum){
988 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
989 	} else {
990 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
991 		hnputs(h->tcpcksum, csum);
992 	}
993 
994 	/* move from pseudo header back to normal ip header */
995 	memset(h->vcf, 0, 4);
996 	h->vcf[0] = IP_VER6;
997 	hnputs(h->ploadlen, hdrlen+dlen);
998 	h->proto = ph->proto;
999 
1000 	return data;
1001 }
1002 
1003 Block *
1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1005 {
1006 	int dlen;
1007 	Tcp4hdr *h;
1008 	ushort csum;
1009 	ushort hdrlen, optpad = 0;
1010 	uchar *opt;
1011 
1012 	hdrlen = TCP4_HDRSIZE;
1013 	if(tcph->flags & SYN){
1014 		if(tcph->mss)
1015 			hdrlen += MSS_LENGTH;
1016 		if(tcph->ws)
1017 			hdrlen += WS_LENGTH;
1018 		optpad = hdrlen & 3;
1019 		if(optpad)
1020 			optpad = 4 - optpad;
1021 		hdrlen += optpad;
1022 	}
1023 
1024 	if(data) {
1025 		dlen = blocklen(data);
1026 		data = padblock(data, hdrlen + TCP4_PKT);
1027 		if(data == nil)
1028 			return nil;
1029 	}
1030 	else {
1031 		dlen = 0;
1032 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1033 		if(data == nil)
1034 			return nil;
1035 		data->wp += hdrlen + TCP4_PKT;
1036 	}
1037 
1038 	/* copy in pseudo ip header plus port numbers */
1039 	h = (Tcp4hdr *)(data->rp);
1040 	memmove(h, ph, TCP4_TCBPHDRSZ);
1041 
1042 	/* copy in variable bits */
1043 	hnputs(h->tcplen, hdrlen + dlen);
1044 	hnputl(h->tcpseq, tcph->seq);
1045 	hnputl(h->tcpack, tcph->ack);
1046 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1047 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1048 	hnputs(h->tcpurg, tcph->urg);
1049 
1050 	if(tcph->flags & SYN){
1051 		opt = h->tcpopt;
1052 		if(tcph->mss != 0){
1053 			*opt++ = MSSOPT;
1054 			*opt++ = MSS_LENGTH;
1055 			hnputs(opt, tcph->mss);
1056 			opt += 2;
1057 		}
1058 		if(tcph->ws != 0){
1059 			*opt++ = WSOPT;
1060 			*opt++ = WS_LENGTH;
1061 			*opt++ = tcph->ws;
1062 		}
1063 		while(optpad-- > 0)
1064 			*opt++ = NOOPOPT;
1065 	}
1066 
1067 	if(tcb != nil && tcb->nochecksum){
1068 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1069 	} else {
1070 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1071 		hnputs(h->tcpcksum, csum);
1072 	}
1073 
1074 	return data;
1075 }
1076 
1077 int
1078 ntohtcp6(Tcp *tcph, Block **bpp)
1079 {
1080 	Tcp6hdr *h;
1081 	uchar *optr;
1082 	ushort hdrlen;
1083 	ushort optlen;
1084 	int n;
1085 
1086 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1087 	if(*bpp == nil)
1088 		return -1;
1089 
1090 	h = (Tcp6hdr *)((*bpp)->rp);
1091 	tcph->source = nhgets(h->tcpsport);
1092 	tcph->dest = nhgets(h->tcpdport);
1093 	tcph->seq = nhgetl(h->tcpseq);
1094 	tcph->ack = nhgetl(h->tcpack);
1095 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1096 	if(hdrlen < TCP6_HDRSIZE) {
1097 		freeblist(*bpp);
1098 		return -1;
1099 	}
1100 
1101 	tcph->flags = h->tcpflag[1];
1102 	tcph->wnd = nhgets(h->tcpwin);
1103 	tcph->urg = nhgets(h->tcpurg);
1104 	tcph->mss = 0;
1105 	tcph->ws = 0;
1106 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1107 
1108 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1109 	if(*bpp == nil)
1110 		return -1;
1111 
1112 	optr = h->tcpopt;
1113 	n = hdrlen - TCP6_HDRSIZE;
1114 	while(n > 0 && *optr != EOLOPT) {
1115 		if(*optr == NOOPOPT) {
1116 			n--;
1117 			optr++;
1118 			continue;
1119 		}
1120 		optlen = optr[1];
1121 		if(optlen < 2 || optlen > n)
1122 			break;
1123 		switch(*optr) {
1124 		case MSSOPT:
1125 			if(optlen == MSS_LENGTH)
1126 				tcph->mss = nhgets(optr+2);
1127 			break;
1128 		case WSOPT:
1129 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1130 				tcph->ws = HaveWS | *(optr+2);
1131 			break;
1132 		}
1133 		n -= optlen;
1134 		optr += optlen;
1135 	}
1136 	return hdrlen;
1137 }
1138 
1139 int
1140 ntohtcp4(Tcp *tcph, Block **bpp)
1141 {
1142 	Tcp4hdr *h;
1143 	uchar *optr;
1144 	ushort hdrlen;
1145 	ushort optlen;
1146 	int n;
1147 
1148 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1149 	if(*bpp == nil)
1150 		return -1;
1151 
1152 	h = (Tcp4hdr *)((*bpp)->rp);
1153 	tcph->source = nhgets(h->tcpsport);
1154 	tcph->dest = nhgets(h->tcpdport);
1155 	tcph->seq = nhgetl(h->tcpseq);
1156 	tcph->ack = nhgetl(h->tcpack);
1157 
1158 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1159 	if(hdrlen < TCP4_HDRSIZE) {
1160 		freeblist(*bpp);
1161 		return -1;
1162 	}
1163 
1164 	tcph->flags = h->tcpflag[1];
1165 	tcph->wnd = nhgets(h->tcpwin);
1166 	tcph->urg = nhgets(h->tcpurg);
1167 	tcph->mss = 0;
1168 	tcph->ws = 0;
1169 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1170 
1171 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1172 	if(*bpp == nil)
1173 		return -1;
1174 
1175 	optr = h->tcpopt;
1176 	n = hdrlen - TCP4_HDRSIZE;
1177 	while(n > 0 && *optr != EOLOPT) {
1178 		if(*optr == NOOPOPT) {
1179 			n--;
1180 			optr++;
1181 			continue;
1182 		}
1183 		optlen = optr[1];
1184 		if(optlen < 2 || optlen > n)
1185 			break;
1186 		switch(*optr) {
1187 		case MSSOPT:
1188 			if(optlen == MSS_LENGTH)
1189 				tcph->mss = nhgets(optr+2);
1190 			break;
1191 		case WSOPT:
1192 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1193 				tcph->ws = HaveWS | *(optr+2);
1194 			break;
1195 		}
1196 		n -= optlen;
1197 		optr += optlen;
1198 	}
1199 	return hdrlen;
1200 }
1201 
1202 /*
1203  *  For outgiing calls, generate an initial sequence
1204  *  number and put a SYN on the send queue
1205  */
1206 void
1207 tcpsndsyn(Conv *s, Tcpctl *tcb)
1208 {
1209 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1210 	tcb->rttseq = tcb->iss;
1211 	tcb->snd.wl2 = tcb->iss;
1212 	tcb->snd.una = tcb->iss;
1213 	tcb->snd.ptr = tcb->rttseq;
1214 	tcb->snd.nxt = tcb->rttseq;
1215 	tcb->flgcnt++;
1216 	tcb->flags |= FORCE;
1217 	tcb->sndsyntime = NOW;
1218 
1219 	/* set desired mss and scale */
1220 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1221 }
1222 
1223 void
1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1225 {
1226 	Block *hbp;
1227 	uchar rflags;
1228 	Tcppriv *tpriv;
1229 	Tcp4hdr ph4;
1230 	Tcp6hdr ph6;
1231 
1232 	netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1233 
1234 	tpriv = tcp->priv;
1235 
1236 	if(seg->flags & RST)
1237 		return;
1238 
1239 	/* make pseudo header */
1240 	switch(version) {
1241 	case V4:
1242 		memset(&ph4, 0, sizeof(ph4));
1243 		ph4.vihl = IP_VER4;
1244 		v6tov4(ph4.tcpsrc, dest);
1245 		v6tov4(ph4.tcpdst, source);
1246 		ph4.proto = IP_TCPPROTO;
1247 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1248 		hnputs(ph4.tcpsport, seg->dest);
1249 		hnputs(ph4.tcpdport, seg->source);
1250 		break;
1251 	case V6:
1252 		memset(&ph6, 0, sizeof(ph6));
1253 		ph6.vcf[0] = IP_VER6;
1254 		ipmove(ph6.tcpsrc, dest);
1255 		ipmove(ph6.tcpdst, source);
1256 		ph6.proto = IP_TCPPROTO;
1257 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1258 		hnputs(ph6.tcpsport, seg->dest);
1259 		hnputs(ph6.tcpdport, seg->source);
1260 		break;
1261 	default:
1262 		panic("sndrst: version %d", version);
1263 	}
1264 
1265 	tpriv->stats[OutRsts]++;
1266 	rflags = RST;
1267 
1268 	/* convince the other end that this reset is in band */
1269 	if(seg->flags & ACK) {
1270 		seg->seq = seg->ack;
1271 		seg->ack = 0;
1272 	}
1273 	else {
1274 		rflags |= ACK;
1275 		seg->ack = seg->seq;
1276 		seg->seq = 0;
1277 		if(seg->flags & SYN)
1278 			seg->ack++;
1279 		seg->ack += length;
1280 		if(seg->flags & FIN)
1281 			seg->ack++;
1282 	}
1283 	seg->flags = rflags;
1284 	seg->wnd = 0;
1285 	seg->urg = 0;
1286 	seg->mss = 0;
1287 	seg->ws = 0;
1288 	switch(version) {
1289 	case V4:
1290 		hbp = htontcp4(seg, nil, &ph4, nil);
1291 		if(hbp == nil)
1292 			return;
1293 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1294 		break;
1295 	case V6:
1296 		hbp = htontcp6(seg, nil, &ph6, nil);
1297 		if(hbp == nil)
1298 			return;
1299 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1300 		break;
1301 	default:
1302 		panic("sndrst2: version %d", version);
1303 	}
1304 }
1305 
1306 /*
1307  *  send a reset to the remote side and close the conversation
1308  *  called with s qlocked
1309  */
1310 char*
1311 tcphangup(Conv *s)
1312 {
1313 	Tcp seg;
1314 	Tcpctl *tcb;
1315 	Block *hbp;
1316 
1317 	tcb = (Tcpctl*)s->ptcl;
1318 	if(waserror())
1319 		return commonerror();
1320 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1321 		if(!waserror()){
1322 			memset(&seg, 0, sizeof seg);
1323 			seg.flags = RST | ACK;
1324 			seg.ack = tcb->rcv.nxt;
1325 			tcb->rcv.una = 0;
1326 			seg.seq = tcb->snd.ptr;
1327 			seg.wnd = 0;
1328 			seg.urg = 0;
1329 			seg.mss = 0;
1330 			seg.ws = 0;
1331 			switch(s->ipversion) {
1332 			case V4:
1333 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1334 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1335 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1336 				break;
1337 			case V6:
1338 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1339 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1340 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1341 				break;
1342 			default:
1343 				panic("tcphangup: version %d", s->ipversion);
1344 			}
1345 			poperror();
1346 		}
1347 	}
1348 	localclose(s, nil);
1349 	poperror();
1350 	return nil;
1351 }
1352 
1353 /*
1354  *  (re)send a SYN ACK
1355  */
1356 int
1357 sndsynack(Proto *tcp, Limbo *lp)
1358 {
1359 	Block *hbp;
1360 	Tcp4hdr ph4;
1361 	Tcp6hdr ph6;
1362 	Tcp seg;
1363 	int scale;
1364 
1365 	/* make pseudo header */
1366 	switch(lp->version) {
1367 	case V4:
1368 		memset(&ph4, 0, sizeof(ph4));
1369 		ph4.vihl = IP_VER4;
1370 		v6tov4(ph4.tcpsrc, lp->laddr);
1371 		v6tov4(ph4.tcpdst, lp->raddr);
1372 		ph4.proto = IP_TCPPROTO;
1373 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1374 		hnputs(ph4.tcpsport, lp->lport);
1375 		hnputs(ph4.tcpdport, lp->rport);
1376 		break;
1377 	case V6:
1378 		memset(&ph6, 0, sizeof(ph6));
1379 		ph6.vcf[0] = IP_VER6;
1380 		ipmove(ph6.tcpsrc, lp->laddr);
1381 		ipmove(ph6.tcpdst, lp->raddr);
1382 		ph6.proto = IP_TCPPROTO;
1383 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1384 		hnputs(ph6.tcpsport, lp->lport);
1385 		hnputs(ph6.tcpdport, lp->rport);
1386 		break;
1387 	default:
1388 		panic("sndrst: version %d", lp->version);
1389 	}
1390 
1391 	seg.seq = lp->iss;
1392 	seg.ack = lp->irs+1;
1393 	seg.flags = SYN|ACK;
1394 	seg.urg = 0;
1395 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1396 	seg.wnd = QMAX;
1397 
1398 	/* if the other side set scale, we should too */
1399 	if(lp->rcvscale){
1400 		seg.ws = scale;
1401 		lp->sndscale = scale;
1402 	} else {
1403 		seg.ws = 0;
1404 		lp->sndscale = 0;
1405 	}
1406 
1407 	switch(lp->version) {
1408 	case V4:
1409 		hbp = htontcp4(&seg, nil, &ph4, nil);
1410 		if(hbp == nil)
1411 			return -1;
1412 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1413 		break;
1414 	case V6:
1415 		hbp = htontcp6(&seg, nil, &ph6, nil);
1416 		if(hbp == nil)
1417 			return -1;
1418 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1419 		break;
1420 	default:
1421 		panic("sndsnack: version %d", lp->version);
1422 	}
1423 	lp->lastsend = NOW;
1424 	return 0;
1425 }
1426 
1427 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1428 
1429 /*
1430  *  put a call into limbo and respond with a SYN ACK
1431  *
1432  *  called with proto locked
1433  */
1434 static void
1435 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1436 {
1437 	Limbo *lp, **l;
1438 	Tcppriv *tpriv;
1439 	int h;
1440 
1441 	tpriv = s->p->priv;
1442 	h = hashipa(source, seg->source);
1443 
1444 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1445 		lp = *l;
1446 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1447 			continue;
1448 		if(ipcmp(lp->raddr, source) != 0)
1449 			continue;
1450 		if(ipcmp(lp->laddr, dest) != 0)
1451 			continue;
1452 
1453 		/* each new SYN restarts the retransmits */
1454 		lp->irs = seg->seq;
1455 		break;
1456 	}
1457 	lp = *l;
1458 	if(lp == nil){
1459 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1460 			lp = tpriv->lht[h];
1461 			tpriv->lht[h] = lp->next;
1462 			lp->next = nil;
1463 		} else {
1464 			lp = malloc(sizeof(*lp));
1465 			if(lp == nil)
1466 				return;
1467 			tpriv->nlimbo++;
1468 		}
1469 		*l = lp;
1470 		lp->version = version;
1471 		ipmove(lp->laddr, dest);
1472 		ipmove(lp->raddr, source);
1473 		lp->lport = seg->dest;
1474 		lp->rport = seg->source;
1475 		lp->mss = seg->mss;
1476 		lp->rcvscale = seg->ws;
1477 		lp->irs = seg->seq;
1478 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1479 	}
1480 
1481 	if(sndsynack(s->p, lp) < 0){
1482 		*l = lp->next;
1483 		tpriv->nlimbo--;
1484 		free(lp);
1485 	}
1486 }
1487 
1488 /*
1489  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1490  */
1491 static void
1492 limborexmit(Proto *tcp)
1493 {
1494 	Tcppriv *tpriv;
1495 	Limbo **l, *lp;
1496 	int h;
1497 	int seen;
1498 	ulong now;
1499 
1500 	tpriv = tcp->priv;
1501 
1502 	if(!canqlock(tcp))
1503 		return;
1504 	seen = 0;
1505 	now = NOW;
1506 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1507 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1508 			lp = *l;
1509 			seen++;
1510 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1511 				continue;
1512 
1513 			/* time it out after 1 second */
1514 			if(++(lp->rexmits) > 5){
1515 				tpriv->nlimbo--;
1516 				*l = lp->next;
1517 				free(lp);
1518 				continue;
1519 			}
1520 
1521 			/* if we're being attacked, don't bother resending SYN ACK's */
1522 			if(tpriv->nlimbo > 100)
1523 				continue;
1524 
1525 			if(sndsynack(tcp, lp) < 0){
1526 				tpriv->nlimbo--;
1527 				*l = lp->next;
1528 				free(lp);
1529 				continue;
1530 			}
1531 
1532 			l = &lp->next;
1533 		}
1534 	}
1535 	qunlock(tcp);
1536 }
1537 
1538 /*
1539  *  lookup call in limbo.  if found, throw it out.
1540  *
1541  *  called with proto locked
1542  */
1543 static void
1544 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1545 {
1546 	Limbo *lp, **l;
1547 	int h;
1548 	Tcppriv *tpriv;
1549 
1550 	tpriv = s->p->priv;
1551 
1552 	/* find a call in limbo */
1553 	h = hashipa(src, segp->source);
1554 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1555 		lp = *l;
1556 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1557 			continue;
1558 		if(ipcmp(lp->laddr, dst) != 0)
1559 			continue;
1560 		if(ipcmp(lp->raddr, src) != 0)
1561 			continue;
1562 
1563 		/* RST can only follow the SYN */
1564 		if(segp->seq == lp->irs+1){
1565 			tpriv->nlimbo--;
1566 			*l = lp->next;
1567 			free(lp);
1568 		}
1569 		break;
1570 	}
1571 }
1572 
1573 /*
1574  *  come here when we finally get an ACK to our SYN-ACK.
1575  *  lookup call in limbo.  if found, create a new conversation
1576  *
1577  *  called with proto locked
1578  */
1579 static Conv*
1580 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1581 {
1582 	Conv *new;
1583 	Tcpctl *tcb;
1584 	Tcppriv *tpriv;
1585 	Tcp4hdr *h4;
1586 	Tcp6hdr *h6;
1587 	Limbo *lp, **l;
1588 	int h;
1589 
1590 	/* unless it's just an ack, it can't be someone coming out of limbo */
1591 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1592 		return nil;
1593 
1594 	tpriv = s->p->priv;
1595 
1596 	/* find a call in limbo */
1597 	h = hashipa(src, segp->source);
1598 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1599 		netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d",
1600 			src, segp->source, lp->raddr, lp->rport,
1601 			dst, segp->dest, lp->laddr, lp->lport,
1602 			version, lp->version
1603  		);
1604 
1605 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1606 			continue;
1607 		if(ipcmp(lp->laddr, dst) != 0)
1608 			continue;
1609 		if(ipcmp(lp->raddr, src) != 0)
1610 			continue;
1611 
1612 		/* we're assuming no data with the initial SYN */
1613 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1614 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux",
1615 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1616 			lp = nil;
1617 		} else {
1618 			tpriv->nlimbo--;
1619 			*l = lp->next;
1620 		}
1621 		break;
1622 	}
1623 	if(lp == nil)
1624 		return nil;
1625 
1626 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1627 	if(new == nil)
1628 		return nil;
1629 
1630 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1631 	tcb = (Tcpctl*)new->ptcl;
1632 	tcb->flags &= ~CLONE;
1633 	tcb->timer.arg = new;
1634 	tcb->timer.state = TcptimerOFF;
1635 	tcb->acktimer.arg = new;
1636 	tcb->acktimer.state = TcptimerOFF;
1637 	tcb->katimer.arg = new;
1638 	tcb->katimer.state = TcptimerOFF;
1639 	tcb->rtt_timer.arg = new;
1640 	tcb->rtt_timer.state = TcptimerOFF;
1641 
1642 	tcb->irs = lp->irs;
1643 	tcb->rcv.nxt = tcb->irs+1;
1644 	tcb->rcv.urg = tcb->rcv.nxt;
1645 
1646 	tcb->iss = lp->iss;
1647 	tcb->rttseq = tcb->iss;
1648 	tcb->snd.wl2 = tcb->iss;
1649 	tcb->snd.una = tcb->iss+1;
1650 	tcb->snd.ptr = tcb->iss+1;
1651 	tcb->snd.nxt = tcb->iss+1;
1652 	tcb->flgcnt = 0;
1653 	tcb->flags |= SYNACK;
1654 
1655 	/* our sending max segment size cannot be bigger than what he asked for */
1656 	if(lp->mss != 0 && lp->mss < tcb->mss)
1657 		tcb->mss = lp->mss;
1658 
1659 	/* window scaling */
1660 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1661 
1662 	/* the congestion window always starts out as a single segment */
1663 	tcb->snd.wnd = segp->wnd;
1664 	tcb->cwind = tcb->mss;
1665 
1666 	/* set initial round trip time */
1667 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1668 	tcpsynackrtt(new);
1669 
1670 	free(lp);
1671 
1672 	/* set up proto header */
1673 	switch(version){
1674 	case V4:
1675 		h4 = &tcb->protohdr.tcp4hdr;
1676 		memset(h4, 0, sizeof(*h4));
1677 		h4->proto = IP_TCPPROTO;
1678 		hnputs(h4->tcpsport, new->lport);
1679 		hnputs(h4->tcpdport, new->rport);
1680 		v6tov4(h4->tcpsrc, dst);
1681 		v6tov4(h4->tcpdst, src);
1682 		break;
1683 	case V6:
1684 		h6 = &tcb->protohdr.tcp6hdr;
1685 		memset(h6, 0, sizeof(*h6));
1686 		h6->proto = IP_TCPPROTO;
1687 		hnputs(h6->tcpsport, new->lport);
1688 		hnputs(h6->tcpdport, new->rport);
1689 		ipmove(h6->tcpsrc, dst);
1690 		ipmove(h6->tcpdst, src);
1691 		break;
1692 	default:
1693 		panic("tcpincoming: version %d", new->ipversion);
1694 	}
1695 
1696 	tcpsetstate(new, Established);
1697 
1698 	iphtadd(&tpriv->ht, new);
1699 
1700 	return new;
1701 }
1702 
1703 int
1704 seq_within(ulong x, ulong low, ulong high)
1705 {
1706 	if(low <= high){
1707 		if(low <= x && x <= high)
1708 			return 1;
1709 	}
1710 	else {
1711 		if(x >= low || x <= high)
1712 			return 1;
1713 	}
1714 	return 0;
1715 }
1716 
1717 int
1718 seq_lt(ulong x, ulong y)
1719 {
1720 	return (int)(x-y) < 0;
1721 }
1722 
1723 int
1724 seq_le(ulong x, ulong y)
1725 {
1726 	return (int)(x-y) <= 0;
1727 }
1728 
1729 int
1730 seq_gt(ulong x, ulong y)
1731 {
1732 	return (int)(x-y) > 0;
1733 }
1734 
1735 int
1736 seq_ge(ulong x, ulong y)
1737 {
1738 	return (int)(x-y) >= 0;
1739 }
1740 
1741 /*
1742  *  use the time between the first SYN and it's ack as the
1743  *  initial round trip time
1744  */
1745 void
1746 tcpsynackrtt(Conv *s)
1747 {
1748 	Tcpctl *tcb;
1749 	int delta;
1750 	Tcppriv *tpriv;
1751 
1752 	tcb = (Tcpctl*)s->ptcl;
1753 	tpriv = s->p->priv;
1754 
1755 	delta = NOW - tcb->sndsyntime;
1756 	tcb->srtt = delta<<LOGAGAIN;
1757 	tcb->mdev = delta<<LOGDGAIN;
1758 
1759 	/* halt round trip timer */
1760 	tcphalt(tpriv, &tcb->rtt_timer);
1761 }
1762 
1763 void
1764 update(Conv *s, Tcp *seg)
1765 {
1766 	int rtt, delta;
1767 	Tcpctl *tcb;
1768 	ulong acked;
1769 	ulong expand;
1770 	Tcppriv *tpriv;
1771 
1772 	tpriv = s->p->priv;
1773 	tcb = (Tcpctl*)s->ptcl;
1774 
1775 	/* if everything has been acked, force output(?) */
1776 	if(seq_gt(seg->ack, tcb->snd.nxt)) {
1777 		tcb->flags |= FORCE;
1778 		return;
1779 	}
1780 
1781 	/* added by Dong Lin for fast retransmission */
1782 	if(seg->ack == tcb->snd.una
1783 	&& tcb->snd.una != tcb->snd.nxt
1784 	&& seg->len == 0
1785 	&& seg->wnd == tcb->snd.wnd) {
1786 
1787 		/* this is a pure ack w/o window update */
1788 		netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1789 			tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1790 
1791 		if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1792 			/*
1793 			 *  tahoe tcp rxt the packet, half sshthresh,
1794  			 *  and set cwnd to one packet
1795 			 */
1796 			tcb->snd.recovery = 1;
1797 			tcb->snd.rxt = tcb->snd.nxt;
1798 			netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1799 			tcprxmit(s);
1800 		} else {
1801 			/* do reno tcp here. */
1802 		}
1803 	}
1804 
1805 	/*
1806 	 *  update window
1807 	 */
1808 	if(seq_gt(seg->ack, tcb->snd.wl2)
1809 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1810 		tcb->snd.wnd = seg->wnd;
1811 		tcb->snd.wl2 = seg->ack;
1812 	}
1813 
1814 	if(!seq_gt(seg->ack, tcb->snd.una)){
1815 		/*
1816 		 *  don't let us hangup if sending into a closed window and
1817 		 *  we're still getting acks
1818 		 */
1819 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1820 			tcb->backedoff = MAXBACKMS/4;
1821 		}
1822 		return;
1823 	}
1824 
1825 	/*
1826 	 *  any positive ack turns off fast rxt,
1827 	 *  (should we do new-reno on partial acks?)
1828 	 */
1829 	if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1830 		tcb->snd.dupacks = 0;
1831 		tcb->snd.recovery = 0;
1832 	} else
1833 		netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1834 
1835 	/* Compute the new send window size */
1836 	acked = seg->ack - tcb->snd.una;
1837 
1838 	/* avoid slow start and timers for SYN acks */
1839 	if((tcb->flags & SYNACK) == 0) {
1840 		tcb->flags |= SYNACK;
1841 		acked--;
1842 		tcb->flgcnt--;
1843 		goto done;
1844 	}
1845 
1846 	/* slow start as long as we're not recovering from lost packets */
1847 	if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1848 		if(tcb->cwind < tcb->ssthresh) {
1849 			expand = tcb->mss;
1850 			if(acked < expand)
1851 				expand = acked;
1852 		}
1853 		else
1854 			expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1855 
1856 		if(tcb->cwind + expand < tcb->cwind)
1857 			expand = tcb->snd.wnd - tcb->cwind;
1858 		if(tcb->cwind + expand > tcb->snd.wnd)
1859 			expand = tcb->snd.wnd - tcb->cwind;
1860 		tcb->cwind += expand;
1861 	}
1862 
1863 	/* Adjust the timers according to the round trip time */
1864 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1865 		tcphalt(tpriv, &tcb->rtt_timer);
1866 		if((tcb->flags&RETRAN) == 0) {
1867 			tcb->backoff = 0;
1868 			tcb->backedoff = 0;
1869 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1870 			if(rtt == 0)
1871 				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
1872 			rtt *= MSPTICK;
1873 			if(tcb->srtt == 0) {
1874 				tcb->srtt = rtt << LOGAGAIN;
1875 				tcb->mdev = rtt << LOGDGAIN;
1876 			} else {
1877 				delta = rtt - (tcb->srtt>>LOGAGAIN);
1878 				tcb->srtt += delta;
1879 				if(tcb->srtt <= 0)
1880 					tcb->srtt = 1;
1881 
1882 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1883 				tcb->mdev += delta;
1884 				if(tcb->mdev <= 0)
1885 					tcb->mdev = 1;
1886 			}
1887 			tcpsettimer(tcb);
1888 		}
1889 	}
1890 
1891 done:
1892 	if(qdiscard(s->wq, acked) < acked)
1893 		tcb->flgcnt--;
1894 
1895 	tcb->snd.una = seg->ack;
1896 	if(seq_gt(seg->ack, tcb->snd.urg))
1897 		tcb->snd.urg = seg->ack;
1898 
1899 	if(tcb->snd.una != tcb->snd.nxt)
1900 		tcpgo(tpriv, &tcb->timer);
1901 	else
1902 		tcphalt(tpriv, &tcb->timer);
1903 
1904 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1905 		tcb->snd.ptr = tcb->snd.una;
1906 
1907 	tcb->flags &= ~RETRAN;
1908 	tcb->backoff = 0;
1909 	tcb->backedoff = 0;
1910 }
1911 
1912 void
1913 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1914 {
1915 	Tcp seg;
1916 	Tcp4hdr *h4;
1917 	Tcp6hdr *h6;
1918 	int hdrlen;
1919 	Tcpctl *tcb;
1920 	ushort length;
1921 	uchar source[IPaddrlen], dest[IPaddrlen];
1922 	Conv *s;
1923 	Fs *f;
1924 	Tcppriv *tpriv;
1925 	uchar version;
1926 
1927 	f = tcp->f;
1928 	tpriv = tcp->priv;
1929 
1930 	tpriv->stats[InSegs]++;
1931 
1932 	h4 = (Tcp4hdr*)(bp->rp);
1933 	h6 = (Tcp6hdr*)(bp->rp);
1934 
1935 	if((h4->vihl&0xF0)==IP_VER4) {
1936 		version = V4;
1937 		length = nhgets(h4->length);
1938 		v4tov6(dest, h4->tcpdst);
1939 		v4tov6(source, h4->tcpsrc);
1940 
1941 		h4->Unused = 0;
1942 		hnputs(h4->tcplen, length-TCP4_PKT);
1943 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1944 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1945 			tpriv->stats[CsumErrs]++;
1946 			tpriv->stats[InErrs]++;
1947 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1948 			freeblist(bp);
1949 			return;
1950 		}
1951 
1952 		hdrlen = ntohtcp4(&seg, &bp);
1953 		if(hdrlen < 0){
1954 			tpriv->stats[HlenErrs]++;
1955 			tpriv->stats[InErrs]++;
1956 			netlog(f, Logtcp, "bad tcp hdr len\n");
1957 			return;
1958 		}
1959 
1960 		/* trim the packet to the size claimed by the datagram */
1961 		length -= hdrlen+TCP4_PKT;
1962 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1963 		if(bp == nil){
1964 			tpriv->stats[LenErrs]++;
1965 			tpriv->stats[InErrs]++;
1966 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
1967 			return;
1968 		}
1969 	}
1970 	else {
1971 		int ttl = h6->ttl;
1972 		int proto = h6->proto;
1973 
1974 		version = V6;
1975 		length = nhgets(h6->ploadlen);
1976 		ipmove(dest, h6->tcpdst);
1977 		ipmove(source, h6->tcpsrc);
1978 
1979 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1980 		h6->ttl = proto;
1981 		hnputl(h6->vcf, length);
1982 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1983 			ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
1984 			tpriv->stats[CsumErrs]++;
1985 			tpriv->stats[InErrs]++;
1986 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1987 			freeblist(bp);
1988 			return;
1989 		}
1990 		h6->ttl = ttl;
1991 		h6->proto = proto;
1992 		hnputs(h6->ploadlen, length);
1993 
1994 		hdrlen = ntohtcp6(&seg, &bp);
1995 		if(hdrlen < 0){
1996 			tpriv->stats[HlenErrs]++;
1997 			tpriv->stats[InErrs]++;
1998 			netlog(f, Logtcp, "bad tcp hdr len\n");
1999 			return;
2000 		}
2001 
2002 		/* trim the packet to the size claimed by the datagram */
2003 		length -= hdrlen;
2004 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2005 		if(bp == nil){
2006 			tpriv->stats[LenErrs]++;
2007 			tpriv->stats[InErrs]++;
2008 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2009 			return;
2010 		}
2011 	}
2012 
2013 	/* lock protocol while searching for a conversation */
2014 	qlock(tcp);
2015 
2016 	/* Look for a matching conversation */
2017 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2018 	if(s == nil){
2019 		netlog(f, Logtcp, "iphtlook failed");
2020 reset:
2021 		qunlock(tcp);
2022 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2023 		freeblist(bp);
2024 		return;
2025 	}
2026 
2027 	/* if it's a listener, look for the right flags and get a new conv */
2028 	tcb = (Tcpctl*)s->ptcl;
2029 	if(tcb->state == Listen){
2030 		if(seg.flags & RST){
2031 			limborst(s, &seg, source, dest, version);
2032 			qunlock(tcp);
2033 			freeblist(bp);
2034 			return;
2035 		}
2036 
2037 		/* if this is a new SYN, put the call into limbo */
2038 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2039 			limbo(s, source, dest, &seg, version);
2040 			qunlock(tcp);
2041 			freeblist(bp);
2042 			return;
2043 		}
2044 
2045 		/*
2046 		 *  if there's a matching call in limbo, tcpincoming will
2047 		 *  return it in state Syn_received
2048 		 */
2049 		s = tcpincoming(s, &seg, source, dest, version);
2050 		if(s == nil)
2051 			goto reset;
2052 	}
2053 
2054 	/* The rest of the input state machine is run with the control block
2055 	 * locked and implements the state machine directly out of the RFC.
2056 	 * Out-of-band data is ignored - it was always a bad idea.
2057 	 */
2058 	tcb = (Tcpctl*)s->ptcl;
2059 	if(waserror()){
2060 		qunlock(s);
2061 		nexterror();
2062 	}
2063 	qlock(s);
2064 	qunlock(tcp);
2065 
2066 	/* fix up window */
2067 	seg.wnd <<= tcb->rcv.scale;
2068 
2069 	/* every input packet in puts off the keep alive time out */
2070 	tcpsetkacounter(tcb);
2071 
2072 	switch(tcb->state) {
2073 	case Closed:
2074 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2075 		goto raise;
2076 	case Syn_sent:
2077 		if(seg.flags & ACK) {
2078 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2079 				sndrst(tcp, source, dest, length, &seg, version,
2080 					 "bad seq in Syn_sent");
2081 				goto raise;
2082 			}
2083 		}
2084 		if(seg.flags & RST) {
2085 			if(seg.flags & ACK)
2086 				localclose(s, Econrefused);
2087 			goto raise;
2088 		}
2089 
2090 		if(seg.flags & SYN) {
2091 			procsyn(s, &seg);
2092 			if(seg.flags & ACK){
2093 				update(s, &seg);
2094 				tcpsynackrtt(s);
2095 				tcpsetstate(s, Established);
2096 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2097 			}
2098 			else {
2099 				tcb->time = NOW;
2100 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2101 			}
2102 
2103 			if(length != 0 || (seg.flags & FIN))
2104 				break;
2105 
2106 			freeblist(bp);
2107 			goto output;
2108 		}
2109 		else
2110 			freeblist(bp);
2111 
2112 		qunlock(s);
2113 		poperror();
2114 		return;
2115 	case Syn_received:
2116 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2117 		if(seg.flags & ACK)
2118 			tcpsynackrtt(s);
2119 		break;
2120 	}
2121 
2122 	/*
2123 	 *  One DOS attack is to open connections to us and then forget about them,
2124 	 *  thereby tying up a conv at no long term cost to the attacker.
2125 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2126 	 *  corresponding code in tcpsendka().
2127 	 */
2128 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2129 		if(tcpporthogdefense
2130 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2131 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2132 				source, seg.source, dest, seg.dest, seg.flags,
2133 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2134 			localclose(s, "stateless hog");
2135 		}
2136 	}
2137 
2138 	/* Cut the data to fit the receive window */
2139 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2140 		netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2141 		update(s, &seg);
2142 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2143 			tcphalt(tpriv, &tcb->rtt_timer);
2144 			tcphalt(tpriv, &tcb->acktimer);
2145 			tcphalt(tpriv, &tcb->katimer);
2146 			tcpsetstate(s, Time_wait);
2147 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2148 			tcpgo(tpriv, &tcb->timer);
2149 		}
2150 		if(!(seg.flags & RST)) {
2151 			tcb->flags |= FORCE;
2152 			goto output;
2153 		}
2154 		qunlock(s);
2155 		poperror();
2156 		return;
2157 	}
2158 
2159 	/* Cannot accept so answer with a rst */
2160 	if(length && tcb->state == Closed) {
2161 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2162 		goto raise;
2163 	}
2164 
2165 	/* The segment is beyond the current receive pointer so
2166 	 * queue the data in the resequence queue
2167 	 */
2168 	if(seg.seq != tcb->rcv.nxt)
2169 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2170 		update(s, &seg);
2171 		if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2172 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2173 		tcb->flags |= FORCE;
2174 		goto output;
2175 	}
2176 
2177 	/*
2178 	 *  keep looping till we've processed this packet plus any
2179 	 *  adjacent packets in the resequence queue
2180 	 */
2181 	for(;;) {
2182 		if(seg.flags & RST) {
2183 			if(tcb->state == Established) {
2184 				tpriv->stats[EstabResets]++;
2185 				if(tcb->rcv.nxt != seg.seq)
2186 					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2187 			}
2188 			localclose(s, Econrefused);
2189 			goto raise;
2190 		}
2191 
2192 		if((seg.flags&ACK) == 0)
2193 			goto raise;
2194 
2195 		switch(tcb->state) {
2196 		case Syn_received:
2197 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2198 				sndrst(tcp, source, dest, length, &seg, version,
2199 					"bad seq in Syn_received");
2200 				goto raise;
2201 			}
2202 			update(s, &seg);
2203 			tcpsetstate(s, Established);
2204 		case Established:
2205 		case Close_wait:
2206 			update(s, &seg);
2207 			break;
2208 		case Finwait1:
2209 			update(s, &seg);
2210 			if(qlen(s->wq)+tcb->flgcnt == 0){
2211 				tcphalt(tpriv, &tcb->rtt_timer);
2212 				tcphalt(tpriv, &tcb->acktimer);
2213 				tcpsetkacounter(tcb);
2214 				tcb->time = NOW;
2215 				tcpsetstate(s, Finwait2);
2216 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2217 				tcpgo(tpriv, &tcb->katimer);
2218 			}
2219 			break;
2220 		case Finwait2:
2221 			update(s, &seg);
2222 			break;
2223 		case Closing:
2224 			update(s, &seg);
2225 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2226 				tcphalt(tpriv, &tcb->rtt_timer);
2227 				tcphalt(tpriv, &tcb->acktimer);
2228 				tcphalt(tpriv, &tcb->katimer);
2229 				tcpsetstate(s, Time_wait);
2230 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2231 				tcpgo(tpriv, &tcb->timer);
2232 			}
2233 			break;
2234 		case Last_ack:
2235 			update(s, &seg);
2236 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2237 				localclose(s, nil);
2238 				goto raise;
2239 			}
2240 		case Time_wait:
2241 			tcb->flags |= FORCE;
2242 			if(tcb->timer.state != TcptimerON)
2243 				tcpgo(tpriv, &tcb->timer);
2244 		}
2245 
2246 		if((seg.flags&URG) && seg.urg) {
2247 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2248 				tcb->rcv.urg = seg.urg + seg.seq;
2249 				pullblock(&bp, seg.urg);
2250 			}
2251 		}
2252 		else
2253 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2254 			tcb->rcv.urg = tcb->rcv.nxt;
2255 
2256 		if(length == 0) {
2257 			if(bp != nil)
2258 				freeblist(bp);
2259 		}
2260 		else {
2261 			switch(tcb->state){
2262 			default:
2263 				/* Ignore segment text */
2264 				if(bp != nil)
2265 					freeblist(bp);
2266 				break;
2267 
2268 			case Syn_received:
2269 			case Established:
2270 			case Finwait1:
2271 				/* If we still have some data place on
2272 				 * receive queue
2273 				 */
2274 				if(bp) {
2275 					bp = packblock(bp);
2276 					if(bp == nil)
2277 						panic("tcp packblock");
2278 					qpassnolim(s->rq, bp);
2279 					bp = nil;
2280 
2281 					/*
2282 					 *  Force an ack every 2 data messages.  This is
2283 					 *  a hack for rob to make his home system run
2284 					 *  faster.
2285 					 *
2286 					 *  this also keeps the standard TCP congestion
2287 					 *  control working since it needs an ack every
2288 					 *  2 max segs worth.  This is not quite that,
2289 					 *  but under a real stream is equivalent since
2290 					 *  every packet has a max seg in it.
2291 					 */
2292 					if(++(tcb->rcv.una) >= 2)
2293 						tcb->flags |= FORCE;
2294 				}
2295 				tcb->rcv.nxt += length;
2296 
2297 				/*
2298 				 *  update our rcv window
2299 				 */
2300 				tcprcvwin(s);
2301 
2302 				/*
2303 				 *  turn on the acktimer if there's something
2304 				 *  to ack
2305 				 */
2306 				if(tcb->acktimer.state != TcptimerON)
2307 					tcpgo(tpriv, &tcb->acktimer);
2308 
2309 				break;
2310 			case Finwait2:
2311 				/* no process to read the data, send a reset */
2312 				if(bp != nil)
2313 					freeblist(bp);
2314 				sndrst(tcp, source, dest, length, &seg, version,
2315 					"send to Finwait2");
2316 				qunlock(s);
2317 				poperror();
2318 				return;
2319 			}
2320 		}
2321 
2322 		if(seg.flags & FIN) {
2323 			tcb->flags |= FORCE;
2324 
2325 			switch(tcb->state) {
2326 			case Syn_received:
2327 			case Established:
2328 				tcb->rcv.nxt++;
2329 				tcpsetstate(s, Close_wait);
2330 				break;
2331 			case Finwait1:
2332 				tcb->rcv.nxt++;
2333 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2334 					tcphalt(tpriv, &tcb->rtt_timer);
2335 					tcphalt(tpriv, &tcb->acktimer);
2336 					tcphalt(tpriv, &tcb->katimer);
2337 					tcpsetstate(s, Time_wait);
2338 					tcb->timer.start = MSL2*(1000/MSPTICK);
2339 					tcpgo(tpriv, &tcb->timer);
2340 				}
2341 				else
2342 					tcpsetstate(s, Closing);
2343 				break;
2344 			case Finwait2:
2345 				tcb->rcv.nxt++;
2346 				tcphalt(tpriv, &tcb->rtt_timer);
2347 				tcphalt(tpriv, &tcb->acktimer);
2348 				tcphalt(tpriv, &tcb->katimer);
2349 				tcpsetstate(s, Time_wait);
2350 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2351 				tcpgo(tpriv, &tcb->timer);
2352 				break;
2353 			case Close_wait:
2354 			case Closing:
2355 			case Last_ack:
2356 				break;
2357 			case Time_wait:
2358 				tcpgo(tpriv, &tcb->timer);
2359 				break;
2360 			}
2361 		}
2362 
2363 		/*
2364 		 *  get next adjacent segment from the resequence queue.
2365 		 *  dump/trim any overlapping segments
2366 		 */
2367 		for(;;) {
2368 			if(tcb->reseq == nil)
2369 				goto output;
2370 
2371 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2372 				goto output;
2373 
2374 			getreseq(tcb, &seg, &bp, &length);
2375 
2376 			if(tcptrim(tcb, &seg, &bp, &length) == 0)
2377 				break;
2378 		}
2379 	}
2380 output:
2381 	tcpoutput(s);
2382 	qunlock(s);
2383 	poperror();
2384 	return;
2385 raise:
2386 	qunlock(s);
2387 	poperror();
2388 	freeblist(bp);
2389 	tcpkick(s);
2390 }
2391 
2392 /*
2393  *  always enters and exits with the s locked.  We drop
2394  *  the lock to ipoput the packet so some care has to be
2395  *  taken by callers.
2396  */
2397 void
2398 tcpoutput(Conv *s)
2399 {
2400 	Tcp seg;
2401 	int msgs;
2402 	Tcpctl *tcb;
2403 	Block *hbp, *bp;
2404 	int sndcnt, n;
2405 	ulong ssize, dsize, usable, sent;
2406 	Fs *f;
2407 	Tcppriv *tpriv;
2408 	uchar version;
2409 
2410 	f = s->p->f;
2411 	tpriv = s->p->priv;
2412 	version = s->ipversion;
2413 
2414 	for(msgs = 0; msgs < 100; msgs++) {
2415 		tcb = (Tcpctl*)s->ptcl;
2416 
2417 		switch(tcb->state) {
2418 		case Listen:
2419 		case Closed:
2420 		case Finwait2:
2421 			return;
2422 		}
2423 
2424 		/* force an ack when a window has opened up */
2425 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2426 			tcb->rcv.blocked = 0;
2427 			tcb->flags |= FORCE;
2428 		}
2429 
2430 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2431 		sent = tcb->snd.ptr - tcb->snd.una;
2432 
2433 		/* Don't send anything else until our SYN has been acked */
2434 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2435 			break;
2436 
2437 		/* Compute usable segment based on offered window and limit
2438 		 * window probes to one
2439 		 */
2440 		if(tcb->snd.wnd == 0){
2441 			if(sent != 0) {
2442 				if((tcb->flags&FORCE) == 0)
2443 					break;
2444 //				tcb->snd.ptr = tcb->snd.una;
2445 			}
2446 			usable = 1;
2447 		}
2448 		else {
2449 			usable = tcb->cwind;
2450 			if(tcb->snd.wnd < usable)
2451 				usable = tcb->snd.wnd;
2452 			usable -= sent;
2453 		}
2454 		ssize = sndcnt-sent;
2455 		if(ssize && usable < 2)
2456 			netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2457 				tcb->snd.wnd, tcb->cwind);
2458 		if(usable < ssize)
2459 			ssize = usable;
2460 		if(tcb->mss < ssize)
2461 			ssize = tcb->mss;
2462 		dsize = ssize;
2463 		seg.urg = 0;
2464 
2465 		if(ssize == 0)
2466 		if((tcb->flags&FORCE) == 0)
2467 			break;
2468 
2469 		tcb->flags &= ~FORCE;
2470 		tcprcvwin(s);
2471 
2472 		/* By default we will generate an ack */
2473 		tcphalt(tpriv, &tcb->acktimer);
2474 		tcb->rcv.una = 0;
2475 		seg.source = s->lport;
2476 		seg.dest = s->rport;
2477 		seg.flags = ACK;
2478 		seg.mss = 0;
2479 		seg.ws = 0;
2480 		switch(tcb->state){
2481 		case Syn_sent:
2482 			seg.flags = 0;
2483 			if(tcb->snd.ptr == tcb->iss){
2484 				seg.flags |= SYN;
2485 				dsize--;
2486 				seg.mss = tcb->mss;
2487 				seg.ws = tcb->scale;
2488 			}
2489 			break;
2490 		case Syn_received:
2491 			/*
2492 			 *  don't send any data with a SYN/ACK packet
2493 			 *  because Linux rejects the packet in its
2494 			 *  attempt to solve the SYN attack problem
2495 			 */
2496 			if(tcb->snd.ptr == tcb->iss){
2497 				seg.flags |= SYN;
2498 				dsize = 0;
2499 				ssize = 1;
2500 				seg.mss = tcb->mss;
2501 				seg.ws = tcb->scale;
2502 			}
2503 			break;
2504 		}
2505 		seg.seq = tcb->snd.ptr;
2506 		seg.ack = tcb->rcv.nxt;
2507 		seg.wnd = tcb->rcv.wnd;
2508 
2509 		/* Pull out data to send */
2510 		bp = nil;
2511 		if(dsize != 0) {
2512 			bp = qcopy(s->wq, dsize, sent);
2513 			if(BLEN(bp) != dsize) {
2514 				seg.flags |= FIN;
2515 				dsize--;
2516 			}
2517 		}
2518 
2519 		if(sent+dsize == sndcnt)
2520 			seg.flags |= PSH;
2521 
2522 		/* keep track of balance of resent data */
2523 		if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2524 			n = tcb->snd.nxt - tcb->snd.ptr;
2525 			if(ssize < n)
2526 				n = ssize;
2527 			tcb->resent += n;
2528 			netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2529 				s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2530 			tpriv->stats[RetransSegs]++;
2531 		}
2532 
2533 		tcb->snd.ptr += ssize;
2534 
2535 		/* Pull up the send pointer so we can accept acks
2536 		 * for this window
2537 		 */
2538 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2539 			tcb->snd.nxt = tcb->snd.ptr;
2540 
2541 		/* Build header, link data and compute cksum */
2542 		switch(version){
2543 		case V4:
2544 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2545 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2546 			if(hbp == nil) {
2547 				freeblist(bp);
2548 				return;
2549 			}
2550 			break;
2551 		case V6:
2552 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2553 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2554 			if(hbp == nil) {
2555 				freeblist(bp);
2556 				return;
2557 			}
2558 			break;
2559 		default:
2560 			hbp = nil;	/* to suppress a warning */
2561 			panic("tcpoutput: version %d", version);
2562 		}
2563 
2564 		/* Start the transmission timers if there is new data and we
2565 		 * expect acknowledges
2566 		 */
2567 		if(ssize != 0){
2568 			if(tcb->timer.state != TcptimerON)
2569 				tcpgo(tpriv, &tcb->timer);
2570 
2571 			/*  If round trip timer isn't running, start it.
2572 			 *  measure the longest packet only in case the
2573 			 *  transmission time dominates RTT
2574 			 */
2575 			if(tcb->rtt_timer.state != TcptimerON)
2576 			if(ssize == tcb->mss) {
2577 				tcpgo(tpriv, &tcb->rtt_timer);
2578 				tcb->rttseq = tcb->snd.ptr;
2579 			}
2580 		}
2581 
2582 		tpriv->stats[OutSegs]++;
2583 
2584 		/* put off the next keep alive */
2585 		tcpgo(tpriv, &tcb->katimer);
2586 
2587 		switch(version){
2588 		case V4:
2589 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2590 				/* a negative return means no route */
2591 				localclose(s, "no route");
2592 			}
2593 			break;
2594 		case V6:
2595 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2596 				/* a negative return means no route */
2597 				localclose(s, "no route");
2598 			}
2599 			break;
2600 		default:
2601 			panic("tcpoutput2: version %d", version);
2602 		}
2603 		if((msgs%4) == 1){
2604 			qunlock(s);
2605 			sched();
2606 			qlock(s);
2607 		}
2608 	}
2609 }
2610 
2611 /*
2612  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2613  */
2614 void
2615 tcpsendka(Conv *s)
2616 {
2617 	Tcp seg;
2618 	Tcpctl *tcb;
2619 	Block *hbp,*dbp;
2620 
2621 	tcb = (Tcpctl*)s->ptcl;
2622 
2623 	dbp = nil;
2624 	seg.urg = 0;
2625 	seg.source = s->lport;
2626 	seg.dest = s->rport;
2627 	seg.flags = ACK|PSH;
2628 	seg.mss = 0;
2629 	seg.ws = 0;
2630 	if(tcpporthogdefense)
2631 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2632 	else
2633 		seg.seq = tcb->snd.una-1;
2634 	seg.ack = tcb->rcv.nxt;
2635 	tcb->rcv.una = 0;
2636 	seg.wnd = tcb->rcv.wnd;
2637 	if(tcb->state == Finwait2){
2638 		seg.flags |= FIN;
2639 	} else {
2640 		dbp = allocb(1);
2641 		dbp->wp++;
2642 	}
2643 
2644 	if(isv4(s->raddr)) {
2645 		/* Build header, link data and compute cksum */
2646 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2647 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2648 		if(hbp == nil) {
2649 			freeblist(dbp);
2650 			return;
2651 		}
2652 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2653 	}
2654 	else {
2655 		/* Build header, link data and compute cksum */
2656 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2657 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2658 		if(hbp == nil) {
2659 			freeblist(dbp);
2660 			return;
2661 		}
2662 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2663 	}
2664 }
2665 
2666 /*
2667  *  set connection to time out after 12 minutes
2668  */
2669 void
2670 tcpsetkacounter(Tcpctl *tcb)
2671 {
2672 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2673 	if(tcb->kacounter < 3)
2674 		tcb->kacounter = 3;
2675 }
2676 
2677 /*
2678  *  if we've timed out, close the connection
2679  *  otherwise, send a keepalive and restart the timer
2680  */
2681 void
2682 tcpkeepalive(void *v)
2683 {
2684 	Tcpctl *tcb;
2685 	Conv *s;
2686 
2687 	s = v;
2688 	tcb = (Tcpctl*)s->ptcl;
2689 	if(waserror()){
2690 		qunlock(s);
2691 		nexterror();
2692 	}
2693 	qlock(s);
2694 	if(tcb->state != Closed){
2695 		if(--(tcb->kacounter) <= 0) {
2696 			localclose(s, Etimedout);
2697 		} else {
2698 			tcpsendka(s);
2699 			tcpgo(s->p->priv, &tcb->katimer);
2700 		}
2701 	}
2702 	qunlock(s);
2703 	poperror();
2704 }
2705 
2706 /*
2707  *  start keepalive timer
2708  */
2709 char*
2710 tcpstartka(Conv *s, char **f, int n)
2711 {
2712 	Tcpctl *tcb;
2713 	int x;
2714 
2715 	tcb = (Tcpctl*)s->ptcl;
2716 	if(tcb->state != Established)
2717 		return "connection must be in Establised state";
2718 	if(n > 1){
2719 		x = atoi(f[1]);
2720 		if(x >= MSPTICK)
2721 			tcb->katimer.start = x/MSPTICK;
2722 	}
2723 	tcpsetkacounter(tcb);
2724 	tcpgo(s->p->priv, &tcb->katimer);
2725 
2726 	return nil;
2727 }
2728 
2729 /*
2730  *  turn checksums on/off
2731  */
2732 char*
2733 tcpsetchecksum(Conv *s, char **f, int)
2734 {
2735 	Tcpctl *tcb;
2736 
2737 	tcb = (Tcpctl*)s->ptcl;
2738 	tcb->nochecksum = !atoi(f[1]);
2739 
2740 	return nil;
2741 }
2742 
2743 void
2744 tcprxmit(Conv *s)
2745 {
2746 	Tcpctl *tcb;
2747 
2748 	tcb = (Tcpctl*)s->ptcl;
2749 
2750 	tcb->flags |= RETRAN|FORCE;
2751 	tcb->snd.ptr = tcb->snd.una;
2752 
2753 	/*
2754 	 *  We should be halving the slow start threshhold (down to one
2755 	 *  mss) but leaving it at mss seems to work well enough
2756 	 */
2757  	tcb->ssthresh = tcb->mss;
2758 
2759 	/*
2760 	 *  pull window down to a single packet
2761 	 */
2762 	tcb->cwind = tcb->mss;
2763 	tcpoutput(s);
2764 }
2765 
2766 void
2767 tcptimeout(void *arg)
2768 {
2769 	Conv *s;
2770 	Tcpctl *tcb;
2771 	int maxback;
2772 	Tcppriv *tpriv;
2773 
2774 	s = (Conv*)arg;
2775 	tpriv = s->p->priv;
2776 	tcb = (Tcpctl*)s->ptcl;
2777 
2778 	if(waserror()){
2779 		qunlock(s);
2780 		nexterror();
2781 	}
2782 	qlock(s);
2783 	switch(tcb->state){
2784 	default:
2785 		tcb->backoff++;
2786 		if(tcb->state == Syn_sent)
2787 			maxback = MAXBACKMS/2;
2788 		else
2789 			maxback = MAXBACKMS;
2790 		tcb->backedoff += tcb->timer.start * MSPTICK;
2791 		if(tcb->backedoff >= maxback) {
2792 			localclose(s, Etimedout);
2793 			break;
2794 		}
2795 		netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2796 		tcpsettimer(tcb);
2797 		tcprxmit(s);
2798 		tpriv->stats[RetransTimeouts]++;
2799 		tcb->snd.dupacks = 0;
2800 		break;
2801 	case Time_wait:
2802 		localclose(s, nil);
2803 		break;
2804 	case Closed:
2805 		break;
2806 	}
2807 	qunlock(s);
2808 	poperror();
2809 }
2810 
2811 int
2812 inwindow(Tcpctl *tcb, int seq)
2813 {
2814 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2815 }
2816 
2817 /*
2818  *  set up state for a received SYN (or SYN ACK) packet
2819  */
2820 void
2821 procsyn(Conv *s, Tcp *seg)
2822 {
2823 	Tcpctl *tcb;
2824 
2825 	tcb = (Tcpctl*)s->ptcl;
2826 	tcb->flags |= FORCE;
2827 
2828 	tcb->rcv.nxt = seg->seq + 1;
2829 	tcb->rcv.urg = tcb->rcv.nxt;
2830 	tcb->irs = seg->seq;
2831 
2832 	/* our sending max segment size cannot be bigger than what he asked for */
2833 	if(seg->mss != 0 && seg->mss < tcb->mss)
2834 		tcb->mss = seg->mss;
2835 
2836 	/* the congestion window always starts out as a single segment */
2837 	tcb->snd.wnd = seg->wnd;
2838 	tcb->cwind = tcb->mss;
2839 }
2840 
2841 int
2842 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2843 {
2844 	Reseq *rp, *rp1;
2845 	int i, rqlen, qmax;
2846 
2847 	rp = malloc(sizeof(Reseq));
2848 	if(rp == nil){
2849 		freeblist(bp);	/* bp always consumed by add_reseq */
2850 		return 0;
2851 	}
2852 
2853 	rp->seg = *seg;
2854 	rp->bp = bp;
2855 	rp->length = length;
2856 
2857 	/* Place on reassembly list sorting by starting seq number */
2858 	rp1 = tcb->reseq;
2859 	if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2860 		rp->next = rp1;
2861 		tcb->reseq = rp;
2862 		if(rp->next != nil)
2863 			tpriv->stats[OutOfOrder]++;
2864 		return 0;
2865 	}
2866 
2867 	rqlen = 0;
2868 	for(i = 0;; i++) {
2869 		rqlen += rp1->length;
2870 		if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2871 			rp->next = rp1->next;
2872 			rp1->next = rp;
2873 			if(rp->next != nil)
2874 				tpriv->stats[OutOfOrder]++;
2875 			break;
2876 		}
2877 		rp1 = rp1->next;
2878 	}
2879 	qmax = QMAX<<tcb->rcv.scale;
2880 	if(rqlen > qmax){
2881 		print("resequence queue > window: %d > %d\n", rqlen, qmax);
2882 		i = 0;
2883 	  	for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2884 	  		print("%#lux %#lux %#ux\n", rp1->seg.seq,
2885 	  			rp1->seg.ack, rp1->seg.flags);
2886 			if(i++ > 10){
2887 				print("...\n");
2888 				break;
2889 			}
2890 		}
2891 
2892 		// delete entire reassembly queue; wait for retransmit.
2893 		// - should we be smarter and only delete the tail?
2894 		for(rp = tcb->reseq; rp != nil; rp = rp1){
2895 			rp1 = rp->next;
2896 			freeblist(rp->bp);
2897 			free(rp);
2898 		}
2899 		tcb->reseq = nil;
2900 
2901 	  	return -1;
2902 	}
2903 	return 0;
2904 }
2905 
2906 void
2907 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2908 {
2909 	Reseq *rp;
2910 
2911 	rp = tcb->reseq;
2912 	if(rp == nil)
2913 		return;
2914 
2915 	tcb->reseq = rp->next;
2916 
2917 	*seg = rp->seg;
2918 	*bp = rp->bp;
2919 	*length = rp->length;
2920 
2921 	free(rp);
2922 }
2923 
2924 int
2925 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2926 {
2927 	ushort len;
2928 	uchar accept;
2929 	int dupcnt, excess;
2930 
2931 	accept = 0;
2932 	len = *length;
2933 	if(seg->flags & SYN)
2934 		len++;
2935 	if(seg->flags & FIN)
2936 		len++;
2937 
2938 	if(tcb->rcv.wnd == 0) {
2939 		if(len == 0 && seg->seq == tcb->rcv.nxt)
2940 			return 0;
2941 	}
2942 	else {
2943 		/* Some part of the segment should be in the window */
2944 		if(inwindow(tcb,seg->seq))
2945 			accept++;
2946 		else
2947 		if(len != 0) {
2948 			if(inwindow(tcb, seg->seq+len-1) ||
2949 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2950 				accept++;
2951 		}
2952 	}
2953 	if(!accept) {
2954 		freeblist(*bp);
2955 		return -1;
2956 	}
2957 	dupcnt = tcb->rcv.nxt - seg->seq;
2958 	if(dupcnt > 0){
2959 		tcb->rerecv += dupcnt;
2960 		if(seg->flags & SYN){
2961 			seg->flags &= ~SYN;
2962 			seg->seq++;
2963 
2964 			if(seg->urg > 1)
2965 				seg->urg--;
2966 			else
2967 				seg->flags &= ~URG;
2968 			dupcnt--;
2969 		}
2970 		if(dupcnt > 0){
2971 			pullblock(bp, (ushort)dupcnt);
2972 			seg->seq += dupcnt;
2973 			*length -= dupcnt;
2974 
2975 			if(seg->urg > dupcnt)
2976 				seg->urg -= dupcnt;
2977 			else {
2978 				seg->flags &= ~URG;
2979 				seg->urg = 0;
2980 			}
2981 		}
2982 	}
2983 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2984 	if(excess > 0) {
2985 		tcb->rerecv += excess;
2986 		*length -= excess;
2987 		*bp = trimblock(*bp, 0, *length);
2988 		if(*bp == nil)
2989 			panic("presotto is a boofhead");
2990 		seg->flags &= ~FIN;
2991 	}
2992 	return 0;
2993 }
2994 
2995 void
2996 tcpadvise(Proto *tcp, Block *bp, char *msg)
2997 {
2998 	Tcp4hdr *h4;
2999 	Tcp6hdr *h6;
3000 	Tcpctl *tcb;
3001 	uchar source[IPaddrlen];
3002 	uchar dest[IPaddrlen];
3003 	ushort psource, pdest;
3004 	Conv *s, **p;
3005 
3006 	h4 = (Tcp4hdr*)(bp->rp);
3007 	h6 = (Tcp6hdr*)(bp->rp);
3008 
3009 	if((h4->vihl&0xF0)==IP_VER4) {
3010 		v4tov6(dest, h4->tcpdst);
3011 		v4tov6(source, h4->tcpsrc);
3012 		psource = nhgets(h4->tcpsport);
3013 		pdest = nhgets(h4->tcpdport);
3014 	}
3015 	else {
3016 		ipmove(dest, h6->tcpdst);
3017 		ipmove(source, h6->tcpsrc);
3018 		psource = nhgets(h6->tcpsport);
3019 		pdest = nhgets(h6->tcpdport);
3020 	}
3021 
3022 	/* Look for a connection */
3023 	qlock(tcp);
3024 	for(p = tcp->conv; *p; p++) {
3025 		s = *p;
3026 		tcb = (Tcpctl*)s->ptcl;
3027 		if(s->rport == pdest)
3028 		if(s->lport == psource)
3029 		if(tcb->state != Closed)
3030 		if(ipcmp(s->raddr, dest) == 0)
3031 		if(ipcmp(s->laddr, source) == 0){
3032 			qlock(s);
3033 			qunlock(tcp);
3034 			switch(tcb->state){
3035 			case Syn_sent:
3036 				localclose(s, msg);
3037 				break;
3038 			}
3039 			qunlock(s);
3040 			freeblist(bp);
3041 			return;
3042 		}
3043 	}
3044 	qunlock(tcp);
3045 	freeblist(bp);
3046 }
3047 
3048 static char*
3049 tcpporthogdefensectl(char *val)
3050 {
3051 	if(strcmp(val, "on") == 0)
3052 		tcpporthogdefense = 1;
3053 	else if(strcmp(val, "off") == 0)
3054 		tcpporthogdefense = 0;
3055 	else
3056 		return "unknown value for tcpporthogdefense";
3057 	return nil;
3058 }
3059 
3060 /* called with c qlocked */
3061 char*
3062 tcpctl(Conv* c, char** f, int n)
3063 {
3064 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3065 		return tcphangup(c);
3066 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3067 		return tcpstartka(c, f, n);
3068 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3069 		return tcpsetchecksum(c, f, n);
3070 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3071 		return tcpporthogdefensectl(f[1]);
3072 	return "unknown control request";
3073 }
3074 
3075 int
3076 tcpstats(Proto *tcp, char *buf, int len)
3077 {
3078 	Tcppriv *priv;
3079 	char *p, *e;
3080 	int i;
3081 
3082 	priv = tcp->priv;
3083 	p = buf;
3084 	e = p+len;
3085 	for(i = 0; i < Nstats; i++)
3086 		p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3087 	return p - buf;
3088 }
3089 
3090 /*
3091  *  garbage collect any stale conversations:
3092  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3093  *	- Finwait2 after 5 minutes
3094  *
3095  *  this is called whenever we run out of channels.  Both checks are
3096  *  of questionable validity so we try to use them only when we're
3097  *  up against the wall.
3098  */
3099 int
3100 tcpgc(Proto *tcp)
3101 {
3102 	Conv *c, **pp, **ep;
3103 	int n;
3104 	Tcpctl *tcb;
3105 
3106 
3107 	n = 0;
3108 	ep = &tcp->conv[tcp->nc];
3109 	for(pp = tcp->conv; pp < ep; pp++) {
3110 		c = *pp;
3111 		if(c == nil)
3112 			break;
3113 		if(!canqlock(c))
3114 			continue;
3115 		tcb = (Tcpctl*)c->ptcl;
3116 		switch(tcb->state){
3117 		case Syn_received:
3118 			if(NOW - tcb->time > 5000){
3119 				localclose(c, "timed out");
3120 				n++;
3121 			}
3122 			break;
3123 		case Finwait2:
3124 			if(NOW - tcb->time > 5*60*1000){
3125 				localclose(c, "timed out");
3126 				n++;
3127 			}
3128 			break;
3129 		}
3130 		qunlock(c);
3131 	}
3132 	return n;
3133 }
3134 
3135 void
3136 tcpsettimer(Tcpctl *tcb)
3137 {
3138 	int x;
3139 
3140 	/* round trip dependency */
3141 	x = backoff(tcb->backoff) *
3142 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3143 
3144 	/* bounded twixt 1/2 and 64 seconds */
3145 	if(x < 500/MSPTICK)
3146 		x = 500/MSPTICK;
3147 	else if(x > (64000/MSPTICK))
3148 		x = 64000/MSPTICK;
3149 	tcb->timer.start = x;
3150 }
3151 
3152 void
3153 tcpinit(Fs *fs)
3154 {
3155 	Proto *tcp;
3156 	Tcppriv *tpriv;
3157 
3158 	tcp = smalloc(sizeof(Proto));
3159 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3160 	tcp->name = "tcp";
3161 	tcp->connect = tcpconnect;
3162 	tcp->announce = tcpannounce;
3163 	tcp->ctl = tcpctl;
3164 	tcp->state = tcpstate;
3165 	tcp->create = tcpcreate;
3166 	tcp->close = tcpclose;
3167 	tcp->rcv = tcpiput;
3168 	tcp->advise = tcpadvise;
3169 	tcp->stats = tcpstats;
3170 	tcp->inuse = tcpinuse;
3171 	tcp->gc = tcpgc;
3172 	tcp->ipproto = IP_TCPPROTO;
3173 	tcp->nc = scalednconv();
3174 	tcp->ptclsize = sizeof(Tcpctl);
3175 	tpriv->stats[MaxConn] = tcp->nc;
3176 
3177 	Fsproto(fs, tcp);
3178 }
3179 
3180 void
3181 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3182 {
3183 	if(rcvscale){
3184 		tcb->rcv.scale = rcvscale & 0xff;
3185 		tcb->snd.scale = sndscale & 0xff;
3186 		tcb->window = QMAX<<tcb->snd.scale;
3187 		qsetlimit(s->rq, tcb->window);
3188 	} else {
3189 		tcb->rcv.scale = 0;
3190 		tcb->snd.scale = 0;
3191 		tcb->window = QMAX;
3192 		qsetlimit(s->rq, tcb->window);
3193 	}
3194 }
3195