xref: /plan9/sys/src/9/ip/tcp.c (revision 53048e26979f9cc5fe8b4606606b02e27f9cfc5c)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Mean segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default mean segment */
50 	DEF_MSS6	= 1280,		/* Default mean segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	HaveWS		= 1<<8,
85 };
86 
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
91 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
92 	"Closing", 	"Last_ack", 	"Time_wait"
93 };
94 
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 	Tcptimer	*next;
99 	Tcptimer	*prev;
100 	Tcptimer	*readynext;
101 	int	state;
102 	int	start;
103 	int	count;
104 	void	(*func)(void*);
105 	void	*arg;
106 };
107 
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 	uchar	vihl;		/* Version and header length */
116 	uchar	tos;		/* Type of service */
117 	uchar	length[2];	/* packet length */
118 	uchar	id[2];		/* Identification */
119 	uchar	frag[2];	/* Fragment information */
120 	uchar	Unused;
121 	uchar	proto;
122 	uchar	tcplen[2];
123 	uchar	tcpsrc[4];
124 	uchar	tcpdst[4];
125 	uchar	tcpsport[2];
126 	uchar	tcpdport[2];
127 	uchar	tcpseq[4];
128 	uchar	tcpack[4];
129 	uchar	tcpflag[2];
130 	uchar	tcpwin[2];
131 	uchar	tcpcksum[2];
132 	uchar	tcpurg[2];
133 	/* Options segment */
134 	uchar	tcpopt[1];
135 };
136 
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 	uchar	vcf[4];
141 	uchar	ploadlen[2];
142 	uchar	proto;
143 	uchar	ttl;
144 	uchar	tcpsrc[IPaddrlen];
145 	uchar	tcpdst[IPaddrlen];
146 	uchar	tcpsport[2];
147 	uchar	tcpdport[2];
148 	uchar	tcpseq[4];
149 	uchar	tcpack[4];
150 	uchar	tcpflag[2];
151 	uchar	tcpwin[2];
152 	uchar	tcpcksum[2];
153 	uchar	tcpurg[2];
154 	/* Options segment */
155 	uchar	tcpopt[1];
156 };
157 
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct	Tcp
166 {
167 	ushort	source;
168 	ushort	dest;
169 	ulong	seq;
170 	ulong	ack;
171 	uchar	flags;
172 	ushort	ws;	/* window scale option (if not zero) */
173 	ulong	wnd;
174 	ushort	urg;
175 	ushort	mss;	/* max segment size option (if not zero) */
176 	ushort	len;	/* size of data */
177 };
178 
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 	Reseq	*next;
187 	Tcp	seg;
188 	Block	*bp;
189 	ushort	length;
190 };
191 
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 	uchar	state;			/* Connection state */
199 	uchar	type;			/* Listening or active connection */
200 	uchar	code;			/* Icmp code */
201 	struct {
202 		ulong	una;		/* Unacked data pointer */
203 		ulong	nxt;		/* Next sequence expected */
204 		ulong	ptr;		/* Data pointer */
205 		ulong	wnd;		/* Tcp send window */
206 		ulong	urg;		/* Urgent data pointer */
207 		ulong	wl2;
208 		int	scale;		/* how much to right shift window in xmitted packets */
209 		/* to implement tahoe and reno TCP */
210 		ulong	dupacks;	/* number of duplicate acks rcvd */
211 		int	recovery;	/* loss recovery flag */
212 		ulong	rxt;		/* right window marker for recovery */
213 	} snd;
214 	struct {
215 		ulong	nxt;		/* Receive pointer to next uchar slot */
216 		ulong	wnd;		/* Receive window incoming */
217 		ulong	urg;		/* Urgent pointer */
218 		int	blocked;
219 		int	una;		/* unacked data segs */
220 		int	scale;		/* how much to left shift window in rcved packets */
221 	} rcv;
222 	ulong	iss;			/* Initial sequence number */
223 	int	sawwsopt;		/* true if we saw a wsopt on the incoming SYN */
224 	ulong	cwind;			/* Congestion window */
225 	int	scale;			/* desired snd.scale */
226 	ushort	ssthresh;		/* Slow start threshold */
227 	int	resent;			/* Bytes just resent */
228 	int	irs;			/* Initial received squence */
229 	ushort	mss;			/* Mean segment size */
230 	int	rerecv;			/* Overlap of data rerecevived */
231 	ulong	window;			/* Recevive window */
232 	uchar	backoff;		/* Exponential backoff counter */
233 	int	backedoff;		/* ms we've backed off for rexmits */
234 	uchar	flags;			/* State flags */
235 	Reseq	*reseq;			/* Resequencing queue */
236 	Tcptimer	timer;			/* Activity timer */
237 	Tcptimer	acktimer;		/* Acknowledge timer */
238 	Tcptimer	rtt_timer;		/* Round trip timer */
239 	Tcptimer	katimer;		/* keep alive timer */
240 	ulong	rttseq;			/* Round trip sequence */
241 	int	srtt;			/* Shortened round trip */
242 	int	mdev;			/* Mean deviation of round trip */
243 	int	kacounter;		/* count down for keep alive */
244 	uint	sndsyntime;		/* time syn sent */
245 	ulong	time;			/* time Finwait2 or Syn_received was sent */
246 	int	nochecksum;		/* non-zero means don't send checksums */
247 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
248 
249 	union {
250 		Tcp4hdr	tcp4hdr;
251 		Tcp6hdr	tcp6hdr;
252 	} protohdr;		/* prototype header */
253 };
254 
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 	Limbo	*next;
272 
273 	uchar	laddr[IPaddrlen];
274 	uchar	raddr[IPaddrlen];
275 	ushort	lport;
276 	ushort	rport;
277 	ulong	irs;		/* initial received sequence */
278 	ulong	iss;		/* initial sent sequence */
279 	ushort	mss;		/* mss from the other end */
280 	ushort	rcvscale;	/* how much to scale rcvd windows */
281 	ushort	sndscale;	/* how much to scale sent windows */
282 	ulong	lastsend;	/* last time we sent a synack */
283 	uchar	version;	/* v4 or v6 */
284 	uchar	rexmits;	/* number of retransmissions */
285 };
286 
287 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
288 ushort	tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */
289 
290 enum {
291 	/* MIB stats */
292 	MaxConn,
293 	ActiveOpens,
294 	PassiveOpens,
295 	EstabResets,
296 	CurrEstab,
297 	InSegs,
298 	OutSegs,
299 	RetransSegs,
300 	RetransTimeouts,
301 	InErrs,
302 	OutRsts,
303 
304 	/* non-MIB stats */
305 	CsumErrs,
306 	HlenErrs,
307 	LenErrs,
308 	OutOfOrder,
309 
310 	Nstats
311 };
312 
313 static char *statnames[] =
314 {
315 [MaxConn]	"MaxConn",
316 [ActiveOpens]	"ActiveOpens",
317 [PassiveOpens]	"PassiveOpens",
318 [EstabResets]	"EstabResets",
319 [CurrEstab]	"CurrEstab",
320 [InSegs]	"InSegs",
321 [OutSegs]	"OutSegs",
322 [RetransSegs]	"RetransSegs",
323 [RetransTimeouts]	"RetransTimeouts",
324 [InErrs]	"InErrs",
325 [OutRsts]	"OutRsts",
326 [CsumErrs]	"CsumErrs",
327 [HlenErrs]	"HlenErrs",
328 [LenErrs]	"LenErrs",
329 [OutOfOrder]	"OutOfOrder",
330 };
331 
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335 	/* List of active timers */
336 	QLock 	tl;
337 	Tcptimer *timers;
338 
339 	/* hash table for matching conversations */
340 	Ipht	ht;
341 
342 	/* calls in limbo waiting for an ACK to our SYN ACK */
343 	int	nlimbo;
344 	Limbo	*lht[NLHT];
345 
346 	/* for keeping track of tcpackproc */
347 	QLock	apl;
348 	int	ackprocstarted;
349 
350 	ulong	stats[Nstats];
351 };
352 
353 /*
354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
355  *  solution to hijacked systems staking out port's as a form
356  *  of DoS attack.
357  *
358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
359  *  it that number gets acked by the other end, we shut down the connection.
360  *  Look for tcpporthogedefense in the code.
361  */
362 int tcpporthogdefense = 0;
363 
364 int	addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void	localclose(Conv*, char*);
367 void	procsyn(Conv*, Tcp*);
368 void	tcpiput(Proto*, Ipifc*, Block*);
369 void	tcpoutput(Conv*);
370 int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void	tcpstart(Conv*, int);
372 void	tcptimeout(void*);
373 void	tcpsndsyn(Conv*, Tcpctl*);
374 void	tcprcvwin(Conv*);
375 void	tcpacktimer(void*);
376 void	tcpkeepalive(void*);
377 void	tcpsetkacounter(Tcpctl*);
378 void	tcprxmit(Conv*);
379 void	tcpsettimer(Tcpctl*);
380 void	tcpsynackrtt(Conv*);
381 void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382 
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385 
386 void
387 tcpsetstate(Conv *s, uchar newstate)
388 {
389 	Tcpctl *tcb;
390 	uchar oldstate;
391 	Tcppriv *tpriv;
392 
393 	tpriv = s->p->priv;
394 
395 	tcb = (Tcpctl*)s->ptcl;
396 
397 	oldstate = tcb->state;
398 	if(oldstate == newstate)
399 		return;
400 
401 	if(oldstate == Established)
402 		tpriv->stats[CurrEstab]--;
403 	if(newstate == Established)
404 		tpriv->stats[CurrEstab]++;
405 
406 	/**
407 	print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408 		tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409 	**/
410 
411 	switch(newstate) {
412 	case Closed:
413 		qclose(s->rq);
414 		qclose(s->wq);
415 		qclose(s->eq);
416 		break;
417 
418 	case Close_wait:		/* Remote closes */
419 		qhangup(s->rq, nil);
420 		break;
421 	}
422 
423 	tcb->state = newstate;
424 
425 	if(oldstate == Syn_sent && newstate != Closed)
426 		Fsconnected(s, nil);
427 }
428 
429 static char*
430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432 	char *e;
433 
434 	e = Fsstdconnect(c, argv, argc);
435 	if(e != nil)
436 		return e;
437 	tcpstart(c, TCP_CONNECT);
438 
439 	return nil;
440 }
441 
442 static int
443 tcpstate(Conv *c, char *state, int n)
444 {
445 	Tcpctl *s;
446 
447 	s = (Tcpctl*)(c->ptcl);
448 
449 	return snprint(state, n,
450 		"%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
451 		tcpstates[s->state],
452 		c->rq ? qlen(c->rq) : 0,
453 		c->wq ? qlen(c->wq) : 0,
454 		s->srtt, s->mdev,
455 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
456 		s->timer.start, s->timer.count, s->rerecv,
457 		s->katimer.start, s->katimer.count);
458 }
459 
460 static int
461 tcpinuse(Conv *c)
462 {
463 	Tcpctl *s;
464 
465 	s = (Tcpctl*)(c->ptcl);
466 	return s->state != Closed;
467 }
468 
469 static char*
470 tcpannounce(Conv *c, char **argv, int argc)
471 {
472 	char *e;
473 
474 	e = Fsstdannounce(c, argv, argc);
475 	if(e != nil)
476 		return e;
477 	tcpstart(c, TCP_LISTEN);
478 	Fsconnected(c, nil);
479 
480 	return nil;
481 }
482 
483 /*
484  *  tcpclose is always called with the q locked
485  */
486 static void
487 tcpclose(Conv *c)
488 {
489 	Tcpctl *tcb;
490 
491 	tcb = (Tcpctl*)c->ptcl;
492 
493 	qhangup(c->rq, nil);
494 	qhangup(c->wq, nil);
495 	qhangup(c->eq, nil);
496 	qflush(c->rq);
497 
498 	switch(tcb->state) {
499 	case Listen:
500 		/*
501 		 *  reset any incoming calls to this listener
502 		 */
503 		Fsconnected(c, "Hangup");
504 
505 		localclose(c, nil);
506 		break;
507 	case Closed:
508 	case Syn_sent:
509 		localclose(c, nil);
510 		break;
511 	case Syn_received:
512 	case Established:
513 		tcb->flgcnt++;
514 		tcb->snd.nxt++;
515 		tcpsetstate(c, Finwait1);
516 		tcpoutput(c);
517 		break;
518 	case Close_wait:
519 		tcb->flgcnt++;
520 		tcb->snd.nxt++;
521 		tcpsetstate(c, Last_ack);
522 		tcpoutput(c);
523 		break;
524 	}
525 }
526 
527 void
528 tcpkick(void *x)
529 {
530 	Conv *s = x;
531 	Tcpctl *tcb;
532 
533 	tcb = (Tcpctl*)s->ptcl;
534 
535 	if(waserror()){
536 		qunlock(s);
537 		nexterror();
538 	}
539 	qlock(s);
540 
541 	switch(tcb->state) {
542 	case Syn_sent:
543 	case Syn_received:
544 	case Established:
545 	case Close_wait:
546 		/*
547 		 * Push data
548 		 */
549 		tcprcvwin(s);
550 		tcpoutput(s);
551 		break;
552 	default:
553 		localclose(s, "Hangup");
554 		break;
555 	}
556 
557 	qunlock(s);
558 	poperror();
559 }
560 
561 void
562 tcprcvwin(Conv *s)				/* Call with tcb locked */
563 {
564 	int w;
565 	Tcpctl *tcb;
566 
567 	tcb = (Tcpctl*)s->ptcl;
568 	w = tcb->window - qlen(s->rq);
569 	if(w < 0)
570 		w = 0;
571 	tcb->rcv.wnd = w;
572 	if(w == 0)
573 		tcb->rcv.blocked = 1;
574 }
575 
576 void
577 tcpacktimer(void *v)
578 {
579 	Tcpctl *tcb;
580 	Conv *s;
581 
582 	s = v;
583 	tcb = (Tcpctl*)s->ptcl;
584 
585 	if(waserror()){
586 		qunlock(s);
587 		nexterror();
588 	}
589 	qlock(s);
590 	if(tcb->state != Closed){
591 		tcb->flags |= FORCE;
592 		tcprcvwin(s);
593 		tcpoutput(s);
594 	}
595 	qunlock(s);
596 	poperror();
597 }
598 
599 static void
600 tcpcreate(Conv *c)
601 {
602 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
603 	c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
604 }
605 
606 static void
607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
608 {
609 	if(newstate != TcptimerON){
610 		if(t->state == TcptimerON){
611 			/* unchain */
612 			if(priv->timers == t){
613 				priv->timers = t->next;
614 				if(t->prev != nil)
615 					panic("timerstate1");
616 			}
617 			if(t->next)
618 				t->next->prev = t->prev;
619 			if(t->prev)
620 				t->prev->next = t->next;
621 			t->next = t->prev = nil;
622 		}
623 	} else {
624 		if(t->state != TcptimerON){
625 			/* chain */
626 			if(t->prev != nil || t->next != nil)
627 				panic("timerstate2");
628 			t->prev = nil;
629 			t->next = priv->timers;
630 			if(t->next)
631 				t->next->prev = t;
632 			priv->timers = t;
633 		}
634 	}
635 	t->state = newstate;
636 }
637 
638 void
639 tcpackproc(void *a)
640 {
641 	Tcptimer *t, *tp, *timeo;
642 	Proto *tcp;
643 	Tcppriv *priv;
644 	int loop;
645 
646 	tcp = a;
647 	priv = tcp->priv;
648 
649 	for(;;) {
650 		tsleep(&up->sleep, return0, 0, MSPTICK);
651 
652 		qlock(&priv->tl);
653 		timeo = nil;
654 		loop = 0;
655 		for(t = priv->timers; t != nil; t = tp) {
656 			if(loop++ > 10000)
657 				panic("tcpackproc1");
658 			tp = t->next;
659  			if(t->state == TcptimerON) {
660 				t->count--;
661 				if(t->count == 0) {
662 					timerstate(priv, t, TcptimerDONE);
663 					t->readynext = timeo;
664 					timeo = t;
665 				}
666 			}
667 		}
668 		qunlock(&priv->tl);
669 
670 		loop = 0;
671 		for(t = timeo; t != nil; t = t->readynext) {
672 			if(loop++ > 10000)
673 				panic("tcpackproc2");
674 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
675 				(*t->func)(t->arg);
676 				poperror();
677 			}
678 		}
679 
680 		limborexmit(tcp);
681 	}
682 }
683 
684 void
685 tcpgo(Tcppriv *priv, Tcptimer *t)
686 {
687 	if(t == nil || t->start == 0)
688 		return;
689 
690 	qlock(&priv->tl);
691 	t->count = t->start;
692 	timerstate(priv, t, TcptimerON);
693 	qunlock(&priv->tl);
694 }
695 
696 void
697 tcphalt(Tcppriv *priv, Tcptimer *t)
698 {
699 	if(t == nil)
700 		return;
701 
702 	qlock(&priv->tl);
703 	timerstate(priv, t, TcptimerOFF);
704 	qunlock(&priv->tl);
705 }
706 
707 int
708 backoff(int n)
709 {
710 	return 1 << n;
711 }
712 
713 void
714 localclose(Conv *s, char *reason)	/* called with tcb locked */
715 {
716 	Tcpctl *tcb;
717 	Reseq *rp,*rp1;
718 	Tcppriv *tpriv;
719 
720 	tpriv = s->p->priv;
721 	tcb = (Tcpctl*)s->ptcl;
722 
723 	iphtrem(&tpriv->ht, s);
724 
725 	tcphalt(tpriv, &tcb->timer);
726 	tcphalt(tpriv, &tcb->rtt_timer);
727 	tcphalt(tpriv, &tcb->acktimer);
728 	tcphalt(tpriv, &tcb->katimer);
729 
730 	/* Flush reassembly queue; nothing more can arrive */
731 	for(rp = tcb->reseq; rp != nil; rp = rp1) {
732 		rp1 = rp->next;
733 		freeblist(rp->bp);
734 		free(rp);
735 	}
736 	tcb->reseq = nil;
737 
738 	if(tcb->state == Syn_sent)
739 		Fsconnected(s, reason);
740 	if(s->state == Announced)
741 		wakeup(&s->listenr);
742 
743 	qhangup(s->rq, reason);
744 	qhangup(s->wq, reason);
745 
746 	tcpsetstate(s, Closed);
747 }
748 
749 /* mtu (- TCP + IP hdr len) of 1st hop */
750 int
751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
752 {
753 	Ipifc *ifc;
754 	int mtu;
755 
756 	ifc = findipifc(tcp->f, addr, 0);
757 	switch(version){
758 	default:
759 	case V4:
760 		mtu = DEF_MSS;
761 		if(ifc != nil)
762 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763 		break;
764 	case V6:
765 		mtu = DEF_MSS6;
766 		if(ifc != nil)
767 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768 		break;
769 	}
770 	if(ifc != nil){
771 		if(ifc->mbps > 1000)
772 			*scale = HaveWS | 4;
773 		else if(ifc->mbps > 100)
774 			*scale = HaveWS | 3;
775 		else if(ifc->mbps > 10)
776 			*scale = HaveWS | 1;
777 		else
778 			*scale = HaveWS | 0;
779 	} else
780 		*scale = HaveWS | 0;
781 
782 	return mtu;
783 }
784 
785 void
786 inittcpctl(Conv *s, int mode)
787 {
788 	Tcpctl *tcb;
789 	Tcp4hdr* h4;
790 	Tcp6hdr* h6;
791 	int mss;
792 
793 	tcb = (Tcpctl*)s->ptcl;
794 
795 	memset(tcb, 0, sizeof(Tcpctl));
796 
797 	tcb->ssthresh = 65535;
798 	tcb->srtt = tcp_irtt<<LOGAGAIN;
799 	tcb->mdev = 0;
800 
801 	/* setup timers */
802 	tcb->timer.start = tcp_irtt / MSPTICK;
803 	tcb->timer.func = tcptimeout;
804 	tcb->timer.arg = s;
805 	tcb->rtt_timer.start = MAX_TIME;
806 	tcb->acktimer.start = TCP_ACK / MSPTICK;
807 	tcb->acktimer.func = tcpacktimer;
808 	tcb->acktimer.arg = s;
809 	tcb->katimer.start = DEF_KAT / MSPTICK;
810 	tcb->katimer.func = tcpkeepalive;
811 	tcb->katimer.arg = s;
812 
813 	mss = DEF_MSS;
814 
815 	/* create a prototype(pseudo) header */
816 	if(mode != TCP_LISTEN){
817 		if(ipcmp(s->laddr, IPnoaddr) == 0)
818 			findlocalip(s->p->f, s->laddr, s->raddr);
819 
820 		switch(s->ipversion){
821 		case V4:
822 			h4 = &tcb->protohdr.tcp4hdr;
823 			memset(h4, 0, sizeof(*h4));
824 			h4->proto = IP_TCPPROTO;
825 			hnputs(h4->tcpsport, s->lport);
826 			hnputs(h4->tcpdport, s->rport);
827 			v6tov4(h4->tcpsrc, s->laddr);
828 			v6tov4(h4->tcpdst, s->raddr);
829 			break;
830 		case V6:
831 			h6 = &tcb->protohdr.tcp6hdr;
832 			memset(h6, 0, sizeof(*h6));
833 			h6->proto = IP_TCPPROTO;
834 			hnputs(h6->tcpsport, s->lport);
835 			hnputs(h6->tcpdport, s->rport);
836 			ipmove(h6->tcpsrc, s->laddr);
837 			ipmove(h6->tcpdst, s->raddr);
838 			mss = DEF_MSS6;
839 			break;
840 		default:
841 			panic("inittcpctl: version %d", s->ipversion);
842 		}
843 	}
844 
845 	tcb->mss = tcb->cwind = mss;
846 
847 	/* default is no window scaling */
848 	tcb->window = QMAX;
849 	tcb->rcv.wnd = QMAX;
850 	tcb->rcv.scale = 0;
851 	tcb->snd.scale = 0;
852 	qsetlimit(s->rq, QMAX);
853 }
854 
855 /*
856  *  called with s qlocked
857  */
858 void
859 tcpstart(Conv *s, int mode)
860 {
861 	Tcpctl *tcb;
862 	Tcppriv *tpriv;
863 	char kpname[KNAMELEN];
864 
865 	tpriv = s->p->priv;
866 
867 	if(tpriv->ackprocstarted == 0){
868 		qlock(&tpriv->apl);
869 		if(tpriv->ackprocstarted == 0){
870 			sprint(kpname, "#I%dtcpack", s->p->f->dev);
871 			kproc(kpname, tcpackproc, s->p);
872 			tpriv->ackprocstarted = 1;
873 		}
874 		qunlock(&tpriv->apl);
875 	}
876 
877 	tcb = (Tcpctl*)s->ptcl;
878 
879 	inittcpctl(s, mode);
880 
881 	iphtadd(&tpriv->ht, s);
882 	switch(mode) {
883 	case TCP_LISTEN:
884 		tpriv->stats[PassiveOpens]++;
885 		tcb->flags |= CLONE;
886 		tcpsetstate(s, Listen);
887 		break;
888 
889 	case TCP_CONNECT:
890 		tpriv->stats[ActiveOpens]++;
891 		tcb->flags |= ACTIVE;
892 		tcpsndsyn(s, tcb);
893 		tcpsetstate(s, Syn_sent);
894 		tcpoutput(s);
895 		break;
896 	}
897 }
898 
899 static char*
900 tcpflag(ushort flag)
901 {
902 	static char buf[128];
903 
904 	sprint(buf, "%d", flag>>10);	/* Head len */
905 	if(flag & URG)
906 		strcat(buf, " URG");
907 	if(flag & ACK)
908 		strcat(buf, " ACK");
909 	if(flag & PSH)
910 		strcat(buf, " PSH");
911 	if(flag & RST)
912 		strcat(buf, " RST");
913 	if(flag & SYN)
914 		strcat(buf, " SYN");
915 	if(flag & FIN)
916 		strcat(buf, " FIN");
917 
918 	return buf;
919 }
920 
921 Block *
922 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
923 {
924 	int dlen;
925 	Tcp6hdr *h;
926 	ushort csum;
927 	ushort hdrlen, optpad = 0;
928 	uchar *opt;
929 
930 	hdrlen = TCP6_HDRSIZE;
931 	if(tcph->flags & SYN){
932 		if(tcph->mss)
933 			hdrlen += MSS_LENGTH;
934 		if(tcph->ws)
935 			hdrlen += WS_LENGTH;
936 		optpad = hdrlen & 3;
937 		if(optpad)
938 			optpad = 4 - optpad;
939 		hdrlen += optpad;
940 	}
941 
942 	if(data) {
943 		dlen = blocklen(data);
944 		data = padblock(data, hdrlen + TCP6_PKT);
945 		if(data == nil)
946 			return nil;
947 	}
948 	else {
949 		dlen = 0;
950 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
951 		if(data == nil)
952 			return nil;
953 		data->wp += hdrlen + TCP6_PKT;
954 	}
955 
956 	/* copy in pseudo ip header plus port numbers */
957 	h = (Tcp6hdr *)(data->rp);
958 	memmove(h, ph, TCP6_TCBPHDRSZ);
959 
960 	/* compose pseudo tcp header, do cksum calculation */
961 	hnputl(h->vcf, hdrlen + dlen);
962 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
963 	h->ttl = ph->proto;
964 
965 	/* copy in variable bits */
966 	hnputl(h->tcpseq, tcph->seq);
967 	hnputl(h->tcpack, tcph->ack);
968 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
969 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
970 	hnputs(h->tcpurg, tcph->urg);
971 
972 	if(tcph->flags & SYN){
973 		opt = h->tcpopt;
974 		if(tcph->mss != 0){
975 			*opt++ = MSSOPT;
976 			*opt++ = MSS_LENGTH;
977 			hnputs(opt, tcph->mss);
978 			opt += 2;
979 		}
980 		if(tcph->ws != 0){
981 			*opt++ = WSOPT;
982 			*opt++ = WS_LENGTH;
983 			*opt++ = tcph->ws;
984 		}
985 		while(optpad-- > 0)
986 			*opt++ = NOOPOPT;
987 	}
988 
989 	if(tcb != nil && tcb->nochecksum){
990 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
991 	} else {
992 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
993 		hnputs(h->tcpcksum, csum);
994 	}
995 
996 	/* move from pseudo header back to normal ip header */
997 	memset(h->vcf, 0, 4);
998 	h->vcf[0] = IP_VER6;
999 	hnputs(h->ploadlen, hdrlen+dlen);
1000 	h->proto = ph->proto;
1001 
1002 	return data;
1003 }
1004 
1005 Block *
1006 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1007 {
1008 	int dlen;
1009 	Tcp4hdr *h;
1010 	ushort csum;
1011 	ushort hdrlen, optpad = 0;
1012 	uchar *opt;
1013 
1014 	hdrlen = TCP4_HDRSIZE;
1015 	if(tcph->flags & SYN){
1016 		if(tcph->mss)
1017 			hdrlen += MSS_LENGTH;
1018 		if(tcph->ws)
1019 			hdrlen += WS_LENGTH;
1020 		optpad = hdrlen & 3;
1021 		if(optpad)
1022 			optpad = 4 - optpad;
1023 		hdrlen += optpad;
1024 	}
1025 
1026 	if(data) {
1027 		dlen = blocklen(data);
1028 		data = padblock(data, hdrlen + TCP4_PKT);
1029 		if(data == nil)
1030 			return nil;
1031 	}
1032 	else {
1033 		dlen = 0;
1034 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1035 		if(data == nil)
1036 			return nil;
1037 		data->wp += hdrlen + TCP4_PKT;
1038 	}
1039 
1040 	/* copy in pseudo ip header plus port numbers */
1041 	h = (Tcp4hdr *)(data->rp);
1042 	memmove(h, ph, TCP4_TCBPHDRSZ);
1043 
1044 	/* copy in variable bits */
1045 	hnputs(h->tcplen, hdrlen + dlen);
1046 	hnputl(h->tcpseq, tcph->seq);
1047 	hnputl(h->tcpack, tcph->ack);
1048 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1049 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1050 	hnputs(h->tcpurg, tcph->urg);
1051 
1052 	if(tcph->flags & SYN){
1053 		opt = h->tcpopt;
1054 		if(tcph->mss != 0){
1055 			*opt++ = MSSOPT;
1056 			*opt++ = MSS_LENGTH;
1057 			hnputs(opt, tcph->mss);
1058 			opt += 2;
1059 		}
1060 		if(tcph->ws != 0){
1061 			*opt++ = WSOPT;
1062 			*opt++ = WS_LENGTH;
1063 			*opt++ = tcph->ws;
1064 		}
1065 		while(optpad-- > 0)
1066 			*opt++ = NOOPOPT;
1067 	}
1068 
1069 	if(tcb != nil && tcb->nochecksum){
1070 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1071 	} else {
1072 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1073 		hnputs(h->tcpcksum, csum);
1074 	}
1075 
1076 	return data;
1077 }
1078 
1079 int
1080 ntohtcp6(Tcp *tcph, Block **bpp)
1081 {
1082 	Tcp6hdr *h;
1083 	uchar *optr;
1084 	ushort hdrlen;
1085 	ushort optlen;
1086 	int n;
1087 
1088 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1089 	if(*bpp == nil)
1090 		return -1;
1091 
1092 	h = (Tcp6hdr *)((*bpp)->rp);
1093 	tcph->source = nhgets(h->tcpsport);
1094 	tcph->dest = nhgets(h->tcpdport);
1095 	tcph->seq = nhgetl(h->tcpseq);
1096 	tcph->ack = nhgetl(h->tcpack);
1097 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1098 	if(hdrlen < TCP6_HDRSIZE) {
1099 		freeblist(*bpp);
1100 		return -1;
1101 	}
1102 
1103 	tcph->flags = h->tcpflag[1];
1104 	tcph->wnd = nhgets(h->tcpwin);
1105 	tcph->urg = nhgets(h->tcpurg);
1106 	tcph->mss = 0;
1107 	tcph->ws = 0;
1108 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1109 
1110 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1111 	if(*bpp == nil)
1112 		return -1;
1113 
1114 	optr = h->tcpopt;
1115 	n = hdrlen - TCP6_HDRSIZE;
1116 	while(n > 0 && *optr != EOLOPT) {
1117 		if(*optr == NOOPOPT) {
1118 			n--;
1119 			optr++;
1120 			continue;
1121 		}
1122 		optlen = optr[1];
1123 		if(optlen < 2 || optlen > n)
1124 			break;
1125 		switch(*optr) {
1126 		case MSSOPT:
1127 			if(optlen == MSS_LENGTH)
1128 				tcph->mss = nhgets(optr+2);
1129 			break;
1130 		case WSOPT:
1131 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1132 				tcph->ws = HaveWS | *(optr+2);
1133 			break;
1134 		}
1135 		n -= optlen;
1136 		optr += optlen;
1137 	}
1138 	return hdrlen;
1139 }
1140 
1141 int
1142 ntohtcp4(Tcp *tcph, Block **bpp)
1143 {
1144 	Tcp4hdr *h;
1145 	uchar *optr;
1146 	ushort hdrlen;
1147 	ushort optlen;
1148 	int n;
1149 
1150 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1151 	if(*bpp == nil)
1152 		return -1;
1153 
1154 	h = (Tcp4hdr *)((*bpp)->rp);
1155 	tcph->source = nhgets(h->tcpsport);
1156 	tcph->dest = nhgets(h->tcpdport);
1157 	tcph->seq = nhgetl(h->tcpseq);
1158 	tcph->ack = nhgetl(h->tcpack);
1159 
1160 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1161 	if(hdrlen < TCP4_HDRSIZE) {
1162 		freeblist(*bpp);
1163 		return -1;
1164 	}
1165 
1166 	tcph->flags = h->tcpflag[1];
1167 	tcph->wnd = nhgets(h->tcpwin);
1168 	tcph->urg = nhgets(h->tcpurg);
1169 	tcph->mss = 0;
1170 	tcph->ws = 0;
1171 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1172 
1173 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1174 	if(*bpp == nil)
1175 		return -1;
1176 
1177 	optr = h->tcpopt;
1178 	n = hdrlen - TCP4_HDRSIZE;
1179 	while(n > 0 && *optr != EOLOPT) {
1180 		if(*optr == NOOPOPT) {
1181 			n--;
1182 			optr++;
1183 			continue;
1184 		}
1185 		optlen = optr[1];
1186 		if(optlen < 2 || optlen > n)
1187 			break;
1188 		switch(*optr) {
1189 		case MSSOPT:
1190 			if(optlen == MSS_LENGTH)
1191 				tcph->mss = nhgets(optr+2);
1192 			break;
1193 		case WSOPT:
1194 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1195 				tcph->ws = HaveWS | *(optr+2);
1196 			break;
1197 		}
1198 		n -= optlen;
1199 		optr += optlen;
1200 	}
1201 	return hdrlen;
1202 }
1203 
1204 /*
1205  *  For outgiing calls, generate an initial sequence
1206  *  number and put a SYN on the send queue
1207  */
1208 void
1209 tcpsndsyn(Conv *s, Tcpctl *tcb)
1210 {
1211 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1212 	tcb->rttseq = tcb->iss;
1213 	tcb->snd.wl2 = tcb->iss;
1214 	tcb->snd.una = tcb->iss;
1215 	tcb->snd.ptr = tcb->rttseq;
1216 	tcb->snd.nxt = tcb->rttseq;
1217 	tcb->flgcnt++;
1218 	tcb->flags |= FORCE;
1219 	tcb->sndsyntime = NOW;
1220 
1221 	/* set desired mss and scale */
1222 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1223 }
1224 
1225 void
1226 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1227 {
1228 	Block *hbp;
1229 	uchar rflags;
1230 	Tcppriv *tpriv;
1231 	Tcp4hdr ph4;
1232 	Tcp6hdr ph6;
1233 
1234 	netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1235 
1236 	tpriv = tcp->priv;
1237 
1238 	if(seg->flags & RST)
1239 		return;
1240 
1241 	/* make pseudo header */
1242 	switch(version) {
1243 	case V4:
1244 		memset(&ph4, 0, sizeof(ph4));
1245 		ph4.vihl = IP_VER4;
1246 		v6tov4(ph4.tcpsrc, dest);
1247 		v6tov4(ph4.tcpdst, source);
1248 		ph4.proto = IP_TCPPROTO;
1249 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1250 		hnputs(ph4.tcpsport, seg->dest);
1251 		hnputs(ph4.tcpdport, seg->source);
1252 		break;
1253 	case V6:
1254 		memset(&ph6, 0, sizeof(ph6));
1255 		ph6.vcf[0] = IP_VER6;
1256 		ipmove(ph6.tcpsrc, dest);
1257 		ipmove(ph6.tcpdst, source);
1258 		ph6.proto = IP_TCPPROTO;
1259 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1260 		hnputs(ph6.tcpsport, seg->dest);
1261 		hnputs(ph6.tcpdport, seg->source);
1262 		break;
1263 	default:
1264 		panic("sndrst: version %d", version);
1265 	}
1266 
1267 	tpriv->stats[OutRsts]++;
1268 	rflags = RST;
1269 
1270 	/* convince the other end that this reset is in band */
1271 	if(seg->flags & ACK) {
1272 		seg->seq = seg->ack;
1273 		seg->ack = 0;
1274 	}
1275 	else {
1276 		rflags |= ACK;
1277 		seg->ack = seg->seq;
1278 		seg->seq = 0;
1279 		if(seg->flags & SYN)
1280 			seg->ack++;
1281 		seg->ack += length;
1282 		if(seg->flags & FIN)
1283 			seg->ack++;
1284 	}
1285 	seg->flags = rflags;
1286 	seg->wnd = 0;
1287 	seg->urg = 0;
1288 	seg->mss = 0;
1289 	seg->ws = 0;
1290 	switch(version) {
1291 	case V4:
1292 		hbp = htontcp4(seg, nil, &ph4, nil);
1293 		if(hbp == nil)
1294 			return;
1295 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1296 		break;
1297 	case V6:
1298 		hbp = htontcp6(seg, nil, &ph6, nil);
1299 		if(hbp == nil)
1300 			return;
1301 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1302 		break;
1303 	default:
1304 		panic("sndrst2: version %d", version);
1305 	}
1306 }
1307 
1308 /*
1309  *  send a reset to the remote side and close the conversation
1310  *  called with s qlocked
1311  */
1312 char*
1313 tcphangup(Conv *s)
1314 {
1315 	Tcp seg;
1316 	Tcpctl *tcb;
1317 	Block *hbp;
1318 
1319 	tcb = (Tcpctl*)s->ptcl;
1320 	if(waserror())
1321 		return commonerror();
1322 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1323 		if(!waserror()){
1324 			seg.flags = RST | ACK;
1325 			seg.ack = tcb->rcv.nxt;
1326 			tcb->rcv.una = 0;
1327 			seg.seq = tcb->snd.ptr;
1328 			seg.wnd = 0;
1329 			seg.urg = 0;
1330 			seg.mss = 0;
1331 			seg.ws = 0;
1332 			switch(s->ipversion) {
1333 			case V4:
1334 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1335 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1336 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1337 				break;
1338 			case V6:
1339 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1340 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1341 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1342 				break;
1343 			default:
1344 				panic("tcphangup: version %d", s->ipversion);
1345 			}
1346 			poperror();
1347 		}
1348 	}
1349 	localclose(s, nil);
1350 	poperror();
1351 	return nil;
1352 }
1353 
1354 /*
1355  *  (re)send a SYN ACK
1356  */
1357 int
1358 sndsynack(Proto *tcp, Limbo *lp)
1359 {
1360 	Block *hbp;
1361 	Tcp4hdr ph4;
1362 	Tcp6hdr ph6;
1363 	Tcp seg;
1364 	int scale;
1365 
1366 	/* make pseudo header */
1367 	switch(lp->version) {
1368 	case V4:
1369 		memset(&ph4, 0, sizeof(ph4));
1370 		ph4.vihl = IP_VER4;
1371 		v6tov4(ph4.tcpsrc, lp->laddr);
1372 		v6tov4(ph4.tcpdst, lp->raddr);
1373 		ph4.proto = IP_TCPPROTO;
1374 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1375 		hnputs(ph4.tcpsport, lp->lport);
1376 		hnputs(ph4.tcpdport, lp->rport);
1377 		break;
1378 	case V6:
1379 		memset(&ph6, 0, sizeof(ph6));
1380 		ph6.vcf[0] = IP_VER6;
1381 		ipmove(ph6.tcpsrc, lp->laddr);
1382 		ipmove(ph6.tcpdst, lp->raddr);
1383 		ph6.proto = IP_TCPPROTO;
1384 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1385 		hnputs(ph6.tcpsport, lp->lport);
1386 		hnputs(ph6.tcpdport, lp->rport);
1387 		break;
1388 	default:
1389 		panic("sndrst: version %d", lp->version);
1390 	}
1391 
1392 	seg.seq = lp->iss;
1393 	seg.ack = lp->irs+1;
1394 	seg.flags = SYN|ACK;
1395 	seg.urg = 0;
1396 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1397 	seg.wnd = QMAX;
1398 
1399 	/* if the other side set scale, we should too */
1400 	if(lp->rcvscale){
1401 		seg.ws = scale;
1402 		lp->sndscale = scale;
1403 	} else {
1404 		seg.ws = 0;
1405 		lp->sndscale = 0;
1406 	}
1407 
1408 	switch(lp->version) {
1409 	case V4:
1410 		hbp = htontcp4(&seg, nil, &ph4, nil);
1411 		if(hbp == nil)
1412 			return -1;
1413 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1414 		break;
1415 	case V6:
1416 		hbp = htontcp6(&seg, nil, &ph6, nil);
1417 		if(hbp == nil)
1418 			return -1;
1419 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1420 		break;
1421 	default:
1422 		panic("sndsnack: version %d", lp->version);
1423 	}
1424 	lp->lastsend = NOW;
1425 	return 0;
1426 }
1427 
1428 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1429 
1430 /*
1431  *  put a call into limbo and respond with a SYN ACK
1432  *
1433  *  called with proto locked
1434  */
1435 static void
1436 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1437 {
1438 	Limbo *lp, **l;
1439 	Tcppriv *tpriv;
1440 	int h;
1441 
1442 	tpriv = s->p->priv;
1443 	h = hashipa(source, seg->source);
1444 
1445 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1446 		lp = *l;
1447 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1448 			continue;
1449 		if(ipcmp(lp->raddr, source) != 0)
1450 			continue;
1451 		if(ipcmp(lp->laddr, dest) != 0)
1452 			continue;
1453 
1454 		/* each new SYN restarts the retransmits */
1455 		lp->irs = seg->seq;
1456 		break;
1457 	}
1458 	lp = *l;
1459 	if(lp == nil){
1460 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1461 			lp = tpriv->lht[h];
1462 			tpriv->lht[h] = lp->next;
1463 			lp->next = nil;
1464 		} else {
1465 			lp = malloc(sizeof(*lp));
1466 			if(lp == nil)
1467 				return;
1468 			tpriv->nlimbo++;
1469 		}
1470 		*l = lp;
1471 		lp->version = version;
1472 		ipmove(lp->laddr, dest);
1473 		ipmove(lp->raddr, source);
1474 		lp->lport = seg->dest;
1475 		lp->rport = seg->source;
1476 		lp->mss = seg->mss;
1477 		lp->rcvscale = seg->ws;
1478 		lp->irs = seg->seq;
1479 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1480 	}
1481 
1482 	if(sndsynack(s->p, lp) < 0){
1483 		*l = lp->next;
1484 		tpriv->nlimbo--;
1485 		free(lp);
1486 	}
1487 }
1488 
1489 /*
1490  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1491  */
1492 static void
1493 limborexmit(Proto *tcp)
1494 {
1495 	Tcppriv *tpriv;
1496 	Limbo **l, *lp;
1497 	int h;
1498 	int seen;
1499 	ulong now;
1500 
1501 	tpriv = tcp->priv;
1502 
1503 	if(!canqlock(tcp))
1504 		return;
1505 	seen = 0;
1506 	now = NOW;
1507 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1508 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1509 			lp = *l;
1510 			seen++;
1511 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1512 				continue;
1513 
1514 			/* time it out after 1 second */
1515 			if(++(lp->rexmits) > 5){
1516 				tpriv->nlimbo--;
1517 				*l = lp->next;
1518 				free(lp);
1519 				continue;
1520 			}
1521 
1522 			/* if we're being attacked, don't bother resending SYN ACK's */
1523 			if(tpriv->nlimbo > 100)
1524 				continue;
1525 
1526 			if(sndsynack(tcp, lp) < 0){
1527 				tpriv->nlimbo--;
1528 				*l = lp->next;
1529 				free(lp);
1530 				continue;
1531 			}
1532 
1533 			l = &lp->next;
1534 		}
1535 	}
1536 	qunlock(tcp);
1537 }
1538 
1539 /*
1540  *  lookup call in limbo.  if found, throw it out.
1541  *
1542  *  called with proto locked
1543  */
1544 static void
1545 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1546 {
1547 	Limbo *lp, **l;
1548 	int h;
1549 	Tcppriv *tpriv;
1550 
1551 	tpriv = s->p->priv;
1552 
1553 	/* find a call in limbo */
1554 	h = hashipa(src, segp->source);
1555 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1556 		lp = *l;
1557 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1558 			continue;
1559 		if(ipcmp(lp->laddr, dst) != 0)
1560 			continue;
1561 		if(ipcmp(lp->raddr, src) != 0)
1562 			continue;
1563 
1564 		/* RST can only follow the SYN */
1565 		if(segp->seq == lp->irs+1){
1566 			tpriv->nlimbo--;
1567 			*l = lp->next;
1568 			free(lp);
1569 		}
1570 		break;
1571 	}
1572 }
1573 
1574 /*
1575  *  come here when we finally get an ACK to our SYN-ACK.
1576  *  lookup call in limbo.  if found, create a new conversation
1577  *
1578  *  called with proto locked
1579  */
1580 static Conv*
1581 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1582 {
1583 	Conv *new;
1584 	Tcpctl *tcb;
1585 	Tcppriv *tpriv;
1586 	Tcp4hdr *h4;
1587 	Tcp6hdr *h6;
1588 	Limbo *lp, **l;
1589 	int h;
1590 
1591 	/* unless it's just an ack, it can't be someone coming out of limbo */
1592 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1593 		return nil;
1594 
1595 	tpriv = s->p->priv;
1596 
1597 	/* find a call in limbo */
1598 	h = hashipa(src, segp->source);
1599 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1600 		netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d",
1601 			src, segp->source, lp->raddr, lp->rport,
1602 			dst, segp->dest, lp->laddr, lp->lport,
1603 			version, lp->version
1604  		);
1605 
1606 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1607 			continue;
1608 		if(ipcmp(lp->laddr, dst) != 0)
1609 			continue;
1610 		if(ipcmp(lp->raddr, src) != 0)
1611 			continue;
1612 
1613 		/* we're assuming no data with the initial SYN */
1614 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1615 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux",
1616 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1617 			lp = nil;
1618 		} else {
1619 			tpriv->nlimbo--;
1620 			*l = lp->next;
1621 		}
1622 		break;
1623 	}
1624 	if(lp == nil)
1625 		return nil;
1626 
1627 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1628 	if(new == nil)
1629 		return nil;
1630 
1631 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1632 	tcb = (Tcpctl*)new->ptcl;
1633 	tcb->flags &= ~CLONE;
1634 	tcb->timer.arg = new;
1635 	tcb->timer.state = TcptimerOFF;
1636 	tcb->acktimer.arg = new;
1637 	tcb->acktimer.state = TcptimerOFF;
1638 	tcb->katimer.arg = new;
1639 	tcb->katimer.state = TcptimerOFF;
1640 	tcb->rtt_timer.arg = new;
1641 	tcb->rtt_timer.state = TcptimerOFF;
1642 
1643 	tcb->irs = lp->irs;
1644 	tcb->rcv.nxt = tcb->irs+1;
1645 	tcb->rcv.urg = tcb->rcv.nxt;
1646 
1647 	tcb->iss = lp->iss;
1648 	tcb->rttseq = tcb->iss;
1649 	tcb->snd.wl2 = tcb->iss;
1650 	tcb->snd.una = tcb->iss+1;
1651 	tcb->snd.ptr = tcb->iss+1;
1652 	tcb->snd.nxt = tcb->iss+1;
1653 	tcb->flgcnt = 0;
1654 	tcb->flags |= SYNACK;
1655 
1656 	/* our sending max segment size cannot be bigger than what he asked for */
1657 	if(lp->mss != 0 && lp->mss < tcb->mss)
1658 		tcb->mss = lp->mss;
1659 
1660 	/* window scaling */
1661 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1662 
1663 	/* the congestion window always starts out as a single segment */
1664 	tcb->snd.wnd = segp->wnd;
1665 	tcb->cwind = tcb->mss;
1666 
1667 	/* set initial round trip time */
1668 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1669 	tcpsynackrtt(new);
1670 
1671 	free(lp);
1672 
1673 	/* set up proto header */
1674 	switch(version){
1675 	case V4:
1676 		h4 = &tcb->protohdr.tcp4hdr;
1677 		memset(h4, 0, sizeof(*h4));
1678 		h4->proto = IP_TCPPROTO;
1679 		hnputs(h4->tcpsport, new->lport);
1680 		hnputs(h4->tcpdport, new->rport);
1681 		v6tov4(h4->tcpsrc, dst);
1682 		v6tov4(h4->tcpdst, src);
1683 		break;
1684 	case V6:
1685 		h6 = &tcb->protohdr.tcp6hdr;
1686 		memset(h6, 0, sizeof(*h6));
1687 		h6->proto = IP_TCPPROTO;
1688 		hnputs(h6->tcpsport, new->lport);
1689 		hnputs(h6->tcpdport, new->rport);
1690 		ipmove(h6->tcpsrc, dst);
1691 		ipmove(h6->tcpdst, src);
1692 		break;
1693 	default:
1694 		panic("tcpincoming: version %d", new->ipversion);
1695 	}
1696 
1697 	tcpsetstate(new, Established);
1698 
1699 	iphtadd(&tpriv->ht, new);
1700 
1701 	return new;
1702 }
1703 
1704 int
1705 seq_within(ulong x, ulong low, ulong high)
1706 {
1707 	if(low <= high){
1708 		if(low <= x && x <= high)
1709 			return 1;
1710 	}
1711 	else {
1712 		if(x >= low || x <= high)
1713 			return 1;
1714 	}
1715 	return 0;
1716 }
1717 
1718 int
1719 seq_lt(ulong x, ulong y)
1720 {
1721 	return (int)(x-y) < 0;
1722 }
1723 
1724 int
1725 seq_le(ulong x, ulong y)
1726 {
1727 	return (int)(x-y) <= 0;
1728 }
1729 
1730 int
1731 seq_gt(ulong x, ulong y)
1732 {
1733 	return (int)(x-y) > 0;
1734 }
1735 
1736 int
1737 seq_ge(ulong x, ulong y)
1738 {
1739 	return (int)(x-y) >= 0;
1740 }
1741 
1742 /*
1743  *  use the time between the first SYN and it's ack as the
1744  *  initial round trip time
1745  */
1746 void
1747 tcpsynackrtt(Conv *s)
1748 {
1749 	Tcpctl *tcb;
1750 	int delta;
1751 	Tcppriv *tpriv;
1752 
1753 	tcb = (Tcpctl*)s->ptcl;
1754 	tpriv = s->p->priv;
1755 
1756 	delta = NOW - tcb->sndsyntime;
1757 	tcb->srtt = delta<<LOGAGAIN;
1758 	tcb->mdev = delta<<LOGDGAIN;
1759 
1760 	/* halt round trip timer */
1761 	tcphalt(tpriv, &tcb->rtt_timer);
1762 }
1763 
1764 void
1765 update(Conv *s, Tcp *seg)
1766 {
1767 	int rtt, delta;
1768 	Tcpctl *tcb;
1769 	ulong acked;
1770 	ulong expand;
1771 	Tcppriv *tpriv;
1772 
1773 	tpriv = s->p->priv;
1774 	tcb = (Tcpctl*)s->ptcl;
1775 
1776 	/* if everything has been acked, force output(?) */
1777 	if(seq_gt(seg->ack, tcb->snd.nxt)) {
1778 		tcb->flags |= FORCE;
1779 		return;
1780 	}
1781 
1782 	/* added by Dong Lin for fast retransmission */
1783 	if(seg->ack == tcb->snd.una
1784 	&& tcb->snd.una != tcb->snd.nxt
1785 	&& seg->len == 0
1786 	&& seg->wnd == tcb->snd.wnd) {
1787 
1788 		/* this is a pure ack w/o window update */
1789 		netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1790 			tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1791 
1792 		if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1793 			/*
1794 			 *  tahoe tcp rxt the packet, half sshthresh,
1795  			 *  and set cwnd to one packet
1796 			 */
1797 			tcb->snd.recovery = 1;
1798 			tcb->snd.rxt = tcb->snd.nxt;
1799 			netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1800 			tcprxmit(s);
1801 		} else {
1802 			/* do reno tcp here. */
1803 		}
1804 	}
1805 
1806 	/*
1807 	 *  update window
1808 	 */
1809 	if(seq_gt(seg->ack, tcb->snd.wl2)
1810 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1811 		tcb->snd.wnd = seg->wnd;
1812 		tcb->snd.wl2 = seg->ack;
1813 	}
1814 
1815 	if(!seq_gt(seg->ack, tcb->snd.una)){
1816 		/*
1817 		 *  don't let us hangup if sending into a closed window and
1818 		 *  we're still getting acks
1819 		 */
1820 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1821 			tcb->backedoff = MAXBACKMS/4;
1822 		}
1823 		return;
1824 	}
1825 
1826 	/*
1827 	 *  any positive ack turns off fast rxt,
1828 	 *  (should we do new-reno on partial acks?)
1829 	 */
1830 	if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1831 		tcb->snd.dupacks = 0;
1832 		tcb->snd.recovery = 0;
1833 	} else
1834 		netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1835 
1836 	/* Compute the new send window size */
1837 	acked = seg->ack - tcb->snd.una;
1838 
1839 	/* avoid slow start and timers for SYN acks */
1840 	if((tcb->flags & SYNACK) == 0) {
1841 		tcb->flags |= SYNACK;
1842 		acked--;
1843 		tcb->flgcnt--;
1844 		goto done;
1845 	}
1846 
1847 	/* slow start as long as we're not recovering from lost packets */
1848 	if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1849 		if(tcb->cwind < tcb->ssthresh) {
1850 			expand = tcb->mss;
1851 			if(acked < expand)
1852 				expand = acked;
1853 		}
1854 		else
1855 			expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1856 
1857 		if(tcb->cwind + expand < tcb->cwind)
1858 			expand = tcb->snd.wnd - tcb->cwind;
1859 		if(tcb->cwind + expand > tcb->snd.wnd)
1860 			expand = tcb->snd.wnd - tcb->cwind;
1861 		tcb->cwind += expand;
1862 	}
1863 
1864 	/* Adjust the timers according to the round trip time */
1865 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1866 		tcphalt(tpriv, &tcb->rtt_timer);
1867 		if((tcb->flags&RETRAN) == 0) {
1868 			tcb->backoff = 0;
1869 			tcb->backedoff = 0;
1870 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1871 			if(rtt == 0)
1872 				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
1873 			rtt *= MSPTICK;
1874 			if(tcb->srtt == 0) {
1875 				tcb->srtt = rtt << LOGAGAIN;
1876 				tcb->mdev = rtt << LOGDGAIN;
1877 			} else {
1878 				delta = rtt - (tcb->srtt>>LOGAGAIN);
1879 				tcb->srtt += delta;
1880 				if(tcb->srtt <= 0)
1881 					tcb->srtt = 1;
1882 
1883 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1884 				tcb->mdev += delta;
1885 				if(tcb->mdev <= 0)
1886 					tcb->mdev = 1;
1887 			}
1888 			tcpsettimer(tcb);
1889 		}
1890 	}
1891 
1892 done:
1893 	if(qdiscard(s->wq, acked) < acked)
1894 		tcb->flgcnt--;
1895 
1896 	tcb->snd.una = seg->ack;
1897 	if(seq_gt(seg->ack, tcb->snd.urg))
1898 		tcb->snd.urg = seg->ack;
1899 
1900 	if(tcb->snd.una != tcb->snd.nxt)
1901 		tcpgo(tpriv, &tcb->timer);
1902 	else
1903 		tcphalt(tpriv, &tcb->timer);
1904 
1905 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1906 		tcb->snd.ptr = tcb->snd.una;
1907 
1908 	tcb->flags &= ~RETRAN;
1909 	tcb->backoff = 0;
1910 	tcb->backedoff = 0;
1911 }
1912 
1913 void
1914 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1915 {
1916 	Tcp seg;
1917 	Tcp4hdr *h4;
1918 	Tcp6hdr *h6;
1919 	int hdrlen;
1920 	Tcpctl *tcb;
1921 	ushort length;
1922 	uchar source[IPaddrlen], dest[IPaddrlen];
1923 	Conv *s;
1924 	Fs *f;
1925 	Tcppriv *tpriv;
1926 	uchar version;
1927 
1928 	f = tcp->f;
1929 	tpriv = tcp->priv;
1930 
1931 	tpriv->stats[InSegs]++;
1932 
1933 	h4 = (Tcp4hdr*)(bp->rp);
1934 	h6 = (Tcp6hdr*)(bp->rp);
1935 
1936 	if((h4->vihl&0xF0)==IP_VER4) {
1937 		version = V4;
1938 		length = nhgets(h4->length);
1939 		v4tov6(dest, h4->tcpdst);
1940 		v4tov6(source, h4->tcpsrc);
1941 
1942 		h4->Unused = 0;
1943 		hnputs(h4->tcplen, length-TCP4_PKT);
1944 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1945 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1946 			tpriv->stats[CsumErrs]++;
1947 			tpriv->stats[InErrs]++;
1948 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1949 			freeblist(bp);
1950 			return;
1951 		}
1952 
1953 		hdrlen = ntohtcp4(&seg, &bp);
1954 		if(hdrlen < 0){
1955 			tpriv->stats[HlenErrs]++;
1956 			tpriv->stats[InErrs]++;
1957 			netlog(f, Logtcp, "bad tcp hdr len\n");
1958 			return;
1959 		}
1960 
1961 		/* trim the packet to the size claimed by the datagram */
1962 		length -= hdrlen+TCP4_PKT;
1963 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1964 		if(bp == nil){
1965 			tpriv->stats[LenErrs]++;
1966 			tpriv->stats[InErrs]++;
1967 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
1968 			return;
1969 		}
1970 	}
1971 	else {
1972 		int ttl = h6->ttl;
1973 		int proto = h6->proto;
1974 
1975 		version = V6;
1976 		length = nhgets(h6->ploadlen);
1977 		ipmove(dest, h6->tcpdst);
1978 		ipmove(source, h6->tcpsrc);
1979 
1980 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1981 		h6->ttl = proto;
1982 		hnputl(h6->vcf, length);
1983 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1984 			ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
1985 			tpriv->stats[CsumErrs]++;
1986 			tpriv->stats[InErrs]++;
1987 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1988 			freeblist(bp);
1989 			return;
1990 		}
1991 		h6->ttl = ttl;
1992 		h6->proto = proto;
1993 		hnputs(h6->ploadlen, length);
1994 
1995 		hdrlen = ntohtcp6(&seg, &bp);
1996 		if(hdrlen < 0){
1997 			tpriv->stats[HlenErrs]++;
1998 			tpriv->stats[InErrs]++;
1999 			netlog(f, Logtcp, "bad tcp hdr len\n");
2000 			return;
2001 		}
2002 
2003 		/* trim the packet to the size claimed by the datagram */
2004 		length -= hdrlen;
2005 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2006 		if(bp == nil){
2007 			tpriv->stats[LenErrs]++;
2008 			tpriv->stats[InErrs]++;
2009 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2010 			return;
2011 		}
2012 	}
2013 
2014 	/* lock protocol while searching for a conversation */
2015 	qlock(tcp);
2016 
2017 	/* Look for a matching conversation */
2018 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2019 	if(s == nil){
2020 		netlog(f, Logtcp, "iphtlook failed");
2021 reset:
2022 		qunlock(tcp);
2023 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2024 		freeblist(bp);
2025 		return;
2026 	}
2027 
2028 	/* if it's a listener, look for the right flags and get a new conv */
2029 	tcb = (Tcpctl*)s->ptcl;
2030 	if(tcb->state == Listen){
2031 		if(seg.flags & RST){
2032 			limborst(s, &seg, source, dest, version);
2033 			qunlock(tcp);
2034 			freeblist(bp);
2035 			return;
2036 		}
2037 
2038 		/* if this is a new SYN, put the call into limbo */
2039 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2040 			limbo(s, source, dest, &seg, version);
2041 			qunlock(tcp);
2042 			freeblist(bp);
2043 			return;
2044 		}
2045 
2046 		/*
2047 		 *  if there's a matching call in limbo, tcpincoming will
2048 		 *  return it in state Syn_received
2049 		 */
2050 		s = tcpincoming(s, &seg, source, dest, version);
2051 		if(s == nil)
2052 			goto reset;
2053 	}
2054 
2055 	/* The rest of the input state machine is run with the control block
2056 	 * locked and implements the state machine directly out of the RFC.
2057 	 * Out-of-band data is ignored - it was always a bad idea.
2058 	 */
2059 	tcb = (Tcpctl*)s->ptcl;
2060 	if(waserror()){
2061 		qunlock(s);
2062 		nexterror();
2063 	}
2064 	qlock(s);
2065 	qunlock(tcp);
2066 
2067 	/* fix up window */
2068 	seg.wnd <<= tcb->rcv.scale;
2069 
2070 	/* every input packet in puts off the keep alive time out */
2071 	tcpsetkacounter(tcb);
2072 
2073 	switch(tcb->state) {
2074 	case Closed:
2075 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2076 		goto raise;
2077 	case Syn_sent:
2078 		if(seg.flags & ACK) {
2079 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2080 				sndrst(tcp, source, dest, length, &seg, version,
2081 					 "bad seq in Syn_sent");
2082 				goto raise;
2083 			}
2084 		}
2085 		if(seg.flags & RST) {
2086 			if(seg.flags & ACK)
2087 				localclose(s, Econrefused);
2088 			goto raise;
2089 		}
2090 
2091 		if(seg.flags & SYN) {
2092 			procsyn(s, &seg);
2093 			if(seg.flags & ACK){
2094 				update(s, &seg);
2095 				tcpsynackrtt(s);
2096 				tcpsetstate(s, Established);
2097 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2098 			}
2099 			else {
2100 				tcb->time = NOW;
2101 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2102 			}
2103 
2104 			if(length != 0 || (seg.flags & FIN))
2105 				break;
2106 
2107 			freeblist(bp);
2108 			goto output;
2109 		}
2110 		else
2111 			freeblist(bp);
2112 
2113 		qunlock(s);
2114 		poperror();
2115 		return;
2116 	case Syn_received:
2117 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2118 		if(seg.flags & ACK)
2119 			tcpsynackrtt(s);
2120 		break;
2121 	}
2122 
2123 	/*
2124 	 *  One DOS attack is to open connections to us and then forget about them,
2125 	 *  thereby tying up a conv at no long term cost to the attacker.
2126 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2127 	 *  corresponding code in tcpsendka().
2128 	 */
2129 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2130 		if(tcpporthogdefense
2131 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2132 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2133 				source, seg.source, dest, seg.dest, seg.flags,
2134 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2135 			localclose(s, "stateless hog");
2136 		}
2137 	}
2138 
2139 	/* Cut the data to fit the receive window */
2140 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2141 		netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2142 		update(s, &seg);
2143 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2144 			tcphalt(tpriv, &tcb->rtt_timer);
2145 			tcphalt(tpriv, &tcb->acktimer);
2146 			tcphalt(tpriv, &tcb->katimer);
2147 			tcpsetstate(s, Time_wait);
2148 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2149 			tcpgo(tpriv, &tcb->timer);
2150 		}
2151 		if(!(seg.flags & RST)) {
2152 			tcb->flags |= FORCE;
2153 			goto output;
2154 		}
2155 		qunlock(s);
2156 		poperror();
2157 		return;
2158 	}
2159 
2160 	/* Cannot accept so answer with a rst */
2161 	if(length && tcb->state == Closed) {
2162 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2163 		goto raise;
2164 	}
2165 
2166 	/* The segment is beyond the current receive pointer so
2167 	 * queue the data in the resequence queue
2168 	 */
2169 	if(seg.seq != tcb->rcv.nxt)
2170 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2171 		update(s, &seg);
2172 		if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2173 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2174 		tcb->flags |= FORCE;
2175 		goto output;
2176 	}
2177 
2178 	/*
2179 	 *  keep looping till we've processed this packet plus any
2180 	 *  adjacent packets in the resequence queue
2181 	 */
2182 	for(;;) {
2183 		if(seg.flags & RST) {
2184 			if(tcb->state == Established) {
2185 				tpriv->stats[EstabResets]++;
2186 				if(tcb->rcv.nxt != seg.seq)
2187 					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2188 			}
2189 			localclose(s, Econrefused);
2190 			goto raise;
2191 		}
2192 
2193 		if((seg.flags&ACK) == 0)
2194 			goto raise;
2195 
2196 		switch(tcb->state) {
2197 		case Syn_received:
2198 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2199 				sndrst(tcp, source, dest, length, &seg, version,
2200 					"bad seq in Syn_received");
2201 				goto raise;
2202 			}
2203 			update(s, &seg);
2204 			tcpsetstate(s, Established);
2205 		case Established:
2206 		case Close_wait:
2207 			update(s, &seg);
2208 			break;
2209 		case Finwait1:
2210 			update(s, &seg);
2211 			if(qlen(s->wq)+tcb->flgcnt == 0){
2212 				tcphalt(tpriv, &tcb->rtt_timer);
2213 				tcphalt(tpriv, &tcb->acktimer);
2214 				tcpsetkacounter(tcb);
2215 				tcb->time = NOW;
2216 				tcpsetstate(s, Finwait2);
2217 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2218 				tcpgo(tpriv, &tcb->katimer);
2219 			}
2220 			break;
2221 		case Finwait2:
2222 			update(s, &seg);
2223 			break;
2224 		case Closing:
2225 			update(s, &seg);
2226 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2227 				tcphalt(tpriv, &tcb->rtt_timer);
2228 				tcphalt(tpriv, &tcb->acktimer);
2229 				tcphalt(tpriv, &tcb->katimer);
2230 				tcpsetstate(s, Time_wait);
2231 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2232 				tcpgo(tpriv, &tcb->timer);
2233 			}
2234 			break;
2235 		case Last_ack:
2236 			update(s, &seg);
2237 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2238 				localclose(s, nil);
2239 				goto raise;
2240 			}
2241 		case Time_wait:
2242 			tcb->flags |= FORCE;
2243 			if(tcb->timer.state != TcptimerON)
2244 				tcpgo(tpriv, &tcb->timer);
2245 		}
2246 
2247 		if((seg.flags&URG) && seg.urg) {
2248 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2249 				tcb->rcv.urg = seg.urg + seg.seq;
2250 				pullblock(&bp, seg.urg);
2251 			}
2252 		}
2253 		else
2254 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2255 			tcb->rcv.urg = tcb->rcv.nxt;
2256 
2257 		if(length == 0) {
2258 			if(bp != nil)
2259 				freeblist(bp);
2260 		}
2261 		else {
2262 			switch(tcb->state){
2263 			default:
2264 				/* Ignore segment text */
2265 				if(bp != nil)
2266 					freeblist(bp);
2267 				break;
2268 
2269 			case Syn_received:
2270 			case Established:
2271 			case Finwait1:
2272 				/* If we still have some data place on
2273 				 * receive queue
2274 				 */
2275 				if(bp) {
2276 					bp = packblock(bp);
2277 					if(bp == nil)
2278 						panic("tcp packblock");
2279 					qpassnolim(s->rq, bp);
2280 					bp = nil;
2281 
2282 					/*
2283 					 *  Force an ack every 2 data messages.  This is
2284 					 *  a hack for rob to make his home system run
2285 					 *  faster.
2286 					 *
2287 					 *  this also keeps the standard TCP congestion
2288 					 *  control working since it needs an ack every
2289 					 *  2 max segs worth.  This is not quite that,
2290 					 *  but under a real stream is equivalent since
2291 					 *  every packet has a max seg in it.
2292 					 */
2293 					if(++(tcb->rcv.una) >= 2)
2294 						tcb->flags |= FORCE;
2295 				}
2296 				tcb->rcv.nxt += length;
2297 
2298 				/*
2299 				 *  update our rcv window
2300 				 */
2301 				tcprcvwin(s);
2302 
2303 				/*
2304 				 *  turn on the acktimer if there's something
2305 				 *  to ack
2306 				 */
2307 				if(tcb->acktimer.state != TcptimerON)
2308 					tcpgo(tpriv, &tcb->acktimer);
2309 
2310 				break;
2311 			case Finwait2:
2312 				/* no process to read the data, send a reset */
2313 				if(bp != nil)
2314 					freeblist(bp);
2315 				sndrst(tcp, source, dest, length, &seg, version,
2316 					"send to Finwait2");
2317 				qunlock(s);
2318 				poperror();
2319 				return;
2320 			}
2321 		}
2322 
2323 		if(seg.flags & FIN) {
2324 			tcb->flags |= FORCE;
2325 
2326 			switch(tcb->state) {
2327 			case Syn_received:
2328 			case Established:
2329 				tcb->rcv.nxt++;
2330 				tcpsetstate(s, Close_wait);
2331 				break;
2332 			case Finwait1:
2333 				tcb->rcv.nxt++;
2334 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2335 					tcphalt(tpriv, &tcb->rtt_timer);
2336 					tcphalt(tpriv, &tcb->acktimer);
2337 					tcphalt(tpriv, &tcb->katimer);
2338 					tcpsetstate(s, Time_wait);
2339 					tcb->timer.start = MSL2*(1000/MSPTICK);
2340 					tcpgo(tpriv, &tcb->timer);
2341 				}
2342 				else
2343 					tcpsetstate(s, Closing);
2344 				break;
2345 			case Finwait2:
2346 				tcb->rcv.nxt++;
2347 				tcphalt(tpriv, &tcb->rtt_timer);
2348 				tcphalt(tpriv, &tcb->acktimer);
2349 				tcphalt(tpriv, &tcb->katimer);
2350 				tcpsetstate(s, Time_wait);
2351 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2352 				tcpgo(tpriv, &tcb->timer);
2353 				break;
2354 			case Close_wait:
2355 			case Closing:
2356 			case Last_ack:
2357 				break;
2358 			case Time_wait:
2359 				tcpgo(tpriv, &tcb->timer);
2360 				break;
2361 			}
2362 		}
2363 
2364 		/*
2365 		 *  get next adjacent segment from the resequence queue.
2366 		 *  dump/trim any overlapping segments
2367 		 */
2368 		for(;;) {
2369 			if(tcb->reseq == nil)
2370 				goto output;
2371 
2372 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2373 				goto output;
2374 
2375 			getreseq(tcb, &seg, &bp, &length);
2376 
2377 			if(tcptrim(tcb, &seg, &bp, &length) == 0)
2378 				break;
2379 		}
2380 	}
2381 output:
2382 	tcpoutput(s);
2383 	qunlock(s);
2384 	poperror();
2385 	return;
2386 raise:
2387 	qunlock(s);
2388 	poperror();
2389 	freeblist(bp);
2390 	tcpkick(s);
2391 }
2392 
2393 /*
2394  *  always enters and exits with the s locked.  We drop
2395  *  the lock to ipoput the packet so some care has to be
2396  *  taken by callers.
2397  */
2398 void
2399 tcpoutput(Conv *s)
2400 {
2401 	Tcp seg;
2402 	int msgs;
2403 	Tcpctl *tcb;
2404 	Block *hbp, *bp;
2405 	int sndcnt, n;
2406 	ulong ssize, dsize, usable, sent;
2407 	Fs *f;
2408 	Tcppriv *tpriv;
2409 	uchar version;
2410 
2411 	f = s->p->f;
2412 	tpriv = s->p->priv;
2413 	version = s->ipversion;
2414 
2415 	for(msgs = 0; msgs < 100; msgs++) {
2416 		tcb = (Tcpctl*)s->ptcl;
2417 
2418 		switch(tcb->state) {
2419 		case Listen:
2420 		case Closed:
2421 		case Finwait2:
2422 			return;
2423 		}
2424 
2425 		/* force an ack when a window has opened up */
2426 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2427 			tcb->rcv.blocked = 0;
2428 			tcb->flags |= FORCE;
2429 		}
2430 
2431 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2432 		sent = tcb->snd.ptr - tcb->snd.una;
2433 
2434 		/* Don't send anything else until our SYN has been acked */
2435 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2436 			break;
2437 
2438 		/* Compute usable segment based on offered window and limit
2439 		 * window probes to one
2440 		 */
2441 		if(tcb->snd.wnd == 0){
2442 			if(sent != 0) {
2443 				if((tcb->flags&FORCE) == 0)
2444 					break;
2445 //				tcb->snd.ptr = tcb->snd.una;
2446 			}
2447 			usable = 1;
2448 		}
2449 		else {
2450 			usable = tcb->cwind;
2451 			if(tcb->snd.wnd < usable)
2452 				usable = tcb->snd.wnd;
2453 			usable -= sent;
2454 		}
2455 		ssize = sndcnt-sent;
2456 		if(ssize && usable < 2)
2457 			netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2458 				tcb->snd.wnd, tcb->cwind);
2459 		if(usable < ssize)
2460 			ssize = usable;
2461 		if(tcb->mss < ssize)
2462 			ssize = tcb->mss;
2463 		dsize = ssize;
2464 		seg.urg = 0;
2465 
2466 		if(ssize == 0)
2467 		if((tcb->flags&FORCE) == 0)
2468 			break;
2469 
2470 		tcb->flags &= ~FORCE;
2471 		tcprcvwin(s);
2472 
2473 		/* By default we will generate an ack */
2474 		tcphalt(tpriv, &tcb->acktimer);
2475 		tcb->rcv.una = 0;
2476 		seg.source = s->lport;
2477 		seg.dest = s->rport;
2478 		seg.flags = ACK;
2479 		seg.mss = 0;
2480 		seg.ws = 0;
2481 		switch(tcb->state){
2482 		case Syn_sent:
2483 			seg.flags = 0;
2484 			if(tcb->snd.ptr == tcb->iss){
2485 				seg.flags |= SYN;
2486 				dsize--;
2487 				seg.mss = tcb->mss;
2488 				seg.ws = tcb->scale;
2489 			}
2490 			break;
2491 		case Syn_received:
2492 			/*
2493 			 *  don't send any data with a SYN/ACK packet
2494 			 *  because Linux rejects the packet in its
2495 			 *  attempt to solve the SYN attack problem
2496 			 */
2497 			if(tcb->snd.ptr == tcb->iss){
2498 				seg.flags |= SYN;
2499 				dsize = 0;
2500 				ssize = 1;
2501 				seg.mss = tcb->mss;
2502 				seg.ws = tcb->scale;
2503 			}
2504 			break;
2505 		}
2506 		seg.seq = tcb->snd.ptr;
2507 		seg.ack = tcb->rcv.nxt;
2508 		seg.wnd = tcb->rcv.wnd;
2509 
2510 		/* Pull out data to send */
2511 		bp = nil;
2512 		if(dsize != 0) {
2513 			bp = qcopy(s->wq, dsize, sent);
2514 			if(BLEN(bp) != dsize) {
2515 				seg.flags |= FIN;
2516 				dsize--;
2517 			}
2518 		}
2519 
2520 		if(sent+dsize == sndcnt)
2521 			seg.flags |= PSH;
2522 
2523 		/* keep track of balance of resent data */
2524 		if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2525 			n = tcb->snd.nxt - tcb->snd.ptr;
2526 			if(ssize < n)
2527 				n = ssize;
2528 			tcb->resent += n;
2529 			netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2530 				s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2531 			tpriv->stats[RetransSegs]++;
2532 		}
2533 
2534 		tcb->snd.ptr += ssize;
2535 
2536 		/* Pull up the send pointer so we can accept acks
2537 		 * for this window
2538 		 */
2539 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2540 			tcb->snd.nxt = tcb->snd.ptr;
2541 
2542 		/* Build header, link data and compute cksum */
2543 		switch(version){
2544 		case V4:
2545 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2546 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2547 			if(hbp == nil) {
2548 				freeblist(bp);
2549 				return;
2550 			}
2551 			break;
2552 		case V6:
2553 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2554 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2555 			if(hbp == nil) {
2556 				freeblist(bp);
2557 				return;
2558 			}
2559 			break;
2560 		default:
2561 			hbp = nil;	/* to suppress a warning */
2562 			panic("tcpoutput: version %d", version);
2563 		}
2564 
2565 		/* Start the transmission timers if there is new data and we
2566 		 * expect acknowledges
2567 		 */
2568 		if(ssize != 0){
2569 			if(tcb->timer.state != TcptimerON)
2570 				tcpgo(tpriv, &tcb->timer);
2571 
2572 			/*  If round trip timer isn't running, start it.
2573 			 *  measure the longest packet only in case the
2574 			 *  transmission time dominates RTT
2575 			 */
2576 			if(tcb->rtt_timer.state != TcptimerON)
2577 			if(ssize == tcb->mss) {
2578 				tcpgo(tpriv, &tcb->rtt_timer);
2579 				tcb->rttseq = tcb->snd.ptr;
2580 			}
2581 		}
2582 
2583 		tpriv->stats[OutSegs]++;
2584 
2585 		/* put off the next keep alive */
2586 		tcpgo(tpriv, &tcb->katimer);
2587 
2588 		switch(version){
2589 		case V4:
2590 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2591 				/* a negative return means no route */
2592 				localclose(s, "no route");
2593 			}
2594 			break;
2595 		case V6:
2596 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2597 				/* a negative return means no route */
2598 				localclose(s, "no route");
2599 			}
2600 			break;
2601 		default:
2602 			panic("tcpoutput2: version %d", version);
2603 		}
2604 		if((msgs%4) == 1){
2605 			qunlock(s);
2606 			sched();
2607 			qlock(s);
2608 		}
2609 	}
2610 }
2611 
2612 /*
2613  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2614  */
2615 void
2616 tcpsendka(Conv *s)
2617 {
2618 	Tcp seg;
2619 	Tcpctl *tcb;
2620 	Block *hbp,*dbp;
2621 
2622 	tcb = (Tcpctl*)s->ptcl;
2623 
2624 	dbp = nil;
2625 	seg.urg = 0;
2626 	seg.source = s->lport;
2627 	seg.dest = s->rport;
2628 	seg.flags = ACK|PSH;
2629 	seg.mss = 0;
2630 	seg.ws = 0;
2631 	if(tcpporthogdefense)
2632 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2633 	else
2634 		seg.seq = tcb->snd.una-1;
2635 	seg.ack = tcb->rcv.nxt;
2636 	tcb->rcv.una = 0;
2637 	seg.wnd = tcb->rcv.wnd;
2638 	if(tcb->state == Finwait2){
2639 		seg.flags |= FIN;
2640 	} else {
2641 		dbp = allocb(1);
2642 		dbp->wp++;
2643 	}
2644 
2645 	if(isv4(s->raddr)) {
2646 		/* Build header, link data and compute cksum */
2647 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2648 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2649 		if(hbp == nil) {
2650 			freeblist(dbp);
2651 			return;
2652 		}
2653 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2654 	}
2655 	else {
2656 		/* Build header, link data and compute cksum */
2657 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2658 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2659 		if(hbp == nil) {
2660 			freeblist(dbp);
2661 			return;
2662 		}
2663 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2664 	}
2665 }
2666 
2667 /*
2668  *  set connection to time out after 12 minutes
2669  */
2670 void
2671 tcpsetkacounter(Tcpctl *tcb)
2672 {
2673 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2674 	if(tcb->kacounter < 3)
2675 		tcb->kacounter = 3;
2676 }
2677 
2678 /*
2679  *  if we've timed out, close the connection
2680  *  otherwise, send a keepalive and restart the timer
2681  */
2682 void
2683 tcpkeepalive(void *v)
2684 {
2685 	Tcpctl *tcb;
2686 	Conv *s;
2687 
2688 	s = v;
2689 	tcb = (Tcpctl*)s->ptcl;
2690 	if(waserror()){
2691 		qunlock(s);
2692 		nexterror();
2693 	}
2694 	qlock(s);
2695 	if(tcb->state != Closed){
2696 		if(--(tcb->kacounter) <= 0) {
2697 			localclose(s, Etimedout);
2698 		} else {
2699 			tcpsendka(s);
2700 			tcpgo(s->p->priv, &tcb->katimer);
2701 		}
2702 	}
2703 	qunlock(s);
2704 	poperror();
2705 }
2706 
2707 /*
2708  *  start keepalive timer
2709  */
2710 char*
2711 tcpstartka(Conv *s, char **f, int n)
2712 {
2713 	Tcpctl *tcb;
2714 	int x;
2715 
2716 	tcb = (Tcpctl*)s->ptcl;
2717 	if(tcb->state != Established)
2718 		return "connection must be in Establised state";
2719 	if(n > 1){
2720 		x = atoi(f[1]);
2721 		if(x >= MSPTICK)
2722 			tcb->katimer.start = x/MSPTICK;
2723 	}
2724 	tcpsetkacounter(tcb);
2725 	tcpgo(s->p->priv, &tcb->katimer);
2726 
2727 	return nil;
2728 }
2729 
2730 /*
2731  *  turn checksums on/off
2732  */
2733 char*
2734 tcpsetchecksum(Conv *s, char **f, int)
2735 {
2736 	Tcpctl *tcb;
2737 
2738 	tcb = (Tcpctl*)s->ptcl;
2739 	tcb->nochecksum = !atoi(f[1]);
2740 
2741 	return nil;
2742 }
2743 
2744 void
2745 tcprxmit(Conv *s)
2746 {
2747 	Tcpctl *tcb;
2748 
2749 	tcb = (Tcpctl*)s->ptcl;
2750 
2751 	tcb->flags |= RETRAN|FORCE;
2752 	tcb->snd.ptr = tcb->snd.una;
2753 
2754 	/*
2755 	 *  We should be halving the slow start threshhold (down to one
2756 	 *  mss) but leaving it at mss seems to work well enough
2757 	 */
2758  	tcb->ssthresh = tcb->mss;
2759 
2760 	/*
2761 	 *  pull window down to a single packet
2762 	 */
2763 	tcb->cwind = tcb->mss;
2764 	tcpoutput(s);
2765 }
2766 
2767 void
2768 tcptimeout(void *arg)
2769 {
2770 	Conv *s;
2771 	Tcpctl *tcb;
2772 	int maxback;
2773 	Tcppriv *tpriv;
2774 
2775 	s = (Conv*)arg;
2776 	tpriv = s->p->priv;
2777 	tcb = (Tcpctl*)s->ptcl;
2778 
2779 	if(waserror()){
2780 		qunlock(s);
2781 		nexterror();
2782 	}
2783 	qlock(s);
2784 	switch(tcb->state){
2785 	default:
2786 		tcb->backoff++;
2787 		if(tcb->state == Syn_sent)
2788 			maxback = MAXBACKMS/2;
2789 		else
2790 			maxback = MAXBACKMS;
2791 		tcb->backedoff += tcb->timer.start * MSPTICK;
2792 		if(tcb->backedoff >= maxback) {
2793 			localclose(s, Etimedout);
2794 			break;
2795 		}
2796 		netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2797 		tcpsettimer(tcb);
2798 		tcprxmit(s);
2799 		tpriv->stats[RetransTimeouts]++;
2800 		tcb->snd.dupacks = 0;
2801 		break;
2802 	case Time_wait:
2803 		localclose(s, nil);
2804 		break;
2805 	case Closed:
2806 		break;
2807 	}
2808 	qunlock(s);
2809 	poperror();
2810 }
2811 
2812 int
2813 inwindow(Tcpctl *tcb, int seq)
2814 {
2815 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2816 }
2817 
2818 /*
2819  *  set up state for a received SYN (or SYN ACK) packet
2820  */
2821 void
2822 procsyn(Conv *s, Tcp *seg)
2823 {
2824 	Tcpctl *tcb;
2825 
2826 	tcb = (Tcpctl*)s->ptcl;
2827 	tcb->flags |= FORCE;
2828 
2829 	tcb->rcv.nxt = seg->seq + 1;
2830 	tcb->rcv.urg = tcb->rcv.nxt;
2831 	tcb->irs = seg->seq;
2832 
2833 	/* our sending max segment size cannot be bigger than what he asked for */
2834 	if(seg->mss != 0 && seg->mss < tcb->mss)
2835 		tcb->mss = seg->mss;
2836 
2837 	/* the congestion window always starts out as a single segment */
2838 	tcb->snd.wnd = seg->wnd;
2839 	tcb->cwind = tcb->mss;
2840 }
2841 
2842 int
2843 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2844 {
2845 	Reseq *rp, *rp1;
2846 	int i, rqlen, qmax;
2847 
2848 	rp = malloc(sizeof(Reseq));
2849 	if(rp == nil){
2850 		freeblist(bp);	/* bp always consumed by add_reseq */
2851 		return 0;
2852 	}
2853 
2854 	rp->seg = *seg;
2855 	rp->bp = bp;
2856 	rp->length = length;
2857 
2858 	/* Place on reassembly list sorting by starting seq number */
2859 	rp1 = tcb->reseq;
2860 	if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2861 		rp->next = rp1;
2862 		tcb->reseq = rp;
2863 		if(rp->next != nil)
2864 			tpriv->stats[OutOfOrder]++;
2865 		return 0;
2866 	}
2867 
2868 	rqlen = 0;
2869 	for(i = 0;; i++) {
2870 		rqlen += rp1->length;
2871 		if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2872 			rp->next = rp1->next;
2873 			rp1->next = rp;
2874 			if(rp->next != nil)
2875 				tpriv->stats[OutOfOrder]++;
2876 			break;
2877 		}
2878 		rp1 = rp1->next;
2879 	}
2880 	qmax = QMAX<<tcb->rcv.scale;
2881 	if(rqlen > qmax){
2882 		print("resequence queue > window: %d > %d\n", rqlen, qmax);
2883 		i = 0;
2884 	  	for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2885 	  		print("%#lux %#lux %#ux\n", rp1->seg.seq,
2886 	  			rp1->seg.ack, rp1->seg.flags);
2887 			if(i++ > 10){
2888 				print("...\n");
2889 				break;
2890 			}
2891 		}
2892 
2893 		/*
2894 		 * delete entire reassembly queue; wait for retransmit.
2895 		 * - should we be smarter and only delete the tail?
2896 		 */
2897 		for(rp = tcb->reseq; rp != nil; rp = rp1){
2898 			rp1 = rp->next;
2899 			freeblist(rp->bp);
2900 			free(rp);
2901 		}
2902 		tcb->reseq = nil;
2903 
2904 	  	return -1;
2905 	}
2906 	return 0;
2907 }
2908 
2909 void
2910 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2911 {
2912 	Reseq *rp;
2913 
2914 	rp = tcb->reseq;
2915 	if(rp == nil)
2916 		return;
2917 
2918 	tcb->reseq = rp->next;
2919 
2920 	*seg = rp->seg;
2921 	*bp = rp->bp;
2922 	*length = rp->length;
2923 
2924 	free(rp);
2925 }
2926 
2927 int
2928 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2929 {
2930 	ushort len;
2931 	uchar accept;
2932 	int dupcnt, excess;
2933 
2934 	accept = 0;
2935 	len = *length;
2936 	if(seg->flags & SYN)
2937 		len++;
2938 	if(seg->flags & FIN)
2939 		len++;
2940 
2941 	if(tcb->rcv.wnd == 0) {
2942 		if(len == 0 && seg->seq == tcb->rcv.nxt)
2943 			return 0;
2944 	}
2945 	else {
2946 		/* Some part of the segment should be in the window */
2947 		if(inwindow(tcb,seg->seq))
2948 			accept++;
2949 		else
2950 		if(len != 0) {
2951 			if(inwindow(tcb, seg->seq+len-1) ||
2952 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2953 				accept++;
2954 		}
2955 	}
2956 	if(!accept) {
2957 		freeblist(*bp);
2958 		return -1;
2959 	}
2960 	dupcnt = tcb->rcv.nxt - seg->seq;
2961 	if(dupcnt > 0){
2962 		tcb->rerecv += dupcnt;
2963 		if(seg->flags & SYN){
2964 			seg->flags &= ~SYN;
2965 			seg->seq++;
2966 
2967 			if(seg->urg > 1)
2968 				seg->urg--;
2969 			else
2970 				seg->flags &= ~URG;
2971 			dupcnt--;
2972 		}
2973 		if(dupcnt > 0){
2974 			pullblock(bp, (ushort)dupcnt);
2975 			seg->seq += dupcnt;
2976 			*length -= dupcnt;
2977 
2978 			if(seg->urg > dupcnt)
2979 				seg->urg -= dupcnt;
2980 			else {
2981 				seg->flags &= ~URG;
2982 				seg->urg = 0;
2983 			}
2984 		}
2985 	}
2986 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2987 	if(excess > 0) {
2988 		tcb->rerecv += excess;
2989 		*length -= excess;
2990 		*bp = trimblock(*bp, 0, *length);
2991 		if(*bp == nil)
2992 			panic("presotto is a boofhead");
2993 		seg->flags &= ~FIN;
2994 	}
2995 	return 0;
2996 }
2997 
2998 void
2999 tcpadvise(Proto *tcp, Block *bp, char *msg)
3000 {
3001 	Tcp4hdr *h4;
3002 	Tcp6hdr *h6;
3003 	Tcpctl *tcb;
3004 	uchar source[IPaddrlen];
3005 	uchar dest[IPaddrlen];
3006 	ushort psource, pdest;
3007 	Conv *s, **p;
3008 
3009 	h4 = (Tcp4hdr*)(bp->rp);
3010 	h6 = (Tcp6hdr*)(bp->rp);
3011 
3012 	if((h4->vihl&0xF0)==IP_VER4) {
3013 		v4tov6(dest, h4->tcpdst);
3014 		v4tov6(source, h4->tcpsrc);
3015 		psource = nhgets(h4->tcpsport);
3016 		pdest = nhgets(h4->tcpdport);
3017 	}
3018 	else {
3019 		ipmove(dest, h6->tcpdst);
3020 		ipmove(source, h6->tcpsrc);
3021 		psource = nhgets(h6->tcpsport);
3022 		pdest = nhgets(h6->tcpdport);
3023 	}
3024 
3025 	/* Look for a connection */
3026 	qlock(tcp);
3027 	for(p = tcp->conv; *p; p++) {
3028 		s = *p;
3029 		tcb = (Tcpctl*)s->ptcl;
3030 		if(s->rport == pdest)
3031 		if(s->lport == psource)
3032 		if(tcb->state != Closed)
3033 		if(ipcmp(s->raddr, dest) == 0)
3034 		if(ipcmp(s->laddr, source) == 0){
3035 			qlock(s);
3036 			qunlock(tcp);
3037 			switch(tcb->state){
3038 			case Syn_sent:
3039 				localclose(s, msg);
3040 				break;
3041 			}
3042 			qunlock(s);
3043 			freeblist(bp);
3044 			return;
3045 		}
3046 	}
3047 	qunlock(tcp);
3048 	freeblist(bp);
3049 }
3050 
3051 static char*
3052 tcpporthogdefensectl(char *val)
3053 {
3054 	if(strcmp(val, "on") == 0)
3055 		tcpporthogdefense = 1;
3056 	else if(strcmp(val, "off") == 0)
3057 		tcpporthogdefense = 0;
3058 	else
3059 		return "unknown value for tcpporthogdefense";
3060 	return nil;
3061 }
3062 
3063 /* called with c qlocked */
3064 char*
3065 tcpctl(Conv* c, char** f, int n)
3066 {
3067 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3068 		return tcphangup(c);
3069 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3070 		return tcpstartka(c, f, n);
3071 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3072 		return tcpsetchecksum(c, f, n);
3073 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3074 		return tcpporthogdefensectl(f[1]);
3075 	return "unknown control request";
3076 }
3077 
3078 int
3079 tcpstats(Proto *tcp, char *buf, int len)
3080 {
3081 	Tcppriv *priv;
3082 	char *p, *e;
3083 	int i;
3084 
3085 	priv = tcp->priv;
3086 	p = buf;
3087 	e = p+len;
3088 	for(i = 0; i < Nstats; i++)
3089 		p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3090 	return p - buf;
3091 }
3092 
3093 /*
3094  *  garbage collect any stale conversations:
3095  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3096  *	- Finwait2 after 5 minutes
3097  *
3098  *  this is called whenever we run out of channels.  Both checks are
3099  *  of questionable validity so we try to use them only when we're
3100  *  up against the wall.
3101  */
3102 int
3103 tcpgc(Proto *tcp)
3104 {
3105 	Conv *c, **pp, **ep;
3106 	int n;
3107 	Tcpctl *tcb;
3108 
3109 
3110 	n = 0;
3111 	ep = &tcp->conv[tcp->nc];
3112 	for(pp = tcp->conv; pp < ep; pp++) {
3113 		c = *pp;
3114 		if(c == nil)
3115 			break;
3116 		if(!canqlock(c))
3117 			continue;
3118 		tcb = (Tcpctl*)c->ptcl;
3119 		switch(tcb->state){
3120 		case Syn_received:
3121 			if(NOW - tcb->time > 5000){
3122 				localclose(c, "timed out");
3123 				n++;
3124 			}
3125 			break;
3126 		case Finwait2:
3127 			if(NOW - tcb->time > 5*60*1000){
3128 				localclose(c, "timed out");
3129 				n++;
3130 			}
3131 			break;
3132 		}
3133 		qunlock(c);
3134 	}
3135 	return n;
3136 }
3137 
3138 void
3139 tcpsettimer(Tcpctl *tcb)
3140 {
3141 	int x;
3142 
3143 	/* round trip dependency */
3144 	x = backoff(tcb->backoff) *
3145 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3146 
3147 	/* bounded twixt 1/2 and 64 seconds */
3148 	if(x < 500/MSPTICK)
3149 		x = 500/MSPTICK;
3150 	else if(x > (64000/MSPTICK))
3151 		x = 64000/MSPTICK;
3152 	tcb->timer.start = x;
3153 }
3154 
3155 void
3156 tcpinit(Fs *fs)
3157 {
3158 	Proto *tcp;
3159 	Tcppriv *tpriv;
3160 
3161 	tcp = smalloc(sizeof(Proto));
3162 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3163 	tcp->name = "tcp";
3164 	tcp->connect = tcpconnect;
3165 	tcp->announce = tcpannounce;
3166 	tcp->ctl = tcpctl;
3167 	tcp->state = tcpstate;
3168 	tcp->create = tcpcreate;
3169 	tcp->close = tcpclose;
3170 	tcp->rcv = tcpiput;
3171 	tcp->advise = tcpadvise;
3172 	tcp->stats = tcpstats;
3173 	tcp->inuse = tcpinuse;
3174 	tcp->gc = tcpgc;
3175 	tcp->ipproto = IP_TCPPROTO;
3176 	tcp->nc = scalednconv();
3177 	tcp->ptclsize = sizeof(Tcpctl);
3178 	tpriv->stats[MaxConn] = tcp->nc;
3179 
3180 	Fsproto(fs, tcp);
3181 }
3182 
3183 void
3184 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3185 {
3186 	if(rcvscale){
3187 		tcb->rcv.scale = rcvscale & 0xff;
3188 		tcb->snd.scale = sndscale & 0xff;
3189 		tcb->window = QMAX<<tcb->snd.scale;
3190 		qsetlimit(s->rq, tcb->window);
3191 	} else {
3192 		tcb->rcv.scale = 0;
3193 		tcb->snd.scale = 0;
3194 		tcb->window = QMAX;
3195 		qsetlimit(s->rq, tcb->window);
3196 	}
3197 }
3198