xref: /plan9/sys/src/9/ip/tcp.c (revision 3e87cd09ffd62dbb30f89f86caa5a8aa696a53bb)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Maximum segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default maximum segment */
50 	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	/*
85 	 * window is 64kb * 2ⁿ
86 	 * these factors determine the ultimate bandwidth-delay product.
87 	 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 	 */
89 	Maxqscale	= 4,		/* maximum queuing scale */
90 	Defadvscale	= 4,		/* default advertisement */
91 };
92 
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
97 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
98 	"Closing", 	"Last_ack", 	"Time_wait"
99 };
100 
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 	Tcptimer	*next;
105 	Tcptimer	*prev;
106 	Tcptimer	*readynext;
107 	int	state;
108 	int	start;
109 	int	count;
110 	void	(*func)(void*);
111 	void	*arg;
112 };
113 
114 /*
115  *  v4 and v6 pseudo headers used for
116  *  checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 	uchar	vihl;		/* Version and header length */
122 	uchar	tos;		/* Type of service */
123 	uchar	length[2];	/* packet length */
124 	uchar	id[2];		/* Identification */
125 	uchar	frag[2];	/* Fragment information */
126 	uchar	Unused;
127 	uchar	proto;
128 	uchar	tcplen[2];
129 	uchar	tcpsrc[4];
130 	uchar	tcpdst[4];
131 	uchar	tcpsport[2];
132 	uchar	tcpdport[2];
133 	uchar	tcpseq[4];
134 	uchar	tcpack[4];
135 	uchar	tcpflag[2];
136 	uchar	tcpwin[2];
137 	uchar	tcpcksum[2];
138 	uchar	tcpurg[2];
139 	/* Options segment */
140 	uchar	tcpopt[1];
141 };
142 
143 typedef struct Tcp6hdr Tcp6hdr;
144 struct Tcp6hdr
145 {
146 	uchar	vcf[4];
147 	uchar	ploadlen[2];
148 	uchar	proto;
149 	uchar	ttl;
150 	uchar	tcpsrc[IPaddrlen];
151 	uchar	tcpdst[IPaddrlen];
152 	uchar	tcpsport[2];
153 	uchar	tcpdport[2];
154 	uchar	tcpseq[4];
155 	uchar	tcpack[4];
156 	uchar	tcpflag[2];
157 	uchar	tcpwin[2];
158 	uchar	tcpcksum[2];
159 	uchar	tcpurg[2];
160 	/* Options segment */
161 	uchar	tcpopt[1];
162 };
163 
164 /*
165  *  this represents the control info
166  *  for a single packet.  It is derived from
167  *  a packet in ntohtcp{4,6}() and stuck into
168  *  a packet in htontcp{4,6}().
169  */
170 typedef struct Tcp Tcp;
171 struct	Tcp
172 {
173 	ushort	source;
174 	ushort	dest;
175 	ulong	seq;
176 	ulong	ack;
177 	uchar	flags;
178 	uchar	update;
179 	ushort	ws;	/* window scale option */
180 	ulong	wnd;	/* prescaled window*/
181 	ushort	urg;
182 	ushort	mss;	/* max segment size option (if not zero) */
183 	ushort	len;	/* size of data */
184 };
185 
186 /*
187  *  this header is malloc'd to thread together fragments
188  *  waiting to be coalesced
189  */
190 typedef struct Reseq Reseq;
191 struct Reseq
192 {
193 	Reseq	*next;
194 	Tcp	seg;
195 	Block	*bp;
196 	ushort	length;
197 };
198 
199 /*
200  *  the qlock in the Conv locks this structure
201  */
202 typedef struct Tcpctl Tcpctl;
203 struct Tcpctl
204 {
205 	uchar	state;			/* Connection state */
206 	uchar	type;			/* Listening or active connection */
207 	uchar	code;			/* Icmp code */
208 	struct {
209 		ulong	una;		/* Unacked data pointer */
210 		ulong	nxt;		/* Next sequence expected */
211 		ulong	ptr;		/* Data pointer */
212 		ulong	wnd;		/* Tcp send window */
213 		ulong	urg;		/* Urgent data pointer */
214 		ulong	wl2;
215 		uint	scale;		/* how much to right shift window */
216 					/* in xmitted packets */
217 		/* to implement tahoe and reno TCP */
218 		ulong	dupacks;	/* number of duplicate acks rcvd */
219 		ulong	partialack;
220 		int	recovery;	/* loss recovery flag */
221 		int	retransmit;	/* retransmit 1 packet @ una flag */
222 		int	rto;
223 		ulong	rxt;		/* right window marker for recovery */
224 					/* "recover" rfc3782 */
225 	} snd;
226 	struct {
227 		ulong	nxt;		/* Receive pointer to next uchar slot */
228 		ulong	wnd;		/* Receive window incoming */
229 		ulong	wsnt;		/* Last wptr sent.  important to */
230 					/* track for large bdp */
231 		ulong	wptr;
232 		ulong	urg;		/* Urgent pointer */
233 		ulong	ackptr;		/* last acked sequence */
234 		int	blocked;
235 		uint	scale;		/* how much to left shift window in */
236 					/* rcv'd packets */
237 	} rcv;
238 	ulong	iss;			/* Initial sequence number */
239 	ulong	cwind;			/* Congestion window */
240 	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
241 	uint	scale;			/* desired snd.scale */
242 	ulong	ssthresh;		/* Slow start threshold */
243 	int	resent;			/* Bytes just resent */
244 	int	irs;			/* Initial received squence */
245 	ushort	mss;			/* Maximum segment size */
246 	int	rerecv;			/* Overlap of data rerecevived */
247 	ulong	window;			/* Our receive window (queue) */
248 	uint	qscale;			/* Log2 of our receive window (queue) */
249 	uchar	backoff;		/* Exponential backoff counter */
250 	int	backedoff;		/* ms we've backed off for rexmits */
251 	uchar	flags;			/* State flags */
252 	Reseq	*reseq;			/* Resequencing queue */
253 	int	nreseq;
254 	int	reseqlen;
255 	Tcptimer	timer;			/* Activity timer */
256 	Tcptimer	acktimer;		/* Acknowledge timer */
257 	Tcptimer	rtt_timer;		/* Round trip timer */
258 	Tcptimer	katimer;		/* keep alive timer */
259 	ulong	rttseq;			/* Round trip sequence */
260 	int	srtt;			/* Smoothed round trip */
261 	int	mdev;			/* Mean deviation of round trip */
262 	int	kacounter;		/* count down for keep alive */
263 	uint	sndsyntime;		/* time syn sent */
264 	ulong	time;			/* time Finwait2 or Syn_received was sent */
265 	ulong	timeuna;		/* snd.una when time was set */
266 	int	nochecksum;		/* non-zero means don't send checksums */
267 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
268 
269 	union {
270 		Tcp4hdr	tcp4hdr;
271 		Tcp6hdr	tcp6hdr;
272 	} protohdr;		/* prototype header */
273 };
274 
275 /*
276  *  New calls are put in limbo rather than having a conversation structure
277  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
278  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
279  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
280  *
281  *  In particular they aren't on a listener's queue so that they don't figure
282  *  in the input queue limit.
283  *
284  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
285  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
286  *  there is no hashing of this list.
287  */
288 typedef struct Limbo Limbo;
289 struct Limbo
290 {
291 	Limbo	*next;
292 
293 	uchar	laddr[IPaddrlen];
294 	uchar	raddr[IPaddrlen];
295 	ushort	lport;
296 	ushort	rport;
297 	ulong	irs;		/* initial received sequence */
298 	ulong	iss;		/* initial sent sequence */
299 	ushort	mss;		/* mss from the other end */
300 	ushort	rcvscale;	/* how much to scale rcvd windows */
301 	ushort	sndscale;	/* how much to scale sent windows */
302 	ulong	lastsend;	/* last time we sent a synack */
303 	uchar	version;	/* v4 or v6 */
304 	uchar	rexmits;	/* number of retransmissions */
305 };
306 
307 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
308 
309 enum {
310 	/* MIB stats */
311 	MaxConn,
312 	Mss,
313 	ActiveOpens,
314 	PassiveOpens,
315 	EstabResets,
316 	CurrEstab,
317 	InSegs,
318 	OutSegs,
319 	RetransSegs,
320 	RetransSegsSent,
321 	RetransTimeouts,
322 	InErrs,
323 	OutRsts,
324 
325 	/* non-MIB stats */
326 	CsumErrs,
327 	HlenErrs,
328 	LenErrs,
329 	Resequenced,
330 	OutOfOrder,
331 	ReseqBytelim,
332 	ReseqPktlim,
333 	Delayack,
334 	Wopenack,
335 
336 	Recovery,
337 	RecoveryDone,
338 	RecoveryRTO,
339 	RecoveryNoSeq,
340 	RecoveryCwind,
341 	RecoveryPA,
342 
343 	Nstats
344 };
345 
346 static char *statnames[Nstats] =
347 {
348 [MaxConn]	"MaxConn",
349 [Mss]		"MaxSegment",
350 [ActiveOpens]	"ActiveOpens",
351 [PassiveOpens]	"PassiveOpens",
352 [EstabResets]	"EstabResets",
353 [CurrEstab]	"CurrEstab",
354 [InSegs]	"InSegs",
355 [OutSegs]	"OutSegs",
356 [RetransSegs]	"RetransSegs",
357 [RetransSegsSent]	"RetransSegsSent",
358 [RetransTimeouts]	"RetransTimeouts",
359 [InErrs]	"InErrs",
360 [OutRsts]	"OutRsts",
361 [CsumErrs]	"CsumErrs",
362 [HlenErrs]	"HlenErrs",
363 [LenErrs]	"LenErrs",
364 [OutOfOrder]	"OutOfOrder",
365 [Resequenced]	"Resequenced",
366 [ReseqBytelim]	"ReseqBytelim",
367 [ReseqPktlim]	"ReseqPktlim",
368 [Delayack]	"Delayack",
369 [Wopenack]	"Wopenack",
370 
371 [Recovery]	"Recovery",
372 [RecoveryDone]	"RecoveryDone",
373 [RecoveryRTO]	"RecoveryRTO",
374 
375 [RecoveryNoSeq]	"RecoveryNoSeq",
376 [RecoveryCwind]	"RecoveryCwind",
377 [RecoveryPA]	"RecoveryPA",
378 };
379 
380 typedef struct Tcppriv Tcppriv;
381 struct Tcppriv
382 {
383 	/* List of active timers */
384 	QLock 	tl;
385 	Tcptimer *timers;
386 
387 	/* hash table for matching conversations */
388 	Ipht	ht;
389 
390 	/* calls in limbo waiting for an ACK to our SYN ACK */
391 	int	nlimbo;
392 	Limbo	*lht[NLHT];
393 
394 	/* for keeping track of tcpackproc */
395 	QLock	apl;
396 	int	ackprocstarted;
397 
398 	uvlong	stats[Nstats];
399 };
400 
401 /*
402  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
403  *  solution to hijacked systems staking out port's as a form
404  *  of DoS attack.
405  *
406  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
407  *  that number gets acked by the other end, we shut down the connection.
408  *  Look for tcpporthogdefense in the code.
409  */
410 int tcpporthogdefense = 0;
411 
412 static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
413 static	int	dumpreseq(Tcpctl*);
414 static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
415 static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
416 static	void	limborexmit(Proto*);
417 static	void	localclose(Conv*, char*);
418 static	void	procsyn(Conv*, Tcp*);
419 static	void	tcpacktimer(void*);
420 static	void	tcpiput(Proto*, Ipifc*, Block*);
421 static	void	tcpkeepalive(void*);
422 static	void	tcpoutput(Conv*);
423 static	void	tcprcvwin(Conv*);
424 static	void	tcprxmit(Conv*);
425 static	void	tcpsetkacounter(Tcpctl*);
426 static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
427 static	void	tcpsettimer(Tcpctl*);
428 static	void	tcpsndsyn(Conv*, Tcpctl*);
429 static	void	tcpstart(Conv*, int);
430 static	void	tcpsynackrtt(Conv*);
431 static	void	tcptimeout(void*);
432 static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
433 
434 static void
435 tcpsetstate(Conv *s, uchar newstate)
436 {
437 	Tcpctl *tcb;
438 	uchar oldstate;
439 	Tcppriv *tpriv;
440 
441 	tpriv = s->p->priv;
442 
443 	tcb = (Tcpctl*)s->ptcl;
444 
445 	oldstate = tcb->state;
446 	if(oldstate == newstate)
447 		return;
448 
449 	if(oldstate == Established)
450 		tpriv->stats[CurrEstab]--;
451 	if(newstate == Established)
452 		tpriv->stats[CurrEstab]++;
453 
454 	switch(newstate) {
455 	case Closed:
456 		qclose(s->rq);
457 		qclose(s->wq);
458 		qclose(s->eq);
459 		break;
460 
461 	case Close_wait:		/* Remote closes */
462 		qhangup(s->rq, nil);
463 		break;
464 	}
465 
466 	tcb->state = newstate;
467 
468 	if(oldstate == Syn_sent && newstate != Closed)
469 		Fsconnected(s, nil);
470 }
471 
472 static char*
473 tcpconnect(Conv *c, char **argv, int argc)
474 {
475 	char *e;
476 	Tcpctl *tcb;
477 
478 	tcb = (Tcpctl*)(c->ptcl);
479 	if(tcb->state != Closed)
480 		return Econinuse;
481 
482 	e = Fsstdconnect(c, argv, argc);
483 	if(e != nil)
484 		return e;
485 	tcpstart(c, TCP_CONNECT);
486 
487 	return nil;
488 }
489 
490 static int
491 tcpstate(Conv *c, char *state, int n)
492 {
493 	Tcpctl *s;
494 
495 	s = (Tcpctl*)(c->ptcl);
496 
497 	return snprint(state, n,
498 		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
499 		"swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
500 		"timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
501 		tcpstates[s->state],
502 		c->rq ? qlen(c->rq) : 0,
503 		c->wq ? qlen(c->wq) : 0,
504 		s->nreseq, s->reseqlen,
505 		s->srtt, s->mdev, s->ssthresh,
506 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
507 		s->qscale,
508 		s->timer.start, s->timer.count, s->rerecv,
509 		s->katimer.start, s->katimer.count);
510 }
511 
512 static int
513 tcpinuse(Conv *c)
514 {
515 	Tcpctl *s;
516 
517 	s = (Tcpctl*)(c->ptcl);
518 	return s->state != Closed;
519 }
520 
521 static char*
522 tcpannounce(Conv *c, char **argv, int argc)
523 {
524 	char *e;
525 	Tcpctl *tcb;
526 
527 	tcb = (Tcpctl*)(c->ptcl);
528 	if(tcb->state != Closed)
529 		return Econinuse;
530 
531 	e = Fsstdannounce(c, argv, argc);
532 	if(e != nil)
533 		return e;
534 	tcpstart(c, TCP_LISTEN);
535 	Fsconnected(c, nil);
536 
537 	return nil;
538 }
539 
540 /*
541  *  tcpclose is always called with the q locked
542  */
543 static void
544 tcpclose(Conv *c)
545 {
546 	Tcpctl *tcb;
547 
548 	tcb = (Tcpctl*)c->ptcl;
549 
550 	qhangup(c->rq, nil);
551 	qhangup(c->wq, nil);
552 	qhangup(c->eq, nil);
553 	qflush(c->rq);
554 
555 	switch(tcb->state) {
556 	case Listen:
557 		/*
558 		 *  reset any incoming calls to this listener
559 		 */
560 		Fsconnected(c, "Hangup");
561 
562 		localclose(c, nil);
563 		break;
564 	case Closed:
565 	case Syn_sent:
566 		localclose(c, nil);
567 		break;
568 	case Syn_received:
569 	case Established:
570 		tcb->flgcnt++;
571 		tcb->snd.nxt++;
572 		tcpsetstate(c, Finwait1);
573 		tcpoutput(c);
574 		break;
575 	case Close_wait:
576 		tcb->flgcnt++;
577 		tcb->snd.nxt++;
578 		tcpsetstate(c, Last_ack);
579 		tcpoutput(c);
580 		break;
581 	}
582 }
583 
584 static void
585 tcpkick(void *x)
586 {
587 	Conv *s = x;
588 	Tcpctl *tcb;
589 
590 	tcb = (Tcpctl*)s->ptcl;
591 
592 	if(waserror()){
593 		qunlock(s);
594 		nexterror();
595 	}
596 	qlock(s);
597 
598 	switch(tcb->state) {
599 	case Syn_sent:
600 	case Syn_received:
601 	case Established:
602 	case Close_wait:
603 		/*
604 		 * Push data
605 		 */
606 		tcpoutput(s);
607 		break;
608 	default:
609 		localclose(s, "Hangup");
610 		break;
611 	}
612 
613 	qunlock(s);
614 	poperror();
615 }
616 
617 static int seq_lt(ulong, ulong);
618 
619 static void
620 tcprcvwin(Conv *s)				/* Call with tcb locked */
621 {
622 	int w;
623 	Tcpctl *tcb;
624 
625 	tcb = (Tcpctl*)s->ptcl;
626 	w = tcb->window - qlen(s->rq);
627 	if(w < 0)
628 		w = 0;
629 	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
630 	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
631 		w = tcb->rcv.wptr - tcb->rcv.nxt;
632 	if(w != tcb->rcv.wnd)
633 	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
634 		tcb->rcv.blocked = 1;
635 		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
636 			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
637 	}
638 	tcb->rcv.wnd = w;
639 	tcb->rcv.wptr = tcb->rcv.nxt + w;
640 }
641 
642 static void
643 tcpacktimer(void *v)
644 {
645 	Tcpctl *tcb;
646 	Conv *s;
647 
648 	s = v;
649 	tcb = (Tcpctl*)s->ptcl;
650 
651 	if(waserror()){
652 		qunlock(s);
653 		nexterror();
654 	}
655 	qlock(s);
656 	if(tcb->state != Closed){
657 		tcb->flags |= FORCE;
658 		tcpoutput(s);
659 	}
660 	qunlock(s);
661 	poperror();
662 }
663 
664 static void
665 tcpcongestion(Tcpctl *tcb)
666 {
667 	ulong inflight;
668 
669 	inflight = tcb->snd.nxt - tcb->snd.una;
670 	if(inflight > tcb->cwind)
671 		inflight = tcb->cwind;
672 	tcb->ssthresh = inflight / 2;
673 	if(tcb->ssthresh < 2*tcb->mss)
674 		tcb->ssthresh = 2*tcb->mss;
675 }
676 
677 enum {
678 	L	= 2,	/* aggressive slow start; legal values ∈ (1.0, 2.0) */
679 };
680 
681 static void
682 tcpabcincr(Tcpctl *tcb, uint acked)
683 {
684 	uint limit;
685 
686 	tcb->abcbytes += acked;
687 	if(tcb->cwind < tcb->ssthresh){
688 		/* slow start */
689 		if(tcb->snd.rto)
690 			limit = tcb->mss;
691 		else
692 			limit = L*tcb->mss;
693 		tcb->cwind += MIN(tcb->abcbytes, limit);
694 		tcb->abcbytes = 0;
695 	} else {
696 		tcb->snd.rto = 0;
697 		/* avoidance */
698 		if(tcb->abcbytes >= tcb->cwind){
699 			tcb->abcbytes -= tcb->cwind;
700 			tcb->cwind += tcb->mss;
701 		}
702 	}
703 }
704 
705 static void
706 tcpcreate(Conv *c)
707 {
708 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
709 	c->wq = qopen(QMAX, Qkick, tcpkick, c);
710 }
711 
712 static void
713 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
714 {
715 	if(newstate != TcptimerON){
716 		if(t->state == TcptimerON){
717 			/* unchain */
718 			if(priv->timers == t){
719 				priv->timers = t->next;
720 				if(t->prev != nil)
721 					panic("timerstate1");
722 			}
723 			if(t->next)
724 				t->next->prev = t->prev;
725 			if(t->prev)
726 				t->prev->next = t->next;
727 			t->next = t->prev = nil;
728 		}
729 	} else {
730 		if(t->state != TcptimerON){
731 			/* chain */
732 			if(t->prev != nil || t->next != nil)
733 				panic("timerstate2");
734 			t->prev = nil;
735 			t->next = priv->timers;
736 			if(t->next)
737 				t->next->prev = t;
738 			priv->timers = t;
739 		}
740 	}
741 	t->state = newstate;
742 }
743 
744 static void
745 tcpackproc(void *a)
746 {
747 	Tcptimer *t, *tp, *timeo;
748 	Proto *tcp;
749 	Tcppriv *priv;
750 	int loop;
751 
752 	tcp = a;
753 	priv = tcp->priv;
754 
755 	for(;;) {
756 		tsleep(&up->sleep, return0, 0, MSPTICK);
757 
758 		qlock(&priv->tl);
759 		timeo = nil;
760 		loop = 0;
761 		for(t = priv->timers; t != nil; t = tp) {
762 			if(loop++ > 10000)
763 				panic("tcpackproc1");
764 			tp = t->next;
765  			if(t->state == TcptimerON) {
766 				t->count--;
767 				if(t->count == 0) {
768 					timerstate(priv, t, TcptimerDONE);
769 					t->readynext = timeo;
770 					timeo = t;
771 				}
772 			}
773 		}
774 		qunlock(&priv->tl);
775 
776 		loop = 0;
777 		for(t = timeo; t != nil; t = t->readynext) {
778 			if(loop++ > 10000)
779 				panic("tcpackproc2");
780 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
781 				(*t->func)(t->arg);
782 				poperror();
783 			}
784 		}
785 
786 		limborexmit(tcp);
787 	}
788 }
789 
790 static void
791 tcpgo(Tcppriv *priv, Tcptimer *t)
792 {
793 	if(t == nil || t->start == 0)
794 		return;
795 
796 	qlock(&priv->tl);
797 	t->count = t->start;
798 	timerstate(priv, t, TcptimerON);
799 	qunlock(&priv->tl);
800 }
801 
802 static void
803 tcphalt(Tcppriv *priv, Tcptimer *t)
804 {
805 	if(t == nil)
806 		return;
807 
808 	qlock(&priv->tl);
809 	timerstate(priv, t, TcptimerOFF);
810 	qunlock(&priv->tl);
811 }
812 
813 static int
814 backoff(int n)
815 {
816 	return 1 << n;
817 }
818 
819 static void
820 localclose(Conv *s, char *reason)	/* called with tcb locked */
821 {
822 	Tcpctl *tcb;
823 	Tcppriv *tpriv;
824 
825 	tpriv = s->p->priv;
826 	tcb = (Tcpctl*)s->ptcl;
827 
828 	iphtrem(&tpriv->ht, s);
829 
830 	tcphalt(tpriv, &tcb->timer);
831 	tcphalt(tpriv, &tcb->rtt_timer);
832 	tcphalt(tpriv, &tcb->acktimer);
833 	tcphalt(tpriv, &tcb->katimer);
834 
835 	/* Flush reassembly queue; nothing more can arrive */
836 	dumpreseq(tcb);
837 
838 	if(tcb->state == Syn_sent)
839 		Fsconnected(s, reason);
840 	if(s->state == Announced)
841 		wakeup(&s->listenr);
842 
843 	qhangup(s->rq, reason);
844 	qhangup(s->wq, reason);
845 
846 	tcpsetstate(s, Closed);
847 }
848 
849 /* mtu (- TCP + IP hdr len) of 1st hop */
850 static int
851 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
852 {
853 	Ipifc *ifc;
854 	int mtu;
855 
856 	ifc = findipifc(tcp->f, addr, 0);
857 	switch(version){
858 	default:
859 	case V4:
860 		mtu = DEF_MSS;
861 		if(ifc != nil)
862 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
863 		break;
864 	case V6:
865 		mtu = DEF_MSS6;
866 		if(ifc != nil)
867 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
868 		break;
869 	}
870 	/*
871 	 * set the ws.  it doesn't commit us to anything.
872 	 * ws is the ultimate limit to the bandwidth-delay product.
873 	 */
874 	*scale = Defadvscale;
875 
876 	return mtu;
877 }
878 
879 static void
880 inittcpctl(Conv *s, int mode)
881 {
882 	Tcpctl *tcb;
883 	Tcp4hdr* h4;
884 	Tcp6hdr* h6;
885 	Tcppriv *tpriv;
886 	int mss;
887 
888 	tcb = (Tcpctl*)s->ptcl;
889 
890 	memset(tcb, 0, sizeof(Tcpctl));
891 
892 	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
893 	tcb->srtt = tcp_irtt<<LOGAGAIN;
894 	tcb->mdev = 0;
895 
896 	/* setup timers */
897 	tcb->timer.start = tcp_irtt / MSPTICK;
898 	tcb->timer.func = tcptimeout;
899 	tcb->timer.arg = s;
900 	tcb->rtt_timer.start = MAX_TIME;
901 	tcb->acktimer.start = TCP_ACK / MSPTICK;
902 	tcb->acktimer.func = tcpacktimer;
903 	tcb->acktimer.arg = s;
904 	tcb->katimer.start = DEF_KAT / MSPTICK;
905 	tcb->katimer.func = tcpkeepalive;
906 	tcb->katimer.arg = s;
907 
908 	mss = DEF_MSS;
909 
910 	/* create a prototype(pseudo) header */
911 	if(mode != TCP_LISTEN){
912 		if(ipcmp(s->laddr, IPnoaddr) == 0)
913 			findlocalip(s->p->f, s->laddr, s->raddr);
914 
915 		switch(s->ipversion){
916 		case V4:
917 			h4 = &tcb->protohdr.tcp4hdr;
918 			memset(h4, 0, sizeof(*h4));
919 			h4->proto = IP_TCPPROTO;
920 			hnputs(h4->tcpsport, s->lport);
921 			hnputs(h4->tcpdport, s->rport);
922 			v6tov4(h4->tcpsrc, s->laddr);
923 			v6tov4(h4->tcpdst, s->raddr);
924 			break;
925 		case V6:
926 			h6 = &tcb->protohdr.tcp6hdr;
927 			memset(h6, 0, sizeof(*h6));
928 			h6->proto = IP_TCPPROTO;
929 			hnputs(h6->tcpsport, s->lport);
930 			hnputs(h6->tcpdport, s->rport);
931 			ipmove(h6->tcpsrc, s->laddr);
932 			ipmove(h6->tcpdst, s->raddr);
933 			mss = DEF_MSS6;
934 			break;
935 		default:
936 			panic("inittcpctl: version %d", s->ipversion);
937 		}
938 	}
939 
940 	tcb->mss = tcb->cwind = mss;
941 	tcb->abcbytes = 0;
942 	tpriv = s->p->priv;
943 	tpriv->stats[Mss] = tcb->mss;
944 
945 	/* default is no window scaling */
946 	tcpsetscale(s, tcb, 0, 0);
947 }
948 
949 /*
950  *  called with s qlocked
951  */
952 static void
953 tcpstart(Conv *s, int mode)
954 {
955 	Tcpctl *tcb;
956 	Tcppriv *tpriv;
957 	char kpname[KNAMELEN];
958 
959 	tpriv = s->p->priv;
960 
961 	if(tpriv->ackprocstarted == 0){
962 		qlock(&tpriv->apl);
963 		if(tpriv->ackprocstarted == 0){
964 			snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
965 			kproc(kpname, tcpackproc, s->p);
966 			tpriv->ackprocstarted = 1;
967 		}
968 		qunlock(&tpriv->apl);
969 	}
970 
971 	tcb = (Tcpctl*)s->ptcl;
972 
973 	inittcpctl(s, mode);
974 
975 	iphtadd(&tpriv->ht, s);
976 	switch(mode) {
977 	case TCP_LISTEN:
978 		tpriv->stats[PassiveOpens]++;
979 		tcb->flags |= CLONE;
980 		tcpsetstate(s, Listen);
981 		break;
982 
983 	case TCP_CONNECT:
984 		tpriv->stats[ActiveOpens]++;
985 		tcb->flags |= ACTIVE;
986 		tcpsndsyn(s, tcb);
987 		tcpsetstate(s, Syn_sent);
988 		tcpoutput(s);
989 		break;
990 	}
991 }
992 
993 static char*
994 tcpflag(char *buf, char *e, ushort flag)
995 {
996 	char *p;
997 
998 	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
999 	if(flag & URG)
1000 		p = seprint(p, e, " URG");
1001 	if(flag & ACK)
1002 		p = seprint(p, e, " ACK");
1003 	if(flag & PSH)
1004 		p = seprint(p, e, " PSH");
1005 	if(flag & RST)
1006 		p = seprint(p, e, " RST");
1007 	if(flag & SYN)
1008 		p = seprint(p, e, " SYN");
1009 	if(flag & FIN)
1010 		p = seprint(p, e, " FIN");
1011 	USED(p);
1012 	return buf;
1013 }
1014 
1015 static Block*
1016 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1017 {
1018 	int dlen;
1019 	Tcp6hdr *h;
1020 	ushort csum;
1021 	ushort hdrlen, optpad = 0;
1022 	uchar *opt;
1023 
1024 	hdrlen = TCP6_HDRSIZE;
1025 	if(tcph->flags & SYN){
1026 		if(tcph->mss)
1027 			hdrlen += MSS_LENGTH;
1028 		if(tcph->ws)
1029 			hdrlen += WS_LENGTH;
1030 		optpad = hdrlen & 3;
1031 		if(optpad)
1032 			optpad = 4 - optpad;
1033 		hdrlen += optpad;
1034 	}
1035 
1036 	if(data) {
1037 		dlen = blocklen(data);
1038 		data = padblock(data, hdrlen + TCP6_PKT);
1039 		if(data == nil)
1040 			return nil;
1041 	}
1042 	else {
1043 		dlen = 0;
1044 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
1045 		if(data == nil)
1046 			return nil;
1047 		data->wp += hdrlen + TCP6_PKT;
1048 	}
1049 
1050 	/* copy in pseudo ip header plus port numbers */
1051 	h = (Tcp6hdr *)(data->rp);
1052 	memmove(h, ph, TCP6_TCBPHDRSZ);
1053 
1054 	/* compose pseudo tcp header, do cksum calculation */
1055 	hnputl(h->vcf, hdrlen + dlen);
1056 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1057 	h->ttl = ph->proto;
1058 
1059 	/* copy in variable bits */
1060 	hnputl(h->tcpseq, tcph->seq);
1061 	hnputl(h->tcpack, tcph->ack);
1062 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1063 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1064 	hnputs(h->tcpurg, tcph->urg);
1065 
1066 	if(tcph->flags & SYN){
1067 		opt = h->tcpopt;
1068 		if(tcph->mss != 0){
1069 			*opt++ = MSSOPT;
1070 			*opt++ = MSS_LENGTH;
1071 			hnputs(opt, tcph->mss);
1072 			opt += 2;
1073 		}
1074 		if(tcph->ws != 0){
1075 			*opt++ = WSOPT;
1076 			*opt++ = WS_LENGTH;
1077 			*opt++ = tcph->ws;
1078 		}
1079 		while(optpad-- > 0)
1080 			*opt++ = NOOPOPT;
1081 	}
1082 
1083 	if(tcb != nil && tcb->nochecksum){
1084 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1085 	} else {
1086 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1087 		hnputs(h->tcpcksum, csum);
1088 	}
1089 
1090 	/* move from pseudo header back to normal ip header */
1091 	memset(h->vcf, 0, 4);
1092 	h->vcf[0] = IP_VER6;
1093 	hnputs(h->ploadlen, hdrlen+dlen);
1094 	h->proto = ph->proto;
1095 
1096 	return data;
1097 }
1098 
1099 static Block*
1100 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1101 {
1102 	int dlen;
1103 	Tcp4hdr *h;
1104 	ushort csum;
1105 	ushort hdrlen, optpad = 0;
1106 	uchar *opt;
1107 
1108 	hdrlen = TCP4_HDRSIZE;
1109 	if(tcph->flags & SYN){
1110 		if(tcph->mss)
1111 			hdrlen += MSS_LENGTH;
1112 		if(1)
1113 			hdrlen += WS_LENGTH;
1114 		optpad = hdrlen & 3;
1115 		if(optpad)
1116 			optpad = 4 - optpad;
1117 		hdrlen += optpad;
1118 	}
1119 
1120 	if(data) {
1121 		dlen = blocklen(data);
1122 		data = padblock(data, hdrlen + TCP4_PKT);
1123 		if(data == nil)
1124 			return nil;
1125 	}
1126 	else {
1127 		dlen = 0;
1128 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1129 		if(data == nil)
1130 			return nil;
1131 		data->wp += hdrlen + TCP4_PKT;
1132 	}
1133 
1134 	/* copy in pseudo ip header plus port numbers */
1135 	h = (Tcp4hdr *)(data->rp);
1136 	memmove(h, ph, TCP4_TCBPHDRSZ);
1137 
1138 	/* copy in variable bits */
1139 	hnputs(h->tcplen, hdrlen + dlen);
1140 	hnputl(h->tcpseq, tcph->seq);
1141 	hnputl(h->tcpack, tcph->ack);
1142 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1143 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1144 	hnputs(h->tcpurg, tcph->urg);
1145 
1146 	if(tcph->flags & SYN){
1147 		opt = h->tcpopt;
1148 		if(tcph->mss != 0){
1149 			*opt++ = MSSOPT;
1150 			*opt++ = MSS_LENGTH;
1151 			hnputs(opt, tcph->mss);
1152 			opt += 2;
1153 		}
1154 		/* always offer.  rfc1323 §2.2 */
1155 		if(1){
1156 			*opt++ = WSOPT;
1157 			*opt++ = WS_LENGTH;
1158 			*opt++ = tcph->ws;
1159 		}
1160 		while(optpad-- > 0)
1161 			*opt++ = NOOPOPT;
1162 	}
1163 
1164 	if(tcb != nil && tcb->nochecksum){
1165 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1166 	} else {
1167 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1168 		hnputs(h->tcpcksum, csum);
1169 	}
1170 
1171 	return data;
1172 }
1173 
1174 static int
1175 ntohtcp6(Tcp *tcph, Block **bpp)
1176 {
1177 	Tcp6hdr *h;
1178 	uchar *optr;
1179 	ushort hdrlen;
1180 	ushort optlen;
1181 	int n;
1182 
1183 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1184 	if(*bpp == nil)
1185 		return -1;
1186 
1187 	h = (Tcp6hdr *)((*bpp)->rp);
1188 	tcph->source = nhgets(h->tcpsport);
1189 	tcph->dest = nhgets(h->tcpdport);
1190 	tcph->seq = nhgetl(h->tcpseq);
1191 	tcph->ack = nhgetl(h->tcpack);
1192 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1193 	if(hdrlen < TCP6_HDRSIZE) {
1194 		freeblist(*bpp);
1195 		return -1;
1196 	}
1197 
1198 	tcph->flags = h->tcpflag[1];
1199 	tcph->wnd = nhgets(h->tcpwin);
1200 	tcph->urg = nhgets(h->tcpurg);
1201 	tcph->mss = 0;
1202 	tcph->ws = 0;
1203 	tcph->update = 0;
1204 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1205 
1206 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1207 	if(*bpp == nil)
1208 		return -1;
1209 
1210 	optr = h->tcpopt;
1211 	n = hdrlen - TCP6_HDRSIZE;
1212 	while(n > 0 && *optr != EOLOPT) {
1213 		if(*optr == NOOPOPT) {
1214 			n--;
1215 			optr++;
1216 			continue;
1217 		}
1218 		optlen = optr[1];
1219 		if(optlen < 2 || optlen > n)
1220 			break;
1221 		switch(*optr) {
1222 		case MSSOPT:
1223 			if(optlen == MSS_LENGTH)
1224 				tcph->mss = nhgets(optr+2);
1225 			break;
1226 		case WSOPT:
1227 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1228 				tcph->ws = *(optr+2);
1229 			break;
1230 		}
1231 		n -= optlen;
1232 		optr += optlen;
1233 	}
1234 	return hdrlen;
1235 }
1236 
1237 static int
1238 ntohtcp4(Tcp *tcph, Block **bpp)
1239 {
1240 	Tcp4hdr *h;
1241 	uchar *optr;
1242 	ushort hdrlen;
1243 	ushort optlen;
1244 	int n;
1245 
1246 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1247 	if(*bpp == nil)
1248 		return -1;
1249 
1250 	h = (Tcp4hdr *)((*bpp)->rp);
1251 	tcph->source = nhgets(h->tcpsport);
1252 	tcph->dest = nhgets(h->tcpdport);
1253 	tcph->seq = nhgetl(h->tcpseq);
1254 	tcph->ack = nhgetl(h->tcpack);
1255 
1256 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1257 	if(hdrlen < TCP4_HDRSIZE) {
1258 		freeblist(*bpp);
1259 		return -1;
1260 	}
1261 
1262 	tcph->flags = h->tcpflag[1];
1263 	tcph->wnd = nhgets(h->tcpwin);
1264 	tcph->urg = nhgets(h->tcpurg);
1265 	tcph->mss = 0;
1266 	tcph->ws = 0;
1267 	tcph->update = 0;
1268 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1269 
1270 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1271 	if(*bpp == nil)
1272 		return -1;
1273 
1274 	optr = h->tcpopt;
1275 	n = hdrlen - TCP4_HDRSIZE;
1276 	while(n > 0 && *optr != EOLOPT) {
1277 		if(*optr == NOOPOPT) {
1278 			n--;
1279 			optr++;
1280 			continue;
1281 		}
1282 		optlen = optr[1];
1283 		if(optlen < 2 || optlen > n)
1284 			break;
1285 		switch(*optr) {
1286 		case MSSOPT:
1287 			if(optlen == MSS_LENGTH)
1288 				tcph->mss = nhgets(optr+2);
1289 			break;
1290 		case WSOPT:
1291 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1292 				tcph->ws = *(optr+2);
1293 			break;
1294 		}
1295 		n -= optlen;
1296 		optr += optlen;
1297 	}
1298 	return hdrlen;
1299 }
1300 
1301 /*
1302  *  For outgoing calls, generate an initial sequence
1303  *  number and put a SYN on the send queue
1304  */
1305 static void
1306 tcpsndsyn(Conv *s, Tcpctl *tcb)
1307 {
1308 	Tcppriv *tpriv;
1309 
1310 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1311 	tcb->rttseq = tcb->iss;
1312 	tcb->snd.wl2 = tcb->iss;
1313 	tcb->snd.una = tcb->iss;
1314 	tcb->snd.rxt = tcb->iss;
1315 	tcb->snd.ptr = tcb->rttseq;
1316 	tcb->snd.nxt = tcb->rttseq;
1317 	tcb->flgcnt++;
1318 	tcb->flags |= FORCE;
1319 	tcb->sndsyntime = NOW;
1320 
1321 	/* set desired mss and scale */
1322 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1323 	tpriv = s->p->priv;
1324 	tpriv->stats[Mss] = tcb->mss;
1325 }
1326 
1327 void
1328 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1329 {
1330 	Block *hbp;
1331 	uchar rflags;
1332 	Tcppriv *tpriv;
1333 	Tcp4hdr ph4;
1334 	Tcp6hdr ph6;
1335 
1336 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1337 
1338 	tpriv = tcp->priv;
1339 
1340 	if(seg->flags & RST)
1341 		return;
1342 
1343 	/* make pseudo header */
1344 	switch(version) {
1345 	case V4:
1346 		memset(&ph4, 0, sizeof(ph4));
1347 		ph4.vihl = IP_VER4;
1348 		v6tov4(ph4.tcpsrc, dest);
1349 		v6tov4(ph4.tcpdst, source);
1350 		ph4.proto = IP_TCPPROTO;
1351 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1352 		hnputs(ph4.tcpsport, seg->dest);
1353 		hnputs(ph4.tcpdport, seg->source);
1354 		break;
1355 	case V6:
1356 		memset(&ph6, 0, sizeof(ph6));
1357 		ph6.vcf[0] = IP_VER6;
1358 		ipmove(ph6.tcpsrc, dest);
1359 		ipmove(ph6.tcpdst, source);
1360 		ph6.proto = IP_TCPPROTO;
1361 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1362 		hnputs(ph6.tcpsport, seg->dest);
1363 		hnputs(ph6.tcpdport, seg->source);
1364 		break;
1365 	default:
1366 		panic("sndrst: version %d", version);
1367 	}
1368 
1369 	tpriv->stats[OutRsts]++;
1370 	rflags = RST;
1371 
1372 	/* convince the other end that this reset is in band */
1373 	if(seg->flags & ACK) {
1374 		seg->seq = seg->ack;
1375 		seg->ack = 0;
1376 	}
1377 	else {
1378 		rflags |= ACK;
1379 		seg->ack = seg->seq;
1380 		seg->seq = 0;
1381 		if(seg->flags & SYN)
1382 			seg->ack++;
1383 		seg->ack += length;
1384 		if(seg->flags & FIN)
1385 			seg->ack++;
1386 	}
1387 	seg->flags = rflags;
1388 	seg->wnd = 0;
1389 	seg->urg = 0;
1390 	seg->mss = 0;
1391 	seg->ws = 0;
1392 	switch(version) {
1393 	case V4:
1394 		hbp = htontcp4(seg, nil, &ph4, nil);
1395 		if(hbp == nil)
1396 			return;
1397 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1398 		break;
1399 	case V6:
1400 		hbp = htontcp6(seg, nil, &ph6, nil);
1401 		if(hbp == nil)
1402 			return;
1403 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1404 		break;
1405 	default:
1406 		panic("sndrst2: version %d", version);
1407 	}
1408 }
1409 
1410 /*
1411  *  send a reset to the remote side and close the conversation
1412  *  called with s qlocked
1413  */
1414 static char*
1415 tcphangup(Conv *s)
1416 {
1417 	Tcp seg;
1418 	Tcpctl *tcb;
1419 	Block *hbp;
1420 
1421 	tcb = (Tcpctl*)s->ptcl;
1422 	if(waserror())
1423 		return commonerror();
1424 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1425 		if(!waserror()){
1426 			memset(&seg, 0, sizeof seg);
1427 			seg.flags = RST | ACK;
1428 			seg.ack = tcb->rcv.nxt;
1429 			tcb->rcv.ackptr = seg.ack;
1430 			seg.seq = tcb->snd.ptr;
1431 			seg.wnd = 0;
1432 			seg.urg = 0;
1433 			seg.mss = 0;
1434 			seg.ws = 0;
1435 			switch(s->ipversion) {
1436 			case V4:
1437 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1438 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1439 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1440 				break;
1441 			case V6:
1442 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1443 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1444 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1445 				break;
1446 			default:
1447 				panic("tcphangup: version %d", s->ipversion);
1448 			}
1449 			poperror();
1450 		}
1451 	}
1452 	localclose(s, nil);
1453 	poperror();
1454 	return nil;
1455 }
1456 
1457 /*
1458  *  (re)send a SYN ACK
1459  */
1460 static int
1461 sndsynack(Proto *tcp, Limbo *lp)
1462 {
1463 	Block *hbp;
1464 	Tcp4hdr ph4;
1465 	Tcp6hdr ph6;
1466 	Tcp seg;
1467 	uint scale;
1468 
1469 	/* make pseudo header */
1470 	switch(lp->version) {
1471 	case V4:
1472 		memset(&ph4, 0, sizeof(ph4));
1473 		ph4.vihl = IP_VER4;
1474 		v6tov4(ph4.tcpsrc, lp->laddr);
1475 		v6tov4(ph4.tcpdst, lp->raddr);
1476 		ph4.proto = IP_TCPPROTO;
1477 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1478 		hnputs(ph4.tcpsport, lp->lport);
1479 		hnputs(ph4.tcpdport, lp->rport);
1480 		break;
1481 	case V6:
1482 		memset(&ph6, 0, sizeof(ph6));
1483 		ph6.vcf[0] = IP_VER6;
1484 		ipmove(ph6.tcpsrc, lp->laddr);
1485 		ipmove(ph6.tcpdst, lp->raddr);
1486 		ph6.proto = IP_TCPPROTO;
1487 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1488 		hnputs(ph6.tcpsport, lp->lport);
1489 		hnputs(ph6.tcpdport, lp->rport);
1490 		break;
1491 	default:
1492 		panic("sndrst: version %d", lp->version);
1493 	}
1494 
1495 	memset(&seg, 0, sizeof seg);
1496 	seg.seq = lp->iss;
1497 	seg.ack = lp->irs+1;
1498 	seg.flags = SYN|ACK;
1499 	seg.urg = 0;
1500 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1501 	seg.wnd = QMAX;
1502 
1503 	/* if the other side set scale, we should too */
1504 	if(lp->rcvscale){
1505 		seg.ws = scale;
1506 		lp->sndscale = scale;
1507 	} else {
1508 		seg.ws = 0;
1509 		lp->sndscale = 0;
1510 	}
1511 
1512 	switch(lp->version) {
1513 	case V4:
1514 		hbp = htontcp4(&seg, nil, &ph4, nil);
1515 		if(hbp == nil)
1516 			return -1;
1517 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1518 		break;
1519 	case V6:
1520 		hbp = htontcp6(&seg, nil, &ph6, nil);
1521 		if(hbp == nil)
1522 			return -1;
1523 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1524 		break;
1525 	default:
1526 		panic("sndsnack: version %d", lp->version);
1527 	}
1528 	lp->lastsend = NOW;
1529 	return 0;
1530 }
1531 
1532 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1533 
1534 /*
1535  *  put a call into limbo and respond with a SYN ACK
1536  *
1537  *  called with proto locked
1538  */
1539 static void
1540 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1541 {
1542 	Limbo *lp, **l;
1543 	Tcppriv *tpriv;
1544 	int h;
1545 
1546 	tpriv = s->p->priv;
1547 	h = hashipa(source, seg->source);
1548 
1549 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1550 		lp = *l;
1551 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1552 			continue;
1553 		if(ipcmp(lp->raddr, source) != 0)
1554 			continue;
1555 		if(ipcmp(lp->laddr, dest) != 0)
1556 			continue;
1557 
1558 		/* each new SYN restarts the retransmits */
1559 		lp->irs = seg->seq;
1560 		break;
1561 	}
1562 	lp = *l;
1563 	if(lp == nil){
1564 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1565 			lp = tpriv->lht[h];
1566 			tpriv->lht[h] = lp->next;
1567 			lp->next = nil;
1568 		} else {
1569 			lp = malloc(sizeof(*lp));
1570 			if(lp == nil)
1571 				return;
1572 			tpriv->nlimbo++;
1573 		}
1574 		*l = lp;
1575 		lp->version = version;
1576 		ipmove(lp->laddr, dest);
1577 		ipmove(lp->raddr, source);
1578 		lp->lport = seg->dest;
1579 		lp->rport = seg->source;
1580 		lp->mss = seg->mss;
1581 		lp->rcvscale = seg->ws;
1582 		lp->irs = seg->seq;
1583 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1584 	}
1585 
1586 	if(sndsynack(s->p, lp) < 0){
1587 		*l = lp->next;
1588 		tpriv->nlimbo--;
1589 		free(lp);
1590 	}
1591 }
1592 
1593 /*
1594  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1595  */
1596 static void
1597 limborexmit(Proto *tcp)
1598 {
1599 	Tcppriv *tpriv;
1600 	Limbo **l, *lp;
1601 	int h;
1602 	int seen;
1603 	ulong now;
1604 
1605 	tpriv = tcp->priv;
1606 
1607 	if(!canqlock(tcp))
1608 		return;
1609 	seen = 0;
1610 	now = NOW;
1611 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1612 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1613 			lp = *l;
1614 			seen++;
1615 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1616 				continue;
1617 
1618 			/* time it out after 1 second */
1619 			if(++(lp->rexmits) > 5){
1620 				tpriv->nlimbo--;
1621 				*l = lp->next;
1622 				free(lp);
1623 				continue;
1624 			}
1625 
1626 			/* if we're being attacked, don't bother resending SYN ACK's */
1627 			if(tpriv->nlimbo > 100)
1628 				continue;
1629 
1630 			if(sndsynack(tcp, lp) < 0){
1631 				tpriv->nlimbo--;
1632 				*l = lp->next;
1633 				free(lp);
1634 				continue;
1635 			}
1636 
1637 			l = &lp->next;
1638 		}
1639 	}
1640 	qunlock(tcp);
1641 }
1642 
1643 /*
1644  *  lookup call in limbo.  if found, throw it out.
1645  *
1646  *  called with proto locked
1647  */
1648 static void
1649 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1650 {
1651 	Limbo *lp, **l;
1652 	int h;
1653 	Tcppriv *tpriv;
1654 
1655 	tpriv = s->p->priv;
1656 
1657 	/* find a call in limbo */
1658 	h = hashipa(src, segp->source);
1659 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1660 		lp = *l;
1661 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1662 			continue;
1663 		if(ipcmp(lp->laddr, dst) != 0)
1664 			continue;
1665 		if(ipcmp(lp->raddr, src) != 0)
1666 			continue;
1667 
1668 		/* RST can only follow the SYN */
1669 		if(segp->seq == lp->irs+1){
1670 			tpriv->nlimbo--;
1671 			*l = lp->next;
1672 			free(lp);
1673 		}
1674 		break;
1675 	}
1676 }
1677 
1678 static void
1679 initialwindow(Tcpctl *tcb)
1680 {
1681 	/* RFC 3390 initial window */
1682 	if(tcb->mss < 1095)
1683 		tcb->cwind = 4*tcb->mss;
1684 	else if(tcb->mss < 2190)
1685 		tcb->cwind = 2*2190;
1686 	else
1687 		tcb->cwind = 2*tcb->mss;
1688 }
1689 
1690 /*
1691  *  come here when we finally get an ACK to our SYN-ACK.
1692  *  lookup call in limbo.  if found, create a new conversation
1693  *
1694  *  called with proto locked
1695  */
1696 static Conv*
1697 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1698 {
1699 	Conv *new;
1700 	Tcpctl *tcb;
1701 	Tcppriv *tpriv;
1702 	Tcp4hdr *h4;
1703 	Tcp6hdr *h6;
1704 	Limbo *lp, **l;
1705 	int h;
1706 
1707 	/* unless it's just an ack, it can't be someone coming out of limbo */
1708 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1709 		return nil;
1710 
1711 	tpriv = s->p->priv;
1712 
1713 	/* find a call in limbo */
1714 	h = hashipa(src, segp->source);
1715 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1716 		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1717 			src, segp->source, lp->raddr, lp->rport,
1718 			dst, segp->dest, lp->laddr, lp->lport,
1719 			version, lp->version
1720  		);
1721 
1722 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1723 			continue;
1724 		if(ipcmp(lp->laddr, dst) != 0)
1725 			continue;
1726 		if(ipcmp(lp->raddr, src) != 0)
1727 			continue;
1728 
1729 		/* we're assuming no data with the initial SYN */
1730 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1731 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1732 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1733 			lp = nil;
1734 		} else {
1735 			tpriv->nlimbo--;
1736 			*l = lp->next;
1737 		}
1738 		break;
1739 	}
1740 	if(lp == nil)
1741 		return nil;
1742 
1743 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1744 	if(new == nil)
1745 		return nil;
1746 
1747 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1748 	tcb = (Tcpctl*)new->ptcl;
1749 	tcb->flags &= ~CLONE;
1750 	tcb->timer.arg = new;
1751 	tcb->timer.state = TcptimerOFF;
1752 	tcb->acktimer.arg = new;
1753 	tcb->acktimer.state = TcptimerOFF;
1754 	tcb->katimer.arg = new;
1755 	tcb->katimer.state = TcptimerOFF;
1756 	tcb->rtt_timer.arg = new;
1757 	tcb->rtt_timer.state = TcptimerOFF;
1758 
1759 	tcb->irs = lp->irs;
1760 	tcb->rcv.nxt = tcb->irs+1;
1761 	tcb->rcv.wptr = tcb->rcv.nxt;
1762 	tcb->rcv.wsnt = 0;
1763 	tcb->rcv.urg = tcb->rcv.nxt;
1764 
1765 	tcb->iss = lp->iss;
1766 	tcb->rttseq = tcb->iss;
1767 	tcb->snd.wl2 = tcb->iss;
1768 	tcb->snd.una = tcb->iss+1;
1769 	tcb->snd.ptr = tcb->iss+1;
1770 	tcb->snd.nxt = tcb->iss+1;
1771 	tcb->snd.rxt = tcb->iss+1;
1772 	tcb->flgcnt = 0;
1773 	tcb->flags |= SYNACK;
1774 
1775 	/* our sending max segment size cannot be bigger than what he asked for */
1776 	if(lp->mss != 0 && lp->mss < tcb->mss) {
1777 		tcb->mss = lp->mss;
1778 		tpriv->stats[Mss] = tcb->mss;
1779 	}
1780 
1781 	/* window scaling */
1782 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1783 
1784 	/* congestion window */
1785 	tcb->snd.wnd = segp->wnd;
1786 	initialwindow(tcb);
1787 
1788 	/* set initial round trip time */
1789 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1790 	tcpsynackrtt(new);
1791 
1792 	free(lp);
1793 
1794 	/* set up proto header */
1795 	switch(version){
1796 	case V4:
1797 		h4 = &tcb->protohdr.tcp4hdr;
1798 		memset(h4, 0, sizeof(*h4));
1799 		h4->proto = IP_TCPPROTO;
1800 		hnputs(h4->tcpsport, new->lport);
1801 		hnputs(h4->tcpdport, new->rport);
1802 		v6tov4(h4->tcpsrc, dst);
1803 		v6tov4(h4->tcpdst, src);
1804 		break;
1805 	case V6:
1806 		h6 = &tcb->protohdr.tcp6hdr;
1807 		memset(h6, 0, sizeof(*h6));
1808 		h6->proto = IP_TCPPROTO;
1809 		hnputs(h6->tcpsport, new->lport);
1810 		hnputs(h6->tcpdport, new->rport);
1811 		ipmove(h6->tcpsrc, dst);
1812 		ipmove(h6->tcpdst, src);
1813 		break;
1814 	default:
1815 		panic("tcpincoming: version %d", new->ipversion);
1816 	}
1817 
1818 	tcpsetstate(new, Established);
1819 
1820 	iphtadd(&tpriv->ht, new);
1821 
1822 	return new;
1823 }
1824 
1825 static int
1826 seq_within(ulong x, ulong low, ulong high)
1827 {
1828 	if(low <= high){
1829 		if(low <= x && x <= high)
1830 			return 1;
1831 	}
1832 	else {
1833 		if(x >= low || x <= high)
1834 			return 1;
1835 	}
1836 	return 0;
1837 }
1838 
1839 static int
1840 seq_lt(ulong x, ulong y)
1841 {
1842 	return (int)(x-y) < 0;
1843 }
1844 
1845 static int
1846 seq_le(ulong x, ulong y)
1847 {
1848 	return (int)(x-y) <= 0;
1849 }
1850 
1851 static int
1852 seq_gt(ulong x, ulong y)
1853 {
1854 	return (int)(x-y) > 0;
1855 }
1856 
1857 static int
1858 seq_ge(ulong x, ulong y)
1859 {
1860 	return (int)(x-y) >= 0;
1861 }
1862 
1863 /*
1864  *  use the time between the first SYN and it's ack as the
1865  *  initial round trip time
1866  */
1867 static void
1868 tcpsynackrtt(Conv *s)
1869 {
1870 	Tcpctl *tcb;
1871 	int delta;
1872 	Tcppriv *tpriv;
1873 
1874 	tcb = (Tcpctl*)s->ptcl;
1875 	tpriv = s->p->priv;
1876 
1877 	delta = NOW - tcb->sndsyntime;
1878 	tcb->srtt = delta<<LOGAGAIN;
1879 	tcb->mdev = delta<<LOGDGAIN;
1880 
1881 	/* halt round trip timer */
1882 	tcphalt(tpriv, &tcb->rtt_timer);
1883 }
1884 
1885 static void
1886 update(Conv *s, Tcp *seg)
1887 {
1888 	int rtt, delta;
1889 	Tcpctl *tcb;
1890 	ulong acked;
1891 	Tcppriv *tpriv;
1892 
1893 	if(seg->update)
1894 		return;
1895 	seg->update = 1;
1896 
1897 	tpriv = s->p->priv;
1898 	tcb = (Tcpctl*)s->ptcl;
1899 
1900 	/* catch zero-window updates, update window & recover */
1901 	if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1902 	    seq_lt(seg->ack, tcb->snd.ptr)){
1903 		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1904 			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1905 		tcb->snd.wnd = seg->wnd;
1906 		goto recovery;
1907 	}
1908 
1909 	/* newreno fast retransmit */
1910 	if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1911 	    ++tcb->snd.dupacks == 3){		/* was TCPREXMTTHRESH */
1912 recovery:
1913 		if(tcb->snd.recovery){
1914 			tpriv->stats[RecoveryCwind]++;
1915 			tcb->cwind += tcb->mss;
1916 		}else if(seq_le(tcb->snd.rxt, seg->ack)){
1917 			tpriv->stats[Recovery]++;
1918 			tcb->abcbytes = 0;
1919 			tcb->snd.recovery = 1;
1920 			tcb->snd.partialack = 0;
1921 			tcb->snd.rxt = tcb->snd.nxt;
1922 			tcpcongestion(tcb);
1923 			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1924 			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1925 				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1926 			tcprxmit(s);
1927 		}else{
1928 			tpriv->stats[RecoveryNoSeq]++;
1929 			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1930 				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1931 			/* don't enter fast retransmit, don't change ssthresh */
1932 		}
1933 	}else if(tcb->snd.recovery){
1934 		tpriv->stats[RecoveryCwind]++;
1935 		tcb->cwind += tcb->mss;
1936 	}
1937 
1938 	/*
1939 	 *  update window
1940 	 */
1941 	if(seq_gt(seg->ack, tcb->snd.wl2)
1942 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1943 		/* clear dupack if we advance wl2 */
1944 		if(tcb->snd.wl2 != seg->ack)
1945 			tcb->snd.dupacks = 0;
1946 		tcb->snd.wnd = seg->wnd;
1947 		tcb->snd.wl2 = seg->ack;
1948 	}
1949 
1950 	if(!seq_gt(seg->ack, tcb->snd.una)){
1951 		/*
1952 		 *  don't let us hangup if sending into a closed window and
1953 		 *  we're still getting acks
1954 		 */
1955 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1956 			tcb->backedoff = MAXBACKMS/4;
1957 		return;
1958 	}
1959 
1960 	/* Compute the new send window size */
1961 	acked = seg->ack - tcb->snd.una;
1962 
1963 	/* avoid slow start and timers for SYN acks */
1964 	if((tcb->flags & SYNACK) == 0) {
1965 		tcb->flags |= SYNACK;
1966 		acked--;
1967 		tcb->flgcnt--;
1968 		goto done;
1969 	}
1970 
1971 	/*
1972 	 * congestion control
1973 	 */
1974 	if(tcb->snd.recovery){
1975 		if(seq_ge(seg->ack, tcb->snd.rxt)){
1976 			/* recovery finished; deflate window */
1977 			tpriv->stats[RecoveryDone]++;
1978 			tcb->snd.dupacks = 0;
1979 			tcb->snd.recovery = 0;
1980 			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1981 			if(tcb->ssthresh < tcb->cwind)
1982 				tcb->cwind = tcb->ssthresh;
1983 			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1984 				tcb->cwind, tcb->ssthresh);
1985 		} else {
1986 			/* partial ack; we lost more than one segment */
1987 			tpriv->stats[RecoveryPA]++;
1988 			if(tcb->cwind > acked)
1989 				tcb->cwind -= acked;
1990 			else{
1991 				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1992 				tcb->cwind = tcb->mss;
1993 			}
1994 			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1995 				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1996 
1997 			if(acked >= tcb->mss)
1998 				tcb->cwind += tcb->mss;
1999 			tcb->snd.partialack++;
2000 		}
2001 	} else
2002 		tcpabcincr(tcb, acked);
2003 
2004 	/* Adjust the timers according to the round trip time */
2005 	/* TODO: fix sloppy treatment of overflow cases here. */
2006 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2007 		tcphalt(tpriv, &tcb->rtt_timer);
2008 		if((tcb->flags&RETRAN) == 0) {
2009 			tcb->backoff = 0;
2010 			tcb->backedoff = 0;
2011 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2012 			if(rtt == 0)
2013 				rtt = 1; /* else all close sys's will rexmit in 0 time */
2014 			rtt *= MSPTICK;
2015 			if(tcb->srtt == 0) {
2016 				tcb->srtt = rtt << LOGAGAIN;
2017 				tcb->mdev = rtt << LOGDGAIN;
2018 			} else {
2019 				delta = rtt - (tcb->srtt>>LOGAGAIN);
2020 				tcb->srtt += delta;
2021 				if(tcb->srtt <= 0)
2022 					tcb->srtt = 1;
2023 
2024 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2025 				tcb->mdev += delta;
2026 				if(tcb->mdev <= 0)
2027 					tcb->mdev = 1;
2028 			}
2029 			tcpsettimer(tcb);
2030 		}
2031 	}
2032 
2033 done:
2034 	if(qdiscard(s->wq, acked) < acked)
2035 		tcb->flgcnt--;
2036 	tcb->snd.una = seg->ack;
2037 
2038 	/* newreno fast recovery */
2039 	if(tcb->snd.recovery)
2040 		tcprxmit(s);
2041 
2042 	if(seq_gt(seg->ack, tcb->snd.urg))
2043 		tcb->snd.urg = seg->ack;
2044 
2045 	if(tcb->snd.una != tcb->snd.nxt){
2046 		/* `impatient' variant */
2047 		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2048 			tcb->time = NOW;
2049 			tcb->timeuna = tcb->snd.una;
2050 			tcpgo(tpriv, &tcb->timer);
2051 		}
2052 	} else
2053 		tcphalt(tpriv, &tcb->timer);
2054 
2055 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2056 		tcb->snd.ptr = tcb->snd.una;
2057 
2058 	if(!tcb->snd.recovery)
2059 		tcb->flags &= ~RETRAN;
2060 	tcb->backoff = 0;
2061 	tcb->backedoff = 0;
2062 }
2063 
2064 static void
2065 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2066 {
2067 	Tcp seg;
2068 	Tcp4hdr *h4;
2069 	Tcp6hdr *h6;
2070 	int hdrlen;
2071 	Tcpctl *tcb;
2072 	ushort length, csum;
2073 	uchar source[IPaddrlen], dest[IPaddrlen];
2074 	Conv *s;
2075 	Fs *f;
2076 	Tcppriv *tpriv;
2077 	uchar version;
2078 
2079 	f = tcp->f;
2080 	tpriv = tcp->priv;
2081 
2082 	tpriv->stats[InSegs]++;
2083 
2084 	h4 = (Tcp4hdr*)(bp->rp);
2085 	h6 = (Tcp6hdr*)(bp->rp);
2086 
2087 	if((h4->vihl&0xF0)==IP_VER4) {
2088 		version = V4;
2089 		length = nhgets(h4->length);
2090 		v4tov6(dest, h4->tcpdst);
2091 		v4tov6(source, h4->tcpsrc);
2092 
2093 		h4->Unused = 0;
2094 		hnputs(h4->tcplen, length-TCP4_PKT);
2095 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2096 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2097 			tpriv->stats[CsumErrs]++;
2098 			tpriv->stats[InErrs]++;
2099 			netlog(f, Logtcp, "bad tcp proto cksum\n");
2100 			freeblist(bp);
2101 			return;
2102 		}
2103 
2104 		hdrlen = ntohtcp4(&seg, &bp);
2105 		if(hdrlen < 0){
2106 			tpriv->stats[HlenErrs]++;
2107 			tpriv->stats[InErrs]++;
2108 			netlog(f, Logtcp, "bad tcp hdr len\n");
2109 			return;
2110 		}
2111 
2112 		/* trim the packet to the size claimed by the datagram */
2113 		length -= hdrlen+TCP4_PKT;
2114 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2115 		if(bp == nil){
2116 			tpriv->stats[LenErrs]++;
2117 			tpriv->stats[InErrs]++;
2118 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2119 			return;
2120 		}
2121 	}
2122 	else {
2123 		int ttl = h6->ttl;
2124 		int proto = h6->proto;
2125 
2126 		version = V6;
2127 		length = nhgets(h6->ploadlen);
2128 		ipmove(dest, h6->tcpdst);
2129 		ipmove(source, h6->tcpsrc);
2130 
2131 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2132 		h6->ttl = proto;
2133 		hnputl(h6->vcf, length);
2134 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2135 		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2136 			tpriv->stats[CsumErrs]++;
2137 			tpriv->stats[InErrs]++;
2138 			netlog(f, Logtcp,
2139 			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2140 				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2141 			freeblist(bp);
2142 			return;
2143 		}
2144 		h6->ttl = ttl;
2145 		h6->proto = proto;
2146 		hnputs(h6->ploadlen, length);
2147 
2148 		hdrlen = ntohtcp6(&seg, &bp);
2149 		if(hdrlen < 0){
2150 			tpriv->stats[HlenErrs]++;
2151 			tpriv->stats[InErrs]++;
2152 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2153 			return;
2154 		}
2155 
2156 		/* trim the packet to the size claimed by the datagram */
2157 		length -= hdrlen;
2158 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2159 		if(bp == nil){
2160 			tpriv->stats[LenErrs]++;
2161 			tpriv->stats[InErrs]++;
2162 			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2163 			return;
2164 		}
2165 	}
2166 
2167 	/* lock protocol while searching for a conversation */
2168 	qlock(tcp);
2169 
2170 	/* Look for a matching conversation */
2171 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2172 	if(s == nil){
2173 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2174 			source, seg.source, dest, seg.dest);
2175 reset:
2176 		qunlock(tcp);
2177 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2178 		freeblist(bp);
2179 		return;
2180 	}
2181 
2182 	/* if it's a listener, look for the right flags and get a new conv */
2183 	tcb = (Tcpctl*)s->ptcl;
2184 	if(tcb->state == Listen){
2185 		if(seg.flags & RST){
2186 			limborst(s, &seg, source, dest, version);
2187 			qunlock(tcp);
2188 			freeblist(bp);
2189 			return;
2190 		}
2191 
2192 		/* if this is a new SYN, put the call into limbo */
2193 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2194 			limbo(s, source, dest, &seg, version);
2195 			qunlock(tcp);
2196 			freeblist(bp);
2197 			return;
2198 		}
2199 
2200 		/*
2201 		 *  if there's a matching call in limbo, tcpincoming will
2202 		 *  return it in state Syn_received
2203 		 */
2204 		s = tcpincoming(s, &seg, source, dest, version);
2205 		if(s == nil)
2206 			goto reset;
2207 	}
2208 
2209 	/* The rest of the input state machine is run with the control block
2210 	 * locked and implements the state machine directly out of the RFC.
2211 	 * Out-of-band data is ignored - it was always a bad idea.
2212 	 */
2213 	tcb = (Tcpctl*)s->ptcl;
2214 	if(waserror()){
2215 		qunlock(s);
2216 		nexterror();
2217 	}
2218 	qlock(s);
2219 	qunlock(tcp);
2220 
2221 	/* fix up window */
2222 	seg.wnd <<= tcb->rcv.scale;
2223 
2224 	/* every input packet in puts off the keep alive time out */
2225 	tcpsetkacounter(tcb);
2226 
2227 	switch(tcb->state) {
2228 	case Closed:
2229 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2230 		goto raise;
2231 	case Syn_sent:
2232 		if(seg.flags & ACK) {
2233 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2234 				sndrst(tcp, source, dest, length, &seg, version,
2235 					 "bad seq in Syn_sent");
2236 				goto raise;
2237 			}
2238 		}
2239 		if(seg.flags & RST) {
2240 			if(seg.flags & ACK)
2241 				localclose(s, Econrefused);
2242 			goto raise;
2243 		}
2244 
2245 		if(seg.flags & SYN) {
2246 			procsyn(s, &seg);
2247 			if(seg.flags & ACK){
2248 				update(s, &seg);
2249 				tcpsynackrtt(s);
2250 				tcpsetstate(s, Established);
2251 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2252 			}
2253 			else {
2254 				tcb->time = NOW;
2255 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2256 			}
2257 
2258 			if(length != 0 || (seg.flags & FIN))
2259 				break;
2260 
2261 			freeblist(bp);
2262 			goto output;
2263 		}
2264 		else
2265 			freeblist(bp);
2266 
2267 		qunlock(s);
2268 		poperror();
2269 		return;
2270 	case Syn_received:
2271 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2272 		if(seg.flags & ACK)
2273 			tcpsynackrtt(s);
2274 		break;
2275 	}
2276 
2277 	/*
2278 	 *  One DOS attack is to open connections to us and then forget about them,
2279 	 *  thereby tying up a conv at no long term cost to the attacker.
2280 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2281 	 *  corresponding code in tcpsendka().
2282 	 */
2283 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2284 		if(tcpporthogdefense
2285 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2286 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2287 				source, seg.source, dest, seg.dest, seg.flags,
2288 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2289 			localclose(s, "stateless hog");
2290 		}
2291 	}
2292 
2293 	/* Cut the data to fit the receive window */
2294 	tcprcvwin(s);
2295 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2296 		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2297 		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2298 			"%lud-%lud l %d from %I\n", seg.seq,
2299 			seg.seq + length - 1, tcb->rcv.nxt,
2300 			tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2301 		update(s, &seg);
2302 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2303 			tcphalt(tpriv, &tcb->rtt_timer);
2304 			tcphalt(tpriv, &tcb->acktimer);
2305 			tcphalt(tpriv, &tcb->katimer);
2306 			tcpsetstate(s, Time_wait);
2307 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2308 			tcpgo(tpriv, &tcb->timer);
2309 		}
2310 		if(!(seg.flags & RST)) {
2311 			tcb->flags |= FORCE;
2312 			goto output;
2313 		}
2314 		qunlock(s);
2315 		poperror();
2316 		return;
2317 	}
2318 
2319 	/* Cannot accept so answer with a rst */
2320 	if(length && tcb->state == Closed) {
2321 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2322 		goto raise;
2323 	}
2324 
2325 	/* The segment is beyond the current receive pointer so
2326 	 * queue the data in the resequence queue
2327 	 */
2328 	if(seg.seq != tcb->rcv.nxt)
2329 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2330 		update(s, &seg);
2331 		if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2332 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2333 				s->laddr, s->lport);
2334 		tcb->flags |= FORCE;	/* force duplicate ack; RFC 5681 §3.2 */
2335 		goto output;
2336 	}
2337 
2338 	if(tcb->nreseq > 0)
2339 		tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2340 
2341 	/*
2342 	 *  keep looping till we've processed this packet plus any
2343 	 *  adjacent packets in the resequence queue
2344 	 */
2345 	for(;;) {
2346 		if(seg.flags & RST) {
2347 			if(tcb->state == Established) {
2348 				tpriv->stats[EstabResets]++;
2349 				if(tcb->rcv.nxt != seg.seq)
2350 					print("out of order RST rcvd: %I.%d -> "
2351 						"%I.%d, rcv.nxt %lux seq %lux\n",
2352 						s->raddr, s->rport, s->laddr,
2353 						s->lport, tcb->rcv.nxt, seg.seq);
2354 			}
2355 			localclose(s, Econrefused);
2356 			goto raise;
2357 		}
2358 
2359 		if((seg.flags&ACK) == 0)
2360 			goto raise;
2361 
2362 		switch(tcb->state) {
2363 		case Syn_received:
2364 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2365 				sndrst(tcp, source, dest, length, &seg, version,
2366 					"bad seq in Syn_received");
2367 				goto raise;
2368 			}
2369 			update(s, &seg);
2370 			tcpsetstate(s, Established);
2371 		case Established:
2372 		case Close_wait:
2373 			update(s, &seg);
2374 			break;
2375 		case Finwait1:
2376 			update(s, &seg);
2377 			if(qlen(s->wq)+tcb->flgcnt == 0){
2378 				tcphalt(tpriv, &tcb->rtt_timer);
2379 				tcphalt(tpriv, &tcb->acktimer);
2380 				tcpsetkacounter(tcb);
2381 				tcb->time = NOW;
2382 				tcpsetstate(s, Finwait2);
2383 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2384 				tcpgo(tpriv, &tcb->katimer);
2385 			}
2386 			break;
2387 		case Finwait2:
2388 			update(s, &seg);
2389 			break;
2390 		case Closing:
2391 			update(s, &seg);
2392 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2393 				tcphalt(tpriv, &tcb->rtt_timer);
2394 				tcphalt(tpriv, &tcb->acktimer);
2395 				tcphalt(tpriv, &tcb->katimer);
2396 				tcpsetstate(s, Time_wait);
2397 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2398 				tcpgo(tpriv, &tcb->timer);
2399 			}
2400 			break;
2401 		case Last_ack:
2402 			update(s, &seg);
2403 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2404 				localclose(s, nil);
2405 				goto raise;
2406 			}
2407 		case Time_wait:
2408 			tcb->flags |= FORCE;
2409 			if(tcb->timer.state != TcptimerON)
2410 				tcpgo(tpriv, &tcb->timer);
2411 		}
2412 
2413 		if((seg.flags&URG) && seg.urg) {
2414 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2415 				tcb->rcv.urg = seg.urg + seg.seq;
2416 				pullblock(&bp, seg.urg);
2417 			}
2418 		}
2419 		else
2420 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2421 			tcb->rcv.urg = tcb->rcv.nxt;
2422 
2423 		if(length == 0) {
2424 			if(bp != nil)
2425 				freeblist(bp);
2426 		}
2427 		else {
2428 			switch(tcb->state){
2429 			default:
2430 				/* Ignore segment text */
2431 				if(bp != nil)
2432 					freeblist(bp);
2433 				break;
2434 
2435 			case Syn_received:
2436 			case Established:
2437 			case Finwait1:
2438 				/* If we still have some data place on
2439 				 * receive queue
2440 				 */
2441 				if(bp) {
2442 					bp = packblock(bp);
2443 					if(bp == nil)
2444 						panic("tcp packblock");
2445 					qpassnolim(s->rq, bp);
2446 					bp = nil;
2447 				}
2448 				tcb->rcv.nxt += length;
2449 
2450 				/*
2451 				 *  turn on the acktimer if there's something
2452 				 *  to ack
2453 				 */
2454 				if(tcb->acktimer.state != TcptimerON)
2455 					tcpgo(tpriv, &tcb->acktimer);
2456 
2457 				break;
2458 			case Finwait2:
2459 				/* no process to read the data, send a reset */
2460 				if(bp != nil)
2461 					freeblist(bp);
2462 				sndrst(tcp, source, dest, length, &seg, version,
2463 					"send to Finwait2");
2464 				qunlock(s);
2465 				poperror();
2466 				return;
2467 			}
2468 		}
2469 
2470 		if(seg.flags & FIN) {
2471 			tcb->flags |= FORCE;
2472 
2473 			switch(tcb->state) {
2474 			case Syn_received:
2475 			case Established:
2476 				tcb->rcv.nxt++;
2477 				tcpsetstate(s, Close_wait);
2478 				break;
2479 			case Finwait1:
2480 				tcb->rcv.nxt++;
2481 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2482 					tcphalt(tpriv, &tcb->rtt_timer);
2483 					tcphalt(tpriv, &tcb->acktimer);
2484 					tcphalt(tpriv, &tcb->katimer);
2485 					tcpsetstate(s, Time_wait);
2486 					tcb->timer.start = MSL2*(1000/MSPTICK);
2487 					tcpgo(tpriv, &tcb->timer);
2488 				}
2489 				else
2490 					tcpsetstate(s, Closing);
2491 				break;
2492 			case Finwait2:
2493 				tcb->rcv.nxt++;
2494 				tcphalt(tpriv, &tcb->rtt_timer);
2495 				tcphalt(tpriv, &tcb->acktimer);
2496 				tcphalt(tpriv, &tcb->katimer);
2497 				tcpsetstate(s, Time_wait);
2498 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2499 				tcpgo(tpriv, &tcb->timer);
2500 				break;
2501 			case Close_wait:
2502 			case Closing:
2503 			case Last_ack:
2504 				break;
2505 			case Time_wait:
2506 				tcpgo(tpriv, &tcb->timer);
2507 				break;
2508 			}
2509 		}
2510 
2511 		/*
2512 		 *  get next adjacent segment from the resequence queue.
2513 		 *  dump/trim any overlapping segments
2514 		 */
2515 		for(;;) {
2516 			if(tcb->reseq == nil)
2517 				goto output;
2518 
2519 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2520 				goto output;
2521 
2522 			getreseq(tcb, &seg, &bp, &length);
2523 
2524 			tcprcvwin(s);
2525 			if(tcptrim(tcb, &seg, &bp, &length) == 0){
2526 				tcb->flags |= FORCE;
2527 				break;
2528 			}
2529 		}
2530 	}
2531 output:
2532 	tcpoutput(s);
2533 	qunlock(s);
2534 	poperror();
2535 	return;
2536 raise:
2537 	qunlock(s);
2538 	poperror();
2539 	freeblist(bp);
2540 	tcpkick(s);
2541 }
2542 
2543 /*
2544  *  always enters and exits with the s locked.  We drop
2545  *  the lock to ipoput the packet so some care has to be
2546  *  taken by callers.
2547  */
2548 static void
2549 tcpoutput(Conv *s)
2550 {
2551 	Tcp seg;
2552 	uint msgs;
2553 	Tcpctl *tcb;
2554 	Block *hbp, *bp;
2555 	int sndcnt;
2556 	ulong ssize, dsize, sent;
2557 	Fs *f;
2558 	Tcppriv *tpriv;
2559 	uchar version;
2560 
2561 	f = s->p->f;
2562 	tpriv = s->p->priv;
2563 	version = s->ipversion;
2564 
2565 	tcb = (Tcpctl*)s->ptcl;
2566 
2567 	/* force ack every 2*mss */
2568 	if((tcb->flags & FORCE) == 0 &&
2569 	    tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2570 		tpriv->stats[Delayack]++;
2571 		tcb->flags |= FORCE;
2572 	}
2573 
2574 	/* force ack if window opening */
2575 	if((tcb->flags & FORCE) == 0){
2576 		tcprcvwin(s);
2577 		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2578 			tpriv->stats[Wopenack]++;
2579 			tcb->flags |= FORCE;
2580 		}
2581 	}
2582 
2583 	for(msgs = 0; msgs < 100; msgs++) {
2584 		switch(tcb->state) {
2585 		case Listen:
2586 		case Closed:
2587 		case Finwait2:
2588 			return;
2589 		}
2590 
2591 		/* Don't send anything else until our SYN has been acked */
2592 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2593 			break;
2594 
2595 		/* force an ack when a window has opened up */
2596 		tcprcvwin(s);
2597 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2598 			tcb->rcv.blocked = 0;
2599 			tcb->flags |= FORCE;
2600 		}
2601 
2602 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2603 		sent = tcb->snd.ptr - tcb->snd.una;
2604 		ssize = sndcnt;
2605 		if(tcb->snd.wnd == 0){
2606 			/* zero window probe */
2607 			if(sent > 0 && !(tcb->flags & FORCE))
2608 				break;	/* already probing, rto re-probes */
2609 			if(ssize < sent)
2610 				ssize = 0;
2611 			else{
2612 				ssize -= sent;
2613 				if(ssize > 0)
2614 					ssize = 1;
2615 			}
2616 		} else {
2617 			/* calculate usable segment size */
2618 			if(ssize > tcb->cwind)
2619 				ssize = tcb->cwind;
2620 			if(ssize > tcb->snd.wnd)
2621 				ssize = tcb->snd.wnd;
2622 
2623 			if(ssize < sent)
2624 				ssize = 0;
2625 			else {
2626 				ssize -= sent;
2627 				if(ssize > tcb->mss)
2628 					ssize = tcb->mss;
2629 			}
2630 		}
2631 
2632 		dsize = ssize;
2633 		seg.urg = 0;
2634 
2635 		if(!(tcb->flags & FORCE))
2636 			if(ssize == 0 ||
2637 			    ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2638 			    sent > TCPREXMTTHRESH * tcb->mss)
2639 				break;
2640 
2641 		tcb->flags &= ~FORCE;
2642 
2643 		/* By default we will generate an ack */
2644 		tcphalt(tpriv, &tcb->acktimer);
2645 		seg.source = s->lport;
2646 		seg.dest = s->rport;
2647 		seg.flags = ACK;
2648 		seg.mss = 0;
2649 		seg.ws = 0;
2650 		seg.update = 0;
2651 		switch(tcb->state){
2652 		case Syn_sent:
2653 			seg.flags = 0;
2654 			if(tcb->snd.ptr == tcb->iss){
2655 				seg.flags |= SYN;
2656 				dsize--;
2657 				seg.mss = tcb->mss;
2658 				seg.ws = tcb->scale;
2659 			}
2660 			break;
2661 		case Syn_received:
2662 			/*
2663 			 *  don't send any data with a SYN/ACK packet
2664 			 *  because Linux rejects the packet in its
2665 			 *  attempt to solve the SYN attack problem
2666 			 */
2667 			if(tcb->snd.ptr == tcb->iss){
2668 				seg.flags |= SYN;
2669 				dsize = 0;
2670 				ssize = 1;
2671 				seg.mss = tcb->mss;
2672 				seg.ws = tcb->scale;
2673 			}
2674 			break;
2675 		}
2676 		seg.seq = tcb->snd.ptr;
2677 		seg.ack = tcb->rcv.nxt;
2678 		seg.wnd = tcb->rcv.wnd;
2679 
2680 		/* Pull out data to send */
2681 		bp = nil;
2682 		if(dsize != 0) {
2683 			bp = qcopy(s->wq, dsize, sent);
2684 			if(BLEN(bp) != dsize) {
2685 				seg.flags |= FIN;
2686 				dsize--;
2687 			}
2688 		}
2689 
2690 		if(sent+dsize == sndcnt && dsize)
2691 			seg.flags |= PSH;
2692 
2693 		tcb->snd.ptr += ssize;
2694 
2695 		/* Pull up the send pointer so we can accept acks
2696 		 * for this window
2697 		 */
2698 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2699 			tcb->snd.nxt = tcb->snd.ptr;
2700 
2701 		/* Build header, link data and compute cksum */
2702 		switch(version){
2703 		case V4:
2704 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2705 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2706 			if(hbp == nil) {
2707 				freeblist(bp);
2708 				return;
2709 			}
2710 			break;
2711 		case V6:
2712 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2713 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2714 			if(hbp == nil) {
2715 				freeblist(bp);
2716 				return;
2717 			}
2718 			break;
2719 		default:
2720 			hbp = nil;	/* to suppress a warning */
2721 			panic("tcpoutput: version %d", version);
2722 		}
2723 
2724 		/* Start the transmission timers if there is new data and we
2725 		 * expect acknowledges
2726 		 */
2727 		if(ssize != 0){
2728 			if(tcb->timer.state != TcptimerON){
2729 				tcb->time = NOW;
2730 				tcb->timeuna = tcb->snd.una;
2731 				tcpgo(tpriv, &tcb->timer);
2732 			}
2733 
2734 			/*  If round trip timer isn't running, start it.
2735 			 *  measure the longest packet only in case the
2736 			 *  transmission time dominates RTT
2737 			 */
2738 			if(tcb->snd.retransmit == 0)
2739 			if(tcb->rtt_timer.state != TcptimerON)
2740 			if(ssize == tcb->mss) {
2741 				tcpgo(tpriv, &tcb->rtt_timer);
2742 				tcb->rttseq = tcb->snd.ptr;
2743 			}
2744 		}
2745 
2746 		tpriv->stats[OutSegs]++;
2747 		if(tcb->snd.retransmit)
2748 			tpriv->stats[RetransSegsSent]++;
2749 		tcb->rcv.ackptr = seg.ack;
2750 		tcb->rcv.wsnt = tcb->rcv.wptr;
2751 
2752 		/* put off the next keep alive */
2753 		tcpgo(tpriv, &tcb->katimer);
2754 
2755 		switch(version){
2756 		case V4:
2757 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2758 				/* a negative return means no route */
2759 				localclose(s, "no route");
2760 			}
2761 			break;
2762 		case V6:
2763 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2764 				/* a negative return means no route */
2765 				localclose(s, "no route");
2766 			}
2767 			break;
2768 		default:
2769 			panic("tcpoutput2: version %d", version);
2770 		}
2771 		if((msgs%4) == 3){
2772 			qunlock(s);
2773 			qlock(s);
2774 		}
2775 	}
2776 }
2777 
2778 /*
2779  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2780  */
2781 static void
2782 tcpsendka(Conv *s)
2783 {
2784 	Tcp seg;
2785 	Tcpctl *tcb;
2786 	Block *hbp,*dbp;
2787 
2788 	tcb = (Tcpctl*)s->ptcl;
2789 
2790 	dbp = nil;
2791 	memset(&seg, 0, sizeof seg);
2792 	seg.urg = 0;
2793 	seg.source = s->lport;
2794 	seg.dest = s->rport;
2795 	seg.flags = ACK|PSH;
2796 	seg.mss = 0;
2797 	seg.ws = 0;
2798 	if(tcpporthogdefense)
2799 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2800 	else
2801 		seg.seq = tcb->snd.una-1;
2802 	seg.ack = tcb->rcv.nxt;
2803 	tcb->rcv.ackptr = seg.ack;
2804 	tcprcvwin(s);
2805 	seg.wnd = tcb->rcv.wnd;
2806 	if(tcb->state == Finwait2){
2807 		seg.flags |= FIN;
2808 	} else {
2809 		dbp = allocb(1);
2810 		dbp->wp++;
2811 	}
2812 
2813 	if(isv4(s->raddr)) {
2814 		/* Build header, link data and compute cksum */
2815 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2816 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2817 		if(hbp == nil) {
2818 			freeblist(dbp);
2819 			return;
2820 		}
2821 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2822 	}
2823 	else {
2824 		/* Build header, link data and compute cksum */
2825 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2826 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2827 		if(hbp == nil) {
2828 			freeblist(dbp);
2829 			return;
2830 		}
2831 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2832 	}
2833 }
2834 
2835 /*
2836  *  set connection to time out after 12 minutes
2837  */
2838 static void
2839 tcpsetkacounter(Tcpctl *tcb)
2840 {
2841 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2842 	if(tcb->kacounter < 3)
2843 		tcb->kacounter = 3;
2844 }
2845 
2846 /*
2847  *  if we've timed out, close the connection
2848  *  otherwise, send a keepalive and restart the timer
2849  */
2850 static void
2851 tcpkeepalive(void *v)
2852 {
2853 	Tcpctl *tcb;
2854 	Conv *s;
2855 
2856 	s = v;
2857 	tcb = (Tcpctl*)s->ptcl;
2858 	if(waserror()){
2859 		qunlock(s);
2860 		nexterror();
2861 	}
2862 	qlock(s);
2863 	if(tcb->state != Closed){
2864 		if(--(tcb->kacounter) <= 0) {
2865 			localclose(s, Etimedout);
2866 		} else {
2867 			tcpsendka(s);
2868 			tcpgo(s->p->priv, &tcb->katimer);
2869 		}
2870 	}
2871 	qunlock(s);
2872 	poperror();
2873 }
2874 
2875 /*
2876  *  start keepalive timer
2877  */
2878 static char*
2879 tcpstartka(Conv *s, char **f, int n)
2880 {
2881 	Tcpctl *tcb;
2882 	int x;
2883 
2884 	tcb = (Tcpctl*)s->ptcl;
2885 	if(tcb->state != Established)
2886 		return "connection must be in Establised state";
2887 	if(n > 1){
2888 		x = atoi(f[1]);
2889 		if(x >= MSPTICK)
2890 			tcb->katimer.start = x/MSPTICK;
2891 	}
2892 	tcpsetkacounter(tcb);
2893 	tcpgo(s->p->priv, &tcb->katimer);
2894 
2895 	return nil;
2896 }
2897 
2898 /*
2899  *  turn checksums on/off
2900  */
2901 static char*
2902 tcpsetchecksum(Conv *s, char **f, int)
2903 {
2904 	Tcpctl *tcb;
2905 
2906 	tcb = (Tcpctl*)s->ptcl;
2907 	tcb->nochecksum = !atoi(f[1]);
2908 
2909 	return nil;
2910 }
2911 
2912 /*
2913  *  retransmit (at most) one segment at snd.una.
2914  *  preserve cwind & snd.ptr
2915  */
2916 static void
2917 tcprxmit(Conv *s)
2918 {
2919 	Tcpctl *tcb;
2920 	Tcppriv *tpriv;
2921 	ulong tcwind, tptr;
2922 
2923 	tcb = (Tcpctl*)s->ptcl;
2924 	tcb->flags |= RETRAN|FORCE;
2925 
2926 	tptr = tcb->snd.ptr;
2927 	tcwind = tcb->cwind;
2928 	tcb->snd.ptr = tcb->snd.una;
2929 	tcb->cwind = tcb->mss;
2930 	tcb->snd.retransmit = 1;
2931 	tcpoutput(s);
2932 	tcb->snd.retransmit = 0;
2933 	tcb->cwind = tcwind;
2934 	tcb->snd.ptr = tptr;
2935 
2936 	tpriv = s->p->priv;
2937 	tpriv->stats[RetransSegs]++;
2938 }
2939 
2940 /*
2941  *  TODO: RFC 4138 F-RTO
2942  */
2943 static void
2944 tcptimeout(void *arg)
2945 {
2946 	Conv *s;
2947 	Tcpctl *tcb;
2948 	int maxback;
2949 	Tcppriv *tpriv;
2950 
2951 	s = (Conv*)arg;
2952 	tpriv = s->p->priv;
2953 	tcb = (Tcpctl*)s->ptcl;
2954 
2955 	if(waserror()){
2956 		qunlock(s);
2957 		nexterror();
2958 	}
2959 	qlock(s);
2960 	switch(tcb->state){
2961 	default:
2962 		tcb->backoff++;
2963 		if(tcb->state == Syn_sent)
2964 			maxback = MAXBACKMS/2;
2965 		else
2966 			maxback = MAXBACKMS;
2967 		tcb->backedoff += tcb->timer.start * MSPTICK;
2968 		if(tcb->backedoff >= maxback) {
2969 			localclose(s, Etimedout);
2970 			break;
2971 		}
2972 		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2973 			tcb->srtt, tcb->mdev, NOW - tcb->time,
2974 			tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2975 			tcpstates[s->state]);
2976 		tcpsettimer(tcb);
2977 		if(tcb->snd.rto == 0)
2978 			tcpcongestion(tcb);
2979 		tcprxmit(s);
2980 		tcb->snd.ptr = tcb->snd.una;
2981 		tcb->cwind = tcb->mss;
2982 		tcb->snd.rto = 1;
2983 		tpriv->stats[RetransTimeouts]++;
2984 
2985 		if(tcb->snd.recovery){
2986 			tcb->snd.dupacks = 0;		/* reno rto */
2987 			tcb->snd.recovery = 0;
2988 			tpriv->stats[RecoveryRTO]++;
2989 			tcb->snd.rxt = tcb->snd.nxt;
2990 			netlog(s->p->f, Logtcpwin,
2991 				"rto recovery rxt @%lud\n", tcb->snd.nxt);
2992 		}
2993 
2994 		tcb->abcbytes = 0;
2995 		break;
2996 	case Time_wait:
2997 		localclose(s, nil);
2998 		break;
2999 	case Closed:
3000 		break;
3001 	}
3002 	qunlock(s);
3003 	poperror();
3004 }
3005 
3006 static int
3007 inwindow(Tcpctl *tcb, int seq)
3008 {
3009 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3010 }
3011 
3012 /*
3013  *  set up state for a received SYN (or SYN ACK) packet
3014  */
3015 static void
3016 procsyn(Conv *s, Tcp *seg)
3017 {
3018 	Tcpctl *tcb;
3019 	Tcppriv *tpriv;
3020 
3021 	tcb = (Tcpctl*)s->ptcl;
3022 	tcb->flags |= FORCE;
3023 
3024 	tcb->rcv.nxt = seg->seq + 1;
3025 	tcb->rcv.wptr = tcb->rcv.nxt;
3026 	tcb->rcv.wsnt = 0;
3027 	tcb->rcv.urg = tcb->rcv.nxt;
3028 	tcb->irs = seg->seq;
3029 
3030 	/* our sending max segment size cannot be bigger than what he asked for */
3031 	if(seg->mss != 0 && seg->mss < tcb->mss) {
3032 		tcb->mss = seg->mss;
3033 		tpriv = s->p->priv;
3034 		tpriv->stats[Mss] = tcb->mss;
3035 	}
3036 
3037 	tcb->snd.wnd = seg->wnd;
3038 	initialwindow(tcb);
3039 }
3040 
3041 static int
3042 dumpreseq(Tcpctl *tcb)
3043 {
3044 	Reseq *r, *next;
3045 
3046 	for(r = tcb->reseq; r != nil; r = next){
3047 		next = r->next;
3048 		freeblist(r->bp);
3049 		free(r);
3050 	}
3051 	tcb->reseq = nil;
3052 	tcb->nreseq = 0;
3053 	tcb->reseqlen = 0;
3054 	return -1;
3055 }
3056 
3057 static void
3058 logreseq(Fs *f, Reseq *r, ulong n)
3059 {
3060 	char *s;
3061 
3062 	for(; r != nil; r = r->next){
3063 		s = nil;
3064 		if(r->next == nil && r->seg.seq != n)
3065 			s = "hole/end";
3066 		else if(r->next == nil)
3067 			s = "end";
3068 		else if(r->seg.seq != n)
3069 			s = "hole";
3070 		if(s != nil)
3071 			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3072 				n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3073 		n = r->seg.seq + r->seg.len;
3074 	}
3075 }
3076 
3077 static int
3078 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3079 {
3080 	Reseq *rp, **rr;
3081 	int qmax;
3082 
3083 	rp = malloc(sizeof *rp);
3084 	if(rp == nil){
3085 		freeblist(bp);		/* bp always consumed by addreseq */
3086 		return 0;
3087 	}
3088 
3089 	rp->seg = *seg;
3090 	rp->bp = bp;
3091 	rp->length = length;
3092 
3093 	tcb->reseqlen += length;
3094 	tcb->nreseq++;
3095 
3096 	/* Place on reassembly list sorting by starting seq number */
3097 	for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3098 		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3099 			rp->next = *rr;
3100 			*rr = rp;
3101 			tpriv->stats[Resequenced]++;
3102 			if(rp->next != nil)
3103 				tpriv->stats[OutOfOrder]++;
3104 			break;
3105 		}
3106 
3107 	qmax = tcb->window;
3108 	if(tcb->reseqlen > qmax){
3109 		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3110 			tcb->reseqlen, qmax, tcb->nreseq);
3111 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3112 		tpriv->stats[ReseqBytelim]++;
3113 		return dumpreseq(tcb);
3114 	}
3115 	qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3116 	if(tcb->nreseq > qmax){
3117 		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3118 			tcb->nreseq, qmax, tcb->reseqlen);
3119 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3120 		tpriv->stats[ReseqPktlim]++;
3121 		return dumpreseq(tcb);
3122 	}
3123 	return 0;
3124 }
3125 
3126 static void
3127 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3128 {
3129 	Reseq *rp;
3130 
3131 	rp = tcb->reseq;
3132 	if(rp == nil)
3133 		return;
3134 
3135 	tcb->reseq = rp->next;
3136 
3137 	*seg = rp->seg;
3138 	*bp = rp->bp;
3139 	*length = rp->length;
3140 
3141 	tcb->nreseq--;
3142 	tcb->reseqlen -= rp->length;
3143 
3144 	free(rp);
3145 }
3146 
3147 static int
3148 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3149 {
3150 	ushort len;
3151 	uchar accept;
3152 	int dupcnt, excess;
3153 
3154 	accept = 0;
3155 	len = *length;
3156 	if(seg->flags & SYN)
3157 		len++;
3158 	if(seg->flags & FIN)
3159 		len++;
3160 
3161 	if(tcb->rcv.wnd == 0) {
3162 		if(len == 0 && seg->seq == tcb->rcv.nxt)
3163 			return 0;
3164 	}
3165 	else {
3166 		/* Some part of the segment should be in the window */
3167 		if(inwindow(tcb,seg->seq))
3168 			accept++;
3169 		else
3170 		if(len != 0) {
3171 			if(inwindow(tcb, seg->seq+len-1) ||
3172 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3173 				accept++;
3174 		}
3175 	}
3176 	if(!accept) {
3177 		freeblist(*bp);
3178 		return -1;
3179 	}
3180 	dupcnt = tcb->rcv.nxt - seg->seq;
3181 	if(dupcnt > 0){
3182 		tcb->rerecv += dupcnt;
3183 		if(seg->flags & SYN){
3184 			seg->flags &= ~SYN;
3185 			seg->seq++;
3186 
3187 			if(seg->urg > 1)
3188 				seg->urg--;
3189 			else
3190 				seg->flags &= ~URG;
3191 			dupcnt--;
3192 		}
3193 		if(dupcnt > 0){
3194 			pullblock(bp, (ushort)dupcnt);
3195 			seg->seq += dupcnt;
3196 			*length -= dupcnt;
3197 
3198 			if(seg->urg > dupcnt)
3199 				seg->urg -= dupcnt;
3200 			else {
3201 				seg->flags &= ~URG;
3202 				seg->urg = 0;
3203 			}
3204 		}
3205 	}
3206 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3207 	if(excess > 0) {
3208 		tcb->rerecv += excess;
3209 		*length -= excess;
3210 		*bp = trimblock(*bp, 0, *length);
3211 		if(*bp == nil)
3212 			panic("presotto is a boofhead");
3213 		seg->flags &= ~FIN;
3214 	}
3215 	return 0;
3216 }
3217 
3218 static void
3219 tcpadvise(Proto *tcp, Block *bp, char *msg)
3220 {
3221 	Tcp4hdr *h4;
3222 	Tcp6hdr *h6;
3223 	Tcpctl *tcb;
3224 	uchar source[IPaddrlen];
3225 	uchar dest[IPaddrlen];
3226 	ushort psource, pdest;
3227 	Conv *s, **p;
3228 
3229 	h4 = (Tcp4hdr*)(bp->rp);
3230 	h6 = (Tcp6hdr*)(bp->rp);
3231 
3232 	if((h4->vihl&0xF0)==IP_VER4) {
3233 		v4tov6(dest, h4->tcpdst);
3234 		v4tov6(source, h4->tcpsrc);
3235 		psource = nhgets(h4->tcpsport);
3236 		pdest = nhgets(h4->tcpdport);
3237 	}
3238 	else {
3239 		ipmove(dest, h6->tcpdst);
3240 		ipmove(source, h6->tcpsrc);
3241 		psource = nhgets(h6->tcpsport);
3242 		pdest = nhgets(h6->tcpdport);
3243 	}
3244 
3245 	/* Look for a connection */
3246 	qlock(tcp);
3247 	for(p = tcp->conv; *p; p++) {
3248 		s = *p;
3249 		tcb = (Tcpctl*)s->ptcl;
3250 		if(s->rport == pdest)
3251 		if(s->lport == psource)
3252 		if(tcb->state != Closed)
3253 		if(ipcmp(s->raddr, dest) == 0)
3254 		if(ipcmp(s->laddr, source) == 0){
3255 			qlock(s);
3256 			qunlock(tcp);
3257 			switch(tcb->state){
3258 			case Syn_sent:
3259 				localclose(s, msg);
3260 				break;
3261 			}
3262 			qunlock(s);
3263 			freeblist(bp);
3264 			return;
3265 		}
3266 	}
3267 	qunlock(tcp);
3268 	freeblist(bp);
3269 }
3270 
3271 static char*
3272 tcpporthogdefensectl(char *val)
3273 {
3274 	if(strcmp(val, "on") == 0)
3275 		tcpporthogdefense = 1;
3276 	else if(strcmp(val, "off") == 0)
3277 		tcpporthogdefense = 0;
3278 	else
3279 		return "unknown value for tcpporthogdefense";
3280 	return nil;
3281 }
3282 
3283 /* called with c qlocked */
3284 static char*
3285 tcpctl(Conv* c, char** f, int n)
3286 {
3287 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3288 		return tcphangup(c);
3289 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3290 		return tcpstartka(c, f, n);
3291 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3292 		return tcpsetchecksum(c, f, n);
3293 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3294 		return tcpporthogdefensectl(f[1]);
3295 	return "unknown control request";
3296 }
3297 
3298 static int
3299 tcpstats(Proto *tcp, char *buf, int len)
3300 {
3301 	Tcppriv *priv;
3302 	char *p, *e;
3303 	int i;
3304 
3305 	priv = tcp->priv;
3306 	p = buf;
3307 	e = p+len;
3308 	for(i = 0; i < Nstats; i++)
3309 		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3310 	return p - buf;
3311 }
3312 
3313 /*
3314  *  garbage collect any stale conversations:
3315  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3316  *	- Finwait2 after 5 minutes
3317  *
3318  *  this is called whenever we run out of channels.  Both checks are
3319  *  of questionable validity so we try to use them only when we're
3320  *  up against the wall.
3321  */
3322 static int
3323 tcpgc(Proto *tcp)
3324 {
3325 	Conv *c, **pp, **ep;
3326 	int n;
3327 	Tcpctl *tcb;
3328 
3329 
3330 	n = 0;
3331 	ep = &tcp->conv[tcp->nc];
3332 	for(pp = tcp->conv; pp < ep; pp++) {
3333 		c = *pp;
3334 		if(c == nil)
3335 			break;
3336 		if(!canqlock(c))
3337 			continue;
3338 		tcb = (Tcpctl*)c->ptcl;
3339 		switch(tcb->state){
3340 		case Syn_received:
3341 			if(NOW - tcb->time > 5000){
3342 				localclose(c, Etimedout);
3343 				n++;
3344 			}
3345 			break;
3346 		case Finwait2:
3347 			if(NOW - tcb->time > 5*60*1000){
3348 				localclose(c, Etimedout);
3349 				n++;
3350 			}
3351 			break;
3352 		}
3353 		qunlock(c);
3354 	}
3355 	return n;
3356 }
3357 
3358 static void
3359 tcpsettimer(Tcpctl *tcb)
3360 {
3361 	int x;
3362 
3363 	/* round trip dependency */
3364 	x = backoff(tcb->backoff) *
3365 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3366 
3367 	/* bounded twixt 0.3 and 64 seconds */
3368 	if(x < 300/MSPTICK)
3369 		x = 300/MSPTICK;
3370 	else if(x > (64000/MSPTICK))
3371 		x = 64000/MSPTICK;
3372 	tcb->timer.start = x;
3373 }
3374 
3375 void
3376 tcpinit(Fs *fs)
3377 {
3378 	Proto *tcp;
3379 	Tcppriv *tpriv;
3380 
3381 	tcp = smalloc(sizeof(Proto));
3382 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3383 	tcp->name = "tcp";
3384 	tcp->connect = tcpconnect;
3385 	tcp->announce = tcpannounce;
3386 	tcp->ctl = tcpctl;
3387 	tcp->state = tcpstate;
3388 	tcp->create = tcpcreate;
3389 	tcp->close = tcpclose;
3390 	tcp->rcv = tcpiput;
3391 	tcp->advise = tcpadvise;
3392 	tcp->stats = tcpstats;
3393 	tcp->inuse = tcpinuse;
3394 	tcp->gc = tcpgc;
3395 	tcp->ipproto = IP_TCPPROTO;
3396 	tcp->nc = scalednconv();
3397 	tcp->ptclsize = sizeof(Tcpctl);
3398 	tpriv->stats[MaxConn] = tcp->nc;
3399 
3400 	Fsproto(fs, tcp);
3401 }
3402 
3403 static void
3404 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3405 {
3406 	/*
3407 	 * guess at reasonable queue sizes.  there's no current way
3408 	 * to know how many nic receive buffers we can safely tie up in the
3409 	 * tcp stack, and we don't adjust our queues to maximize throughput
3410 	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3411 	 * respected, but we still control our own buffer commitment by
3412 	 * keeping a seperate qscale.
3413 	 */
3414 	tcb->rcv.scale = rcvscale & 0xff;
3415 	tcb->snd.scale = sndscale & 0xff;
3416 	tcb->qscale = rcvscale & 0xff;
3417 	if(rcvscale > Maxqscale)
3418 		tcb->qscale = Maxqscale;
3419 
3420 	if(rcvscale != tcb->rcv.scale)
3421 		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3422 			"qlen %d >> window %ud lport %d\n",
3423 			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3424 	tcb->window = QMAX << tcb->qscale;
3425 	tcb->ssthresh = tcb->window;
3426 
3427 	/*
3428 	 * it's important to set wq large enough to cover the full
3429 	 * bandwidth-delay product.  it's possible to be in loss
3430 	 * recovery with a big window, and we need to keep sending
3431 	 * into the inflated window.  the difference can be huge
3432 	 * for even modest (70ms) ping times.
3433 	 */
3434 	qsetlimit(s->rq, tcb->window);
3435 	qsetlimit(s->wq, tcb->window);
3436 	tcprcvwin(s);
3437 }
3438