xref: /plan9-contrib/sys/src/9/ip/tcp.c (revision 25fc69938fdecc61cd09e795cbe2d2f72f1082b1)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Maximum segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default maximum segment */
50 	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	HaveWS		= 1<<8,
85 };
86 
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
91 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
92 	"Closing", 	"Last_ack", 	"Time_wait"
93 };
94 
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 	Tcptimer	*next;
99 	Tcptimer	*prev;
100 	Tcptimer	*readynext;
101 	int	state;
102 	int	start;
103 	int	count;
104 	void	(*func)(void*);
105 	void	*arg;
106 };
107 
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 	uchar	vihl;		/* Version and header length */
116 	uchar	tos;		/* Type of service */
117 	uchar	length[2];	/* packet length */
118 	uchar	id[2];		/* Identification */
119 	uchar	frag[2];	/* Fragment information */
120 	uchar	Unused;
121 	uchar	proto;
122 	uchar	tcplen[2];
123 	uchar	tcpsrc[4];
124 	uchar	tcpdst[4];
125 	uchar	tcpsport[2];
126 	uchar	tcpdport[2];
127 	uchar	tcpseq[4];
128 	uchar	tcpack[4];
129 	uchar	tcpflag[2];
130 	uchar	tcpwin[2];
131 	uchar	tcpcksum[2];
132 	uchar	tcpurg[2];
133 	/* Options segment */
134 	uchar	tcpopt[1];
135 };
136 
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 	uchar	vcf[4];
141 	uchar	ploadlen[2];
142 	uchar	proto;
143 	uchar	ttl;
144 	uchar	tcpsrc[IPaddrlen];
145 	uchar	tcpdst[IPaddrlen];
146 	uchar	tcpsport[2];
147 	uchar	tcpdport[2];
148 	uchar	tcpseq[4];
149 	uchar	tcpack[4];
150 	uchar	tcpflag[2];
151 	uchar	tcpwin[2];
152 	uchar	tcpcksum[2];
153 	uchar	tcpurg[2];
154 	/* Options segment */
155 	uchar	tcpopt[1];
156 };
157 
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct	Tcp
166 {
167 	ushort	source;
168 	ushort	dest;
169 	ulong	seq;
170 	ulong	ack;
171 	uchar	flags;
172 	ushort	ws;	/* window scale option (if not zero) */
173 	ulong	wnd;
174 	ushort	urg;
175 	ushort	mss;	/* max segment size option (if not zero) */
176 	ushort	len;	/* size of data */
177 };
178 
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 	Reseq	*next;
187 	Tcp	seg;
188 	Block	*bp;
189 	ushort	length;
190 };
191 
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 	uchar	state;			/* Connection state */
199 	uchar	type;			/* Listening or active connection */
200 	uchar	code;			/* Icmp code */
201 	struct {
202 		ulong	una;		/* Unacked data pointer */
203 		ulong	nxt;		/* Next sequence expected */
204 		ulong	ptr;		/* Data pointer */
205 		ulong	wnd;		/* Tcp send window */
206 		ulong	urg;		/* Urgent data pointer */
207 		ulong	wl2;
208 		int	scale;		/* how much to right shift window in xmitted packets */
209 		/* to implement tahoe and reno TCP */
210 		ulong	dupacks;	/* number of duplicate acks rcvd */
211 		int	recovery;	/* loss recovery flag */
212 		ulong	rxt;		/* right window marker for recovery */
213 	} snd;
214 	struct {
215 		ulong	nxt;		/* Receive pointer to next uchar slot */
216 		ulong	wnd;		/* Receive window incoming */
217 		ulong	urg;		/* Urgent pointer */
218 		int	blocked;
219 		int	una;		/* unacked data segs */
220 		int	scale;		/* how much to left shift window in rcved packets */
221 	} rcv;
222 	ulong	iss;			/* Initial sequence number */
223 	int	sawwsopt;		/* true if we saw a wsopt on the incoming SYN */
224 	ulong	cwind;			/* Congestion window */
225 	int	scale;			/* desired snd.scale */
226 	ushort	ssthresh;		/* Slow start threshold */
227 	int	resent;			/* Bytes just resent */
228 	int	irs;			/* Initial received squence */
229 	ushort	mss;			/* Maximum segment size */
230 	int	rerecv;			/* Overlap of data rerecevived */
231 	ulong	window;			/* Receive window */
232 	uchar	backoff;		/* Exponential backoff counter */
233 	int	backedoff;		/* ms we've backed off for rexmits */
234 	uchar	flags;			/* State flags */
235 	Reseq	*reseq;			/* Resequencing queue */
236 	Tcptimer	timer;			/* Activity timer */
237 	Tcptimer	acktimer;		/* Acknowledge timer */
238 	Tcptimer	rtt_timer;		/* Round trip timer */
239 	Tcptimer	katimer;		/* keep alive timer */
240 	ulong	rttseq;			/* Round trip sequence */
241 	int	srtt;			/* Shortened round trip */
242 	int	mdev;			/* Mean deviation of round trip */
243 	int	kacounter;		/* count down for keep alive */
244 	uint	sndsyntime;		/* time syn sent */
245 	ulong	time;			/* time Finwait2 or Syn_received was sent */
246 	int	nochecksum;		/* non-zero means don't send checksums */
247 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
248 
249 	union {
250 		Tcp4hdr	tcp4hdr;
251 		Tcp6hdr	tcp6hdr;
252 	} protohdr;		/* prototype header */
253 };
254 
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 	Limbo	*next;
272 
273 	uchar	laddr[IPaddrlen];
274 	uchar	raddr[IPaddrlen];
275 	ushort	lport;
276 	ushort	rport;
277 	ulong	irs;		/* initial received sequence */
278 	ulong	iss;		/* initial sent sequence */
279 	ushort	mss;		/* mss from the other end */
280 	ushort	rcvscale;	/* how much to scale rcvd windows */
281 	ushort	sndscale;	/* how much to scale sent windows */
282 	ulong	lastsend;	/* last time we sent a synack */
283 	uchar	version;	/* v4 or v6 */
284 	uchar	rexmits;	/* number of retransmissions */
285 };
286 
287 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
288 ushort	tcp_mss = DEF_MSS;	/* Maximum segment size to be sent */
289 
290 enum {
291 	/* MIB stats */
292 	MaxConn,
293 	Mss,
294 	ActiveOpens,
295 	PassiveOpens,
296 	EstabResets,
297 	CurrEstab,
298 	InSegs,
299 	OutSegs,
300 	RetransSegs,
301 	RetransTimeouts,
302 	InErrs,
303 	OutRsts,
304 
305 	/* non-MIB stats */
306 	CsumErrs,
307 	HlenErrs,
308 	LenErrs,
309 	OutOfOrder,
310 
311 	Nstats
312 };
313 
314 static char *statnames[] =
315 {
316 [MaxConn]	"MaxConn",
317 [Mss]		"MaxSegment",
318 [ActiveOpens]	"ActiveOpens",
319 [PassiveOpens]	"PassiveOpens",
320 [EstabResets]	"EstabResets",
321 [CurrEstab]	"CurrEstab",
322 [InSegs]	"InSegs",
323 [OutSegs]	"OutSegs",
324 [RetransSegs]	"RetransSegs",
325 [RetransTimeouts]	"RetransTimeouts",
326 [InErrs]	"InErrs",
327 [OutRsts]	"OutRsts",
328 [CsumErrs]	"CsumErrs",
329 [HlenErrs]	"HlenErrs",
330 [LenErrs]	"LenErrs",
331 [OutOfOrder]	"OutOfOrder",
332 };
333 
334 typedef struct Tcppriv Tcppriv;
335 struct Tcppriv
336 {
337 	/* List of active timers */
338 	QLock 	tl;
339 	Tcptimer *timers;
340 
341 	/* hash table for matching conversations */
342 	Ipht	ht;
343 
344 	/* calls in limbo waiting for an ACK to our SYN ACK */
345 	int	nlimbo;
346 	Limbo	*lht[NLHT];
347 
348 	/* for keeping track of tcpackproc */
349 	QLock	apl;
350 	int	ackprocstarted;
351 
352 	uvlong	stats[Nstats];
353 };
354 
355 /*
356  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
357  *  solution to hijacked systems staking out port's as a form
358  *  of DoS attack.
359  *
360  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
361  *  that number gets acked by the other end, we shut down the connection.
362  *  Look for tcpporthogdefense in the code.
363  */
364 int tcpporthogdefense = 0;
365 
366 int	addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
367 void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
368 void	localclose(Conv*, char*);
369 void	procsyn(Conv*, Tcp*);
370 void	tcpiput(Proto*, Ipifc*, Block*);
371 void	tcpoutput(Conv*);
372 int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
373 void	tcpstart(Conv*, int);
374 void	tcptimeout(void*);
375 void	tcpsndsyn(Conv*, Tcpctl*);
376 void	tcprcvwin(Conv*);
377 void	tcpacktimer(void*);
378 void	tcpkeepalive(void*);
379 void	tcpsetkacounter(Tcpctl*);
380 void	tcprxmit(Conv*);
381 void	tcpsettimer(Tcpctl*);
382 void	tcpsynackrtt(Conv*);
383 void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
384 
385 static void limborexmit(Proto*);
386 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
387 
388 void
389 tcpsetstate(Conv *s, uchar newstate)
390 {
391 	Tcpctl *tcb;
392 	uchar oldstate;
393 	Tcppriv *tpriv;
394 
395 	tpriv = s->p->priv;
396 
397 	tcb = (Tcpctl*)s->ptcl;
398 
399 	oldstate = tcb->state;
400 	if(oldstate == newstate)
401 		return;
402 
403 	if(oldstate == Established)
404 		tpriv->stats[CurrEstab]--;
405 	if(newstate == Established)
406 		tpriv->stats[CurrEstab]++;
407 
408 	/**
409 	print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
410 		tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
411 	**/
412 
413 	switch(newstate) {
414 	case Closed:
415 		qclose(s->rq);
416 		qclose(s->wq);
417 		qclose(s->eq);
418 		break;
419 
420 	case Close_wait:		/* Remote closes */
421 		qhangup(s->rq, nil);
422 		break;
423 	}
424 
425 	tcb->state = newstate;
426 
427 	if(oldstate == Syn_sent && newstate != Closed)
428 		Fsconnected(s, nil);
429 }
430 
431 static char*
432 tcpconnect(Conv *c, char **argv, int argc)
433 {
434 	char *e;
435 	Tcpctl *tcb;
436 
437 	tcb = (Tcpctl*)(c->ptcl);
438 	if(tcb->state != Closed)
439 		return Econinuse;
440 
441 	e = Fsstdconnect(c, argv, argc);
442 	if(e != nil)
443 		return e;
444 	tcpstart(c, TCP_CONNECT);
445 
446 	return nil;
447 }
448 
449 static int
450 tcpstate(Conv *c, char *state, int n)
451 {
452 	Tcpctl *s;
453 
454 	s = (Tcpctl*)(c->ptcl);
455 
456 	return snprint(state, n,
457 		"%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
458 		tcpstates[s->state],
459 		c->rq ? qlen(c->rq) : 0,
460 		c->wq ? qlen(c->wq) : 0,
461 		s->srtt, s->mdev,
462 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
463 		s->timer.start, s->timer.count, s->rerecv,
464 		s->katimer.start, s->katimer.count);
465 }
466 
467 static int
468 tcpinuse(Conv *c)
469 {
470 	Tcpctl *s;
471 
472 	s = (Tcpctl*)(c->ptcl);
473 	return s->state != Closed;
474 }
475 
476 static char*
477 tcpannounce(Conv *c, char **argv, int argc)
478 {
479 	char *e;
480 	Tcpctl *tcb;
481 
482 	tcb = (Tcpctl*)(c->ptcl);
483 	if(tcb->state != Closed)
484 		return Econinuse;
485 
486 	e = Fsstdannounce(c, argv, argc);
487 	if(e != nil)
488 		return e;
489 	tcpstart(c, TCP_LISTEN);
490 	Fsconnected(c, nil);
491 
492 	return nil;
493 }
494 
495 /*
496  *  tcpclose is always called with the q locked
497  */
498 static void
499 tcpclose(Conv *c)
500 {
501 	Tcpctl *tcb;
502 
503 	tcb = (Tcpctl*)c->ptcl;
504 
505 	qhangup(c->rq, nil);
506 	qhangup(c->wq, nil);
507 	qhangup(c->eq, nil);
508 	qflush(c->rq);
509 
510 	switch(tcb->state) {
511 	case Listen:
512 		/*
513 		 *  reset any incoming calls to this listener
514 		 */
515 		Fsconnected(c, "Hangup");
516 
517 		localclose(c, nil);
518 		break;
519 	case Closed:
520 	case Syn_sent:
521 		localclose(c, nil);
522 		break;
523 	case Syn_received:
524 	case Established:
525 		tcb->flgcnt++;
526 		tcb->snd.nxt++;
527 		tcpsetstate(c, Finwait1);
528 		tcpoutput(c);
529 		break;
530 	case Close_wait:
531 		tcb->flgcnt++;
532 		tcb->snd.nxt++;
533 		tcpsetstate(c, Last_ack);
534 		tcpoutput(c);
535 		break;
536 	}
537 }
538 
539 void
540 tcpkick(void *x)
541 {
542 	Conv *s = x;
543 	Tcpctl *tcb;
544 
545 	tcb = (Tcpctl*)s->ptcl;
546 
547 	if(waserror()){
548 		qunlock(s);
549 		nexterror();
550 	}
551 	qlock(s);
552 
553 	switch(tcb->state) {
554 	case Syn_sent:
555 	case Syn_received:
556 	case Established:
557 	case Close_wait:
558 		/*
559 		 * Push data
560 		 */
561 		tcprcvwin(s);
562 		tcpoutput(s);
563 		break;
564 	default:
565 		localclose(s, "Hangup");
566 		break;
567 	}
568 
569 	qunlock(s);
570 	poperror();
571 }
572 
573 void
574 tcprcvwin(Conv *s)				/* Call with tcb locked */
575 {
576 	int w;
577 	Tcpctl *tcb;
578 
579 	tcb = (Tcpctl*)s->ptcl;
580 	w = tcb->window - qlen(s->rq);
581 	if(w < 0)
582 		w = 0;
583 	if(w == 0)
584 		netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq));
585 	tcb->rcv.wnd = w;
586 	if(w == 0)
587 		tcb->rcv.blocked = 1;
588 }
589 
590 void
591 tcpacktimer(void *v)
592 {
593 	Tcpctl *tcb;
594 	Conv *s;
595 
596 	s = v;
597 	tcb = (Tcpctl*)s->ptcl;
598 
599 	if(waserror()){
600 		qunlock(s);
601 		nexterror();
602 	}
603 	qlock(s);
604 	if(tcb->state != Closed){
605 		tcb->flags |= FORCE;
606 		tcprcvwin(s);
607 		tcpoutput(s);
608 	}
609 	qunlock(s);
610 	poperror();
611 }
612 
613 static void
614 tcpcreate(Conv *c)
615 {
616 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
617 	c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
618 }
619 
620 static void
621 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
622 {
623 	if(newstate != TcptimerON){
624 		if(t->state == TcptimerON){
625 			/* unchain */
626 			if(priv->timers == t){
627 				priv->timers = t->next;
628 				if(t->prev != nil)
629 					panic("timerstate1");
630 			}
631 			if(t->next)
632 				t->next->prev = t->prev;
633 			if(t->prev)
634 				t->prev->next = t->next;
635 			t->next = t->prev = nil;
636 		}
637 	} else {
638 		if(t->state != TcptimerON){
639 			/* chain */
640 			if(t->prev != nil || t->next != nil)
641 				panic("timerstate2");
642 			t->prev = nil;
643 			t->next = priv->timers;
644 			if(t->next)
645 				t->next->prev = t;
646 			priv->timers = t;
647 		}
648 	}
649 	t->state = newstate;
650 }
651 
652 void
653 tcpackproc(void *a)
654 {
655 	Tcptimer *t, *tp, *timeo;
656 	Proto *tcp;
657 	Tcppriv *priv;
658 	int loop;
659 
660 	tcp = a;
661 	priv = tcp->priv;
662 
663 	for(;;) {
664 		tsleep(&up->sleep, return0, 0, MSPTICK);
665 
666 		qlock(&priv->tl);
667 		timeo = nil;
668 		loop = 0;
669 		for(t = priv->timers; t != nil; t = tp) {
670 			if(loop++ > 10000)
671 				panic("tcpackproc1");
672 			tp = t->next;
673  			if(t->state == TcptimerON) {
674 				t->count--;
675 				if(t->count == 0) {
676 					timerstate(priv, t, TcptimerDONE);
677 					t->readynext = timeo;
678 					timeo = t;
679 				}
680 			}
681 		}
682 		qunlock(&priv->tl);
683 
684 		loop = 0;
685 		for(t = timeo; t != nil; t = t->readynext) {
686 			if(loop++ > 10000)
687 				panic("tcpackproc2");
688 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
689 				(*t->func)(t->arg);
690 				poperror();
691 			}
692 		}
693 
694 		limborexmit(tcp);
695 	}
696 }
697 
698 void
699 tcpgo(Tcppriv *priv, Tcptimer *t)
700 {
701 	if(t == nil || t->start == 0)
702 		return;
703 
704 	qlock(&priv->tl);
705 	t->count = t->start;
706 	timerstate(priv, t, TcptimerON);
707 	qunlock(&priv->tl);
708 }
709 
710 void
711 tcphalt(Tcppriv *priv, Tcptimer *t)
712 {
713 	if(t == nil)
714 		return;
715 
716 	qlock(&priv->tl);
717 	timerstate(priv, t, TcptimerOFF);
718 	qunlock(&priv->tl);
719 }
720 
721 int
722 backoff(int n)
723 {
724 	return 1 << n;
725 }
726 
727 void
728 localclose(Conv *s, char *reason)	/* called with tcb locked */
729 {
730 	Tcpctl *tcb;
731 	Reseq *rp,*rp1;
732 	Tcppriv *tpriv;
733 
734 	tpriv = s->p->priv;
735 	tcb = (Tcpctl*)s->ptcl;
736 
737 	iphtrem(&tpriv->ht, s);
738 
739 	tcphalt(tpriv, &tcb->timer);
740 	tcphalt(tpriv, &tcb->rtt_timer);
741 	tcphalt(tpriv, &tcb->acktimer);
742 	tcphalt(tpriv, &tcb->katimer);
743 
744 	/* Flush reassembly queue; nothing more can arrive */
745 	for(rp = tcb->reseq; rp != nil; rp = rp1) {
746 		rp1 = rp->next;
747 		freeblist(rp->bp);
748 		free(rp);
749 	}
750 	tcb->reseq = nil;
751 
752 	if(tcb->state == Syn_sent)
753 		Fsconnected(s, reason);
754 	if(s->state == Announced)
755 		wakeup(&s->listenr);
756 
757 	qhangup(s->rq, reason);
758 	qhangup(s->wq, reason);
759 
760 	tcpsetstate(s, Closed);
761 }
762 
763 /* mtu (- TCP + IP hdr len) of 1st hop */
764 int
765 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
766 {
767 	Ipifc *ifc;
768 	int mtu;
769 
770 	ifc = findipifc(tcp->f, addr, 0);
771 	switch(version){
772 	default:
773 	case V4:
774 		mtu = DEF_MSS;
775 		if(ifc != nil)
776 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
777 		break;
778 	case V6:
779 		mtu = DEF_MSS6;
780 		if(ifc != nil)
781 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
782 		break;
783 	}
784 	if(ifc != nil){
785 		if(ifc->mbps > 1000)
786 			*scale = HaveWS | 4;
787 		else if(ifc->mbps > 100)
788 			*scale = HaveWS | 3;
789 		else if(ifc->mbps > 10)
790 			*scale = HaveWS | 1;
791 		else
792 			*scale = HaveWS | 0;
793 	} else
794 		*scale = HaveWS | 0;
795 
796 	return mtu;
797 }
798 
799 void
800 inittcpctl(Conv *s, int mode)
801 {
802 	Tcpctl *tcb;
803 	Tcp4hdr* h4;
804 	Tcp6hdr* h6;
805 	Tcppriv *tpriv;
806 	int mss;
807 
808 	tcb = (Tcpctl*)s->ptcl;
809 
810 	memset(tcb, 0, sizeof(Tcpctl));
811 
812 	tcb->ssthresh = 65535;
813 	tcb->srtt = tcp_irtt<<LOGAGAIN;
814 	tcb->mdev = 0;
815 
816 	/* setup timers */
817 	tcb->timer.start = tcp_irtt / MSPTICK;
818 	tcb->timer.func = tcptimeout;
819 	tcb->timer.arg = s;
820 	tcb->rtt_timer.start = MAX_TIME;
821 	tcb->acktimer.start = TCP_ACK / MSPTICK;
822 	tcb->acktimer.func = tcpacktimer;
823 	tcb->acktimer.arg = s;
824 	tcb->katimer.start = DEF_KAT / MSPTICK;
825 	tcb->katimer.func = tcpkeepalive;
826 	tcb->katimer.arg = s;
827 
828 	mss = DEF_MSS;
829 
830 	/* create a prototype(pseudo) header */
831 	if(mode != TCP_LISTEN){
832 		if(ipcmp(s->laddr, IPnoaddr) == 0)
833 			findlocalip(s->p->f, s->laddr, s->raddr);
834 
835 		switch(s->ipversion){
836 		case V4:
837 			h4 = &tcb->protohdr.tcp4hdr;
838 			memset(h4, 0, sizeof(*h4));
839 			h4->proto = IP_TCPPROTO;
840 			hnputs(h4->tcpsport, s->lport);
841 			hnputs(h4->tcpdport, s->rport);
842 			v6tov4(h4->tcpsrc, s->laddr);
843 			v6tov4(h4->tcpdst, s->raddr);
844 			break;
845 		case V6:
846 			h6 = &tcb->protohdr.tcp6hdr;
847 			memset(h6, 0, sizeof(*h6));
848 			h6->proto = IP_TCPPROTO;
849 			hnputs(h6->tcpsport, s->lport);
850 			hnputs(h6->tcpdport, s->rport);
851 			ipmove(h6->tcpsrc, s->laddr);
852 			ipmove(h6->tcpdst, s->raddr);
853 			mss = DEF_MSS6;
854 			break;
855 		default:
856 			panic("inittcpctl: version %d", s->ipversion);
857 		}
858 	}
859 
860 	tcb->mss = tcb->cwind = mss;
861 	tpriv = s->p->priv;
862 	tpriv->stats[Mss] = tcb->mss;
863 
864 	/* default is no window scaling */
865 	tcb->window = QMAX;
866 	tcb->rcv.wnd = QMAX;
867 	tcb->rcv.scale = 0;
868 	tcb->snd.scale = 0;
869 	qsetlimit(s->rq, QMAX);
870 }
871 
872 /*
873  *  called with s qlocked
874  */
875 void
876 tcpstart(Conv *s, int mode)
877 {
878 	Tcpctl *tcb;
879 	Tcppriv *tpriv;
880 	char kpname[KNAMELEN];
881 
882 	tpriv = s->p->priv;
883 
884 	if(tpriv->ackprocstarted == 0){
885 		qlock(&tpriv->apl);
886 		if(tpriv->ackprocstarted == 0){
887 			sprint(kpname, "#I%dtcpack", s->p->f->dev);
888 			kproc(kpname, tcpackproc, s->p);
889 			tpriv->ackprocstarted = 1;
890 		}
891 		qunlock(&tpriv->apl);
892 	}
893 
894 	tcb = (Tcpctl*)s->ptcl;
895 
896 	inittcpctl(s, mode);
897 
898 	iphtadd(&tpriv->ht, s);
899 	switch(mode) {
900 	case TCP_LISTEN:
901 		tpriv->stats[PassiveOpens]++;
902 		tcb->flags |= CLONE;
903 		tcpsetstate(s, Listen);
904 		break;
905 
906 	case TCP_CONNECT:
907 		tpriv->stats[ActiveOpens]++;
908 		tcb->flags |= ACTIVE;
909 		tcpsndsyn(s, tcb);
910 		tcpsetstate(s, Syn_sent);
911 		tcpoutput(s);
912 		break;
913 	}
914 }
915 
916 static char*
917 tcpflag(ushort flag)
918 {
919 	static char buf[128];
920 
921 	sprint(buf, "%d", flag>>10);	/* Head len */
922 	if(flag & URG)
923 		strcat(buf, " URG");
924 	if(flag & ACK)
925 		strcat(buf, " ACK");
926 	if(flag & PSH)
927 		strcat(buf, " PSH");
928 	if(flag & RST)
929 		strcat(buf, " RST");
930 	if(flag & SYN)
931 		strcat(buf, " SYN");
932 	if(flag & FIN)
933 		strcat(buf, " FIN");
934 
935 	return buf;
936 }
937 
938 Block *
939 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
940 {
941 	int dlen;
942 	Tcp6hdr *h;
943 	ushort csum;
944 	ushort hdrlen, optpad = 0;
945 	uchar *opt;
946 
947 	hdrlen = TCP6_HDRSIZE;
948 	if(tcph->flags & SYN){
949 		if(tcph->mss)
950 			hdrlen += MSS_LENGTH;
951 		if(tcph->ws)
952 			hdrlen += WS_LENGTH;
953 		optpad = hdrlen & 3;
954 		if(optpad)
955 			optpad = 4 - optpad;
956 		hdrlen += optpad;
957 	}
958 
959 	if(data) {
960 		dlen = blocklen(data);
961 		data = padblock(data, hdrlen + TCP6_PKT);
962 		if(data == nil)
963 			return nil;
964 	}
965 	else {
966 		dlen = 0;
967 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
968 		if(data == nil)
969 			return nil;
970 		data->wp += hdrlen + TCP6_PKT;
971 	}
972 
973 	/* copy in pseudo ip header plus port numbers */
974 	h = (Tcp6hdr *)(data->rp);
975 	memmove(h, ph, TCP6_TCBPHDRSZ);
976 
977 	/* compose pseudo tcp header, do cksum calculation */
978 	hnputl(h->vcf, hdrlen + dlen);
979 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
980 	h->ttl = ph->proto;
981 
982 	/* copy in variable bits */
983 	hnputl(h->tcpseq, tcph->seq);
984 	hnputl(h->tcpack, tcph->ack);
985 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
986 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
987 	hnputs(h->tcpurg, tcph->urg);
988 
989 	if(tcph->flags & SYN){
990 		opt = h->tcpopt;
991 		if(tcph->mss != 0){
992 			*opt++ = MSSOPT;
993 			*opt++ = MSS_LENGTH;
994 			hnputs(opt, tcph->mss);
995 //			print("our outgoing mss %d\n", tcph->mss);
996 			opt += 2;
997 		}
998 		if(tcph->ws != 0){
999 			*opt++ = WSOPT;
1000 			*opt++ = WS_LENGTH;
1001 			*opt++ = tcph->ws;
1002 		}
1003 		while(optpad-- > 0)
1004 			*opt++ = NOOPOPT;
1005 	}
1006 
1007 	if(tcb != nil && tcb->nochecksum){
1008 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1009 	} else {
1010 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1011 		hnputs(h->tcpcksum, csum);
1012 	}
1013 
1014 	/* move from pseudo header back to normal ip header */
1015 	memset(h->vcf, 0, 4);
1016 	h->vcf[0] = IP_VER6;
1017 	hnputs(h->ploadlen, hdrlen+dlen);
1018 	h->proto = ph->proto;
1019 
1020 	return data;
1021 }
1022 
1023 Block *
1024 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1025 {
1026 	int dlen;
1027 	Tcp4hdr *h;
1028 	ushort csum;
1029 	ushort hdrlen, optpad = 0;
1030 	uchar *opt;
1031 
1032 	hdrlen = TCP4_HDRSIZE;
1033 	if(tcph->flags & SYN){
1034 		if(tcph->mss)
1035 			hdrlen += MSS_LENGTH;
1036 		if(tcph->ws)
1037 			hdrlen += WS_LENGTH;
1038 		optpad = hdrlen & 3;
1039 		if(optpad)
1040 			optpad = 4 - optpad;
1041 		hdrlen += optpad;
1042 	}
1043 
1044 	if(data) {
1045 		dlen = blocklen(data);
1046 		data = padblock(data, hdrlen + TCP4_PKT);
1047 		if(data == nil)
1048 			return nil;
1049 	}
1050 	else {
1051 		dlen = 0;
1052 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1053 		if(data == nil)
1054 			return nil;
1055 		data->wp += hdrlen + TCP4_PKT;
1056 	}
1057 
1058 	/* copy in pseudo ip header plus port numbers */
1059 	h = (Tcp4hdr *)(data->rp);
1060 	memmove(h, ph, TCP4_TCBPHDRSZ);
1061 
1062 	/* copy in variable bits */
1063 	hnputs(h->tcplen, hdrlen + dlen);
1064 	hnputl(h->tcpseq, tcph->seq);
1065 	hnputl(h->tcpack, tcph->ack);
1066 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1067 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1068 	hnputs(h->tcpurg, tcph->urg);
1069 
1070 	if(tcph->flags & SYN){
1071 		opt = h->tcpopt;
1072 		if(tcph->mss != 0){
1073 			*opt++ = MSSOPT;
1074 			*opt++ = MSS_LENGTH;
1075 			hnputs(opt, tcph->mss);
1076 			opt += 2;
1077 		}
1078 		if(tcph->ws != 0){
1079 			*opt++ = WSOPT;
1080 			*opt++ = WS_LENGTH;
1081 			*opt++ = tcph->ws;
1082 		}
1083 		while(optpad-- > 0)
1084 			*opt++ = NOOPOPT;
1085 	}
1086 
1087 	if(tcb != nil && tcb->nochecksum){
1088 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1089 	} else {
1090 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1091 		hnputs(h->tcpcksum, csum);
1092 	}
1093 
1094 	return data;
1095 }
1096 
1097 int
1098 ntohtcp6(Tcp *tcph, Block **bpp)
1099 {
1100 	Tcp6hdr *h;
1101 	uchar *optr;
1102 	ushort hdrlen;
1103 	ushort optlen;
1104 	int n;
1105 
1106 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1107 	if(*bpp == nil)
1108 		return -1;
1109 
1110 	h = (Tcp6hdr *)((*bpp)->rp);
1111 	tcph->source = nhgets(h->tcpsport);
1112 	tcph->dest = nhgets(h->tcpdport);
1113 	tcph->seq = nhgetl(h->tcpseq);
1114 	tcph->ack = nhgetl(h->tcpack);
1115 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1116 	if(hdrlen < TCP6_HDRSIZE) {
1117 		freeblist(*bpp);
1118 		return -1;
1119 	}
1120 
1121 	tcph->flags = h->tcpflag[1];
1122 	tcph->wnd = nhgets(h->tcpwin);
1123 	tcph->urg = nhgets(h->tcpurg);
1124 	tcph->mss = 0;
1125 	tcph->ws = 0;
1126 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1127 
1128 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1129 	if(*bpp == nil)
1130 		return -1;
1131 
1132 	optr = h->tcpopt;
1133 	n = hdrlen - TCP6_HDRSIZE;
1134 	while(n > 0 && *optr != EOLOPT) {
1135 		if(*optr == NOOPOPT) {
1136 			n--;
1137 			optr++;
1138 			continue;
1139 		}
1140 		optlen = optr[1];
1141 		if(optlen < 2 || optlen > n)
1142 			break;
1143 		switch(*optr) {
1144 		case MSSOPT:
1145 			if(optlen == MSS_LENGTH)
1146 				tcph->mss = nhgets(optr+2);
1147 			break;
1148 		case WSOPT:
1149 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1150 				tcph->ws = HaveWS | *(optr+2);
1151 			break;
1152 		}
1153 		n -= optlen;
1154 		optr += optlen;
1155 	}
1156 	return hdrlen;
1157 }
1158 
1159 int
1160 ntohtcp4(Tcp *tcph, Block **bpp)
1161 {
1162 	Tcp4hdr *h;
1163 	uchar *optr;
1164 	ushort hdrlen;
1165 	ushort optlen;
1166 	int n;
1167 
1168 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1169 	if(*bpp == nil)
1170 		return -1;
1171 
1172 	h = (Tcp4hdr *)((*bpp)->rp);
1173 	tcph->source = nhgets(h->tcpsport);
1174 	tcph->dest = nhgets(h->tcpdport);
1175 	tcph->seq = nhgetl(h->tcpseq);
1176 	tcph->ack = nhgetl(h->tcpack);
1177 
1178 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1179 	if(hdrlen < TCP4_HDRSIZE) {
1180 		freeblist(*bpp);
1181 		return -1;
1182 	}
1183 
1184 	tcph->flags = h->tcpflag[1];
1185 	tcph->wnd = nhgets(h->tcpwin);
1186 	tcph->urg = nhgets(h->tcpurg);
1187 	tcph->mss = 0;
1188 	tcph->ws = 0;
1189 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1190 
1191 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1192 	if(*bpp == nil)
1193 		return -1;
1194 
1195 	optr = h->tcpopt;
1196 	n = hdrlen - TCP4_HDRSIZE;
1197 	while(n > 0 && *optr != EOLOPT) {
1198 		if(*optr == NOOPOPT) {
1199 			n--;
1200 			optr++;
1201 			continue;
1202 		}
1203 		optlen = optr[1];
1204 		if(optlen < 2 || optlen > n)
1205 			break;
1206 		switch(*optr) {
1207 		case MSSOPT:
1208 			if(optlen == MSS_LENGTH) {
1209 				tcph->mss = nhgets(optr+2);
1210 //				print("new incoming mss %d\n", tcph->mss);
1211 			}
1212 			break;
1213 		case WSOPT:
1214 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1215 				tcph->ws = HaveWS | *(optr+2);
1216 			break;
1217 		}
1218 		n -= optlen;
1219 		optr += optlen;
1220 	}
1221 	return hdrlen;
1222 }
1223 
1224 /*
1225  *  For outgiing calls, generate an initial sequence
1226  *  number and put a SYN on the send queue
1227  */
1228 void
1229 tcpsndsyn(Conv *s, Tcpctl *tcb)
1230 {
1231 	Tcppriv *tpriv;
1232 
1233 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1234 	tcb->rttseq = tcb->iss;
1235 	tcb->snd.wl2 = tcb->iss;
1236 	tcb->snd.una = tcb->iss;
1237 	tcb->snd.ptr = tcb->rttseq;
1238 	tcb->snd.nxt = tcb->rttseq;
1239 	tcb->flgcnt++;
1240 	tcb->flags |= FORCE;
1241 	tcb->sndsyntime = NOW;
1242 
1243 	/* set desired mss and scale */
1244 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1245 	tpriv = s->p->priv;
1246 	tpriv->stats[Mss] = tcb->mss;
1247 }
1248 
1249 void
1250 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1251 {
1252 	Block *hbp;
1253 	uchar rflags;
1254 	Tcppriv *tpriv;
1255 	Tcp4hdr ph4;
1256 	Tcp6hdr ph6;
1257 
1258 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1259 
1260 	tpriv = tcp->priv;
1261 
1262 	if(seg->flags & RST)
1263 		return;
1264 
1265 	/* make pseudo header */
1266 	switch(version) {
1267 	case V4:
1268 		memset(&ph4, 0, sizeof(ph4));
1269 		ph4.vihl = IP_VER4;
1270 		v6tov4(ph4.tcpsrc, dest);
1271 		v6tov4(ph4.tcpdst, source);
1272 		ph4.proto = IP_TCPPROTO;
1273 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1274 		hnputs(ph4.tcpsport, seg->dest);
1275 		hnputs(ph4.tcpdport, seg->source);
1276 		break;
1277 	case V6:
1278 		memset(&ph6, 0, sizeof(ph6));
1279 		ph6.vcf[0] = IP_VER6;
1280 		ipmove(ph6.tcpsrc, dest);
1281 		ipmove(ph6.tcpdst, source);
1282 		ph6.proto = IP_TCPPROTO;
1283 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1284 		hnputs(ph6.tcpsport, seg->dest);
1285 		hnputs(ph6.tcpdport, seg->source);
1286 		break;
1287 	default:
1288 		panic("sndrst: version %d", version);
1289 	}
1290 
1291 	tpriv->stats[OutRsts]++;
1292 	rflags = RST;
1293 
1294 	/* convince the other end that this reset is in band */
1295 	if(seg->flags & ACK) {
1296 		seg->seq = seg->ack;
1297 		seg->ack = 0;
1298 	}
1299 	else {
1300 		rflags |= ACK;
1301 		seg->ack = seg->seq;
1302 		seg->seq = 0;
1303 		if(seg->flags & SYN)
1304 			seg->ack++;
1305 		seg->ack += length;
1306 		if(seg->flags & FIN)
1307 			seg->ack++;
1308 	}
1309 	seg->flags = rflags;
1310 	seg->wnd = 0;
1311 	seg->urg = 0;
1312 	seg->mss = 0;
1313 	seg->ws = 0;
1314 	switch(version) {
1315 	case V4:
1316 		hbp = htontcp4(seg, nil, &ph4, nil);
1317 		if(hbp == nil)
1318 			return;
1319 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1320 		break;
1321 	case V6:
1322 		hbp = htontcp6(seg, nil, &ph6, nil);
1323 		if(hbp == nil)
1324 			return;
1325 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1326 		break;
1327 	default:
1328 		panic("sndrst2: version %d", version);
1329 	}
1330 }
1331 
1332 /*
1333  *  send a reset to the remote side and close the conversation
1334  *  called with s qlocked
1335  */
1336 char*
1337 tcphangup(Conv *s)
1338 {
1339 	Tcp seg;
1340 	Tcpctl *tcb;
1341 	Block *hbp;
1342 
1343 	tcb = (Tcpctl*)s->ptcl;
1344 	if(waserror())
1345 		return commonerror();
1346 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1347 		if(!waserror()){
1348 			seg.flags = RST | ACK;
1349 			seg.ack = tcb->rcv.nxt;
1350 			tcb->rcv.una = 0;
1351 			seg.seq = tcb->snd.ptr;
1352 			seg.wnd = 0;
1353 			seg.urg = 0;
1354 			seg.mss = 0;
1355 			seg.ws = 0;
1356 			switch(s->ipversion) {
1357 			case V4:
1358 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1359 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1360 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1361 				break;
1362 			case V6:
1363 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1364 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1365 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1366 				break;
1367 			default:
1368 				panic("tcphangup: version %d", s->ipversion);
1369 			}
1370 			poperror();
1371 		}
1372 	}
1373 	localclose(s, nil);
1374 	poperror();
1375 	return nil;
1376 }
1377 
1378 /*
1379  *  (re)send a SYN ACK
1380  */
1381 int
1382 sndsynack(Proto *tcp, Limbo *lp)
1383 {
1384 	Block *hbp;
1385 	Tcp4hdr ph4;
1386 	Tcp6hdr ph6;
1387 	Tcp seg;
1388 	int scale;
1389 
1390 	/* make pseudo header */
1391 	switch(lp->version) {
1392 	case V4:
1393 		memset(&ph4, 0, sizeof(ph4));
1394 		ph4.vihl = IP_VER4;
1395 		v6tov4(ph4.tcpsrc, lp->laddr);
1396 		v6tov4(ph4.tcpdst, lp->raddr);
1397 		ph4.proto = IP_TCPPROTO;
1398 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1399 		hnputs(ph4.tcpsport, lp->lport);
1400 		hnputs(ph4.tcpdport, lp->rport);
1401 		break;
1402 	case V6:
1403 		memset(&ph6, 0, sizeof(ph6));
1404 		ph6.vcf[0] = IP_VER6;
1405 		ipmove(ph6.tcpsrc, lp->laddr);
1406 		ipmove(ph6.tcpdst, lp->raddr);
1407 		ph6.proto = IP_TCPPROTO;
1408 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1409 		hnputs(ph6.tcpsport, lp->lport);
1410 		hnputs(ph6.tcpdport, lp->rport);
1411 		break;
1412 	default:
1413 		panic("sndrst: version %d", lp->version);
1414 	}
1415 
1416 	seg.seq = lp->iss;
1417 	seg.ack = lp->irs+1;
1418 	seg.flags = SYN|ACK;
1419 	seg.urg = 0;
1420 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1421 //	if (seg.mss > lp->mss && lp->mss >= 512)
1422 //		seg.mss = lp->mss;
1423 	seg.wnd = QMAX;
1424 
1425 	/* if the other side set scale, we should too */
1426 	if(lp->rcvscale){
1427 		seg.ws = scale;
1428 		lp->sndscale = scale;
1429 	} else {
1430 		seg.ws = 0;
1431 		lp->sndscale = 0;
1432 	}
1433 
1434 	switch(lp->version) {
1435 	case V4:
1436 		hbp = htontcp4(&seg, nil, &ph4, nil);
1437 		if(hbp == nil)
1438 			return -1;
1439 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1440 		break;
1441 	case V6:
1442 		hbp = htontcp6(&seg, nil, &ph6, nil);
1443 		if(hbp == nil)
1444 			return -1;
1445 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1446 		break;
1447 	default:
1448 		panic("sndsnack: version %d", lp->version);
1449 	}
1450 	lp->lastsend = NOW;
1451 	return 0;
1452 }
1453 
1454 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1455 
1456 /*
1457  *  put a call into limbo and respond with a SYN ACK
1458  *
1459  *  called with proto locked
1460  */
1461 static void
1462 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1463 {
1464 	Limbo *lp, **l;
1465 	Tcppriv *tpriv;
1466 	int h;
1467 
1468 	tpriv = s->p->priv;
1469 	h = hashipa(source, seg->source);
1470 
1471 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1472 		lp = *l;
1473 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1474 			continue;
1475 		if(ipcmp(lp->raddr, source) != 0)
1476 			continue;
1477 		if(ipcmp(lp->laddr, dest) != 0)
1478 			continue;
1479 
1480 		/* each new SYN restarts the retransmits */
1481 		lp->irs = seg->seq;
1482 		break;
1483 	}
1484 	lp = *l;
1485 	if(lp == nil){
1486 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1487 			lp = tpriv->lht[h];
1488 			tpriv->lht[h] = lp->next;
1489 			lp->next = nil;
1490 		} else {
1491 			lp = malloc(sizeof(*lp));
1492 			if(lp == nil)
1493 				return;
1494 			tpriv->nlimbo++;
1495 		}
1496 		*l = lp;
1497 		lp->version = version;
1498 		ipmove(lp->laddr, dest);
1499 		ipmove(lp->raddr, source);
1500 		lp->lport = seg->dest;
1501 		lp->rport = seg->source;
1502 		lp->mss = seg->mss;
1503 		lp->rcvscale = seg->ws;
1504 		lp->irs = seg->seq;
1505 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1506 	}
1507 
1508 	if(sndsynack(s->p, lp) < 0){
1509 		*l = lp->next;
1510 		tpriv->nlimbo--;
1511 		free(lp);
1512 	}
1513 }
1514 
1515 /*
1516  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1517  */
1518 static void
1519 limborexmit(Proto *tcp)
1520 {
1521 	Tcppriv *tpriv;
1522 	Limbo **l, *lp;
1523 	int h;
1524 	int seen;
1525 	ulong now;
1526 
1527 	tpriv = tcp->priv;
1528 
1529 	if(!canqlock(tcp))
1530 		return;
1531 	seen = 0;
1532 	now = NOW;
1533 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1534 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1535 			lp = *l;
1536 			seen++;
1537 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1538 				continue;
1539 
1540 			/* time it out after 1 second */
1541 			if(++(lp->rexmits) > 5){
1542 				tpriv->nlimbo--;
1543 				*l = lp->next;
1544 				free(lp);
1545 				continue;
1546 			}
1547 
1548 			/* if we're being attacked, don't bother resending SYN ACK's */
1549 			if(tpriv->nlimbo > 100)
1550 				continue;
1551 
1552 			if(sndsynack(tcp, lp) < 0){
1553 				tpriv->nlimbo--;
1554 				*l = lp->next;
1555 				free(lp);
1556 				continue;
1557 			}
1558 
1559 			l = &lp->next;
1560 		}
1561 	}
1562 	qunlock(tcp);
1563 }
1564 
1565 /*
1566  *  lookup call in limbo.  if found, throw it out.
1567  *
1568  *  called with proto locked
1569  */
1570 static void
1571 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1572 {
1573 	Limbo *lp, **l;
1574 	int h;
1575 	Tcppriv *tpriv;
1576 
1577 	tpriv = s->p->priv;
1578 
1579 	/* find a call in limbo */
1580 	h = hashipa(src, segp->source);
1581 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1582 		lp = *l;
1583 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1584 			continue;
1585 		if(ipcmp(lp->laddr, dst) != 0)
1586 			continue;
1587 		if(ipcmp(lp->raddr, src) != 0)
1588 			continue;
1589 
1590 		/* RST can only follow the SYN */
1591 		if(segp->seq == lp->irs+1){
1592 			tpriv->nlimbo--;
1593 			*l = lp->next;
1594 			free(lp);
1595 		}
1596 		break;
1597 	}
1598 }
1599 
1600 /*
1601  *  come here when we finally get an ACK to our SYN-ACK.
1602  *  lookup call in limbo.  if found, create a new conversation
1603  *
1604  *  called with proto locked
1605  */
1606 static Conv*
1607 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1608 {
1609 	Conv *new;
1610 	Tcpctl *tcb;
1611 	Tcppriv *tpriv;
1612 	Tcp4hdr *h4;
1613 	Tcp6hdr *h6;
1614 	Limbo *lp, **l;
1615 	int h;
1616 
1617 	/* unless it's just an ack, it can't be someone coming out of limbo */
1618 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1619 		return nil;
1620 
1621 	tpriv = s->p->priv;
1622 
1623 	/* find a call in limbo */
1624 	h = hashipa(src, segp->source);
1625 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1626 		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1627 			src, segp->source, lp->raddr, lp->rport,
1628 			dst, segp->dest, lp->laddr, lp->lport,
1629 			version, lp->version
1630  		);
1631 
1632 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1633 			continue;
1634 		if(ipcmp(lp->laddr, dst) != 0)
1635 			continue;
1636 		if(ipcmp(lp->raddr, src) != 0)
1637 			continue;
1638 
1639 		/* we're assuming no data with the initial SYN */
1640 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1641 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1642 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1643 			lp = nil;
1644 		} else {
1645 			tpriv->nlimbo--;
1646 			*l = lp->next;
1647 		}
1648 		break;
1649 	}
1650 	if(lp == nil)
1651 		return nil;
1652 
1653 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1654 	if(new == nil)
1655 		return nil;
1656 
1657 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1658 	tcb = (Tcpctl*)new->ptcl;
1659 	tcb->flags &= ~CLONE;
1660 	tcb->timer.arg = new;
1661 	tcb->timer.state = TcptimerOFF;
1662 	tcb->acktimer.arg = new;
1663 	tcb->acktimer.state = TcptimerOFF;
1664 	tcb->katimer.arg = new;
1665 	tcb->katimer.state = TcptimerOFF;
1666 	tcb->rtt_timer.arg = new;
1667 	tcb->rtt_timer.state = TcptimerOFF;
1668 
1669 	tcb->irs = lp->irs;
1670 	tcb->rcv.nxt = tcb->irs+1;
1671 	tcb->rcv.urg = tcb->rcv.nxt;
1672 
1673 	tcb->iss = lp->iss;
1674 	tcb->rttseq = tcb->iss;
1675 	tcb->snd.wl2 = tcb->iss;
1676 	tcb->snd.una = tcb->iss+1;
1677 	tcb->snd.ptr = tcb->iss+1;
1678 	tcb->snd.nxt = tcb->iss+1;
1679 	tcb->flgcnt = 0;
1680 	tcb->flags |= SYNACK;
1681 
1682 	/* our sending max segment size cannot be bigger than what he asked for */
1683 	if(lp->mss != 0 && lp->mss < tcb->mss) {
1684 		tcb->mss = lp->mss;
1685 		tpriv->stats[Mss] = tcb->mss;
1686 	}
1687 
1688 	/* window scaling */
1689 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1690 
1691 	/* the congestion window always starts out as a single segment */
1692 	tcb->snd.wnd = segp->wnd;
1693 	tcb->cwind = tcb->mss;
1694 
1695 	/* set initial round trip time */
1696 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1697 	tcpsynackrtt(new);
1698 
1699 	free(lp);
1700 
1701 	/* set up proto header */
1702 	switch(version){
1703 	case V4:
1704 		h4 = &tcb->protohdr.tcp4hdr;
1705 		memset(h4, 0, sizeof(*h4));
1706 		h4->proto = IP_TCPPROTO;
1707 		hnputs(h4->tcpsport, new->lport);
1708 		hnputs(h4->tcpdport, new->rport);
1709 		v6tov4(h4->tcpsrc, dst);
1710 		v6tov4(h4->tcpdst, src);
1711 		break;
1712 	case V6:
1713 		h6 = &tcb->protohdr.tcp6hdr;
1714 		memset(h6, 0, sizeof(*h6));
1715 		h6->proto = IP_TCPPROTO;
1716 		hnputs(h6->tcpsport, new->lport);
1717 		hnputs(h6->tcpdport, new->rport);
1718 		ipmove(h6->tcpsrc, dst);
1719 		ipmove(h6->tcpdst, src);
1720 		break;
1721 	default:
1722 		panic("tcpincoming: version %d", new->ipversion);
1723 	}
1724 
1725 	tcpsetstate(new, Established);
1726 
1727 	iphtadd(&tpriv->ht, new);
1728 
1729 	return new;
1730 }
1731 
1732 int
1733 seq_within(ulong x, ulong low, ulong high)
1734 {
1735 	if(low <= high){
1736 		if(low <= x && x <= high)
1737 			return 1;
1738 	}
1739 	else {
1740 		if(x >= low || x <= high)
1741 			return 1;
1742 	}
1743 	return 0;
1744 }
1745 
1746 int
1747 seq_lt(ulong x, ulong y)
1748 {
1749 	return (int)(x-y) < 0;
1750 }
1751 
1752 int
1753 seq_le(ulong x, ulong y)
1754 {
1755 	return (int)(x-y) <= 0;
1756 }
1757 
1758 int
1759 seq_gt(ulong x, ulong y)
1760 {
1761 	return (int)(x-y) > 0;
1762 }
1763 
1764 int
1765 seq_ge(ulong x, ulong y)
1766 {
1767 	return (int)(x-y) >= 0;
1768 }
1769 
1770 /*
1771  *  use the time between the first SYN and it's ack as the
1772  *  initial round trip time
1773  */
1774 void
1775 tcpsynackrtt(Conv *s)
1776 {
1777 	Tcpctl *tcb;
1778 	int delta;
1779 	Tcppriv *tpriv;
1780 
1781 	tcb = (Tcpctl*)s->ptcl;
1782 	tpriv = s->p->priv;
1783 
1784 	delta = NOW - tcb->sndsyntime;
1785 	tcb->srtt = delta<<LOGAGAIN;
1786 	tcb->mdev = delta<<LOGDGAIN;
1787 
1788 	/* halt round trip timer */
1789 	tcphalt(tpriv, &tcb->rtt_timer);
1790 }
1791 
1792 void
1793 update(Conv *s, Tcp *seg)
1794 {
1795 	int rtt, delta;
1796 	Tcpctl *tcb;
1797 	ulong acked;
1798 	ulong expand;
1799 	Tcppriv *tpriv;
1800 
1801 	tpriv = s->p->priv;
1802 	tcb = (Tcpctl*)s->ptcl;
1803 
1804 	/* if everything has been acked, force output(?) */
1805 	if(seq_gt(seg->ack, tcb->snd.nxt)) {
1806 		tcb->flags |= FORCE;
1807 		return;
1808 	}
1809 
1810 	/* added by Dong Lin for fast retransmission */
1811 	if(seg->ack == tcb->snd.una
1812 	&& tcb->snd.una != tcb->snd.nxt
1813 	&& seg->len == 0
1814 	&& seg->wnd == tcb->snd.wnd) {
1815 
1816 		/* this is a pure ack w/o window update */
1817 		netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n",
1818 			tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1819 
1820 		if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1821 			/*
1822 			 *  tahoe tcp rxt the packet, half sshthresh,
1823  			 *  and set cwnd to one packet
1824 			 */
1825 			tcb->snd.recovery = 1;
1826 			tcb->snd.rxt = tcb->snd.nxt;
1827 			netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1828 			tcprxmit(s);
1829 		} else {
1830 			/* do reno tcp here. */
1831 		}
1832 	}
1833 
1834 	/*
1835 	 *  update window
1836 	 */
1837 	if(seq_gt(seg->ack, tcb->snd.wl2)
1838 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1839 		tcb->snd.wnd = seg->wnd;
1840 		tcb->snd.wl2 = seg->ack;
1841 	}
1842 
1843 	if(!seq_gt(seg->ack, tcb->snd.una)){
1844 		/*
1845 		 *  don't let us hangup if sending into a closed window and
1846 		 *  we're still getting acks
1847 		 */
1848 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1849 			tcb->backedoff = MAXBACKMS/4;
1850 		}
1851 		return;
1852 	}
1853 
1854 	/*
1855 	 *  any positive ack turns off fast rxt,
1856 	 *  (should we do new-reno on partial acks?)
1857 	 */
1858 	if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1859 		tcb->snd.dupacks = 0;
1860 		tcb->snd.recovery = 0;
1861 	} else
1862 		netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind);
1863 
1864 	/* Compute the new send window size */
1865 	acked = seg->ack - tcb->snd.una;
1866 
1867 	/* avoid slow start and timers for SYN acks */
1868 	if((tcb->flags & SYNACK) == 0) {
1869 		tcb->flags |= SYNACK;
1870 		acked--;
1871 		tcb->flgcnt--;
1872 		goto done;
1873 	}
1874 
1875 	/* slow start as long as we're not recovering from lost packets */
1876 	if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1877 		if(tcb->cwind < tcb->ssthresh) {
1878 			expand = tcb->mss;
1879 			if(acked < expand)
1880 				expand = acked;
1881 		}
1882 		else
1883 			expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1884 
1885 		if(tcb->cwind + expand < tcb->cwind)
1886 			expand = tcb->snd.wnd - tcb->cwind;
1887 		if(tcb->cwind + expand > tcb->snd.wnd)
1888 			expand = tcb->snd.wnd - tcb->cwind;
1889 		tcb->cwind += expand;
1890 	}
1891 
1892 	/* Adjust the timers according to the round trip time */
1893 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1894 		tcphalt(tpriv, &tcb->rtt_timer);
1895 		if((tcb->flags&RETRAN) == 0) {
1896 			tcb->backoff = 0;
1897 			tcb->backedoff = 0;
1898 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1899 			if(rtt == 0)
1900 				rtt = 1;	/* otherwise all close systems will rexmit in 0 time */
1901 			rtt *= MSPTICK;
1902 			if(tcb->srtt == 0) {
1903 				tcb->srtt = rtt << LOGAGAIN;
1904 				tcb->mdev = rtt << LOGDGAIN;
1905 			} else {
1906 				delta = rtt - (tcb->srtt>>LOGAGAIN);
1907 				tcb->srtt += delta;
1908 				if(tcb->srtt <= 0)
1909 					tcb->srtt = 1;
1910 
1911 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1912 				tcb->mdev += delta;
1913 				if(tcb->mdev <= 0)
1914 					tcb->mdev = 1;
1915 			}
1916 			tcpsettimer(tcb);
1917 		}
1918 	}
1919 
1920 done:
1921 	if(qdiscard(s->wq, acked) < acked)
1922 		tcb->flgcnt--;
1923 
1924 	tcb->snd.una = seg->ack;
1925 	if(seq_gt(seg->ack, tcb->snd.urg))
1926 		tcb->snd.urg = seg->ack;
1927 
1928 	if(tcb->snd.una != tcb->snd.nxt)
1929 		tcpgo(tpriv, &tcb->timer);
1930 	else
1931 		tcphalt(tpriv, &tcb->timer);
1932 
1933 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1934 		tcb->snd.ptr = tcb->snd.una;
1935 
1936 	tcb->flags &= ~RETRAN;
1937 	tcb->backoff = 0;
1938 	tcb->backedoff = 0;
1939 }
1940 
1941 void
1942 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1943 {
1944 	Tcp seg;
1945 	Tcp4hdr *h4;
1946 	Tcp6hdr *h6;
1947 	int hdrlen;
1948 	Tcpctl *tcb;
1949 	ushort length, csum;
1950 	uchar source[IPaddrlen], dest[IPaddrlen];
1951 	Conv *s;
1952 	Fs *f;
1953 	Tcppriv *tpriv;
1954 	uchar version;
1955 
1956 	f = tcp->f;
1957 	tpriv = tcp->priv;
1958 
1959 	tpriv->stats[InSegs]++;
1960 
1961 	h4 = (Tcp4hdr*)(bp->rp);
1962 	h6 = (Tcp6hdr*)(bp->rp);
1963 
1964 	if((h4->vihl&0xF0)==IP_VER4) {
1965 		version = V4;
1966 		length = nhgets(h4->length);
1967 		v4tov6(dest, h4->tcpdst);
1968 		v4tov6(source, h4->tcpsrc);
1969 
1970 		h4->Unused = 0;
1971 		hnputs(h4->tcplen, length-TCP4_PKT);
1972 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1973 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1974 			tpriv->stats[CsumErrs]++;
1975 			tpriv->stats[InErrs]++;
1976 			netlog(f, Logtcp, "bad tcp proto cksum\n");
1977 			freeblist(bp);
1978 			return;
1979 		}
1980 
1981 		hdrlen = ntohtcp4(&seg, &bp);
1982 		if(hdrlen < 0){
1983 			tpriv->stats[HlenErrs]++;
1984 			tpriv->stats[InErrs]++;
1985 			netlog(f, Logtcp, "bad tcp hdr len\n");
1986 			return;
1987 		}
1988 
1989 		/* trim the packet to the size claimed by the datagram */
1990 		length -= hdrlen+TCP4_PKT;
1991 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1992 		if(bp == nil){
1993 			tpriv->stats[LenErrs]++;
1994 			tpriv->stats[InErrs]++;
1995 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
1996 			return;
1997 		}
1998 	}
1999 	else {
2000 		int ttl = h6->ttl;
2001 		int proto = h6->proto;
2002 
2003 		version = V6;
2004 		length = nhgets(h6->ploadlen);
2005 		ipmove(dest, h6->tcpdst);
2006 		ipmove(source, h6->tcpsrc);
2007 
2008 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2009 		h6->ttl = proto;
2010 		hnputl(h6->vcf, length);
2011 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2012 		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2013 			tpriv->stats[CsumErrs]++;
2014 			tpriv->stats[InErrs]++;
2015 			netlog(f, Logtcp,
2016 			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2017 				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2018 			freeblist(bp);
2019 			return;
2020 		}
2021 		h6->ttl = ttl;
2022 		h6->proto = proto;
2023 		hnputs(h6->ploadlen, length);
2024 
2025 		hdrlen = ntohtcp6(&seg, &bp);
2026 		if(hdrlen < 0){
2027 			tpriv->stats[HlenErrs]++;
2028 			tpriv->stats[InErrs]++;
2029 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2030 			return;
2031 		}
2032 
2033 		/* trim the packet to the size claimed by the datagram */
2034 		length -= hdrlen;
2035 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2036 		if(bp == nil){
2037 			tpriv->stats[LenErrs]++;
2038 			tpriv->stats[InErrs]++;
2039 			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2040 			return;
2041 		}
2042 	}
2043 
2044 	/* lock protocol while searching for a conversation */
2045 	qlock(tcp);
2046 
2047 	/* Look for a matching conversation */
2048 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2049 	if(s == nil){
2050 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2051 			source, seg.source, dest, seg.dest);
2052 reset:
2053 		qunlock(tcp);
2054 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2055 		freeblist(bp);
2056 		return;
2057 	}
2058 
2059 	/* if it's a listener, look for the right flags and get a new conv */
2060 	tcb = (Tcpctl*)s->ptcl;
2061 	if(tcb->state == Listen){
2062 		if(seg.flags & RST){
2063 			limborst(s, &seg, source, dest, version);
2064 			qunlock(tcp);
2065 			freeblist(bp);
2066 			return;
2067 		}
2068 
2069 		/* if this is a new SYN, put the call into limbo */
2070 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2071 			limbo(s, source, dest, &seg, version);
2072 			qunlock(tcp);
2073 			freeblist(bp);
2074 			return;
2075 		}
2076 
2077 		/*
2078 		 *  if there's a matching call in limbo, tcpincoming will
2079 		 *  return it in state Syn_received
2080 		 */
2081 		s = tcpincoming(s, &seg, source, dest, version);
2082 		if(s == nil)
2083 			goto reset;
2084 	}
2085 
2086 	/* The rest of the input state machine is run with the control block
2087 	 * locked and implements the state machine directly out of the RFC.
2088 	 * Out-of-band data is ignored - it was always a bad idea.
2089 	 */
2090 	tcb = (Tcpctl*)s->ptcl;
2091 	if(waserror()){
2092 		qunlock(s);
2093 		nexterror();
2094 	}
2095 	qlock(s);
2096 	qunlock(tcp);
2097 
2098 	/* fix up window */
2099 	seg.wnd <<= tcb->rcv.scale;
2100 
2101 	/* every input packet in puts off the keep alive time out */
2102 	tcpsetkacounter(tcb);
2103 
2104 	switch(tcb->state) {
2105 	case Closed:
2106 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2107 		goto raise;
2108 	case Syn_sent:
2109 		if(seg.flags & ACK) {
2110 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2111 				sndrst(tcp, source, dest, length, &seg, version,
2112 					 "bad seq in Syn_sent");
2113 				goto raise;
2114 			}
2115 		}
2116 		if(seg.flags & RST) {
2117 			if(seg.flags & ACK)
2118 				localclose(s, Econrefused);
2119 			goto raise;
2120 		}
2121 
2122 		if(seg.flags & SYN) {
2123 			procsyn(s, &seg);
2124 			if(seg.flags & ACK){
2125 				update(s, &seg);
2126 				tcpsynackrtt(s);
2127 				tcpsetstate(s, Established);
2128 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2129 			}
2130 			else {
2131 				tcb->time = NOW;
2132 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2133 			}
2134 
2135 			if(length != 0 || (seg.flags & FIN))
2136 				break;
2137 
2138 			freeblist(bp);
2139 			goto output;
2140 		}
2141 		else
2142 			freeblist(bp);
2143 
2144 		qunlock(s);
2145 		poperror();
2146 		return;
2147 	case Syn_received:
2148 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2149 		if(seg.flags & ACK)
2150 			tcpsynackrtt(s);
2151 		break;
2152 	}
2153 
2154 	/*
2155 	 *  One DOS attack is to open connections to us and then forget about them,
2156 	 *  thereby tying up a conv at no long term cost to the attacker.
2157 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2158 	 *  corresponding code in tcpsendka().
2159 	 */
2160 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2161 		if(tcpporthogdefense
2162 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2163 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2164 				source, seg.source, dest, seg.dest, seg.flags,
2165 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2166 			localclose(s, "stateless hog");
2167 		}
2168 	}
2169 
2170 	/* Cut the data to fit the receive window */
2171 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2172 		netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud\n",
2173 			seg.seq, seg.seq + length - 1,
2174 			tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1);
2175 		netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2176 		update(s, &seg);
2177 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2178 			tcphalt(tpriv, &tcb->rtt_timer);
2179 			tcphalt(tpriv, &tcb->acktimer);
2180 			tcphalt(tpriv, &tcb->katimer);
2181 			tcpsetstate(s, Time_wait);
2182 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2183 			tcpgo(tpriv, &tcb->timer);
2184 		}
2185 		if(!(seg.flags & RST)) {
2186 			tcb->flags |= FORCE;
2187 			goto output;
2188 		}
2189 		qunlock(s);
2190 		poperror();
2191 		return;
2192 	}
2193 
2194 	/* Cannot accept so answer with a rst */
2195 	if(length && tcb->state == Closed) {
2196 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2197 		goto raise;
2198 	}
2199 
2200 	/* The segment is beyond the current receive pointer so
2201 	 * queue the data in the resequence queue
2202 	 */
2203 	if(seg.seq != tcb->rcv.nxt)
2204 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2205 		update(s, &seg);
2206 		if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2207 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2208 		tcb->flags |= FORCE;
2209 		goto output;
2210 	}
2211 
2212 	/*
2213 	 *  keep looping till we've processed this packet plus any
2214 	 *  adjacent packets in the resequence queue
2215 	 */
2216 	for(;;) {
2217 		if(seg.flags & RST) {
2218 			if(tcb->state == Established) {
2219 				tpriv->stats[EstabResets]++;
2220 				if(tcb->rcv.nxt != seg.seq)
2221 					print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2222 			}
2223 			localclose(s, Econrefused);
2224 			goto raise;
2225 		}
2226 
2227 		if((seg.flags&ACK) == 0)
2228 			goto raise;
2229 
2230 		switch(tcb->state) {
2231 		case Syn_received:
2232 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2233 				sndrst(tcp, source, dest, length, &seg, version,
2234 					"bad seq in Syn_received");
2235 				goto raise;
2236 			}
2237 			update(s, &seg);
2238 			tcpsetstate(s, Established);
2239 		case Established:
2240 		case Close_wait:
2241 			update(s, &seg);
2242 			break;
2243 		case Finwait1:
2244 			update(s, &seg);
2245 			if(qlen(s->wq)+tcb->flgcnt == 0){
2246 				tcphalt(tpriv, &tcb->rtt_timer);
2247 				tcphalt(tpriv, &tcb->acktimer);
2248 				tcpsetkacounter(tcb);
2249 				tcb->time = NOW;
2250 				tcpsetstate(s, Finwait2);
2251 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2252 				tcpgo(tpriv, &tcb->katimer);
2253 			}
2254 			break;
2255 		case Finwait2:
2256 			update(s, &seg);
2257 			break;
2258 		case Closing:
2259 			update(s, &seg);
2260 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2261 				tcphalt(tpriv, &tcb->rtt_timer);
2262 				tcphalt(tpriv, &tcb->acktimer);
2263 				tcphalt(tpriv, &tcb->katimer);
2264 				tcpsetstate(s, Time_wait);
2265 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2266 				tcpgo(tpriv, &tcb->timer);
2267 			}
2268 			break;
2269 		case Last_ack:
2270 			update(s, &seg);
2271 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2272 				localclose(s, nil);
2273 				goto raise;
2274 			}
2275 		case Time_wait:
2276 			tcb->flags |= FORCE;
2277 			if(tcb->timer.state != TcptimerON)
2278 				tcpgo(tpriv, &tcb->timer);
2279 		}
2280 
2281 		if((seg.flags&URG) && seg.urg) {
2282 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2283 				tcb->rcv.urg = seg.urg + seg.seq;
2284 				pullblock(&bp, seg.urg);
2285 			}
2286 		}
2287 		else
2288 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2289 			tcb->rcv.urg = tcb->rcv.nxt;
2290 
2291 		if(length == 0) {
2292 			if(bp != nil)
2293 				freeblist(bp);
2294 		}
2295 		else {
2296 			switch(tcb->state){
2297 			default:
2298 				/* Ignore segment text */
2299 				if(bp != nil)
2300 					freeblist(bp);
2301 				break;
2302 
2303 			case Syn_received:
2304 			case Established:
2305 			case Finwait1:
2306 				/* If we still have some data place on
2307 				 * receive queue
2308 				 */
2309 				if(bp) {
2310 					bp = packblock(bp);
2311 					if(bp == nil)
2312 						panic("tcp packblock");
2313 					qpassnolim(s->rq, bp);
2314 					bp = nil;
2315 
2316 					/*
2317 					 *  Force an ack every 2 data messages.  This is
2318 					 *  a hack for rob to make his home system run
2319 					 *  faster.
2320 					 *
2321 					 *  this also keeps the standard TCP congestion
2322 					 *  control working since it needs an ack every
2323 					 *  2 max segs worth.  This is not quite that,
2324 					 *  but under a real stream is equivalent since
2325 					 *  every packet has a max seg in it.
2326 					 */
2327 					if(++(tcb->rcv.una) >= 2)
2328 						tcb->flags |= FORCE;
2329 				}
2330 				tcb->rcv.nxt += length;
2331 
2332 				/*
2333 				 *  update our rcv window
2334 				 */
2335 				tcprcvwin(s);
2336 
2337 				/*
2338 				 *  turn on the acktimer if there's something
2339 				 *  to ack
2340 				 */
2341 				if(tcb->acktimer.state != TcptimerON)
2342 					tcpgo(tpriv, &tcb->acktimer);
2343 
2344 				break;
2345 			case Finwait2:
2346 				/* no process to read the data, send a reset */
2347 				if(bp != nil)
2348 					freeblist(bp);
2349 				sndrst(tcp, source, dest, length, &seg, version,
2350 					"send to Finwait2");
2351 				qunlock(s);
2352 				poperror();
2353 				return;
2354 			}
2355 		}
2356 
2357 		if(seg.flags & FIN) {
2358 			tcb->flags |= FORCE;
2359 
2360 			switch(tcb->state) {
2361 			case Syn_received:
2362 			case Established:
2363 				tcb->rcv.nxt++;
2364 				tcpsetstate(s, Close_wait);
2365 				break;
2366 			case Finwait1:
2367 				tcb->rcv.nxt++;
2368 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2369 					tcphalt(tpriv, &tcb->rtt_timer);
2370 					tcphalt(tpriv, &tcb->acktimer);
2371 					tcphalt(tpriv, &tcb->katimer);
2372 					tcpsetstate(s, Time_wait);
2373 					tcb->timer.start = MSL2*(1000/MSPTICK);
2374 					tcpgo(tpriv, &tcb->timer);
2375 				}
2376 				else
2377 					tcpsetstate(s, Closing);
2378 				break;
2379 			case Finwait2:
2380 				tcb->rcv.nxt++;
2381 				tcphalt(tpriv, &tcb->rtt_timer);
2382 				tcphalt(tpriv, &tcb->acktimer);
2383 				tcphalt(tpriv, &tcb->katimer);
2384 				tcpsetstate(s, Time_wait);
2385 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2386 				tcpgo(tpriv, &tcb->timer);
2387 				break;
2388 			case Close_wait:
2389 			case Closing:
2390 			case Last_ack:
2391 				break;
2392 			case Time_wait:
2393 				tcpgo(tpriv, &tcb->timer);
2394 				break;
2395 			}
2396 		}
2397 
2398 		/*
2399 		 *  get next adjacent segment from the resequence queue.
2400 		 *  dump/trim any overlapping segments
2401 		 */
2402 		for(;;) {
2403 			if(tcb->reseq == nil)
2404 				goto output;
2405 
2406 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2407 				goto output;
2408 
2409 			getreseq(tcb, &seg, &bp, &length);
2410 
2411 			if(tcptrim(tcb, &seg, &bp, &length) == 0)
2412 				break;
2413 		}
2414 	}
2415 output:
2416 	tcpoutput(s);
2417 	qunlock(s);
2418 	poperror();
2419 	return;
2420 raise:
2421 	qunlock(s);
2422 	poperror();
2423 	freeblist(bp);
2424 	tcpkick(s);
2425 }
2426 
2427 /*
2428  *  always enters and exits with the s locked.  We drop
2429  *  the lock to ipoput the packet so some care has to be
2430  *  taken by callers.
2431  */
2432 void
2433 tcpoutput(Conv *s)
2434 {
2435 	Tcp seg;
2436 	int msgs;
2437 	Tcpctl *tcb;
2438 	Block *hbp, *bp;
2439 	int sndcnt, n;
2440 	ulong ssize, dsize, usable, sent;
2441 	Fs *f;
2442 	Tcppriv *tpriv;
2443 	uchar version;
2444 
2445 	f = s->p->f;
2446 	tpriv = s->p->priv;
2447 	version = s->ipversion;
2448 
2449 	for(msgs = 0; msgs < 100; msgs++) {
2450 		tcb = (Tcpctl*)s->ptcl;
2451 
2452 		switch(tcb->state) {
2453 		case Listen:
2454 		case Closed:
2455 		case Finwait2:
2456 			return;
2457 		}
2458 
2459 		/* force an ack when a window has opened up */
2460 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2461 			tcb->rcv.blocked = 0;
2462 			tcb->flags |= FORCE;
2463 		}
2464 
2465 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2466 		sent = tcb->snd.ptr - tcb->snd.una;
2467 
2468 		/* Don't send anything else until our SYN has been acked */
2469 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2470 			break;
2471 
2472 		/* Compute usable segment based on offered window and limit
2473 		 * window probes to one
2474 		 */
2475 		if(tcb->snd.wnd == 0){
2476 			if(sent != 0) {
2477 				if((tcb->flags&FORCE) == 0)
2478 					break;
2479 //				tcb->snd.ptr = tcb->snd.una;
2480 			}
2481 			usable = 1;
2482 		}
2483 		else {
2484 			usable = tcb->cwind;
2485 			if(tcb->snd.wnd < usable)
2486 				usable = tcb->snd.wnd;
2487 			usable -= sent;
2488 		}
2489 		ssize = sndcnt-sent;
2490 		if(ssize && usable < 2)
2491 			netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2492 				tcb->snd.wnd, tcb->cwind);
2493 		if(usable < ssize)
2494 			ssize = usable;
2495 		if(tcb->mss < ssize)
2496 			ssize = tcb->mss;
2497 		dsize = ssize;
2498 		seg.urg = 0;
2499 
2500 		if(ssize == 0)
2501 		if((tcb->flags&FORCE) == 0)
2502 			break;
2503 
2504 		tcb->flags &= ~FORCE;
2505 		tcprcvwin(s);
2506 
2507 		/* By default we will generate an ack */
2508 		tcphalt(tpriv, &tcb->acktimer);
2509 		tcb->rcv.una = 0;
2510 		seg.source = s->lport;
2511 		seg.dest = s->rport;
2512 		seg.flags = ACK;
2513 		seg.mss = 0;
2514 		seg.ws = 0;
2515 		switch(tcb->state){
2516 		case Syn_sent:
2517 			seg.flags = 0;
2518 			if(tcb->snd.ptr == tcb->iss){
2519 				seg.flags |= SYN;
2520 				dsize--;
2521 				seg.mss = tcb->mss;
2522 				seg.ws = tcb->scale;
2523 			}
2524 			break;
2525 		case Syn_received:
2526 			/*
2527 			 *  don't send any data with a SYN/ACK packet
2528 			 *  because Linux rejects the packet in its
2529 			 *  attempt to solve the SYN attack problem
2530 			 */
2531 			if(tcb->snd.ptr == tcb->iss){
2532 				seg.flags |= SYN;
2533 				dsize = 0;
2534 				ssize = 1;
2535 				seg.mss = tcb->mss;
2536 				seg.ws = tcb->scale;
2537 			}
2538 			break;
2539 		}
2540 		seg.seq = tcb->snd.ptr;
2541 		seg.ack = tcb->rcv.nxt;
2542 		seg.wnd = tcb->rcv.wnd;
2543 
2544 		/* Pull out data to send */
2545 		bp = nil;
2546 		if(dsize != 0) {
2547 			bp = qcopy(s->wq, dsize, sent);
2548 			if(BLEN(bp) != dsize) {
2549 				seg.flags |= FIN;
2550 				dsize--;
2551 			}
2552 		}
2553 
2554 		if(sent+dsize == sndcnt)
2555 			seg.flags |= PSH;
2556 
2557 		/* keep track of balance of resent data */
2558 		if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2559 			n = tcb->snd.nxt - tcb->snd.ptr;
2560 			if(ssize < n)
2561 				n = ssize;
2562 			tcb->resent += n;
2563 			netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n",
2564 				s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2565 			tpriv->stats[RetransSegs]++;
2566 		}
2567 
2568 		tcb->snd.ptr += ssize;
2569 
2570 		/* Pull up the send pointer so we can accept acks
2571 		 * for this window
2572 		 */
2573 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2574 			tcb->snd.nxt = tcb->snd.ptr;
2575 
2576 		/* Build header, link data and compute cksum */
2577 		switch(version){
2578 		case V4:
2579 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2580 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2581 			if(hbp == nil) {
2582 				freeblist(bp);
2583 				return;
2584 			}
2585 			break;
2586 		case V6:
2587 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2588 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2589 			if(hbp == nil) {
2590 				freeblist(bp);
2591 				return;
2592 			}
2593 			break;
2594 		default:
2595 			hbp = nil;	/* to suppress a warning */
2596 			panic("tcpoutput: version %d", version);
2597 		}
2598 
2599 		/* Start the transmission timers if there is new data and we
2600 		 * expect acknowledges
2601 		 */
2602 		if(ssize != 0){
2603 			if(tcb->timer.state != TcptimerON)
2604 				tcpgo(tpriv, &tcb->timer);
2605 
2606 			/*  If round trip timer isn't running, start it.
2607 			 *  measure the longest packet only in case the
2608 			 *  transmission time dominates RTT
2609 			 */
2610 			if(tcb->rtt_timer.state != TcptimerON)
2611 			if(ssize == tcb->mss) {
2612 				tcpgo(tpriv, &tcb->rtt_timer);
2613 				tcb->rttseq = tcb->snd.ptr;
2614 			}
2615 		}
2616 
2617 		tpriv->stats[OutSegs]++;
2618 
2619 		/* put off the next keep alive */
2620 		tcpgo(tpriv, &tcb->katimer);
2621 
2622 		switch(version){
2623 		case V4:
2624 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2625 				/* a negative return means no route */
2626 				localclose(s, "no route");
2627 			}
2628 			break;
2629 		case V6:
2630 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2631 				/* a negative return means no route */
2632 				localclose(s, "no route");
2633 			}
2634 			break;
2635 		default:
2636 			panic("tcpoutput2: version %d", version);
2637 		}
2638 		if((msgs%4) == 1){
2639 			qunlock(s);
2640 			sched();
2641 			qlock(s);
2642 		}
2643 	}
2644 }
2645 
2646 /*
2647  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2648  */
2649 void
2650 tcpsendka(Conv *s)
2651 {
2652 	Tcp seg;
2653 	Tcpctl *tcb;
2654 	Block *hbp,*dbp;
2655 
2656 	tcb = (Tcpctl*)s->ptcl;
2657 
2658 	dbp = nil;
2659 	seg.urg = 0;
2660 	seg.source = s->lport;
2661 	seg.dest = s->rport;
2662 	seg.flags = ACK|PSH;
2663 	seg.mss = 0;
2664 	seg.ws = 0;
2665 	if(tcpporthogdefense)
2666 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2667 	else
2668 		seg.seq = tcb->snd.una-1;
2669 	seg.ack = tcb->rcv.nxt;
2670 	tcb->rcv.una = 0;
2671 	seg.wnd = tcb->rcv.wnd;
2672 	if(tcb->state == Finwait2){
2673 		seg.flags |= FIN;
2674 	} else {
2675 		dbp = allocb(1);
2676 		dbp->wp++;
2677 	}
2678 
2679 	if(isv4(s->raddr)) {
2680 		/* Build header, link data and compute cksum */
2681 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2682 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2683 		if(hbp == nil) {
2684 			freeblist(dbp);
2685 			return;
2686 		}
2687 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2688 	}
2689 	else {
2690 		/* Build header, link data and compute cksum */
2691 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2692 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2693 		if(hbp == nil) {
2694 			freeblist(dbp);
2695 			return;
2696 		}
2697 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2698 	}
2699 }
2700 
2701 /*
2702  *  set connection to time out after 12 minutes
2703  */
2704 void
2705 tcpsetkacounter(Tcpctl *tcb)
2706 {
2707 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2708 	if(tcb->kacounter < 3)
2709 		tcb->kacounter = 3;
2710 }
2711 
2712 /*
2713  *  if we've timed out, close the connection
2714  *  otherwise, send a keepalive and restart the timer
2715  */
2716 void
2717 tcpkeepalive(void *v)
2718 {
2719 	Tcpctl *tcb;
2720 	Conv *s;
2721 
2722 	s = v;
2723 	tcb = (Tcpctl*)s->ptcl;
2724 	if(waserror()){
2725 		qunlock(s);
2726 		nexterror();
2727 	}
2728 	qlock(s);
2729 	if(tcb->state != Closed){
2730 		if(--(tcb->kacounter) <= 0) {
2731 			localclose(s, Etimedout);
2732 		} else {
2733 			tcpsendka(s);
2734 			tcpgo(s->p->priv, &tcb->katimer);
2735 		}
2736 	}
2737 	qunlock(s);
2738 	poperror();
2739 }
2740 
2741 /*
2742  *  start keepalive timer
2743  */
2744 char*
2745 tcpstartka(Conv *s, char **f, int n)
2746 {
2747 	Tcpctl *tcb;
2748 	int x;
2749 
2750 	tcb = (Tcpctl*)s->ptcl;
2751 	if(tcb->state != Established)
2752 		return "connection must be in Establised state";
2753 	if(n > 1){
2754 		x = atoi(f[1]);
2755 		if(x >= MSPTICK)
2756 			tcb->katimer.start = x/MSPTICK;
2757 	}
2758 	tcpsetkacounter(tcb);
2759 	tcpgo(s->p->priv, &tcb->katimer);
2760 
2761 	return nil;
2762 }
2763 
2764 /*
2765  *  turn checksums on/off
2766  */
2767 char*
2768 tcpsetchecksum(Conv *s, char **f, int)
2769 {
2770 	Tcpctl *tcb;
2771 
2772 	tcb = (Tcpctl*)s->ptcl;
2773 	tcb->nochecksum = !atoi(f[1]);
2774 
2775 	return nil;
2776 }
2777 
2778 void
2779 tcprxmit(Conv *s)
2780 {
2781 	Tcpctl *tcb;
2782 
2783 	tcb = (Tcpctl*)s->ptcl;
2784 
2785 	tcb->flags |= RETRAN|FORCE;
2786 	tcb->snd.ptr = tcb->snd.una;
2787 
2788 	/*
2789 	 *  We should be halving the slow start threshhold (down to one
2790 	 *  mss) but leaving it at mss seems to work well enough
2791 	 */
2792  	tcb->ssthresh = tcb->mss;
2793 
2794 	/*
2795 	 *  pull window down to a single packet
2796 	 */
2797 	tcb->cwind = tcb->mss;
2798 	tcpoutput(s);
2799 }
2800 
2801 void
2802 tcptimeout(void *arg)
2803 {
2804 	Conv *s;
2805 	Tcpctl *tcb;
2806 	int maxback;
2807 	Tcppriv *tpriv;
2808 
2809 	s = (Conv*)arg;
2810 	tpriv = s->p->priv;
2811 	tcb = (Tcpctl*)s->ptcl;
2812 
2813 	if(waserror()){
2814 		qunlock(s);
2815 		nexterror();
2816 	}
2817 	qlock(s);
2818 	switch(tcb->state){
2819 	default:
2820 		tcb->backoff++;
2821 		if(tcb->state == Syn_sent)
2822 			maxback = MAXBACKMS/2;
2823 		else
2824 			maxback = MAXBACKMS;
2825 		tcb->backedoff += tcb->timer.start * MSPTICK;
2826 		if(tcb->backedoff >= maxback) {
2827 			localclose(s, Etimedout);
2828 			break;
2829 		}
2830 		netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW);
2831 		tcpsettimer(tcb);
2832 		tcprxmit(s);
2833 		tpriv->stats[RetransTimeouts]++;
2834 		tcb->snd.dupacks = 0;
2835 		break;
2836 	case Time_wait:
2837 		localclose(s, nil);
2838 		break;
2839 	case Closed:
2840 		break;
2841 	}
2842 	qunlock(s);
2843 	poperror();
2844 }
2845 
2846 int
2847 inwindow(Tcpctl *tcb, int seq)
2848 {
2849 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2850 }
2851 
2852 /*
2853  *  set up state for a received SYN (or SYN ACK) packet
2854  */
2855 void
2856 procsyn(Conv *s, Tcp *seg)
2857 {
2858 	Tcpctl *tcb;
2859 	Tcppriv *tpriv;
2860 
2861 	tcb = (Tcpctl*)s->ptcl;
2862 	tcb->flags |= FORCE;
2863 
2864 	tcb->rcv.nxt = seg->seq + 1;
2865 	tcb->rcv.urg = tcb->rcv.nxt;
2866 	tcb->irs = seg->seq;
2867 
2868 	/* our sending max segment size cannot be bigger than what he asked for */
2869 	if(seg->mss != 0 && seg->mss < tcb->mss) {
2870 		tcb->mss = seg->mss;
2871 		tpriv = s->p->priv;
2872 		tpriv->stats[Mss] = tcb->mss;
2873 	}
2874 
2875 	/* the congestion window always starts out as a single segment */
2876 	tcb->snd.wnd = seg->wnd;
2877 	tcb->cwind = tcb->mss;
2878 }
2879 
2880 int
2881 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2882 {
2883 	Reseq *rp, *rp1;
2884 	int i, rqlen, qmax;
2885 
2886 	rp = malloc(sizeof(Reseq));
2887 	if(rp == nil){
2888 		freeblist(bp);	/* bp always consumed by add_reseq */
2889 		return 0;
2890 	}
2891 
2892 	rp->seg = *seg;
2893 	rp->bp = bp;
2894 	rp->length = length;
2895 
2896 	/* Place on reassembly list sorting by starting seq number */
2897 	rp1 = tcb->reseq;
2898 	if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2899 		rp->next = rp1;
2900 		tcb->reseq = rp;
2901 		if(rp->next != nil)
2902 			tpriv->stats[OutOfOrder]++;
2903 		return 0;
2904 	}
2905 
2906 	rqlen = 0;
2907 	for(i = 0;; i++) {
2908 		rqlen += rp1->length;
2909 		if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2910 			rp->next = rp1->next;
2911 			rp1->next = rp;
2912 			if(rp->next != nil)
2913 				tpriv->stats[OutOfOrder]++;
2914 			break;
2915 		}
2916 		rp1 = rp1->next;
2917 	}
2918 	qmax = QMAX<<tcb->rcv.scale;
2919 	if(rqlen > qmax){
2920 		print("resequence queue > window: %d > %d\n", rqlen, qmax);
2921 		i = 0;
2922 	  	for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2923 	  		print("%#lux %#lux %#ux\n", rp1->seg.seq,
2924 	  			rp1->seg.ack, rp1->seg.flags);
2925 			if(i++ > 10){
2926 				print("...\n");
2927 				break;
2928 			}
2929 		}
2930 
2931 		/*
2932 		 * delete entire reassembly queue; wait for retransmit.
2933 		 * - should we be smarter and only delete the tail?
2934 		 */
2935 		for(rp = tcb->reseq; rp != nil; rp = rp1){
2936 			rp1 = rp->next;
2937 			freeblist(rp->bp);
2938 			free(rp);
2939 		}
2940 		tcb->reseq = nil;
2941 
2942 	  	return -1;
2943 	}
2944 	return 0;
2945 }
2946 
2947 void
2948 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2949 {
2950 	Reseq *rp;
2951 
2952 	rp = tcb->reseq;
2953 	if(rp == nil)
2954 		return;
2955 
2956 	tcb->reseq = rp->next;
2957 
2958 	*seg = rp->seg;
2959 	*bp = rp->bp;
2960 	*length = rp->length;
2961 
2962 	free(rp);
2963 }
2964 
2965 int
2966 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2967 {
2968 	ushort len;
2969 	uchar accept;
2970 	int dupcnt, excess;
2971 
2972 	accept = 0;
2973 	len = *length;
2974 	if(seg->flags & SYN)
2975 		len++;
2976 	if(seg->flags & FIN)
2977 		len++;
2978 
2979 	if(tcb->rcv.wnd == 0) {
2980 		if(len == 0 && seg->seq == tcb->rcv.nxt)
2981 			return 0;
2982 	}
2983 	else {
2984 		/* Some part of the segment should be in the window */
2985 		if(inwindow(tcb,seg->seq))
2986 			accept++;
2987 		else
2988 		if(len != 0) {
2989 			if(inwindow(tcb, seg->seq+len-1) ||
2990 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2991 				accept++;
2992 		}
2993 	}
2994 	if(!accept) {
2995 		freeblist(*bp);
2996 		return -1;
2997 	}
2998 	dupcnt = tcb->rcv.nxt - seg->seq;
2999 	if(dupcnt > 0){
3000 		tcb->rerecv += dupcnt;
3001 		if(seg->flags & SYN){
3002 			seg->flags &= ~SYN;
3003 			seg->seq++;
3004 
3005 			if(seg->urg > 1)
3006 				seg->urg--;
3007 			else
3008 				seg->flags &= ~URG;
3009 			dupcnt--;
3010 		}
3011 		if(dupcnt > 0){
3012 			pullblock(bp, (ushort)dupcnt);
3013 			seg->seq += dupcnt;
3014 			*length -= dupcnt;
3015 
3016 			if(seg->urg > dupcnt)
3017 				seg->urg -= dupcnt;
3018 			else {
3019 				seg->flags &= ~URG;
3020 				seg->urg = 0;
3021 			}
3022 		}
3023 	}
3024 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3025 	if(excess > 0) {
3026 		tcb->rerecv += excess;
3027 		*length -= excess;
3028 		*bp = trimblock(*bp, 0, *length);
3029 		if(*bp == nil)
3030 			panic("presotto is a boofhead");
3031 		seg->flags &= ~FIN;
3032 	}
3033 	return 0;
3034 }
3035 
3036 void
3037 tcpadvise(Proto *tcp, Block *bp, char *msg)
3038 {
3039 	Tcp4hdr *h4;
3040 	Tcp6hdr *h6;
3041 	Tcpctl *tcb;
3042 	uchar source[IPaddrlen];
3043 	uchar dest[IPaddrlen];
3044 	ushort psource, pdest;
3045 	Conv *s, **p;
3046 
3047 	h4 = (Tcp4hdr*)(bp->rp);
3048 	h6 = (Tcp6hdr*)(bp->rp);
3049 
3050 	if((h4->vihl&0xF0)==IP_VER4) {
3051 		v4tov6(dest, h4->tcpdst);
3052 		v4tov6(source, h4->tcpsrc);
3053 		psource = nhgets(h4->tcpsport);
3054 		pdest = nhgets(h4->tcpdport);
3055 	}
3056 	else {
3057 		ipmove(dest, h6->tcpdst);
3058 		ipmove(source, h6->tcpsrc);
3059 		psource = nhgets(h6->tcpsport);
3060 		pdest = nhgets(h6->tcpdport);
3061 	}
3062 
3063 	/* Look for a connection */
3064 	qlock(tcp);
3065 	for(p = tcp->conv; *p; p++) {
3066 		s = *p;
3067 		tcb = (Tcpctl*)s->ptcl;
3068 		if(s->rport == pdest)
3069 		if(s->lport == psource)
3070 		if(tcb->state != Closed)
3071 		if(ipcmp(s->raddr, dest) == 0)
3072 		if(ipcmp(s->laddr, source) == 0){
3073 			qlock(s);
3074 			qunlock(tcp);
3075 			switch(tcb->state){
3076 			case Syn_sent:
3077 				localclose(s, msg);
3078 				break;
3079 			}
3080 			qunlock(s);
3081 			freeblist(bp);
3082 			return;
3083 		}
3084 	}
3085 	qunlock(tcp);
3086 	freeblist(bp);
3087 }
3088 
3089 static char*
3090 tcpporthogdefensectl(char *val)
3091 {
3092 	if(strcmp(val, "on") == 0)
3093 		tcpporthogdefense = 1;
3094 	else if(strcmp(val, "off") == 0)
3095 		tcpporthogdefense = 0;
3096 	else
3097 		return "unknown value for tcpporthogdefense";
3098 	return nil;
3099 }
3100 
3101 /* called with c qlocked */
3102 char*
3103 tcpctl(Conv* c, char** f, int n)
3104 {
3105 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3106 		return tcphangup(c);
3107 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3108 		return tcpstartka(c, f, n);
3109 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3110 		return tcpsetchecksum(c, f, n);
3111 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3112 		return tcpporthogdefensectl(f[1]);
3113 	return "unknown control request";
3114 }
3115 
3116 int
3117 tcpstats(Proto *tcp, char *buf, int len)
3118 {
3119 	Tcppriv *priv;
3120 	char *p, *e;
3121 	int i;
3122 
3123 	priv = tcp->priv;
3124 	p = buf;
3125 	e = p+len;
3126 	for(i = 0; i < Nstats; i++)
3127 		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3128 	return p - buf;
3129 }
3130 
3131 /*
3132  *  garbage collect any stale conversations:
3133  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3134  *	- Finwait2 after 5 minutes
3135  *
3136  *  this is called whenever we run out of channels.  Both checks are
3137  *  of questionable validity so we try to use them only when we're
3138  *  up against the wall.
3139  */
3140 int
3141 tcpgc(Proto *tcp)
3142 {
3143 	Conv *c, **pp, **ep;
3144 	int n;
3145 	Tcpctl *tcb;
3146 
3147 
3148 	n = 0;
3149 	ep = &tcp->conv[tcp->nc];
3150 	for(pp = tcp->conv; pp < ep; pp++) {
3151 		c = *pp;
3152 		if(c == nil)
3153 			break;
3154 		if(!canqlock(c))
3155 			continue;
3156 		tcb = (Tcpctl*)c->ptcl;
3157 		switch(tcb->state){
3158 		case Syn_received:
3159 			if(NOW - tcb->time > 5000){
3160 				localclose(c, "timed out");
3161 				n++;
3162 			}
3163 			break;
3164 		case Finwait2:
3165 			if(NOW - tcb->time > 5*60*1000){
3166 				localclose(c, "timed out");
3167 				n++;
3168 			}
3169 			break;
3170 		}
3171 		qunlock(c);
3172 	}
3173 	return n;
3174 }
3175 
3176 void
3177 tcpsettimer(Tcpctl *tcb)
3178 {
3179 	int x;
3180 
3181 	/* round trip dependency */
3182 	x = backoff(tcb->backoff) *
3183 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3184 
3185 	/* bounded twixt 1/2 and 64 seconds */
3186 	if(x < 500/MSPTICK)
3187 		x = 500/MSPTICK;
3188 	else if(x > (64000/MSPTICK))
3189 		x = 64000/MSPTICK;
3190 	tcb->timer.start = x;
3191 }
3192 
3193 void
3194 tcpinit(Fs *fs)
3195 {
3196 	Proto *tcp;
3197 	Tcppriv *tpriv;
3198 
3199 	tcp = smalloc(sizeof(Proto));
3200 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3201 	tcp->name = "tcp";
3202 	tcp->connect = tcpconnect;
3203 	tcp->announce = tcpannounce;
3204 	tcp->ctl = tcpctl;
3205 	tcp->state = tcpstate;
3206 	tcp->create = tcpcreate;
3207 	tcp->close = tcpclose;
3208 	tcp->rcv = tcpiput;
3209 	tcp->advise = tcpadvise;
3210 	tcp->stats = tcpstats;
3211 	tcp->inuse = tcpinuse;
3212 	tcp->gc = tcpgc;
3213 	tcp->ipproto = IP_TCPPROTO;
3214 	tcp->nc = scalednconv();
3215 	tcp->ptclsize = sizeof(Tcpctl);
3216 	tpriv->stats[MaxConn] = tcp->nc;
3217 
3218 	Fsproto(fs, tcp);
3219 }
3220 
3221 void
3222 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3223 {
3224 	if(rcvscale){
3225 		tcb->rcv.scale = rcvscale & 0xff;
3226 		tcb->snd.scale = sndscale & 0xff;
3227 		tcb->window = QMAX<<tcb->snd.scale;
3228 		qsetlimit(s->rq, tcb->window);
3229 	} else {
3230 		tcb->rcv.scale = 0;
3231 		tcb->snd.scale = 0;
3232 		tcb->window = QMAX;
3233 		qsetlimit(s->rq, tcb->window);
3234 	}
3235 }
3236