xref: /plan9-contrib/sys/src/9/ip/tcp.c (revision 41aa733568302a52b3e933f0702c3962684de9a2)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Maximum segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default maximum segment */
50 	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	/*
85 	 * window is 64kb * 2ⁿ
86 	 * these factors determine the ultimate bandwidth-delay product.
87 	 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 	 */
89 	Maxqscale	= 4,		/* maximum queuing scale */
90 	Defadvscale	= 4,		/* default advertisement */
91 };
92 
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 	"Closed",	"Listen",	"Syn_sent", "Syn_received",
97 	"Established",	"Finwait1",	"Finwait2", "Close_wait",
98 	"Closing",	"Last_ack",	"Time_wait"
99 };
100 
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 	Tcptimer	*next;
105 	Tcptimer	*prev;
106 	Tcptimer	*readynext;
107 	int	state;
108 	int	start;
109 	int	count;
110 	void	(*func)(void*);
111 	void	*arg;
112 };
113 
114 /*
115  *  v4 and v6 pseudo headers used for
116  *  checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 	uchar	vihl;		/* Version and header length */
122 	uchar	tos;		/* Type of service */
123 	uchar	length[2];	/* packet length */
124 	uchar	id[2];		/* Identification */
125 	uchar	frag[2];	/* Fragment information */
126 	uchar	Unused;
127 	uchar	proto;
128 	uchar	tcplen[2];
129 	uchar	tcpsrc[4];
130 	uchar	tcpdst[4];
131 	/* same as v6 from here on */
132 	uchar	tcpsport[2];
133 	uchar	tcpdport[2];
134 	uchar	tcpseq[4];
135 	uchar	tcpack[4];
136 	uchar	tcpflag[2];
137 	uchar	tcpwin[2];
138 	uchar	tcpcksum[2];
139 	uchar	tcpurg[2];
140 	/* Options segment */
141 	uchar	tcpopt[1];
142 };
143 
144 typedef struct Tcp6hdr Tcp6hdr;
145 struct Tcp6hdr
146 {
147 	uchar	vcf[4];
148 	uchar	ploadlen[2];
149 	uchar	proto;
150 	uchar	ttl;
151 	uchar	tcpsrc[IPaddrlen];
152 	uchar	tcpdst[IPaddrlen];
153 	/* same as v4 from here on */
154 	uchar	tcpsport[2];
155 	uchar	tcpdport[2];
156 	uchar	tcpseq[4];
157 	uchar	tcpack[4];
158 	uchar	tcpflag[2];
159 	uchar	tcpwin[2];
160 	uchar	tcpcksum[2];
161 	uchar	tcpurg[2];
162 	/* Options segment */
163 	uchar	tcpopt[1];
164 };
165 
166 /*
167  *  this represents the control info
168  *  for a single packet.  It is derived from
169  *  a packet in ntohtcp{4,6}() and stuck into
170  *  a packet in htontcp{4,6}().
171  */
172 typedef struct Tcp Tcp;
173 struct	Tcp
174 {
175 	ushort	source;
176 	ushort	dest;
177 	ulong	seq;
178 	ulong	ack;
179 	uchar	flags;
180 	uchar	update;
181 	ushort	ws;	/* window scale option */
182 	ulong	wnd;	/* prescaled window*/
183 	ushort	urg;
184 	ushort	mss;	/* max segment size option (if not zero) */
185 	ushort	len;	/* size of data */
186 };
187 
188 /*
189  *  this header is malloc'd to thread together fragments
190  *  waiting to be coalesced
191  */
192 typedef struct Reseq Reseq;
193 struct Reseq
194 {
195 	Reseq	*next;
196 	Tcp	seg;
197 	Block	*bp;
198 	ushort	length;
199 };
200 
201 /*
202  *  the qlock in the Conv locks this structure
203  */
204 typedef struct Tcpctl Tcpctl;
205 struct Tcpctl
206 {
207 	uchar	state;			/* Connection state */
208 	uchar	type;			/* Listening or active connection */
209 	uchar	code;			/* Icmp code */
210 	struct {
211 		ulong	una;		/* Unacked data pointer */
212 		ulong	nxt;		/* Next sequence expected */
213 		ulong	ptr;		/* Data pointer */
214 		ulong	wnd;		/* Tcp send window */
215 		ulong	urg;		/* Urgent data pointer */
216 		ulong	wl2;
217 		uint	scale;		/* how much to right shift window */
218 					/* in xmitted packets */
219 		/* to implement tahoe and reno TCP */
220 		ulong	dupacks;	/* number of duplicate acks rcvd */
221 		ulong	partialack;
222 		int	recovery;	/* loss recovery flag */
223 		int	retransmit;	/* retransmit 1 packet @ una flag */
224 		int	rto;
225 		ulong	rxt;		/* right window marker for recovery */
226 					/* "recover" rfc3782 */
227 	} snd;
228 	struct {
229 		ulong	nxt;		/* Receive pointer to next uchar slot */
230 		ulong	wnd;		/* Receive window incoming */
231 		ulong	wsnt;		/* Last wptr sent.  important to */
232 					/* track for large bdp */
233 		ulong	wptr;
234 		ulong	urg;		/* Urgent pointer */
235 		ulong	ackptr;		/* last acked sequence */
236 		int	blocked;
237 		uint	scale;		/* how much to left shift window in */
238 					/* rcv'd packets */
239 	} rcv;
240 	ulong	iss;			/* Initial sequence number */
241 	ulong	cwind;			/* Congestion window */
242 	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
243 	uint	scale;			/* desired snd.scale */
244 	ulong	ssthresh;		/* Slow start threshold */
245 	int	resent;			/* Bytes just resent */
246 	int	irs;			/* Initial received squence */
247 	ushort	mss;			/* Maximum segment size */
248 	int	rerecv;			/* Overlap of data rerecevived */
249 	ulong	window;			/* Our receive window (queue) */
250 	uint	qscale;			/* Log2 of our receive window (queue) */
251 	uchar	backoff;		/* Exponential backoff counter */
252 	int	backedoff;		/* ms we've backed off for rexmits */
253 	uchar	flags;			/* State flags */
254 	Reseq	*reseq;			/* Resequencing queue */
255 	int	nreseq;
256 	int	reseqlen;
257 	Tcptimer	timer;			/* Activity timer */
258 	Tcptimer	acktimer;		/* Acknowledge timer */
259 	Tcptimer	rtt_timer;		/* Round trip timer */
260 	Tcptimer	katimer;		/* keep alive timer */
261 	ulong	rttseq;			/* Round trip sequence */
262 	int	srtt;			/* Smoothed round trip */
263 	int	mdev;			/* Mean deviation of round trip */
264 	int	kacounter;		/* count down for keep alive */
265 	uint	sndsyntime;		/* time syn sent */
266 	ulong	time;			/* time Finwait2 or Syn_received was sent */
267 	ulong	timeuna;		/* snd.una when time was set */
268 	int	nochecksum;		/* non-zero means don't send checksums */
269 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
270 
271 	union {
272 		Tcp4hdr	tcp4hdr;
273 		Tcp6hdr	tcp6hdr;
274 	} protohdr;		/* prototype header */
275 };
276 
277 /*
278  *  New calls are put in limbo rather than having a conversation structure
279  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
280  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
281  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
282  *
283  *  In particular they aren't on a listener's queue so that they don't figure
284  *  in the input queue limit.
285  *
286  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
287  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
288  *  there is no hashing of this list.
289  */
290 typedef struct Limbo Limbo;
291 struct Limbo
292 {
293 	Limbo	*next;
294 
295 	uchar	laddr[IPaddrlen];
296 	uchar	raddr[IPaddrlen];
297 	ushort	lport;
298 	ushort	rport;
299 	ulong	irs;		/* initial received sequence */
300 	ulong	iss;		/* initial sent sequence */
301 	ushort	mss;		/* mss from the other end */
302 	ushort	rcvscale;	/* how much to scale rcvd windows */
303 	ushort	sndscale;	/* how much to scale sent windows */
304 	ulong	lastsend;	/* last time we sent a synack */
305 	uchar	version;	/* v4 or v6 */
306 	uchar	rexmits;	/* number of retransmissions */
307 };
308 
309 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
310 
311 enum {
312 	/* MIB stats */
313 	MaxConn,
314 	Mss,
315 	ActiveOpens,
316 	PassiveOpens,
317 	EstabResets,
318 	CurrEstab,
319 	InSegs,
320 	OutSegs,
321 	RetransSegs,
322 	RetransSegsSent,
323 	RetransTimeouts,
324 	InErrs,
325 	OutRsts,
326 
327 	/* non-MIB stats */
328 	CsumErrs,
329 	HlenErrs,
330 	LenErrs,
331 	Resequenced,
332 	OutOfOrder,
333 	ReseqBytelim,
334 	ReseqPktlim,
335 	Delayack,
336 	Wopenack,
337 
338 	Recovery,
339 	RecoveryDone,
340 	RecoveryRTO,
341 	RecoveryNoSeq,
342 	RecoveryCwind,
343 	RecoveryPA,
344 
345 	Nstats
346 };
347 
348 static char *statnames[Nstats] =
349 {
350 [MaxConn]	"MaxConn",
351 [Mss]		"MaxSegment",
352 [ActiveOpens]	"ActiveOpens",
353 [PassiveOpens]	"PassiveOpens",
354 [EstabResets]	"EstabResets",
355 [CurrEstab]	"CurrEstab",
356 [InSegs]	"InSegs",
357 [OutSegs]	"OutSegs",
358 [RetransSegs]	"RetransSegs",
359 [RetransSegsSent]	"RetransSegsSent",
360 [RetransTimeouts]	"RetransTimeouts",
361 [InErrs]	"InErrs",
362 [OutRsts]	"OutRsts",
363 [CsumErrs]	"CsumErrs",
364 [HlenErrs]	"HlenErrs",
365 [LenErrs]	"LenErrs",
366 [OutOfOrder]	"OutOfOrder",
367 [Resequenced]	"Resequenced",
368 [ReseqBytelim]	"ReseqBytelim",
369 [ReseqPktlim]	"ReseqPktlim",
370 [Delayack]	"Delayack",
371 [Wopenack]	"Wopenack",
372 
373 [Recovery]	"Recovery",
374 [RecoveryDone]	"RecoveryDone",
375 [RecoveryRTO]	"RecoveryRTO",
376 
377 [RecoveryNoSeq]	"RecoveryNoSeq",
378 [RecoveryCwind]	"RecoveryCwind",
379 [RecoveryPA]	"RecoveryPA",
380 };
381 
382 typedef struct Tcppriv Tcppriv;
383 struct Tcppriv
384 {
385 	/* List of active timers */
386 	QLock	tl;
387 	Tcptimer *timers;
388 
389 	/* hash table for matching conversations */
390 	Ipht	ht;
391 
392 	/* calls in limbo waiting for an ACK to our SYN ACK */
393 	int	nlimbo;
394 	Limbo	*lht[NLHT];
395 
396 	/* for keeping track of tcpackproc */
397 	QLock	apl;
398 	int	ackprocstarted;
399 
400 	uvlong	stats[Nstats];
401 };
402 
403 /*
404  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
405  *  solution to hijacked systems staking out port's as a form
406  *  of DoS attack.
407  *
408  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
409  *  that number gets acked by the other end, we shut down the connection.
410  *  Look for tcpporthogdefense in the code.
411  */
412 int tcpporthogdefense = 0;
413 
414 static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
415 static	int	dumpreseq(Tcpctl*);
416 static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
417 static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
418 static	void	limborexmit(Proto*);
419 static	void	localclose(Conv*, char*);
420 static	void	procsyn(Conv*, Tcp*);
421 static	void	tcpacktimer(void*);
422 static	void	tcpiput(Proto*, Ipifc*, Block*);
423 static	void	tcpkeepalive(void*);
424 static	void	tcpoutput(Conv*);
425 static	void	tcprcvwin(Conv*);
426 static	void	tcprxmit(Conv*);
427 static	void	tcpsetkacounter(Tcpctl*);
428 static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
429 static	void	tcpsettimer(Tcpctl*);
430 static	void	tcpsndsyn(Conv*, Tcpctl*);
431 static	void	tcpstart(Conv*, int);
432 static	void	tcpsynackrtt(Conv*);
433 static	void	tcptimeout(void*);
434 static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
435 
436 static void
tcpsetstate(Conv * s,uchar newstate)437 tcpsetstate(Conv *s, uchar newstate)
438 {
439 	Tcpctl *tcb;
440 	uchar oldstate;
441 	Tcppriv *tpriv;
442 
443 	tpriv = s->p->priv;
444 
445 	tcb = (Tcpctl*)s->ptcl;
446 
447 	oldstate = tcb->state;
448 	if(oldstate == newstate)
449 		return;
450 
451 	if(oldstate == Established)
452 		tpriv->stats[CurrEstab]--;
453 	if(newstate == Established)
454 		tpriv->stats[CurrEstab]++;
455 
456 	switch(newstate) {
457 	case Closed:
458 		qclose(s->rq);
459 		qclose(s->wq);
460 		qclose(s->eq);
461 		break;
462 
463 	case Close_wait:		/* Remote closes */
464 		qhangup(s->rq, nil);
465 		break;
466 	}
467 
468 	tcb->state = newstate;
469 
470 	if(oldstate == Syn_sent && newstate != Closed)
471 		Fsconnected(s, nil);
472 }
473 
474 static char*
tcpconnect(Conv * c,char ** argv,int argc)475 tcpconnect(Conv *c, char **argv, int argc)
476 {
477 	char *e;
478 	Tcpctl *tcb;
479 
480 	tcb = (Tcpctl*)(c->ptcl);
481 	if(tcb->state != Closed)
482 		return Econinuse;
483 
484 	e = Fsstdconnect(c, argv, argc);
485 	if(e != nil)
486 		return e;
487 	tcpstart(c, TCP_CONNECT);
488 
489 	return nil;
490 }
491 
492 static int
tcpstate(Conv * c,char * state,int n)493 tcpstate(Conv *c, char *state, int n)
494 {
495 	Tcpctl *s;
496 
497 	s = (Tcpctl*)(c->ptcl);
498 
499 	return snprint(state, n,
500 		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
501 		"swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
502 		"timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
503 		tcpstates[s->state],
504 		c->rq ? qlen(c->rq) : 0,
505 		c->wq ? qlen(c->wq) : 0,
506 		s->nreseq, s->reseqlen,
507 		s->srtt, s->mdev, s->ssthresh,
508 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
509 		s->qscale,
510 		s->timer.start, s->timer.count, s->rerecv,
511 		s->katimer.start, s->katimer.count);
512 }
513 
514 static int
tcpinuse(Conv * c)515 tcpinuse(Conv *c)
516 {
517 	Tcpctl *s;
518 
519 	s = (Tcpctl*)(c->ptcl);
520 	return s->state != Closed;
521 }
522 
523 static char*
tcpannounce(Conv * c,char ** argv,int argc)524 tcpannounce(Conv *c, char **argv, int argc)
525 {
526 	char *e;
527 	Tcpctl *tcb;
528 
529 	tcb = (Tcpctl*)(c->ptcl);
530 	if(tcb->state != Closed)
531 		return Econinuse;
532 
533 	e = Fsstdannounce(c, argv, argc);
534 	if(e != nil)
535 		return e;
536 	tcpstart(c, TCP_LISTEN);
537 	Fsconnected(c, nil);
538 
539 	return nil;
540 }
541 
542 static void
tcpclosestate(Conv * c,Tcpctl * tcb,int state)543 tcpclosestate(Conv *c, Tcpctl *tcb, int state)
544 {
545 	tcb->flgcnt++;
546 	tcb->snd.nxt++;
547 	tcpsetstate(c, state);
548 	tcpoutput(c);
549 }
550 
551 /* close the output half of a tcp connection */
552 static char *
tcpxmitclose(Conv * c)553 tcpxmitclose(Conv *c)
554 {
555 	Tcpctl *tcb;
556 
557 	qhangup(c->wq, nil);
558 
559 	tcb = (Tcpctl*)c->ptcl;
560 	switch(tcb->state) {
561 	case Listen:
562 		/*
563 		 *  reset any incoming calls to this listener
564 		 */
565 		Fsconnected(c, "Hangup");
566 		/* fall through */
567 	case Closed:
568 	case Syn_sent:
569 		localclose(c, nil);
570 		break;
571 	case Syn_received:
572 	case Established:
573 	case Close_wait:
574 		tcpclosestate(c, tcb, tcb->state);
575 		break;
576 	}
577 	return nil;
578 }
579 
580 /*
581  *  tcpclose is always called with the q locked
582  */
583 static void
tcpclose(Conv * c)584 tcpclose(Conv *c)
585 {
586 	Tcpctl *tcb;
587 
588 	tcb = (Tcpctl*)c->ptcl;
589 
590 	qhangup(c->rq, nil);
591 	qhangup(c->wq, nil);
592 	qhangup(c->eq, nil);
593 	qflush(c->rq);
594 
595 	switch(tcb->state) {
596 	case Listen:
597 		/*
598 		 *  reset any incoming calls to this listener
599 		 */
600 		Fsconnected(c, "Hangup");
601 		/* fall through */
602 	case Closed:
603 	case Syn_sent:
604 		localclose(c, nil);
605 		break;
606 	case Syn_received:
607 	case Established:
608 		tcpclosestate(c, tcb, Finwait1);
609 		break;
610 	case Close_wait:
611 		tcpclosestate(c, tcb, Last_ack);
612 		break;
613 	}
614 }
615 
616 static void
tcpkick(void * x)617 tcpkick(void *x)
618 {
619 	Conv *s = x;
620 	Tcpctl *tcb;
621 
622 	tcb = (Tcpctl*)s->ptcl;
623 
624 	if(waserror()){
625 		qunlock(s);
626 		nexterror();
627 	}
628 	qlock(s);
629 
630 	switch(tcb->state) {
631 	case Syn_sent:
632 	case Syn_received:
633 	case Established:
634 	case Close_wait:
635 		/*
636 		 * Push data
637 		 */
638 		tcpoutput(s);
639 		break;
640 	default:
641 		localclose(s, "Hangup");
642 		break;
643 	}
644 
645 	qunlock(s);
646 	poperror();
647 }
648 
649 static int seq_lt(ulong, ulong);
650 
651 static void
tcprcvwin(Conv * s)652 tcprcvwin(Conv *s)				/* Call with tcb locked */
653 {
654 	int w;
655 	Tcpctl *tcb;
656 
657 	tcb = (Tcpctl*)s->ptcl;
658 	w = tcb->window - qlen(s->rq);
659 	if(w < 0)
660 		w = 0;
661 	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
662 	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
663 		w = tcb->rcv.wptr - tcb->rcv.nxt;
664 	if(w != tcb->rcv.wnd)
665 	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
666 		tcb->rcv.blocked = 1;
667 		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
668 			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
669 	}
670 	tcb->rcv.wnd = w;
671 	tcb->rcv.wptr = tcb->rcv.nxt + w;
672 }
673 
674 static void
tcpacktimer(void * v)675 tcpacktimer(void *v)
676 {
677 	Tcpctl *tcb;
678 	Conv *s;
679 
680 	s = v;
681 	tcb = (Tcpctl*)s->ptcl;
682 
683 	if(waserror()){
684 		qunlock(s);
685 		nexterror();
686 	}
687 	qlock(s);
688 	if(tcb->state != Closed){
689 		tcb->flags |= FORCE;
690 		tcpoutput(s);
691 	}
692 	qunlock(s);
693 	poperror();
694 }
695 
696 static void
tcpcongestion(Tcpctl * tcb)697 tcpcongestion(Tcpctl *tcb)
698 {
699 	ulong inflight;
700 
701 	inflight = tcb->snd.nxt - tcb->snd.una;
702 	if(inflight > tcb->cwind)
703 		inflight = tcb->cwind;
704 	tcb->ssthresh = inflight / 2;
705 	if(tcb->ssthresh < 2*tcb->mss)
706 		tcb->ssthresh = 2*tcb->mss;
707 }
708 
709 enum {
710 	L	= 2,	/* aggressive slow start; legal values ∈ (1.0, 2.0) */
711 };
712 
713 static void
tcpabcincr(Tcpctl * tcb,uint acked)714 tcpabcincr(Tcpctl *tcb, uint acked)
715 {
716 	uint limit;
717 
718 	tcb->abcbytes += acked;
719 	if(tcb->cwind < tcb->ssthresh){
720 		/* slow start */
721 		if(tcb->snd.rto)
722 			limit = tcb->mss;
723 		else
724 			limit = L*tcb->mss;
725 		tcb->cwind += MIN(tcb->abcbytes, limit);
726 		tcb->abcbytes = 0;
727 	} else {
728 		tcb->snd.rto = 0;
729 		/* avoidance */
730 		if(tcb->abcbytes >= tcb->cwind){
731 			tcb->abcbytes -= tcb->cwind;
732 			tcb->cwind += tcb->mss;
733 		}
734 	}
735 }
736 
737 static void
tcpcreate(Conv * c)738 tcpcreate(Conv *c)
739 {
740 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
741 	c->wq = qopen(QMAX, Qkick, tcpkick, c);
742 }
743 
744 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)745 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
746 {
747 	if(newstate != TcptimerON){
748 		if(t->state == TcptimerON){
749 			/* unchain */
750 			if(priv->timers == t){
751 				priv->timers = t->next;
752 				if(t->prev != nil)
753 					panic("timerstate1");
754 			}
755 			if(t->next)
756 				t->next->prev = t->prev;
757 			if(t->prev)
758 				t->prev->next = t->next;
759 			t->next = t->prev = nil;
760 		}
761 	} else {
762 		if(t->state != TcptimerON){
763 			/* chain */
764 			if(t->prev != nil || t->next != nil)
765 				panic("timerstate2");
766 			t->prev = nil;
767 			t->next = priv->timers;
768 			if(t->next)
769 				t->next->prev = t;
770 			priv->timers = t;
771 		}
772 	}
773 	t->state = newstate;
774 }
775 
776 static void
tcpackproc(void * a)777 tcpackproc(void *a)
778 {
779 	Tcptimer *t, *tp, *timeo;
780 	Proto *tcp;
781 	Tcppriv *priv;
782 	int loop;
783 
784 	tcp = a;
785 	priv = tcp->priv;
786 
787 	for(;;) {
788 		tsleep(&up->sleep, return0, 0, MSPTICK);
789 
790 		qlock(&priv->tl);
791 		timeo = nil;
792 		loop = 0;
793 		for(t = priv->timers; t != nil; t = tp) {
794 			if(loop++ > 10000)
795 				panic("tcpackproc1");
796 			tp = t->next;
797 			if(t->state == TcptimerON) {
798 				t->count--;
799 				if(t->count == 0) {
800 					timerstate(priv, t, TcptimerDONE);
801 					t->readynext = timeo;
802 					timeo = t;
803 				}
804 			}
805 		}
806 		qunlock(&priv->tl);
807 
808 		loop = 0;
809 		for(t = timeo; t != nil; t = t->readynext) {
810 			if(loop++ > 10000)
811 				panic("tcpackproc2");
812 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
813 				(*t->func)(t->arg);
814 				poperror();
815 			}
816 		}
817 
818 		limborexmit(tcp);
819 	}
820 }
821 
822 static void
tcpgo(Tcppriv * priv,Tcptimer * t)823 tcpgo(Tcppriv *priv, Tcptimer *t)
824 {
825 	if(t == nil || t->start == 0)
826 		return;
827 
828 	qlock(&priv->tl);
829 	t->count = t->start;
830 	timerstate(priv, t, TcptimerON);
831 	qunlock(&priv->tl);
832 }
833 
834 static void
tcphalt(Tcppriv * priv,Tcptimer * t)835 tcphalt(Tcppriv *priv, Tcptimer *t)
836 {
837 	if(t == nil)
838 		return;
839 
840 	qlock(&priv->tl);
841 	timerstate(priv, t, TcptimerOFF);
842 	qunlock(&priv->tl);
843 }
844 
845 static int
backoff(int n)846 backoff(int n)
847 {
848 	return 1 << n;
849 }
850 
851 static void
localclose(Conv * s,char * reason)852 localclose(Conv *s, char *reason)	/* called with tcb locked */
853 {
854 	Tcpctl *tcb;
855 	Tcppriv *tpriv;
856 
857 	tpriv = s->p->priv;
858 	tcb = (Tcpctl*)s->ptcl;
859 
860 	iphtrem(&tpriv->ht, s);
861 
862 	tcphalt(tpriv, &tcb->timer);
863 	tcphalt(tpriv, &tcb->rtt_timer);
864 	tcphalt(tpriv, &tcb->acktimer);
865 	tcphalt(tpriv, &tcb->katimer);
866 
867 	/* Flush reassembly queue; nothing more can arrive */
868 	dumpreseq(tcb);
869 
870 	if(tcb->state == Syn_sent)
871 		Fsconnected(s, reason);
872 	if(s->state == Announced)
873 		wakeup(&s->listenr);
874 
875 	qhangup(s->rq, reason);
876 	qhangup(s->wq, reason);
877 
878 	tcpsetstate(s, Closed);
879 }
880 
881 /* mtu (- TCP + IP hdr len) of 1st hop */
882 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)883 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
884 {
885 	Ipifc *ifc;
886 	int mtu;
887 
888 	ifc = findipifc(tcp->f, addr, 0);
889 	switch(version){
890 	default:
891 	case V4:
892 		mtu = DEF_MSS;
893 		if(ifc != nil)
894 			mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE);
895 		break;
896 	case V6:
897 		mtu = DEF_MSS6;
898 		if(ifc != nil)
899 			mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE);
900 		break;
901 	}
902 	/*
903 	 * set the ws.  it doesn't commit us to anything.
904 	 * ws is the ultimate limit to the bandwidth-delay product.
905 	 */
906 	*scale = Defadvscale;
907 
908 	return mtu;
909 }
910 
911 static void
inittcpctl(Conv * s,int mode)912 inittcpctl(Conv *s, int mode)
913 {
914 	Tcpctl *tcb;
915 	Tcp4hdr* h4;
916 	Tcp6hdr* h6;
917 	Tcppriv *tpriv;
918 	int mss;
919 
920 	tcb = (Tcpctl*)s->ptcl;
921 
922 	memset(tcb, 0, sizeof(Tcpctl));
923 
924 	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
925 	tcb->srtt = tcp_irtt<<LOGAGAIN;
926 	tcb->mdev = 0;
927 
928 	/* setup timers */
929 	tcb->timer.start = tcp_irtt / MSPTICK;
930 	tcb->timer.func = tcptimeout;
931 	tcb->timer.arg = s;
932 	tcb->rtt_timer.start = MAX_TIME;
933 	tcb->acktimer.start = TCP_ACK / MSPTICK;
934 	tcb->acktimer.func = tcpacktimer;
935 	tcb->acktimer.arg = s;
936 	tcb->katimer.start = DEF_KAT / MSPTICK;
937 	tcb->katimer.func = tcpkeepalive;
938 	tcb->katimer.arg = s;
939 
940 	mss = DEF_MSS;
941 
942 	/* create a prototype(pseudo) header */
943 	if(mode != TCP_LISTEN){
944 		if(ipcmp(s->laddr, IPnoaddr) == 0)
945 			findlocalip(s->p->f, s->laddr, s->raddr);
946 
947 		switch(s->ipversion){
948 		case V4:
949 			h4 = &tcb->protohdr.tcp4hdr;
950 			memset(h4, 0, sizeof(*h4));
951 			h4->proto = IP_TCPPROTO;
952 			hnputs(h4->tcpsport, s->lport);
953 			hnputs(h4->tcpdport, s->rport);
954 			v6tov4(h4->tcpsrc, s->laddr);
955 			v6tov4(h4->tcpdst, s->raddr);
956 			break;
957 		case V6:
958 			h6 = &tcb->protohdr.tcp6hdr;
959 			memset(h6, 0, sizeof(*h6));
960 			h6->proto = IP_TCPPROTO;
961 			hnputs(h6->tcpsport, s->lport);
962 			hnputs(h6->tcpdport, s->rport);
963 			ipmove(h6->tcpsrc, s->laddr);
964 			ipmove(h6->tcpdst, s->raddr);
965 			mss = DEF_MSS6;
966 			break;
967 		default:
968 			panic("inittcpctl: version %d", s->ipversion);
969 		}
970 	}
971 
972 	tcb->mss = tcb->cwind = mss;
973 	tcb->abcbytes = 0;
974 	tpriv = s->p->priv;
975 	tpriv->stats[Mss] = tcb->mss;
976 
977 	/* default is no window scaling */
978 	tcpsetscale(s, tcb, 0, 0);
979 }
980 
981 /*
982  *  called with s qlocked
983  */
984 static void
tcpstart(Conv * s,int mode)985 tcpstart(Conv *s, int mode)
986 {
987 	Tcpctl *tcb;
988 	Tcppriv *tpriv;
989 	char kpname[KNAMELEN];
990 
991 	tpriv = s->p->priv;
992 
993 	if(tpriv->ackprocstarted == 0){
994 		qlock(&tpriv->apl);
995 		if(tpriv->ackprocstarted == 0){
996 			snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
997 			kproc(kpname, tcpackproc, s->p);
998 			tpriv->ackprocstarted = 1;
999 		}
1000 		qunlock(&tpriv->apl);
1001 	}
1002 
1003 	tcb = (Tcpctl*)s->ptcl;
1004 
1005 	inittcpctl(s, mode);
1006 
1007 	iphtadd(&tpriv->ht, s);
1008 	switch(mode) {
1009 	case TCP_LISTEN:
1010 		tpriv->stats[PassiveOpens]++;
1011 		tcb->flags |= CLONE;
1012 		tcpsetstate(s, Listen);
1013 		break;
1014 
1015 	case TCP_CONNECT:
1016 		tpriv->stats[ActiveOpens]++;
1017 		tcb->flags |= ACTIVE;
1018 		tcpsndsyn(s, tcb);
1019 		tcpsetstate(s, Syn_sent);
1020 		tcpoutput(s);
1021 		break;
1022 	}
1023 }
1024 
1025 static char*
tcpflag(char * buf,char * e,ushort flag)1026 tcpflag(char *buf, char *e, ushort flag)
1027 {
1028 	char *p;
1029 
1030 	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
1031 	if(flag & URG)
1032 		p = seprint(p, e, " URG");
1033 	if(flag & ACK)
1034 		p = seprint(p, e, " ACK");
1035 	if(flag & PSH)
1036 		p = seprint(p, e, " PSH");
1037 	if(flag & RST)
1038 		p = seprint(p, e, " RST");
1039 	if(flag & SYN)
1040 		p = seprint(p, e, " SYN");
1041 	if(flag & FIN)
1042 		p = seprint(p, e, " FIN");
1043 	USED(p);
1044 	return buf;
1045 }
1046 
1047 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1048 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1049 {
1050 	int dlen;
1051 	Tcp6hdr *h;
1052 	ushort csum;
1053 	ushort hdrlen, optpad = 0;
1054 	uchar *opt;
1055 
1056 	hdrlen = TCP6_HDRSIZE;
1057 	if(tcph->flags & SYN){
1058 		if(tcph->mss)
1059 			hdrlen += MSS_LENGTH;
1060 		if(tcph->ws)
1061 			hdrlen += WS_LENGTH;
1062 		optpad = hdrlen & 3;
1063 		if(optpad)
1064 			optpad = 4 - optpad;
1065 		hdrlen += optpad;
1066 	}
1067 
1068 	if(data) {
1069 		dlen = blocklen(data);
1070 		data = padblock(data, hdrlen + TCP6_PKT);
1071 		if(data == nil)
1072 			return nil;
1073 	}
1074 	else {
1075 		dlen = 0;
1076 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
1077 		if(data == nil)
1078 			return nil;
1079 		data->wp += hdrlen + TCP6_PKT;
1080 	}
1081 
1082 	/* copy in pseudo ip header plus port numbers */
1083 	h = (Tcp6hdr *)(data->rp);
1084 	memmove(h, ph, TCP6_TCBPHDRSZ);
1085 
1086 	/* compose pseudo tcp header, do cksum calculation */
1087 	hnputl(h->vcf, hdrlen + dlen);
1088 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1089 	h->ttl = ph->proto;
1090 
1091 	/* copy in variable bits */
1092 	hnputl(h->tcpseq, tcph->seq);
1093 	hnputl(h->tcpack, tcph->ack);
1094 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1095 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1096 	hnputs(h->tcpurg, tcph->urg);
1097 
1098 	if(tcph->flags & SYN){
1099 		opt = h->tcpopt;
1100 		if(tcph->mss != 0){
1101 			*opt++ = MSSOPT;
1102 			*opt++ = MSS_LENGTH;
1103 			hnputs(opt, tcph->mss);
1104 			opt += 2;
1105 		}
1106 		if(tcph->ws != 0){
1107 			*opt++ = WSOPT;
1108 			*opt++ = WS_LENGTH;
1109 			*opt++ = tcph->ws;
1110 		}
1111 		while(optpad-- > 0)
1112 			*opt++ = NOOPOPT;
1113 	}
1114 
1115 	if(tcb != nil && tcb->nochecksum){
1116 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1117 	} else {
1118 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1119 		hnputs(h->tcpcksum, csum);
1120 	}
1121 
1122 	/* move from pseudo header back to normal ip header */
1123 	memset(h->vcf, 0, 4);
1124 	h->vcf[0] = IP_VER6;
1125 	hnputs(h->ploadlen, hdrlen+dlen);
1126 	h->proto = ph->proto;
1127 
1128 	return data;
1129 }
1130 
1131 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1132 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1133 {
1134 	int dlen;
1135 	Tcp4hdr *h;
1136 	ushort csum;
1137 	ushort hdrlen, optpad = 0;
1138 	uchar *opt;
1139 
1140 	hdrlen = TCP4_HDRSIZE;
1141 	if(tcph->flags & SYN){
1142 		if(tcph->mss)
1143 			hdrlen += MSS_LENGTH;
1144 		if(1)
1145 			hdrlen += WS_LENGTH;
1146 		optpad = hdrlen & 3;
1147 		if(optpad)
1148 			optpad = 4 - optpad;
1149 		hdrlen += optpad;
1150 	}
1151 
1152 	if(data) {
1153 		dlen = blocklen(data);
1154 		data = padblock(data, hdrlen + TCP4_PKT);
1155 		if(data == nil)
1156 			return nil;
1157 	}
1158 	else {
1159 		dlen = 0;
1160 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1161 		if(data == nil)
1162 			return nil;
1163 		data->wp += hdrlen + TCP4_PKT;
1164 	}
1165 
1166 	/* copy in pseudo ip header plus port numbers */
1167 	h = (Tcp4hdr *)(data->rp);
1168 	memmove(h, ph, TCP4_TCBPHDRSZ);
1169 
1170 	/* copy in variable bits */
1171 	hnputs(h->tcplen, hdrlen + dlen);
1172 	hnputl(h->tcpseq, tcph->seq);
1173 	hnputl(h->tcpack, tcph->ack);
1174 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1175 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1176 	hnputs(h->tcpurg, tcph->urg);
1177 
1178 	if(tcph->flags & SYN){
1179 		opt = h->tcpopt;
1180 		if(tcph->mss != 0){
1181 			*opt++ = MSSOPT;
1182 			*opt++ = MSS_LENGTH;
1183 			hnputs(opt, tcph->mss);
1184 			opt += 2;
1185 		}
1186 		/* always offer.  rfc1323 §2.2 */
1187 		if(1){
1188 			*opt++ = WSOPT;
1189 			*opt++ = WS_LENGTH;
1190 			*opt++ = tcph->ws;
1191 		}
1192 		while(optpad-- > 0)
1193 			*opt++ = NOOPOPT;
1194 	}
1195 
1196 	if(tcb != nil && tcb->nochecksum){
1197 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1198 	} else {
1199 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1200 		hnputs(h->tcpcksum, csum);
1201 	}
1202 
1203 	return data;
1204 }
1205 
1206 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1207 ntohtcp6(Tcp *tcph, Block **bpp)
1208 {
1209 	Tcp6hdr *h;
1210 	uchar *optr;
1211 	ushort hdrlen;
1212 	ushort optlen;
1213 	int n;
1214 
1215 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1216 	if(*bpp == nil)
1217 		return -1;
1218 
1219 	h = (Tcp6hdr *)((*bpp)->rp);
1220 	tcph->source = nhgets(h->tcpsport);
1221 	tcph->dest = nhgets(h->tcpdport);
1222 	tcph->seq = nhgetl(h->tcpseq);
1223 	tcph->ack = nhgetl(h->tcpack);
1224 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1225 	if(hdrlen < TCP6_HDRSIZE) {
1226 		freeblist(*bpp);
1227 		return -1;
1228 	}
1229 
1230 	tcph->flags = h->tcpflag[1];
1231 	tcph->wnd = nhgets(h->tcpwin);
1232 	tcph->urg = nhgets(h->tcpurg);
1233 	tcph->mss = 0;
1234 	tcph->ws = 0;
1235 	tcph->update = 0;
1236 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1237 
1238 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1239 	if(*bpp == nil)
1240 		return -1;
1241 
1242 	optr = h->tcpopt;
1243 	n = hdrlen - TCP6_HDRSIZE;
1244 	while(n > 0 && *optr != EOLOPT) {
1245 		if(*optr == NOOPOPT) {
1246 			n--;
1247 			optr++;
1248 			continue;
1249 		}
1250 		optlen = optr[1];
1251 		if(optlen < 2 || optlen > n)
1252 			break;
1253 		switch(*optr) {
1254 		case MSSOPT:
1255 			if(optlen == MSS_LENGTH)
1256 				tcph->mss = nhgets(optr+2);
1257 			break;
1258 		case WSOPT:
1259 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1260 				tcph->ws = *(optr+2);
1261 			break;
1262 		}
1263 		n -= optlen;
1264 		optr += optlen;
1265 	}
1266 	return hdrlen;
1267 }
1268 
1269 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1270 ntohtcp4(Tcp *tcph, Block **bpp)
1271 {
1272 	Tcp4hdr *h;
1273 	uchar *optr;
1274 	ushort hdrlen;
1275 	ushort optlen;
1276 	int n;
1277 
1278 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1279 	if(*bpp == nil)
1280 		return -1;
1281 
1282 	h = (Tcp4hdr *)((*bpp)->rp);
1283 	tcph->source = nhgets(h->tcpsport);
1284 	tcph->dest = nhgets(h->tcpdport);
1285 	tcph->seq = nhgetl(h->tcpseq);
1286 	tcph->ack = nhgetl(h->tcpack);
1287 
1288 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1289 	if(hdrlen < TCP4_HDRSIZE) {
1290 		freeblist(*bpp);
1291 		return -1;
1292 	}
1293 
1294 	tcph->flags = h->tcpflag[1];
1295 	tcph->wnd = nhgets(h->tcpwin);
1296 	tcph->urg = nhgets(h->tcpurg);
1297 	tcph->mss = 0;
1298 	tcph->ws = 0;
1299 	tcph->update = 0;
1300 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1301 
1302 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1303 	if(*bpp == nil)
1304 		return -1;
1305 
1306 	optr = h->tcpopt;
1307 	n = hdrlen - TCP4_HDRSIZE;
1308 	while(n > 0 && *optr != EOLOPT) {
1309 		if(*optr == NOOPOPT) {
1310 			n--;
1311 			optr++;
1312 			continue;
1313 		}
1314 		optlen = optr[1];
1315 		if(optlen < 2 || optlen > n)
1316 			break;
1317 		switch(*optr) {
1318 		case MSSOPT:
1319 			if(optlen == MSS_LENGTH)
1320 				tcph->mss = nhgets(optr+2);
1321 			break;
1322 		case WSOPT:
1323 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1324 				tcph->ws = *(optr+2);
1325 			break;
1326 		}
1327 		n -= optlen;
1328 		optr += optlen;
1329 	}
1330 	return hdrlen;
1331 }
1332 
1333 /*
1334  *  For outgoing calls, generate an initial sequence
1335  *  number and put a SYN on the send queue
1336  */
1337 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1338 tcpsndsyn(Conv *s, Tcpctl *tcb)
1339 {
1340 	Tcppriv *tpriv;
1341 
1342 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1343 	tcb->rttseq = tcb->iss;
1344 	tcb->snd.wl2 = tcb->iss;
1345 	tcb->snd.una = tcb->iss;
1346 	tcb->snd.rxt = tcb->iss;
1347 	tcb->snd.ptr = tcb->rttseq;
1348 	tcb->snd.nxt = tcb->rttseq;
1349 	tcb->flgcnt++;
1350 	tcb->flags |= FORCE;
1351 	tcb->sndsyntime = NOW;
1352 
1353 	/* set desired mss and scale */
1354 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1355 	tpriv = s->p->priv;
1356 	tpriv->stats[Mss] = tcb->mss;
1357 }
1358 
1359 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1360 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1361 {
1362 	Block *hbp;
1363 	uchar rflags;
1364 	Tcppriv *tpriv;
1365 	Tcp4hdr ph4;
1366 	Tcp6hdr ph6;
1367 
1368 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1369 
1370 	tpriv = tcp->priv;
1371 
1372 	if(seg->flags & RST)
1373 		return;
1374 
1375 	/* make pseudo header */
1376 	switch(version) {
1377 	case V4:
1378 		memset(&ph4, 0, sizeof(ph4));
1379 		ph4.vihl = IP_VER4;
1380 		v6tov4(ph4.tcpsrc, dest);
1381 		v6tov4(ph4.tcpdst, source);
1382 		ph4.proto = IP_TCPPROTO;
1383 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1384 		hnputs(ph4.tcpsport, seg->dest);
1385 		hnputs(ph4.tcpdport, seg->source);
1386 		break;
1387 	case V6:
1388 		memset(&ph6, 0, sizeof(ph6));
1389 		ph6.vcf[0] = IP_VER6;
1390 		ipmove(ph6.tcpsrc, dest);
1391 		ipmove(ph6.tcpdst, source);
1392 		ph6.proto = IP_TCPPROTO;
1393 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1394 		hnputs(ph6.tcpsport, seg->dest);
1395 		hnputs(ph6.tcpdport, seg->source);
1396 		break;
1397 	default:
1398 		panic("sndrst: version %d", version);
1399 	}
1400 
1401 	tpriv->stats[OutRsts]++;
1402 	rflags = RST;
1403 
1404 	/* convince the other end that this reset is in band */
1405 	if(seg->flags & ACK) {
1406 		seg->seq = seg->ack;
1407 		seg->ack = 0;
1408 	}
1409 	else {
1410 		rflags |= ACK;
1411 		seg->ack = seg->seq;
1412 		seg->seq = 0;
1413 		if(seg->flags & SYN)
1414 			seg->ack++;
1415 		seg->ack += length;
1416 		if(seg->flags & FIN)
1417 			seg->ack++;
1418 	}
1419 	seg->flags = rflags;
1420 	seg->wnd = 0;
1421 	seg->urg = 0;
1422 	seg->mss = 0;
1423 	seg->ws = 0;
1424 	switch(version) {
1425 	case V4:
1426 		hbp = htontcp4(seg, nil, &ph4, nil);
1427 		if(hbp == nil)
1428 			return;
1429 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1430 		break;
1431 	case V6:
1432 		hbp = htontcp6(seg, nil, &ph6, nil);
1433 		if(hbp == nil)
1434 			return;
1435 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1436 		break;
1437 	default:
1438 		panic("sndrst2: version %d", version);
1439 	}
1440 }
1441 
1442 /*
1443  * close the conversation
1444  */
1445 static char*
tcpclose2(Conv * s)1446 tcpclose2(Conv *s)
1447 {
1448 	tcpclose(s);
1449 	return nil;
1450 }
1451 
1452 /*
1453  *  send a reset to the remote side and close the conversation
1454  *  called with s qlocked
1455  */
1456 static char*
tcphangup(Conv * s)1457 tcphangup(Conv *s)
1458 {
1459 	Tcp seg;
1460 	Tcpctl *tcb;
1461 	Block *hbp;
1462 
1463 	tcb = (Tcpctl*)s->ptcl;
1464 	if(waserror())
1465 		return commonerror();
1466 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1467 		if(!waserror()){
1468 			memset(&seg, 0, sizeof seg);
1469 			seg.flags = RST | ACK;
1470 			seg.ack = tcb->rcv.nxt;
1471 			tcb->rcv.ackptr = seg.ack;
1472 			seg.seq = tcb->snd.ptr;
1473 			seg.wnd = 0;
1474 			seg.urg = 0;
1475 			seg.mss = 0;
1476 			seg.ws = 0;
1477 			switch(s->ipversion) {
1478 			case V4:
1479 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1480 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1481 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1482 				break;
1483 			case V6:
1484 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1485 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1486 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1487 				break;
1488 			default:
1489 				panic("tcphangup: version %d", s->ipversion);
1490 			}
1491 			poperror();
1492 		}
1493 	}
1494 	localclose(s, nil);
1495 	poperror();
1496 	return nil;
1497 }
1498 
1499 /*
1500  *  (re)send a SYN ACK
1501  */
1502 static int
sndsynack(Proto * tcp,Limbo * lp)1503 sndsynack(Proto *tcp, Limbo *lp)
1504 {
1505 	Block *hbp;
1506 	Tcp4hdr ph4;
1507 	Tcp6hdr ph6;
1508 	Tcp seg;
1509 	uint scale;
1510 
1511 	/* make pseudo header */
1512 	switch(lp->version) {
1513 	case V4:
1514 		memset(&ph4, 0, sizeof(ph4));
1515 		ph4.vihl = IP_VER4;
1516 		v6tov4(ph4.tcpsrc, lp->laddr);
1517 		v6tov4(ph4.tcpdst, lp->raddr);
1518 		ph4.proto = IP_TCPPROTO;
1519 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1520 		hnputs(ph4.tcpsport, lp->lport);
1521 		hnputs(ph4.tcpdport, lp->rport);
1522 		break;
1523 	case V6:
1524 		memset(&ph6, 0, sizeof(ph6));
1525 		ph6.vcf[0] = IP_VER6;
1526 		ipmove(ph6.tcpsrc, lp->laddr);
1527 		ipmove(ph6.tcpdst, lp->raddr);
1528 		ph6.proto = IP_TCPPROTO;
1529 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1530 		hnputs(ph6.tcpsport, lp->lport);
1531 		hnputs(ph6.tcpdport, lp->rport);
1532 		break;
1533 	default:
1534 		panic("sndrst: version %d", lp->version);
1535 	}
1536 
1537 	memset(&seg, 0, sizeof seg);
1538 	seg.seq = lp->iss;
1539 	seg.ack = lp->irs+1;
1540 	seg.flags = SYN|ACK;
1541 	seg.urg = 0;
1542 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1543 	seg.wnd = QMAX;
1544 
1545 	/* if the other side set scale, we should too */
1546 	if(lp->rcvscale){
1547 		seg.ws = scale;
1548 		lp->sndscale = scale;
1549 	} else {
1550 		seg.ws = 0;
1551 		lp->sndscale = 0;
1552 	}
1553 
1554 	switch(lp->version) {
1555 	case V4:
1556 		hbp = htontcp4(&seg, nil, &ph4, nil);
1557 		if(hbp == nil)
1558 			return -1;
1559 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1560 		break;
1561 	case V6:
1562 		hbp = htontcp6(&seg, nil, &ph6, nil);
1563 		if(hbp == nil)
1564 			return -1;
1565 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1566 		break;
1567 	default:
1568 		panic("sndsnack: version %d", lp->version);
1569 	}
1570 	lp->lastsend = NOW;
1571 	return 0;
1572 }
1573 
1574 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1575 
1576 /*
1577  *  put a call into limbo and respond with a SYN ACK
1578  *
1579  *  called with proto locked
1580  */
1581 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1582 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1583 {
1584 	Limbo *lp, **l;
1585 	Tcppriv *tpriv;
1586 	int h;
1587 
1588 	tpriv = s->p->priv;
1589 	h = hashipa(source, seg->source);
1590 
1591 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1592 		lp = *l;
1593 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1594 			continue;
1595 		if(ipcmp(lp->raddr, source) != 0)
1596 			continue;
1597 		if(ipcmp(lp->laddr, dest) != 0)
1598 			continue;
1599 
1600 		/* each new SYN restarts the retransmits */
1601 		lp->irs = seg->seq;
1602 		break;
1603 	}
1604 	lp = *l;
1605 	if(lp == nil){
1606 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1607 			lp = tpriv->lht[h];
1608 			tpriv->lht[h] = lp->next;
1609 			lp->next = nil;
1610 		} else {
1611 			lp = malloc(sizeof(*lp));
1612 			if(lp == nil)
1613 				return;
1614 			tpriv->nlimbo++;
1615 		}
1616 		*l = lp;
1617 		lp->version = version;
1618 		ipmove(lp->laddr, dest);
1619 		ipmove(lp->raddr, source);
1620 		lp->lport = seg->dest;
1621 		lp->rport = seg->source;
1622 		lp->mss = seg->mss;
1623 		lp->rcvscale = seg->ws;
1624 		lp->irs = seg->seq;
1625 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1626 	}
1627 
1628 	if(sndsynack(s->p, lp) < 0){
1629 		*l = lp->next;
1630 		tpriv->nlimbo--;
1631 		free(lp);
1632 	}
1633 }
1634 
1635 /*
1636  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1637  */
1638 static void
limborexmit(Proto * tcp)1639 limborexmit(Proto *tcp)
1640 {
1641 	Tcppriv *tpriv;
1642 	Limbo **l, *lp;
1643 	int h;
1644 	int seen;
1645 	ulong now;
1646 
1647 	tpriv = tcp->priv;
1648 
1649 	if(!canqlock(tcp))
1650 		return;
1651 	seen = 0;
1652 	now = NOW;
1653 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1654 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1655 			lp = *l;
1656 			seen++;
1657 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1658 				continue;
1659 
1660 			/* time it out after 1 second */
1661 			if(++(lp->rexmits) > 5){
1662 				tpriv->nlimbo--;
1663 				*l = lp->next;
1664 				free(lp);
1665 				continue;
1666 			}
1667 
1668 			/* if we're being attacked, don't bother resending SYN ACK's */
1669 			if(tpriv->nlimbo > 100)
1670 				continue;
1671 
1672 			if(sndsynack(tcp, lp) < 0){
1673 				tpriv->nlimbo--;
1674 				*l = lp->next;
1675 				free(lp);
1676 				continue;
1677 			}
1678 
1679 			l = &lp->next;
1680 		}
1681 	}
1682 	qunlock(tcp);
1683 }
1684 
1685 /*
1686  *  lookup call in limbo.  if found, throw it out.
1687  *
1688  *  called with proto locked
1689  */
1690 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1691 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1692 {
1693 	Limbo *lp, **l;
1694 	int h;
1695 	Tcppriv *tpriv;
1696 
1697 	tpriv = s->p->priv;
1698 
1699 	/* find a call in limbo */
1700 	h = hashipa(src, segp->source);
1701 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1702 		lp = *l;
1703 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1704 			continue;
1705 		if(ipcmp(lp->laddr, dst) != 0)
1706 			continue;
1707 		if(ipcmp(lp->raddr, src) != 0)
1708 			continue;
1709 
1710 		/* RST can only follow the SYN */
1711 		if(segp->seq == lp->irs+1){
1712 			tpriv->nlimbo--;
1713 			*l = lp->next;
1714 			free(lp);
1715 		}
1716 		break;
1717 	}
1718 }
1719 
1720 static void
initialwindow(Tcpctl * tcb)1721 initialwindow(Tcpctl *tcb)
1722 {
1723 	/* RFC 3390 initial window */
1724 	if(tcb->mss < 1095)
1725 		tcb->cwind = 4*tcb->mss;
1726 	else if(tcb->mss < 2190)
1727 		tcb->cwind = 2*2190;
1728 	else
1729 		tcb->cwind = 2*tcb->mss;
1730 }
1731 
1732 /*
1733  *  come here when we finally get an ACK to our SYN-ACK.
1734  *  lookup call in limbo.  if found, create a new conversation
1735  *
1736  *  called with proto locked
1737  */
1738 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1739 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1740 {
1741 	Conv *new;
1742 	Tcpctl *tcb;
1743 	Tcppriv *tpriv;
1744 	Tcp4hdr *h4;
1745 	Tcp6hdr *h6;
1746 	Limbo *lp, **l;
1747 	int h;
1748 
1749 	/* unless it's just an ack, it can't be someone coming out of limbo */
1750 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1751 		return nil;
1752 
1753 	tpriv = s->p->priv;
1754 
1755 	/* find a call in limbo */
1756 	h = hashipa(src, segp->source);
1757 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1758 		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1759 			src, segp->source, lp->raddr, lp->rport,
1760 			dst, segp->dest, lp->laddr, lp->lport,
1761 			version, lp->version
1762 		);
1763 
1764 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1765 			continue;
1766 		if(ipcmp(lp->laddr, dst) != 0)
1767 			continue;
1768 		if(ipcmp(lp->raddr, src) != 0)
1769 			continue;
1770 
1771 		/* we're assuming no data with the initial SYN */
1772 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1773 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1774 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1775 			lp = nil;
1776 		} else {
1777 			tpriv->nlimbo--;
1778 			*l = lp->next;
1779 		}
1780 		break;
1781 	}
1782 	if(lp == nil)
1783 		return nil;
1784 
1785 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1786 	if(new == nil)
1787 		return nil;
1788 
1789 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1790 	tcb = (Tcpctl*)new->ptcl;
1791 	tcb->flags &= ~CLONE;
1792 	tcb->timer.arg = new;
1793 	tcb->timer.state = TcptimerOFF;
1794 	tcb->acktimer.arg = new;
1795 	tcb->acktimer.state = TcptimerOFF;
1796 	tcb->katimer.arg = new;
1797 	tcb->katimer.state = TcptimerOFF;
1798 	tcb->rtt_timer.arg = new;
1799 	tcb->rtt_timer.state = TcptimerOFF;
1800 
1801 	tcb->irs = lp->irs;
1802 	tcb->rcv.nxt = tcb->irs+1;
1803 	tcb->rcv.wptr = tcb->rcv.nxt;
1804 	tcb->rcv.wsnt = 0;
1805 	tcb->rcv.urg = tcb->rcv.nxt;
1806 
1807 	tcb->iss = lp->iss;
1808 	tcb->rttseq = tcb->iss;
1809 	tcb->snd.wl2 = tcb->iss;
1810 	tcb->snd.una = tcb->iss+1;
1811 	tcb->snd.ptr = tcb->iss+1;
1812 	tcb->snd.nxt = tcb->iss+1;
1813 	tcb->snd.rxt = tcb->iss+1;
1814 	tcb->flgcnt = 0;
1815 	tcb->flags |= SYNACK;
1816 
1817 	/* set desired mss and scale */
1818 	tcb->mss = tcpmtu(s->p, dst, s->ipversion, &tcb->scale);
1819 
1820 	/* our sending max segment size cannot be bigger than what he asked for */
1821 	if(lp->mss != 0 && lp->mss < tcb->mss)
1822 		tcb->mss = lp->mss;
1823 	tpriv->stats[Mss] = tcb->mss;
1824 
1825 	/* window scaling */
1826 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1827 
1828 	/* congestion window */
1829 	tcb->snd.wnd = segp->wnd;
1830 	initialwindow(tcb);
1831 
1832 	/* set initial round trip time */
1833 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1834 	tcpsynackrtt(new);
1835 
1836 	free(lp);
1837 
1838 	/* set up proto header */
1839 	switch(version){
1840 	case V4:
1841 		h4 = &tcb->protohdr.tcp4hdr;
1842 		memset(h4, 0, sizeof(*h4));
1843 		h4->proto = IP_TCPPROTO;
1844 		hnputs(h4->tcpsport, new->lport);
1845 		hnputs(h4->tcpdport, new->rport);
1846 		v6tov4(h4->tcpsrc, dst);
1847 		v6tov4(h4->tcpdst, src);
1848 		break;
1849 	case V6:
1850 		h6 = &tcb->protohdr.tcp6hdr;
1851 		memset(h6, 0, sizeof(*h6));
1852 		h6->proto = IP_TCPPROTO;
1853 		hnputs(h6->tcpsport, new->lport);
1854 		hnputs(h6->tcpdport, new->rport);
1855 		ipmove(h6->tcpsrc, dst);
1856 		ipmove(h6->tcpdst, src);
1857 		break;
1858 	default:
1859 		panic("tcpincoming: version %d", new->ipversion);
1860 	}
1861 
1862 	tcpsetstate(new, Established);
1863 
1864 	iphtadd(&tpriv->ht, new);
1865 
1866 	return new;
1867 }
1868 
1869 static int
seq_within(ulong x,ulong low,ulong high)1870 seq_within(ulong x, ulong low, ulong high)
1871 {
1872 	if(low <= high){
1873 		if(low <= x && x <= high)
1874 			return 1;
1875 	}
1876 	else {
1877 		if(x >= low || x <= high)
1878 			return 1;
1879 	}
1880 	return 0;
1881 }
1882 
1883 static int
seq_lt(ulong x,ulong y)1884 seq_lt(ulong x, ulong y)
1885 {
1886 	return (int)(x-y) < 0;
1887 }
1888 
1889 static int
seq_le(ulong x,ulong y)1890 seq_le(ulong x, ulong y)
1891 {
1892 	return (int)(x-y) <= 0;
1893 }
1894 
1895 static int
seq_gt(ulong x,ulong y)1896 seq_gt(ulong x, ulong y)
1897 {
1898 	return (int)(x-y) > 0;
1899 }
1900 
1901 static int
seq_ge(ulong x,ulong y)1902 seq_ge(ulong x, ulong y)
1903 {
1904 	return (int)(x-y) >= 0;
1905 }
1906 
1907 /*
1908  *  use the time between the first SYN and it's ack as the
1909  *  initial round trip time
1910  */
1911 static void
tcpsynackrtt(Conv * s)1912 tcpsynackrtt(Conv *s)
1913 {
1914 	Tcpctl *tcb;
1915 	int delta;
1916 	Tcppriv *tpriv;
1917 
1918 	tcb = (Tcpctl*)s->ptcl;
1919 	tpriv = s->p->priv;
1920 
1921 	delta = NOW - tcb->sndsyntime;
1922 	tcb->srtt = delta<<LOGAGAIN;
1923 	tcb->mdev = delta<<LOGDGAIN;
1924 
1925 	/* halt round trip timer */
1926 	tcphalt(tpriv, &tcb->rtt_timer);
1927 }
1928 
1929 static void
update(Conv * s,Tcp * seg)1930 update(Conv *s, Tcp *seg)
1931 {
1932 	int rtt, delta;
1933 	Tcpctl *tcb;
1934 	ulong acked;
1935 	Tcppriv *tpriv;
1936 
1937 	if(seg->update)
1938 		return;
1939 	seg->update = 1;
1940 
1941 	tpriv = s->p->priv;
1942 	tcb = (Tcpctl*)s->ptcl;
1943 
1944 	/* catch zero-window updates, update window & recover */
1945 	if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1946 	    seq_lt(seg->ack, tcb->snd.ptr)){
1947 		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1948 			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1949 		tcb->snd.wnd = seg->wnd;
1950 		goto recovery;
1951 	}
1952 
1953 	/* newreno fast retransmit */
1954 	if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1955 	    ++tcb->snd.dupacks == 3){		/* was TCPREXMTTHRESH */
1956 recovery:
1957 		if(tcb->snd.recovery){
1958 			tpriv->stats[RecoveryCwind]++;
1959 			tcb->cwind += tcb->mss;
1960 		}else if(seq_le(tcb->snd.rxt, seg->ack)){
1961 			tpriv->stats[Recovery]++;
1962 			tcb->abcbytes = 0;
1963 			tcb->snd.recovery = 1;
1964 			tcb->snd.partialack = 0;
1965 			tcb->snd.rxt = tcb->snd.nxt;
1966 			tcpcongestion(tcb);
1967 			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1968 			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1969 				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1970 			tcprxmit(s);
1971 		}else{
1972 			tpriv->stats[RecoveryNoSeq]++;
1973 			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1974 				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1975 			/* don't enter fast retransmit, don't change ssthresh */
1976 		}
1977 	}else if(tcb->snd.recovery){
1978 		tpriv->stats[RecoveryCwind]++;
1979 		tcb->cwind += tcb->mss;
1980 	}
1981 
1982 	/*
1983 	 *  update window
1984 	 */
1985 	if(seq_gt(seg->ack, tcb->snd.wl2)
1986 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1987 		/* clear dupack if we advance wl2 */
1988 		if(tcb->snd.wl2 != seg->ack)
1989 			tcb->snd.dupacks = 0;
1990 		tcb->snd.wnd = seg->wnd;
1991 		tcb->snd.wl2 = seg->ack;
1992 	}
1993 
1994 	if(!seq_gt(seg->ack, tcb->snd.una)){
1995 		/*
1996 		 *  don't let us hangup if sending into a closed window and
1997 		 *  we're still getting acks
1998 		 */
1999 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
2000 			tcb->backedoff = MAXBACKMS/4;
2001 		return;
2002 	}
2003 
2004 	/* Compute the new send window size */
2005 	acked = seg->ack - tcb->snd.una;
2006 
2007 	/* avoid slow start and timers for SYN acks */
2008 	if((tcb->flags & SYNACK) == 0) {
2009 		tcb->flags |= SYNACK;
2010 		acked--;
2011 		tcb->flgcnt--;
2012 		goto done;
2013 	}
2014 
2015 	/*
2016 	 * congestion control
2017 	 */
2018 	if(tcb->snd.recovery){
2019 		if(seq_ge(seg->ack, tcb->snd.rxt)){
2020 			/* recovery finished; deflate window */
2021 			tpriv->stats[RecoveryDone]++;
2022 			tcb->snd.dupacks = 0;
2023 			tcb->snd.recovery = 0;
2024 			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
2025 			if(tcb->ssthresh < tcb->cwind)
2026 				tcb->cwind = tcb->ssthresh;
2027 			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
2028 				tcb->cwind, tcb->ssthresh);
2029 		} else {
2030 			/* partial ack; we lost more than one segment */
2031 			tpriv->stats[RecoveryPA]++;
2032 			if(tcb->cwind > acked)
2033 				tcb->cwind -= acked;
2034 			else{
2035 				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
2036 				tcb->cwind = tcb->mss;
2037 			}
2038 			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
2039 				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
2040 
2041 			if(acked >= tcb->mss)
2042 				tcb->cwind += tcb->mss;
2043 			tcb->snd.partialack++;
2044 		}
2045 	} else
2046 		tcpabcincr(tcb, acked);
2047 
2048 	/* Adjust the timers according to the round trip time */
2049 	/* TODO: fix sloppy treatment of overflow cases here. */
2050 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2051 		tcphalt(tpriv, &tcb->rtt_timer);
2052 		if((tcb->flags&RETRAN) == 0) {
2053 			tcb->backoff = 0;
2054 			tcb->backedoff = 0;
2055 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2056 			if(rtt == 0)
2057 				rtt = 1; /* else all close sys's will rexmit in 0 time */
2058 			rtt *= MSPTICK;
2059 			if(tcb->srtt == 0) {
2060 				tcb->srtt = rtt << LOGAGAIN;
2061 				tcb->mdev = rtt << LOGDGAIN;
2062 			} else {
2063 				delta = rtt - (tcb->srtt>>LOGAGAIN);
2064 				tcb->srtt += delta;
2065 				if(tcb->srtt <= 0)
2066 					tcb->srtt = 1;
2067 
2068 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2069 				tcb->mdev += delta;
2070 				if(tcb->mdev <= 0)
2071 					tcb->mdev = 1;
2072 			}
2073 			tcpsettimer(tcb);
2074 		}
2075 	}
2076 
2077 done:
2078 	if(qdiscard(s->wq, acked) < acked)
2079 		tcb->flgcnt--;
2080 	tcb->snd.una = seg->ack;
2081 
2082 	/* newreno fast recovery */
2083 	if(tcb->snd.recovery)
2084 		tcprxmit(s);
2085 
2086 	if(seq_gt(seg->ack, tcb->snd.urg))
2087 		tcb->snd.urg = seg->ack;
2088 
2089 	if(tcb->snd.una != tcb->snd.nxt){
2090 		/* `impatient' variant */
2091 		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2092 			tcb->time = NOW;
2093 			tcb->timeuna = tcb->snd.una;
2094 			tcpgo(tpriv, &tcb->timer);
2095 		}
2096 	} else
2097 		tcphalt(tpriv, &tcb->timer);
2098 
2099 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2100 		tcb->snd.ptr = tcb->snd.una;
2101 
2102 	if(!tcb->snd.recovery)
2103 		tcb->flags &= ~RETRAN;
2104 	tcb->backoff = 0;
2105 	tcb->backedoff = 0;
2106 }
2107 
2108 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2109 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2110 {
2111 	Tcp seg;
2112 	Tcp4hdr *h4;
2113 	Tcp6hdr *h6;
2114 	int hdrlen;
2115 	Tcpctl *tcb;
2116 	ushort length, csum;
2117 	uchar source[IPaddrlen], dest[IPaddrlen];
2118 	Conv *s;
2119 	Fs *f;
2120 	Tcppriv *tpriv;
2121 	uchar version;
2122 
2123 	f = tcp->f;
2124 	tpriv = tcp->priv;
2125 
2126 	tpriv->stats[InSegs]++;
2127 
2128 	h4 = (Tcp4hdr*)(bp->rp);
2129 	h6 = (Tcp6hdr*)(bp->rp);
2130 
2131 	if((h4->vihl&0xF0)==IP_VER4) {
2132 		version = V4;
2133 		length = nhgets(h4->length);
2134 		v4tov6(dest, h4->tcpdst);
2135 		v4tov6(source, h4->tcpsrc);
2136 
2137 		h4->Unused = 0;
2138 		hnputs(h4->tcplen, length-TCP4_PKT);
2139 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2140 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2141 			tpriv->stats[CsumErrs]++;
2142 			tpriv->stats[InErrs]++;
2143 			netlog(f, Logtcp, "bad tcp proto cksum\n");
2144 			freeblist(bp);
2145 			return;
2146 		}
2147 
2148 		hdrlen = ntohtcp4(&seg, &bp);
2149 		if(hdrlen < 0){
2150 			tpriv->stats[HlenErrs]++;
2151 			tpriv->stats[InErrs]++;
2152 			netlog(f, Logtcp, "bad tcp hdr len\n");
2153 			return;
2154 		}
2155 
2156 		/* trim the packet to the size claimed by the datagram */
2157 		length -= hdrlen+TCP4_PKT;
2158 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2159 		if(bp == nil){
2160 			tpriv->stats[LenErrs]++;
2161 			tpriv->stats[InErrs]++;
2162 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2163 			return;
2164 		}
2165 	}
2166 	else {
2167 		int ttl = h6->ttl;
2168 		int proto = h6->proto;
2169 
2170 		version = V6;
2171 		length = nhgets(h6->ploadlen);
2172 		ipmove(dest, h6->tcpdst);
2173 		ipmove(source, h6->tcpsrc);
2174 
2175 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2176 		h6->ttl = proto;
2177 		hnputl(h6->vcf, length);
2178 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2179 		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2180 			tpriv->stats[CsumErrs]++;
2181 			tpriv->stats[InErrs]++;
2182 			netlog(f, Logtcp,
2183 			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2184 				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2185 			freeblist(bp);
2186 			return;
2187 		}
2188 		h6->ttl = ttl;
2189 		h6->proto = proto;
2190 		hnputs(h6->ploadlen, length);
2191 
2192 		hdrlen = ntohtcp6(&seg, &bp);
2193 		if(hdrlen < 0){
2194 			tpriv->stats[HlenErrs]++;
2195 			tpriv->stats[InErrs]++;
2196 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2197 			return;
2198 		}
2199 
2200 		/* trim the packet to the size claimed by the datagram */
2201 		length -= hdrlen;
2202 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2203 		if(bp == nil){
2204 			tpriv->stats[LenErrs]++;
2205 			tpriv->stats[InErrs]++;
2206 			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2207 			return;
2208 		}
2209 	}
2210 
2211 	/* lock protocol while searching for a conversation */
2212 	qlock(tcp);
2213 
2214 	/* Look for a matching conversation */
2215 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2216 	if(s == nil){
2217 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2218 			source, seg.source, dest, seg.dest);
2219 reset:
2220 		qunlock(tcp);
2221 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2222 		freeblist(bp);
2223 		return;
2224 	}
2225 
2226 	/* if it's a listener, look for the right flags and get a new conv */
2227 	tcb = (Tcpctl*)s->ptcl;
2228 	if(tcb->state == Listen){
2229 		if(seg.flags & RST){
2230 			limborst(s, &seg, source, dest, version);
2231 			qunlock(tcp);
2232 			freeblist(bp);
2233 			return;
2234 		}
2235 
2236 		/* if this is a new SYN, put the call into limbo */
2237 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2238 			limbo(s, source, dest, &seg, version);
2239 			qunlock(tcp);
2240 			freeblist(bp);
2241 			return;
2242 		}
2243 
2244 		/*
2245 		 *  if there's a matching call in limbo, tcpincoming will
2246 		 *  return it in state Syn_received
2247 		 */
2248 		s = tcpincoming(s, &seg, source, dest, version);
2249 		if(s == nil)
2250 			goto reset;
2251 	}
2252 
2253 	/* The rest of the input state machine is run with the control block
2254 	 * locked and implements the state machine directly out of the RFC.
2255 	 * Out-of-band data is ignored - it was always a bad idea.
2256 	 */
2257 	tcb = (Tcpctl*)s->ptcl;
2258 	if(waserror()){
2259 		qunlock(s);
2260 		nexterror();
2261 	}
2262 	qlock(s);
2263 	qunlock(tcp);
2264 
2265 	/* fix up window */
2266 	seg.wnd <<= tcb->rcv.scale;
2267 
2268 	/* every input packet in puts off the keep alive time out */
2269 	tcpsetkacounter(tcb);
2270 
2271 	switch(tcb->state) {
2272 	case Closed:
2273 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2274 		goto raise;
2275 	case Syn_sent:
2276 		if(seg.flags & ACK) {
2277 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2278 				sndrst(tcp, source, dest, length, &seg, version,
2279 					 "bad seq in Syn_sent");
2280 				goto raise;
2281 			}
2282 		}
2283 		if(seg.flags & RST) {
2284 			if(seg.flags & ACK)
2285 				localclose(s, Econrefused);
2286 			goto raise;
2287 		}
2288 
2289 		if(seg.flags & SYN) {
2290 			procsyn(s, &seg);
2291 			if(seg.flags & ACK){
2292 				update(s, &seg);
2293 				tcpsynackrtt(s);
2294 				tcpsetstate(s, Established);
2295 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2296 			}
2297 			else {
2298 				tcb->time = NOW;
2299 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2300 			}
2301 
2302 			if(length != 0 || (seg.flags & FIN))
2303 				break;
2304 
2305 			freeblist(bp);
2306 			goto output;
2307 		}
2308 		else
2309 			freeblist(bp);
2310 
2311 		qunlock(s);
2312 		poperror();
2313 		return;
2314 	case Syn_received:
2315 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2316 		if(seg.flags & ACK)
2317 			tcpsynackrtt(s);
2318 		break;
2319 	}
2320 
2321 	/*
2322 	 *  One DOS attack is to open connections to us and then forget about them,
2323 	 *  thereby tying up a conv at no long term cost to the attacker.
2324 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2325 	 *  corresponding code in tcpsendka().
2326 	 */
2327 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2328 		if(tcpporthogdefense
2329 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2330 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2331 				source, seg.source, dest, seg.dest, seg.flags,
2332 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2333 			localclose(s, "stateless hog");
2334 		}
2335 	}
2336 
2337 	/* Cut the data to fit the receive window */
2338 	tcprcvwin(s);
2339 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2340 		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2341 		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2342 			"%lud-%lud l %d from %I\n", seg.seq,
2343 			seg.seq + length - 1, tcb->rcv.nxt,
2344 			tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2345 		update(s, &seg);
2346 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2347 			tcphalt(tpriv, &tcb->rtt_timer);
2348 			tcphalt(tpriv, &tcb->acktimer);
2349 			tcphalt(tpriv, &tcb->katimer);
2350 			tcpsetstate(s, Time_wait);
2351 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2352 			tcpgo(tpriv, &tcb->timer);
2353 		}
2354 		if(!(seg.flags & RST)) {
2355 			tcb->flags |= FORCE;
2356 			goto output;
2357 		}
2358 		qunlock(s);
2359 		poperror();
2360 		return;
2361 	}
2362 
2363 	/* Cannot accept so answer with a rst */
2364 	if(length && tcb->state == Closed) {
2365 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2366 		goto raise;
2367 	}
2368 
2369 	/* The segment is beyond the current receive pointer so
2370 	 * queue the data in the resequence queue
2371 	 */
2372 	if(seg.seq != tcb->rcv.nxt)
2373 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2374 		update(s, &seg);
2375 		if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2376 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2377 				s->laddr, s->lport);
2378 		tcb->flags |= FORCE;	/* force duplicate ack; RFC 5681 §3.2 */
2379 		goto output;
2380 	}
2381 
2382 	if(tcb->nreseq > 0)
2383 		tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2384 
2385 	/*
2386 	 *  keep looping till we've processed this packet plus any
2387 	 *  adjacent packets in the resequence queue
2388 	 */
2389 	for(;;) {
2390 		if(seg.flags & RST) {
2391 			if(tcb->state == Established) {
2392 				tpriv->stats[EstabResets]++;
2393 				if(tcb->rcv.nxt != seg.seq)
2394 					netlog(f, Logtcp, "out of order RST "
2395 						"rcvd: %I.%d -> %I.%d, rcv.nxt "
2396 						"%lux seq %lux\n",
2397 						s->raddr, s->rport, s->laddr,
2398 						s->lport, tcb->rcv.nxt, seg.seq);
2399 			}
2400 			localclose(s, Econrefused);
2401 			goto raise;
2402 		}
2403 
2404 		if((seg.flags&ACK) == 0)
2405 			goto raise;
2406 
2407 		switch(tcb->state) {
2408 		case Syn_received:
2409 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2410 				sndrst(tcp, source, dest, length, &seg, version,
2411 					"bad seq in Syn_received");
2412 				goto raise;
2413 			}
2414 			update(s, &seg);
2415 			tcpsetstate(s, Established);
2416 		case Established:
2417 		case Close_wait:
2418 			update(s, &seg);
2419 			break;
2420 		case Finwait1:
2421 			update(s, &seg);
2422 			if(qlen(s->wq)+tcb->flgcnt == 0){
2423 				tcphalt(tpriv, &tcb->rtt_timer);
2424 				tcphalt(tpriv, &tcb->acktimer);
2425 				tcpsetkacounter(tcb);
2426 				tcb->time = NOW;
2427 				tcpsetstate(s, Finwait2);
2428 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2429 				tcpgo(tpriv, &tcb->katimer);
2430 			}
2431 			break;
2432 		case Finwait2:
2433 			update(s, &seg);
2434 			break;
2435 		case Closing:
2436 			update(s, &seg);
2437 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2438 				tcphalt(tpriv, &tcb->rtt_timer);
2439 				tcphalt(tpriv, &tcb->acktimer);
2440 				tcphalt(tpriv, &tcb->katimer);
2441 				tcpsetstate(s, Time_wait);
2442 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2443 				tcpgo(tpriv, &tcb->timer);
2444 			}
2445 			break;
2446 		case Last_ack:
2447 			update(s, &seg);
2448 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2449 				localclose(s, nil);
2450 				goto raise;
2451 			}
2452 		case Time_wait:
2453 			tcb->flags |= FORCE;
2454 			if(tcb->timer.state != TcptimerON)
2455 				tcpgo(tpriv, &tcb->timer);
2456 		}
2457 
2458 		if((seg.flags&URG) && seg.urg) {
2459 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2460 				tcb->rcv.urg = seg.urg + seg.seq;
2461 				pullblock(&bp, seg.urg);
2462 			}
2463 		}
2464 		else
2465 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2466 			tcb->rcv.urg = tcb->rcv.nxt;
2467 
2468 		if(length == 0) {
2469 			if(bp != nil)
2470 				freeblist(bp);
2471 		}
2472 		else {
2473 			switch(tcb->state){
2474 			default:
2475 				/* Ignore segment text */
2476 				if(bp != nil)
2477 					freeblist(bp);
2478 				break;
2479 
2480 			case Syn_received:
2481 			case Established:
2482 			case Finwait1:
2483 				/* If we still have some data place on
2484 				 * receive queue
2485 				 */
2486 				if(bp) {
2487 					bp = packblock(bp);
2488 					if(bp == nil)
2489 						panic("tcp packblock");
2490 					qpassnolim(s->rq, bp);
2491 					bp = nil;
2492 				}
2493 				tcb->rcv.nxt += length;
2494 
2495 				/*
2496 				 *  turn on the acktimer if there's something
2497 				 *  to ack
2498 				 */
2499 				if(tcb->acktimer.state != TcptimerON)
2500 					tcpgo(tpriv, &tcb->acktimer);
2501 
2502 				break;
2503 			case Finwait2:
2504 				/* no process to read the data, send a reset */
2505 				if(bp != nil)
2506 					freeblist(bp);
2507 				sndrst(tcp, source, dest, length, &seg, version,
2508 					"send to Finwait2");
2509 				qunlock(s);
2510 				poperror();
2511 				return;
2512 			}
2513 		}
2514 
2515 		if(seg.flags & FIN) {
2516 			tcb->flags |= FORCE;
2517 
2518 			switch(tcb->state) {
2519 			case Syn_received:
2520 			case Established:
2521 				tcb->rcv.nxt++;
2522 				tcpsetstate(s, Close_wait);
2523 				break;
2524 			case Finwait1:
2525 				tcb->rcv.nxt++;
2526 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2527 					tcphalt(tpriv, &tcb->rtt_timer);
2528 					tcphalt(tpriv, &tcb->acktimer);
2529 					tcphalt(tpriv, &tcb->katimer);
2530 					tcpsetstate(s, Time_wait);
2531 					tcb->timer.start = MSL2*(1000/MSPTICK);
2532 					tcpgo(tpriv, &tcb->timer);
2533 				}
2534 				else
2535 					tcpsetstate(s, Closing);
2536 				break;
2537 			case Finwait2:
2538 				tcb->rcv.nxt++;
2539 				tcphalt(tpriv, &tcb->rtt_timer);
2540 				tcphalt(tpriv, &tcb->acktimer);
2541 				tcphalt(tpriv, &tcb->katimer);
2542 				tcpsetstate(s, Time_wait);
2543 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2544 				tcpgo(tpriv, &tcb->timer);
2545 				break;
2546 			case Close_wait:
2547 			case Closing:
2548 			case Last_ack:
2549 				break;
2550 			case Time_wait:
2551 				tcpgo(tpriv, &tcb->timer);
2552 				break;
2553 			}
2554 		}
2555 
2556 		/*
2557 		 *  get next adjacent segment from the resequence queue.
2558 		 *  dump/trim any overlapping segments
2559 		 */
2560 		for(;;) {
2561 			if(tcb->reseq == nil)
2562 				goto output;
2563 
2564 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2565 				goto output;
2566 
2567 			getreseq(tcb, &seg, &bp, &length);
2568 
2569 			tcprcvwin(s);
2570 			if(tcptrim(tcb, &seg, &bp, &length) == 0){
2571 				tcb->flags |= FORCE;
2572 				break;
2573 			}
2574 		}
2575 	}
2576 output:
2577 	tcpoutput(s);
2578 	qunlock(s);
2579 	poperror();
2580 	return;
2581 raise:
2582 	qunlock(s);
2583 	poperror();
2584 	freeblist(bp);
2585 	tcpkick(s);
2586 }
2587 
2588 /*
2589  *  always enters and exits with the s locked.  We drop
2590  *  the lock to ipoput the packet so some care has to be
2591  *  taken by callers.
2592  */
2593 static void
tcpoutput(Conv * s)2594 tcpoutput(Conv *s)
2595 {
2596 	Tcp seg;
2597 	uint msgs;
2598 	Tcpctl *tcb;
2599 	Block *hbp, *bp;
2600 	int sndcnt;
2601 	ulong ssize, dsize, sent;
2602 	Fs *f;
2603 	Tcppriv *tpriv;
2604 	uchar version;
2605 
2606 	f = s->p->f;
2607 	tpriv = s->p->priv;
2608 	version = s->ipversion;
2609 
2610 	tcb = (Tcpctl*)s->ptcl;
2611 
2612 	/* force ack every 2*mss */
2613 	if((tcb->flags & FORCE) == 0 &&
2614 	    tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2615 		tpriv->stats[Delayack]++;
2616 		tcb->flags |= FORCE;
2617 	}
2618 
2619 	/* force ack if window opening */
2620 	if((tcb->flags & FORCE) == 0){
2621 		tcprcvwin(s);
2622 		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2623 			tpriv->stats[Wopenack]++;
2624 			tcb->flags |= FORCE;
2625 		}
2626 	}
2627 
2628 	for(msgs = 0; msgs < 100; msgs++) {
2629 		switch(tcb->state) {
2630 		case Listen:
2631 		case Closed:
2632 		case Finwait2:
2633 			return;
2634 		}
2635 
2636 		/* Don't send anything else until our SYN has been acked */
2637 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2638 			break;
2639 
2640 		/* force an ack when a window has opened up */
2641 		tcprcvwin(s);
2642 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2643 			tcb->rcv.blocked = 0;
2644 			tcb->flags |= FORCE;
2645 		}
2646 
2647 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2648 		sent = tcb->snd.ptr - tcb->snd.una;
2649 		ssize = sndcnt;
2650 		if(tcb->snd.wnd == 0){
2651 			/* zero window probe */
2652 			if(sent > 0 && !(tcb->flags & FORCE))
2653 				break;	/* already probing, rto re-probes */
2654 			if(ssize < sent)
2655 				ssize = 0;
2656 			else{
2657 				ssize -= sent;
2658 				if(ssize > 0)
2659 					ssize = 1;
2660 			}
2661 		} else {
2662 			/* calculate usable segment size */
2663 			if(ssize > tcb->cwind)
2664 				ssize = tcb->cwind;
2665 			if(ssize > tcb->snd.wnd)
2666 				ssize = tcb->snd.wnd;
2667 
2668 			if(ssize < sent)
2669 				ssize = 0;
2670 			else {
2671 				ssize -= sent;
2672 				if(ssize > tcb->mss)
2673 					ssize = tcb->mss;
2674 			}
2675 		}
2676 
2677 		dsize = ssize;
2678 		seg.urg = 0;
2679 
2680 		if(!(tcb->flags & FORCE))
2681 			if(ssize == 0 ||
2682 			    ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2683 			    sent > TCPREXMTTHRESH * tcb->mss)
2684 				break;
2685 
2686 		tcb->flags &= ~FORCE;
2687 
2688 		/* By default we will generate an ack */
2689 		tcphalt(tpriv, &tcb->acktimer);
2690 		seg.source = s->lport;
2691 		seg.dest = s->rport;
2692 		seg.flags = ACK;
2693 		seg.mss = 0;
2694 		seg.ws = 0;
2695 		seg.update = 0;
2696 		switch(tcb->state){
2697 		case Syn_sent:
2698 			seg.flags = 0;
2699 			if(tcb->snd.ptr == tcb->iss){
2700 				seg.flags |= SYN;
2701 				dsize--;
2702 				seg.mss = tcb->mss;
2703 				seg.ws = tcb->scale;
2704 			}
2705 			break;
2706 		case Syn_received:
2707 			/*
2708 			 *  don't send any data with a SYN/ACK packet
2709 			 *  because Linux rejects the packet in its
2710 			 *  attempt to solve the SYN attack problem
2711 			 */
2712 			if(tcb->snd.ptr == tcb->iss){
2713 				seg.flags |= SYN;
2714 				dsize = 0;
2715 				ssize = 1;
2716 				seg.mss = tcb->mss;
2717 				seg.ws = tcb->scale;
2718 			}
2719 			break;
2720 		}
2721 		seg.seq = tcb->snd.ptr;
2722 		seg.ack = tcb->rcv.nxt;
2723 		seg.wnd = tcb->rcv.wnd;
2724 
2725 		/* Pull out data to send */
2726 		bp = nil;
2727 		if(dsize != 0) {
2728 			bp = qcopy(s->wq, dsize, sent);
2729 			if(BLEN(bp) != dsize) {
2730 				seg.flags |= FIN;
2731 				dsize--;
2732 			}
2733 		}
2734 
2735 		if(sent+dsize == sndcnt && dsize)
2736 			seg.flags |= PSH;
2737 
2738 		tcb->snd.ptr += ssize;
2739 
2740 		/* Pull up the send pointer so we can accept acks
2741 		 * for this window
2742 		 */
2743 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2744 			tcb->snd.nxt = tcb->snd.ptr;
2745 
2746 		/* Build header, link data and compute cksum */
2747 		switch(version){
2748 		case V4:
2749 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2750 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2751 			if(hbp == nil) {
2752 				freeblist(bp);
2753 				return;
2754 			}
2755 			break;
2756 		case V6:
2757 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2758 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2759 			if(hbp == nil) {
2760 				freeblist(bp);
2761 				return;
2762 			}
2763 			break;
2764 		default:
2765 			hbp = nil;	/* to suppress a warning */
2766 			panic("tcpoutput: version %d", version);
2767 		}
2768 
2769 		/* Start the transmission timers if there is new data and we
2770 		 * expect acknowledges
2771 		 */
2772 		if(ssize != 0){
2773 			if(tcb->timer.state != TcptimerON){
2774 				tcb->time = NOW;
2775 				tcb->timeuna = tcb->snd.una;
2776 				tcpgo(tpriv, &tcb->timer);
2777 			}
2778 
2779 			/*  If round trip timer isn't running, start it.
2780 			 *  measure the longest packet only in case the
2781 			 *  transmission time dominates RTT
2782 			 */
2783 			if(tcb->snd.retransmit == 0)
2784 			if(tcb->rtt_timer.state != TcptimerON)
2785 			if(ssize == tcb->mss) {
2786 				tcpgo(tpriv, &tcb->rtt_timer);
2787 				tcb->rttseq = tcb->snd.ptr;
2788 			}
2789 		}
2790 
2791 		tpriv->stats[OutSegs]++;
2792 		if(tcb->snd.retransmit)
2793 			tpriv->stats[RetransSegsSent]++;
2794 		tcb->rcv.ackptr = seg.ack;
2795 		tcb->rcv.wsnt = tcb->rcv.wptr;
2796 
2797 		/* put off the next keep alive */
2798 		tcpgo(tpriv, &tcb->katimer);
2799 
2800 		switch(version){
2801 		case V4:
2802 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2803 				/* a negative return means no route */
2804 				localclose(s, "no route");
2805 			}
2806 			break;
2807 		case V6:
2808 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2809 				/* a negative return means no route */
2810 				localclose(s, "no route");
2811 			}
2812 			break;
2813 		default:
2814 			panic("tcpoutput2: version %d", version);
2815 		}
2816 		if((msgs%4) == 3){
2817 			qunlock(s);
2818 			qlock(s);
2819 		}
2820 	}
2821 }
2822 
2823 /*
2824  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2825  */
2826 static void
tcpsendka(Conv * s)2827 tcpsendka(Conv *s)
2828 {
2829 	Tcp seg;
2830 	Tcpctl *tcb;
2831 	Block *hbp,*dbp;
2832 
2833 	tcb = (Tcpctl*)s->ptcl;
2834 
2835 	dbp = nil;
2836 	memset(&seg, 0, sizeof seg);
2837 	seg.urg = 0;
2838 	seg.source = s->lport;
2839 	seg.dest = s->rport;
2840 	seg.flags = ACK|PSH;
2841 	seg.mss = 0;
2842 	seg.ws = 0;
2843 	if(tcpporthogdefense)
2844 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2845 	else
2846 		seg.seq = tcb->snd.una-1;
2847 	seg.ack = tcb->rcv.nxt;
2848 	tcb->rcv.ackptr = seg.ack;
2849 	tcprcvwin(s);
2850 	seg.wnd = tcb->rcv.wnd;
2851 	if(tcb->state == Finwait2){
2852 		seg.flags |= FIN;
2853 	} else {
2854 		dbp = allocb(1);
2855 		dbp->wp++;
2856 	}
2857 
2858 	if(isv4(s->raddr)) {
2859 		/* Build header, link data and compute cksum */
2860 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2861 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2862 		if(hbp == nil) {
2863 			freeblist(dbp);
2864 			return;
2865 		}
2866 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2867 	}
2868 	else {
2869 		/* Build header, link data and compute cksum */
2870 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2871 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2872 		if(hbp == nil) {
2873 			freeblist(dbp);
2874 			return;
2875 		}
2876 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2877 	}
2878 }
2879 
2880 /*
2881  *  set connection to time out after 12 minutes
2882  */
2883 static void
tcpsetkacounter(Tcpctl * tcb)2884 tcpsetkacounter(Tcpctl *tcb)
2885 {
2886 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2887 	if(tcb->kacounter < 3)
2888 		tcb->kacounter = 3;
2889 }
2890 
2891 /*
2892  *  if we've timed out, close the connection
2893  *  otherwise, send a keepalive and restart the timer
2894  */
2895 static void
tcpkeepalive(void * v)2896 tcpkeepalive(void *v)
2897 {
2898 	Tcpctl *tcb;
2899 	Conv *s;
2900 
2901 	s = v;
2902 	tcb = (Tcpctl*)s->ptcl;
2903 	if(waserror()){
2904 		qunlock(s);
2905 		nexterror();
2906 	}
2907 	qlock(s);
2908 	if(tcb->state != Closed){
2909 		if(--(tcb->kacounter) <= 0) {
2910 			localclose(s, Etimedout);
2911 		} else {
2912 			tcpsendka(s);
2913 			tcpgo(s->p->priv, &tcb->katimer);
2914 		}
2915 	}
2916 	qunlock(s);
2917 	poperror();
2918 }
2919 
2920 /*
2921  *  start keepalive timer
2922  */
2923 static char*
tcpstartka(Conv * s,char ** f,int n)2924 tcpstartka(Conv *s, char **f, int n)
2925 {
2926 	Tcpctl *tcb;
2927 	int x;
2928 
2929 	tcb = (Tcpctl*)s->ptcl;
2930 	if(tcb->state != Established)
2931 		return "connection must be in Establised state";
2932 	if(n > 1){
2933 		x = atoi(f[1]);
2934 		if(x >= MSPTICK)
2935 			tcb->katimer.start = x/MSPTICK;
2936 	}
2937 	tcpsetkacounter(tcb);
2938 	tcpgo(s->p->priv, &tcb->katimer);
2939 
2940 	return nil;
2941 }
2942 
2943 /*
2944  *  turn checksums on/off
2945  */
2946 static char*
tcpsetchecksum(Conv * s,char ** f,int)2947 tcpsetchecksum(Conv *s, char **f, int)
2948 {
2949 	Tcpctl *tcb;
2950 
2951 	tcb = (Tcpctl*)s->ptcl;
2952 	tcb->nochecksum = !atoi(f[1]);
2953 
2954 	return nil;
2955 }
2956 
2957 /*
2958  *  retransmit (at most) one segment at snd.una.
2959  *  preserve cwind & snd.ptr
2960  */
2961 static void
tcprxmit(Conv * s)2962 tcprxmit(Conv *s)
2963 {
2964 	Tcpctl *tcb;
2965 	Tcppriv *tpriv;
2966 	ulong tcwind, tptr;
2967 
2968 	tcb = (Tcpctl*)s->ptcl;
2969 	tcb->flags |= RETRAN|FORCE;
2970 
2971 	tptr = tcb->snd.ptr;
2972 	tcwind = tcb->cwind;
2973 	tcb->snd.ptr = tcb->snd.una;
2974 	tcb->cwind = tcb->mss;
2975 	tcb->snd.retransmit = 1;
2976 	tcpoutput(s);
2977 	tcb->snd.retransmit = 0;
2978 	tcb->cwind = tcwind;
2979 	tcb->snd.ptr = tptr;
2980 
2981 	tpriv = s->p->priv;
2982 	tpriv->stats[RetransSegs]++;
2983 }
2984 
2985 /*
2986  *  TODO: RFC 4138 F-RTO
2987  */
2988 static void
tcptimeout(void * arg)2989 tcptimeout(void *arg)
2990 {
2991 	Conv *s;
2992 	Tcpctl *tcb;
2993 	int maxback;
2994 	Tcppriv *tpriv;
2995 
2996 	s = (Conv*)arg;
2997 	tpriv = s->p->priv;
2998 	tcb = (Tcpctl*)s->ptcl;
2999 
3000 	if(waserror()){
3001 		qunlock(s);
3002 		nexterror();
3003 	}
3004 	qlock(s);
3005 	switch(tcb->state){
3006 	default:
3007 		tcb->backoff++;
3008 		if(tcb->state == Syn_sent)
3009 			maxback = MAXBACKMS/2;
3010 		else
3011 			maxback = MAXBACKMS;
3012 		tcb->backedoff += tcb->timer.start * MSPTICK;
3013 		if(tcb->backedoff >= maxback) {
3014 			localclose(s, Etimedout);
3015 			break;
3016 		}
3017 		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
3018 			tcb->srtt, tcb->mdev, NOW - tcb->time,
3019 			tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
3020 			tcpstates[s->state]);
3021 		tcpsettimer(tcb);
3022 		if(tcb->snd.rto == 0)
3023 			tcpcongestion(tcb);
3024 		tcprxmit(s);
3025 		tcb->snd.ptr = tcb->snd.una;
3026 		tcb->cwind = tcb->mss;
3027 		tcb->snd.rto = 1;
3028 		tpriv->stats[RetransTimeouts]++;
3029 
3030 		if(tcb->snd.recovery){
3031 			tcb->snd.dupacks = 0;		/* reno rto */
3032 			tcb->snd.recovery = 0;
3033 			tpriv->stats[RecoveryRTO]++;
3034 			tcb->snd.rxt = tcb->snd.nxt;
3035 			netlog(s->p->f, Logtcpwin,
3036 				"rto recovery rxt @%lud\n", tcb->snd.nxt);
3037 		}
3038 
3039 		tcb->abcbytes = 0;
3040 		break;
3041 	case Time_wait:
3042 		localclose(s, nil);
3043 		break;
3044 	case Closed:
3045 		break;
3046 	}
3047 	qunlock(s);
3048 	poperror();
3049 }
3050 
3051 static int
inwindow(Tcpctl * tcb,int seq)3052 inwindow(Tcpctl *tcb, int seq)
3053 {
3054 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3055 }
3056 
3057 /*
3058  *  set up state for a received SYN (or SYN ACK) packet
3059  */
3060 static void
procsyn(Conv * s,Tcp * seg)3061 procsyn(Conv *s, Tcp *seg)
3062 {
3063 	Tcpctl *tcb;
3064 	Tcppriv *tpriv;
3065 
3066 	tcb = (Tcpctl*)s->ptcl;
3067 	tcb->flags |= FORCE;
3068 
3069 	tcb->rcv.nxt = seg->seq + 1;
3070 	tcb->rcv.wptr = tcb->rcv.nxt;
3071 	tcb->rcv.wsnt = 0;
3072 	tcb->rcv.urg = tcb->rcv.nxt;
3073 	tcb->irs = seg->seq;
3074 
3075 	/* our sending max segment size cannot be bigger than what he asked for */
3076 	if(seg->mss != 0 && seg->mss < tcb->mss) {
3077 		tcb->mss = seg->mss;
3078 		tpriv = s->p->priv;
3079 		tpriv->stats[Mss] = tcb->mss;
3080 	}
3081 
3082 	tcb->snd.wnd = seg->wnd;
3083 	initialwindow(tcb);
3084 }
3085 
3086 static int
dumpreseq(Tcpctl * tcb)3087 dumpreseq(Tcpctl *tcb)
3088 {
3089 	Reseq *r, *next;
3090 
3091 	for(r = tcb->reseq; r != nil; r = next){
3092 		next = r->next;
3093 		freeblist(r->bp);
3094 		free(r);
3095 	}
3096 	tcb->reseq = nil;
3097 	tcb->nreseq = 0;
3098 	tcb->reseqlen = 0;
3099 	return -1;
3100 }
3101 
3102 static void
logreseq(Fs * f,Reseq * r,ulong n)3103 logreseq(Fs *f, Reseq *r, ulong n)
3104 {
3105 	char *s;
3106 
3107 	for(; r != nil; r = r->next){
3108 		s = nil;
3109 		if(r->next == nil && r->seg.seq != n)
3110 			s = "hole/end";
3111 		else if(r->next == nil)
3112 			s = "end";
3113 		else if(r->seg.seq != n)
3114 			s = "hole";
3115 		if(s != nil)
3116 			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3117 				n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3118 		n = r->seg.seq + r->seg.len;
3119 	}
3120 }
3121 
3122 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3123 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3124 {
3125 	Reseq *rp, **rr;
3126 	int qmax;
3127 
3128 	rp = malloc(sizeof *rp);
3129 	if(rp == nil){
3130 		freeblist(bp);		/* bp always consumed by addreseq */
3131 		return 0;
3132 	}
3133 
3134 	rp->seg = *seg;
3135 	rp->bp = bp;
3136 	rp->length = length;
3137 
3138 	tcb->reseqlen += length;
3139 	tcb->nreseq++;
3140 
3141 	/* Place on reassembly list sorting by starting seq number */
3142 	for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3143 		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3144 			rp->next = *rr;
3145 			*rr = rp;
3146 			tpriv->stats[Resequenced]++;
3147 			if(rp->next != nil)
3148 				tpriv->stats[OutOfOrder]++;
3149 			break;
3150 		}
3151 
3152 	qmax = tcb->window;
3153 	if(tcb->reseqlen > qmax){
3154 		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3155 			tcb->reseqlen, qmax, tcb->nreseq);
3156 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3157 		tpriv->stats[ReseqBytelim]++;
3158 		return dumpreseq(tcb);
3159 	}
3160 	qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3161 	if(tcb->nreseq > qmax){
3162 		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3163 			tcb->nreseq, qmax, tcb->reseqlen);
3164 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3165 		tpriv->stats[ReseqPktlim]++;
3166 		return dumpreseq(tcb);
3167 	}
3168 	return 0;
3169 }
3170 
3171 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3172 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3173 {
3174 	Reseq *rp;
3175 
3176 	rp = tcb->reseq;
3177 	if(rp == nil)
3178 		return;
3179 
3180 	tcb->reseq = rp->next;
3181 
3182 	*seg = rp->seg;
3183 	*bp = rp->bp;
3184 	*length = rp->length;
3185 
3186 	tcb->nreseq--;
3187 	tcb->reseqlen -= rp->length;
3188 
3189 	free(rp);
3190 }
3191 
3192 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3193 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3194 {
3195 	ushort len;
3196 	uchar accept;
3197 	int dupcnt, excess;
3198 
3199 	accept = 0;
3200 	len = *length;
3201 	if(seg->flags & SYN)
3202 		len++;
3203 	if(seg->flags & FIN)
3204 		len++;
3205 
3206 	if(tcb->rcv.wnd == 0) {
3207 		if(len == 0 && seg->seq == tcb->rcv.nxt)
3208 			return 0;
3209 	}
3210 	else {
3211 		/* Some part of the segment should be in the window */
3212 		if(inwindow(tcb,seg->seq))
3213 			accept++;
3214 		else
3215 		if(len != 0) {
3216 			if(inwindow(tcb, seg->seq+len-1) ||
3217 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3218 				accept++;
3219 		}
3220 	}
3221 	if(!accept) {
3222 		freeblist(*bp);
3223 		return -1;
3224 	}
3225 	dupcnt = tcb->rcv.nxt - seg->seq;
3226 	if(dupcnt > 0){
3227 		tcb->rerecv += dupcnt;
3228 		if(seg->flags & SYN){
3229 			seg->flags &= ~SYN;
3230 			seg->seq++;
3231 
3232 			if(seg->urg > 1)
3233 				seg->urg--;
3234 			else
3235 				seg->flags &= ~URG;
3236 			dupcnt--;
3237 		}
3238 		if(dupcnt > 0){
3239 			pullblock(bp, (ushort)dupcnt);
3240 			seg->seq += dupcnt;
3241 			*length -= dupcnt;
3242 
3243 			if(seg->urg > dupcnt)
3244 				seg->urg -= dupcnt;
3245 			else {
3246 				seg->flags &= ~URG;
3247 				seg->urg = 0;
3248 			}
3249 		}
3250 	}
3251 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3252 	if(excess > 0) {
3253 		tcb->rerecv += excess;
3254 		*length -= excess;
3255 		*bp = trimblock(*bp, 0, *length);
3256 		if(*bp == nil)
3257 			panic("presotto is a boofhead");
3258 		seg->flags &= ~FIN;
3259 	}
3260 	return 0;
3261 }
3262 
3263 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3264 tcpadvise(Proto *tcp, Block *bp, char *msg)
3265 {
3266 	Tcp4hdr *h4;
3267 	Tcp6hdr *h6;
3268 	Tcpctl *tcb;
3269 	uchar source[IPaddrlen];
3270 	uchar dest[IPaddrlen];
3271 	ushort psource, pdest;
3272 	Conv *s, **p;
3273 
3274 	h4 = (Tcp4hdr*)(bp->rp);
3275 	h6 = (Tcp6hdr*)(bp->rp);
3276 
3277 	if((h4->vihl&0xF0)==IP_VER4) {
3278 		v4tov6(dest, h4->tcpdst);
3279 		v4tov6(source, h4->tcpsrc);
3280 		psource = nhgets(h4->tcpsport);
3281 		pdest = nhgets(h4->tcpdport);
3282 	}
3283 	else {
3284 		ipmove(dest, h6->tcpdst);
3285 		ipmove(source, h6->tcpsrc);
3286 		psource = nhgets(h6->tcpsport);
3287 		pdest = nhgets(h6->tcpdport);
3288 	}
3289 
3290 	/* Look for a connection */
3291 	qlock(tcp);
3292 	for(p = tcp->conv; *p; p++) {
3293 		s = *p;
3294 		tcb = (Tcpctl*)s->ptcl;
3295 		if(s->rport == pdest)
3296 		if(s->lport == psource)
3297 		if(tcb->state != Closed)
3298 		if(ipcmp(s->raddr, dest) == 0)
3299 		if(ipcmp(s->laddr, source) == 0){
3300 			qlock(s);
3301 			qunlock(tcp);
3302 			switch(tcb->state){
3303 			case Syn_sent:
3304 				localclose(s, msg);
3305 				break;
3306 			}
3307 			qunlock(s);
3308 			freeblist(bp);
3309 			return;
3310 		}
3311 	}
3312 	qunlock(tcp);
3313 	freeblist(bp);
3314 }
3315 
3316 static char*
tcpporthogdefensectl(char * val)3317 tcpporthogdefensectl(char *val)
3318 {
3319 	if(strcmp(val, "on") == 0)
3320 		tcpporthogdefense = 1;
3321 	else if(strcmp(val, "off") == 0)
3322 		tcpporthogdefense = 0;
3323 	else
3324 		return "unknown value for tcpporthogdefense";
3325 	return nil;
3326 }
3327 
3328 /* called with c qlocked */
3329 static char*
tcpctl(Conv * c,char ** f,int n)3330 tcpctl(Conv* c, char** f, int n)
3331 {
3332 	if(n == 1 && strcmp(f[0], "close") == 0)
3333 		return tcpclose2(c);
3334 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3335 		return tcphangup(c);
3336 	if(n == 1 && strcmp(f[0], "hangupxmit") == 0)
3337 		return tcpxmitclose(c);
3338 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3339 		return tcpstartka(c, f, n);
3340 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3341 		return tcpsetchecksum(c, f, n);
3342 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3343 		return tcpporthogdefensectl(f[1]);
3344 	return "unknown control request";
3345 }
3346 
3347 static int
tcpstats(Proto * tcp,char * buf,int len)3348 tcpstats(Proto *tcp, char *buf, int len)
3349 {
3350 	Tcppriv *priv;
3351 	char *p, *e;
3352 	int i;
3353 
3354 	priv = tcp->priv;
3355 	p = buf;
3356 	e = p+len;
3357 	for(i = 0; i < Nstats; i++)
3358 		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3359 	return p - buf;
3360 }
3361 
3362 /*
3363  *  garbage collect any stale conversations:
3364  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3365  *	- Finwait2 after 5 minutes
3366  *
3367  *  this is called whenever we run out of channels.  Both checks are
3368  *  of questionable validity so we try to use them only when we're
3369  *  up against the wall.
3370  */
3371 static int
tcpgc(Proto * tcp)3372 tcpgc(Proto *tcp)
3373 {
3374 	Conv *c, **pp, **ep;
3375 	int n;
3376 	Tcpctl *tcb;
3377 
3378 
3379 	n = 0;
3380 	ep = &tcp->conv[tcp->nc];
3381 	for(pp = tcp->conv; pp < ep; pp++) {
3382 		c = *pp;
3383 		if(c == nil)
3384 			break;
3385 		if(!canqlock(c))
3386 			continue;
3387 		tcb = (Tcpctl*)c->ptcl;
3388 		switch(tcb->state){
3389 		case Syn_received:
3390 			if(NOW - tcb->time > 5000){
3391 				localclose(c, Etimedout);
3392 				n++;
3393 			}
3394 			break;
3395 		case Finwait2:
3396 			if(NOW - tcb->time > 5*60*1000){
3397 				localclose(c, Etimedout);
3398 				n++;
3399 			}
3400 			break;
3401 		}
3402 		qunlock(c);
3403 	}
3404 	return n;
3405 }
3406 
3407 static void
tcpsettimer(Tcpctl * tcb)3408 tcpsettimer(Tcpctl *tcb)
3409 {
3410 	int x;
3411 
3412 	/* round trip dependency */
3413 	x = backoff(tcb->backoff) *
3414 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3415 
3416 	/* bounded twixt 0.3 and 64 seconds */
3417 	if(x < 300/MSPTICK)
3418 		x = 300/MSPTICK;
3419 	else if(x > (64000/MSPTICK))
3420 		x = 64000/MSPTICK;
3421 	tcb->timer.start = x;
3422 }
3423 
3424 void
tcpinit(Fs * fs)3425 tcpinit(Fs *fs)
3426 {
3427 	Proto *tcp;
3428 	Tcppriv *tpriv;
3429 
3430 	tcp = smalloc(sizeof(Proto));
3431 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3432 	tcp->name = "tcp";
3433 	tcp->connect = tcpconnect;
3434 	tcp->announce = tcpannounce;
3435 	tcp->ctl = tcpctl;
3436 	tcp->state = tcpstate;
3437 	tcp->create = tcpcreate;
3438 	tcp->close = tcpclose;
3439 	tcp->rcv = tcpiput;
3440 	tcp->advise = tcpadvise;
3441 	tcp->stats = tcpstats;
3442 	tcp->inuse = tcpinuse;
3443 	tcp->gc = tcpgc;
3444 	tcp->ipproto = IP_TCPPROTO;
3445 	tcp->nc = scalednconv();
3446 	tcp->ptclsize = sizeof(Tcpctl);
3447 	tpriv->stats[MaxConn] = tcp->nc;
3448 
3449 	Fsproto(fs, tcp);
3450 }
3451 
3452 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3453 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3454 {
3455 	/*
3456 	 * guess at reasonable queue sizes.  there's no current way
3457 	 * to know how many nic receive buffers we can safely tie up in the
3458 	 * tcp stack, and we don't adjust our queues to maximize throughput
3459 	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3460 	 * respected, but we still control our own buffer commitment by
3461 	 * keeping a seperate qscale.
3462 	 */
3463 	tcb->rcv.scale = rcvscale & 0xff;
3464 	tcb->snd.scale = sndscale & 0xff;
3465 	tcb->qscale = rcvscale & 0xff;
3466 	if(rcvscale > Maxqscale)
3467 		tcb->qscale = Maxqscale;
3468 
3469 	if(rcvscale != tcb->rcv.scale)
3470 		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3471 			"qlen %d >> window %ud lport %d\n",
3472 			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3473 	tcb->window = QMAX << tcb->qscale;
3474 	tcb->ssthresh = tcb->window;
3475 
3476 	/*
3477 	 * it's important to set wq large enough to cover the full
3478 	 * bandwidth-delay product.  it's possible to be in loss
3479 	 * recovery with a big window, and we need to keep sending
3480 	 * into the inflated window.  the difference can be huge
3481 	 * for even modest (70ms) ping times.
3482 	 */
3483 	qsetlimit(s->rq, tcb->window);
3484 	qsetlimit(s->wq, tcb->window);
3485 	tcprcvwin(s);
3486 }
3487