xref: /plan9/sys/src/9/ip/tcp.c (revision 8e860520aa6bfda4bc4f23391be5d54c6baed7e6)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Maximum segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default maximum segment */
50 	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	/*
85 	 * window is 64kb * 2ⁿ
86 	 * these factors determine the ultimate bandwidth-delay product.
87 	 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 	 */
89 	Maxqscale	= 4,		/* maximum queuing scale */
90 	Defadvscale	= 4,		/* default advertisement */
91 };
92 
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
97 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
98 	"Closing", 	"Last_ack", 	"Time_wait"
99 };
100 
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 	Tcptimer	*next;
105 	Tcptimer	*prev;
106 	Tcptimer	*readynext;
107 	int	state;
108 	int	start;
109 	int	count;
110 	void	(*func)(void*);
111 	void	*arg;
112 };
113 
114 /*
115  *  v4 and v6 pseudo headers used for
116  *  checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 	uchar	vihl;		/* Version and header length */
122 	uchar	tos;		/* Type of service */
123 	uchar	length[2];	/* packet length */
124 	uchar	id[2];		/* Identification */
125 	uchar	frag[2];	/* Fragment information */
126 	uchar	Unused;
127 	uchar	proto;
128 	uchar	tcplen[2];
129 	uchar	tcpsrc[4];
130 	uchar	tcpdst[4];
131 	/* same as v6 from here on */
132 	uchar	tcpsport[2];
133 	uchar	tcpdport[2];
134 	uchar	tcpseq[4];
135 	uchar	tcpack[4];
136 	uchar	tcpflag[2];
137 	uchar	tcpwin[2];
138 	uchar	tcpcksum[2];
139 	uchar	tcpurg[2];
140 	/* Options segment */
141 	uchar	tcpopt[1];
142 };
143 
144 typedef struct Tcp6hdr Tcp6hdr;
145 struct Tcp6hdr
146 {
147 	uchar	vcf[4];
148 	uchar	ploadlen[2];
149 	uchar	proto;
150 	uchar	ttl;
151 	uchar	tcpsrc[IPaddrlen];
152 	uchar	tcpdst[IPaddrlen];
153 	/* same as v4 from here on */
154 	uchar	tcpsport[2];
155 	uchar	tcpdport[2];
156 	uchar	tcpseq[4];
157 	uchar	tcpack[4];
158 	uchar	tcpflag[2];
159 	uchar	tcpwin[2];
160 	uchar	tcpcksum[2];
161 	uchar	tcpurg[2];
162 	/* Options segment */
163 	uchar	tcpopt[1];
164 };
165 
166 /*
167  *  this represents the control info
168  *  for a single packet.  It is derived from
169  *  a packet in ntohtcp{4,6}() and stuck into
170  *  a packet in htontcp{4,6}().
171  */
172 typedef struct Tcp Tcp;
173 struct	Tcp
174 {
175 	ushort	source;
176 	ushort	dest;
177 	ulong	seq;
178 	ulong	ack;
179 	uchar	flags;
180 	uchar	update;
181 	ushort	ws;	/* window scale option */
182 	ulong	wnd;	/* prescaled window*/
183 	ushort	urg;
184 	ushort	mss;	/* max segment size option (if not zero) */
185 	ushort	len;	/* size of data */
186 };
187 
188 /*
189  *  this header is malloc'd to thread together fragments
190  *  waiting to be coalesced
191  */
192 typedef struct Reseq Reseq;
193 struct Reseq
194 {
195 	Reseq	*next;
196 	Tcp	seg;
197 	Block	*bp;
198 	ushort	length;
199 };
200 
201 /*
202  *  the qlock in the Conv locks this structure
203  */
204 typedef struct Tcpctl Tcpctl;
205 struct Tcpctl
206 {
207 	uchar	state;			/* Connection state */
208 	uchar	type;			/* Listening or active connection */
209 	uchar	code;			/* Icmp code */
210 	struct {
211 		ulong	una;		/* Unacked data pointer */
212 		ulong	nxt;		/* Next sequence expected */
213 		ulong	ptr;		/* Data pointer */
214 		ulong	wnd;		/* Tcp send window */
215 		ulong	urg;		/* Urgent data pointer */
216 		ulong	wl2;
217 		uint	scale;		/* how much to right shift window */
218 					/* in xmitted packets */
219 		/* to implement tahoe and reno TCP */
220 		ulong	dupacks;	/* number of duplicate acks rcvd */
221 		ulong	partialack;
222 		int	recovery;	/* loss recovery flag */
223 		int	retransmit;	/* retransmit 1 packet @ una flag */
224 		int	rto;
225 		ulong	rxt;		/* right window marker for recovery */
226 					/* "recover" rfc3782 */
227 	} snd;
228 	struct {
229 		ulong	nxt;		/* Receive pointer to next uchar slot */
230 		ulong	wnd;		/* Receive window incoming */
231 		ulong	wsnt;		/* Last wptr sent.  important to */
232 					/* track for large bdp */
233 		ulong	wptr;
234 		ulong	urg;		/* Urgent pointer */
235 		ulong	ackptr;		/* last acked sequence */
236 		int	blocked;
237 		uint	scale;		/* how much to left shift window in */
238 					/* rcv'd packets */
239 	} rcv;
240 	ulong	iss;			/* Initial sequence number */
241 	ulong	cwind;			/* Congestion window */
242 	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
243 	uint	scale;			/* desired snd.scale */
244 	ulong	ssthresh;		/* Slow start threshold */
245 	int	resent;			/* Bytes just resent */
246 	int	irs;			/* Initial received squence */
247 	ushort	mss;			/* Maximum segment size */
248 	int	rerecv;			/* Overlap of data rerecevived */
249 	ulong	window;			/* Our receive window (queue) */
250 	uint	qscale;			/* Log2 of our receive window (queue) */
251 	uchar	backoff;		/* Exponential backoff counter */
252 	int	backedoff;		/* ms we've backed off for rexmits */
253 	uchar	flags;			/* State flags */
254 	Reseq	*reseq;			/* Resequencing queue */
255 	int	nreseq;
256 	int	reseqlen;
257 	Tcptimer	timer;			/* Activity timer */
258 	Tcptimer	acktimer;		/* Acknowledge timer */
259 	Tcptimer	rtt_timer;		/* Round trip timer */
260 	Tcptimer	katimer;		/* keep alive timer */
261 	ulong	rttseq;			/* Round trip sequence */
262 	int	srtt;			/* Smoothed round trip */
263 	int	mdev;			/* Mean deviation of round trip */
264 	int	kacounter;		/* count down for keep alive */
265 	uint	sndsyntime;		/* time syn sent */
266 	ulong	time;			/* time Finwait2 or Syn_received was sent */
267 	ulong	timeuna;		/* snd.una when time was set */
268 	int	nochecksum;		/* non-zero means don't send checksums */
269 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
270 
271 	union {
272 		Tcp4hdr	tcp4hdr;
273 		Tcp6hdr	tcp6hdr;
274 	} protohdr;		/* prototype header */
275 };
276 
277 /*
278  *  New calls are put in limbo rather than having a conversation structure
279  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
280  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
281  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
282  *
283  *  In particular they aren't on a listener's queue so that they don't figure
284  *  in the input queue limit.
285  *
286  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
287  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
288  *  there is no hashing of this list.
289  */
290 typedef struct Limbo Limbo;
291 struct Limbo
292 {
293 	Limbo	*next;
294 
295 	uchar	laddr[IPaddrlen];
296 	uchar	raddr[IPaddrlen];
297 	ushort	lport;
298 	ushort	rport;
299 	ulong	irs;		/* initial received sequence */
300 	ulong	iss;		/* initial sent sequence */
301 	ushort	mss;		/* mss from the other end */
302 	ushort	rcvscale;	/* how much to scale rcvd windows */
303 	ushort	sndscale;	/* how much to scale sent windows */
304 	ulong	lastsend;	/* last time we sent a synack */
305 	uchar	version;	/* v4 or v6 */
306 	uchar	rexmits;	/* number of retransmissions */
307 };
308 
309 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
310 
311 enum {
312 	/* MIB stats */
313 	MaxConn,
314 	Mss,
315 	ActiveOpens,
316 	PassiveOpens,
317 	EstabResets,
318 	CurrEstab,
319 	InSegs,
320 	OutSegs,
321 	RetransSegs,
322 	RetransSegsSent,
323 	RetransTimeouts,
324 	InErrs,
325 	OutRsts,
326 
327 	/* non-MIB stats */
328 	CsumErrs,
329 	HlenErrs,
330 	LenErrs,
331 	Resequenced,
332 	OutOfOrder,
333 	ReseqBytelim,
334 	ReseqPktlim,
335 	Delayack,
336 	Wopenack,
337 
338 	Recovery,
339 	RecoveryDone,
340 	RecoveryRTO,
341 	RecoveryNoSeq,
342 	RecoveryCwind,
343 	RecoveryPA,
344 
345 	Nstats
346 };
347 
348 static char *statnames[Nstats] =
349 {
350 [MaxConn]	"MaxConn",
351 [Mss]		"MaxSegment",
352 [ActiveOpens]	"ActiveOpens",
353 [PassiveOpens]	"PassiveOpens",
354 [EstabResets]	"EstabResets",
355 [CurrEstab]	"CurrEstab",
356 [InSegs]	"InSegs",
357 [OutSegs]	"OutSegs",
358 [RetransSegs]	"RetransSegs",
359 [RetransSegsSent]	"RetransSegsSent",
360 [RetransTimeouts]	"RetransTimeouts",
361 [InErrs]	"InErrs",
362 [OutRsts]	"OutRsts",
363 [CsumErrs]	"CsumErrs",
364 [HlenErrs]	"HlenErrs",
365 [LenErrs]	"LenErrs",
366 [OutOfOrder]	"OutOfOrder",
367 [Resequenced]	"Resequenced",
368 [ReseqBytelim]	"ReseqBytelim",
369 [ReseqPktlim]	"ReseqPktlim",
370 [Delayack]	"Delayack",
371 [Wopenack]	"Wopenack",
372 
373 [Recovery]	"Recovery",
374 [RecoveryDone]	"RecoveryDone",
375 [RecoveryRTO]	"RecoveryRTO",
376 
377 [RecoveryNoSeq]	"RecoveryNoSeq",
378 [RecoveryCwind]	"RecoveryCwind",
379 [RecoveryPA]	"RecoveryPA",
380 };
381 
382 typedef struct Tcppriv Tcppriv;
383 struct Tcppriv
384 {
385 	/* List of active timers */
386 	QLock 	tl;
387 	Tcptimer *timers;
388 
389 	/* hash table for matching conversations */
390 	Ipht	ht;
391 
392 	/* calls in limbo waiting for an ACK to our SYN ACK */
393 	int	nlimbo;
394 	Limbo	*lht[NLHT];
395 
396 	/* for keeping track of tcpackproc */
397 	QLock	apl;
398 	int	ackprocstarted;
399 
400 	uvlong	stats[Nstats];
401 };
402 
403 /*
404  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
405  *  solution to hijacked systems staking out port's as a form
406  *  of DoS attack.
407  *
408  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
409  *  that number gets acked by the other end, we shut down the connection.
410  *  Look for tcpporthogdefense in the code.
411  */
412 int tcpporthogdefense = 0;
413 
414 static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
415 static	int	dumpreseq(Tcpctl*);
416 static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
417 static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
418 static	void	limborexmit(Proto*);
419 static	void	localclose(Conv*, char*);
420 static	void	procsyn(Conv*, Tcp*);
421 static	void	tcpacktimer(void*);
422 static	void	tcpiput(Proto*, Ipifc*, Block*);
423 static	void	tcpkeepalive(void*);
424 static	void	tcpoutput(Conv*);
425 static	void	tcprcvwin(Conv*);
426 static	void	tcprxmit(Conv*);
427 static	void	tcpsetkacounter(Tcpctl*);
428 static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
429 static	void	tcpsettimer(Tcpctl*);
430 static	void	tcpsndsyn(Conv*, Tcpctl*);
431 static	void	tcpstart(Conv*, int);
432 static	void	tcpsynackrtt(Conv*);
433 static	void	tcptimeout(void*);
434 static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
435 
436 static void
tcpsetstate(Conv * s,uchar newstate)437 tcpsetstate(Conv *s, uchar newstate)
438 {
439 	Tcpctl *tcb;
440 	uchar oldstate;
441 	Tcppriv *tpriv;
442 
443 	tpriv = s->p->priv;
444 
445 	tcb = (Tcpctl*)s->ptcl;
446 
447 	oldstate = tcb->state;
448 	if(oldstate == newstate)
449 		return;
450 
451 	if(oldstate == Established)
452 		tpriv->stats[CurrEstab]--;
453 	if(newstate == Established)
454 		tpriv->stats[CurrEstab]++;
455 
456 	switch(newstate) {
457 	case Closed:
458 		qclose(s->rq);
459 		qclose(s->wq);
460 		qclose(s->eq);
461 		break;
462 
463 	case Close_wait:		/* Remote closes */
464 		qhangup(s->rq, nil);
465 		break;
466 	}
467 
468 	tcb->state = newstate;
469 
470 	if(oldstate == Syn_sent && newstate != Closed)
471 		Fsconnected(s, nil);
472 }
473 
474 static char*
tcpconnect(Conv * c,char ** argv,int argc)475 tcpconnect(Conv *c, char **argv, int argc)
476 {
477 	char *e;
478 	Tcpctl *tcb;
479 
480 	tcb = (Tcpctl*)(c->ptcl);
481 	if(tcb->state != Closed)
482 		return Econinuse;
483 
484 	e = Fsstdconnect(c, argv, argc);
485 	if(e != nil)
486 		return e;
487 	tcpstart(c, TCP_CONNECT);
488 
489 	return nil;
490 }
491 
492 static int
tcpstate(Conv * c,char * state,int n)493 tcpstate(Conv *c, char *state, int n)
494 {
495 	Tcpctl *s;
496 
497 	s = (Tcpctl*)(c->ptcl);
498 
499 	return snprint(state, n,
500 		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
501 		"swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
502 		"timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
503 		tcpstates[s->state],
504 		c->rq ? qlen(c->rq) : 0,
505 		c->wq ? qlen(c->wq) : 0,
506 		s->nreseq, s->reseqlen,
507 		s->srtt, s->mdev, s->ssthresh,
508 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
509 		s->qscale,
510 		s->timer.start, s->timer.count, s->rerecv,
511 		s->katimer.start, s->katimer.count);
512 }
513 
514 static int
tcpinuse(Conv * c)515 tcpinuse(Conv *c)
516 {
517 	Tcpctl *s;
518 
519 	s = (Tcpctl*)(c->ptcl);
520 	return s->state != Closed;
521 }
522 
523 static char*
tcpannounce(Conv * c,char ** argv,int argc)524 tcpannounce(Conv *c, char **argv, int argc)
525 {
526 	char *e;
527 	Tcpctl *tcb;
528 
529 	tcb = (Tcpctl*)(c->ptcl);
530 	if(tcb->state != Closed)
531 		return Econinuse;
532 
533 	e = Fsstdannounce(c, argv, argc);
534 	if(e != nil)
535 		return e;
536 	tcpstart(c, TCP_LISTEN);
537 	Fsconnected(c, nil);
538 
539 	return nil;
540 }
541 
542 /*
543  *  tcpclose is always called with the q locked
544  */
545 static void
tcpclose(Conv * c)546 tcpclose(Conv *c)
547 {
548 	Tcpctl *tcb;
549 
550 	tcb = (Tcpctl*)c->ptcl;
551 
552 	qhangup(c->rq, nil);
553 	qhangup(c->wq, nil);
554 	qhangup(c->eq, nil);
555 	qflush(c->rq);
556 
557 	switch(tcb->state) {
558 	case Listen:
559 		/*
560 		 *  reset any incoming calls to this listener
561 		 */
562 		Fsconnected(c, "Hangup");
563 
564 		localclose(c, nil);
565 		break;
566 	case Closed:
567 	case Syn_sent:
568 		localclose(c, nil);
569 		break;
570 	case Syn_received:
571 	case Established:
572 		tcb->flgcnt++;
573 		tcb->snd.nxt++;
574 		tcpsetstate(c, Finwait1);
575 		tcpoutput(c);
576 		break;
577 	case Close_wait:
578 		tcb->flgcnt++;
579 		tcb->snd.nxt++;
580 		tcpsetstate(c, Last_ack);
581 		tcpoutput(c);
582 		break;
583 	}
584 }
585 
586 static void
tcpkick(void * x)587 tcpkick(void *x)
588 {
589 	Conv *s = x;
590 	Tcpctl *tcb;
591 
592 	tcb = (Tcpctl*)s->ptcl;
593 
594 	if(waserror()){
595 		qunlock(s);
596 		nexterror();
597 	}
598 	qlock(s);
599 
600 	switch(tcb->state) {
601 	case Syn_sent:
602 	case Syn_received:
603 	case Established:
604 	case Close_wait:
605 		/*
606 		 * Push data
607 		 */
608 		tcpoutput(s);
609 		break;
610 	default:
611 		localclose(s, "Hangup");
612 		break;
613 	}
614 
615 	qunlock(s);
616 	poperror();
617 }
618 
619 static int seq_lt(ulong, ulong);
620 
621 static void
tcprcvwin(Conv * s)622 tcprcvwin(Conv *s)				/* Call with tcb locked */
623 {
624 	int w;
625 	Tcpctl *tcb;
626 
627 	tcb = (Tcpctl*)s->ptcl;
628 	w = tcb->window - qlen(s->rq);
629 	if(w < 0)
630 		w = 0;
631 	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
632 	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
633 		w = tcb->rcv.wptr - tcb->rcv.nxt;
634 	if(w != tcb->rcv.wnd)
635 	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
636 		tcb->rcv.blocked = 1;
637 		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
638 			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
639 	}
640 	tcb->rcv.wnd = w;
641 	tcb->rcv.wptr = tcb->rcv.nxt + w;
642 }
643 
644 static void
tcpacktimer(void * v)645 tcpacktimer(void *v)
646 {
647 	Tcpctl *tcb;
648 	Conv *s;
649 
650 	s = v;
651 	tcb = (Tcpctl*)s->ptcl;
652 
653 	if(waserror()){
654 		qunlock(s);
655 		nexterror();
656 	}
657 	qlock(s);
658 	if(tcb->state != Closed){
659 		tcb->flags |= FORCE;
660 		tcpoutput(s);
661 	}
662 	qunlock(s);
663 	poperror();
664 }
665 
666 static void
tcpcongestion(Tcpctl * tcb)667 tcpcongestion(Tcpctl *tcb)
668 {
669 	ulong inflight;
670 
671 	inflight = tcb->snd.nxt - tcb->snd.una;
672 	if(inflight > tcb->cwind)
673 		inflight = tcb->cwind;
674 	tcb->ssthresh = inflight / 2;
675 	if(tcb->ssthresh < 2*tcb->mss)
676 		tcb->ssthresh = 2*tcb->mss;
677 }
678 
679 enum {
680 	L	= 2,	/* aggressive slow start; legal values ∈ (1.0, 2.0) */
681 };
682 
683 static void
tcpabcincr(Tcpctl * tcb,uint acked)684 tcpabcincr(Tcpctl *tcb, uint acked)
685 {
686 	uint limit;
687 
688 	tcb->abcbytes += acked;
689 	if(tcb->cwind < tcb->ssthresh){
690 		/* slow start */
691 		if(tcb->snd.rto)
692 			limit = tcb->mss;
693 		else
694 			limit = L*tcb->mss;
695 		tcb->cwind += MIN(tcb->abcbytes, limit);
696 		tcb->abcbytes = 0;
697 	} else {
698 		tcb->snd.rto = 0;
699 		/* avoidance */
700 		if(tcb->abcbytes >= tcb->cwind){
701 			tcb->abcbytes -= tcb->cwind;
702 			tcb->cwind += tcb->mss;
703 		}
704 	}
705 }
706 
707 static void
tcpcreate(Conv * c)708 tcpcreate(Conv *c)
709 {
710 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
711 	c->wq = qopen(QMAX, Qkick, tcpkick, c);
712 }
713 
714 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)715 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
716 {
717 	if(newstate != TcptimerON){
718 		if(t->state == TcptimerON){
719 			/* unchain */
720 			if(priv->timers == t){
721 				priv->timers = t->next;
722 				if(t->prev != nil)
723 					panic("timerstate1");
724 			}
725 			if(t->next)
726 				t->next->prev = t->prev;
727 			if(t->prev)
728 				t->prev->next = t->next;
729 			t->next = t->prev = nil;
730 		}
731 	} else {
732 		if(t->state != TcptimerON){
733 			/* chain */
734 			if(t->prev != nil || t->next != nil)
735 				panic("timerstate2");
736 			t->prev = nil;
737 			t->next = priv->timers;
738 			if(t->next)
739 				t->next->prev = t;
740 			priv->timers = t;
741 		}
742 	}
743 	t->state = newstate;
744 }
745 
746 static void
tcpackproc(void * a)747 tcpackproc(void *a)
748 {
749 	Tcptimer *t, *tp, *timeo;
750 	Proto *tcp;
751 	Tcppriv *priv;
752 	int loop;
753 
754 	tcp = a;
755 	priv = tcp->priv;
756 
757 	for(;;) {
758 		tsleep(&up->sleep, return0, 0, MSPTICK);
759 
760 		qlock(&priv->tl);
761 		timeo = nil;
762 		loop = 0;
763 		for(t = priv->timers; t != nil; t = tp) {
764 			if(loop++ > 10000)
765 				panic("tcpackproc1");
766 			tp = t->next;
767  			if(t->state == TcptimerON) {
768 				t->count--;
769 				if(t->count == 0) {
770 					timerstate(priv, t, TcptimerDONE);
771 					t->readynext = timeo;
772 					timeo = t;
773 				}
774 			}
775 		}
776 		qunlock(&priv->tl);
777 
778 		loop = 0;
779 		for(t = timeo; t != nil; t = t->readynext) {
780 			if(loop++ > 10000)
781 				panic("tcpackproc2");
782 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
783 				(*t->func)(t->arg);
784 				poperror();
785 			}
786 		}
787 
788 		limborexmit(tcp);
789 	}
790 }
791 
792 static void
tcpgo(Tcppriv * priv,Tcptimer * t)793 tcpgo(Tcppriv *priv, Tcptimer *t)
794 {
795 	if(t == nil || t->start == 0)
796 		return;
797 
798 	qlock(&priv->tl);
799 	t->count = t->start;
800 	timerstate(priv, t, TcptimerON);
801 	qunlock(&priv->tl);
802 }
803 
804 static void
tcphalt(Tcppriv * priv,Tcptimer * t)805 tcphalt(Tcppriv *priv, Tcptimer *t)
806 {
807 	if(t == nil)
808 		return;
809 
810 	qlock(&priv->tl);
811 	timerstate(priv, t, TcptimerOFF);
812 	qunlock(&priv->tl);
813 }
814 
815 static int
backoff(int n)816 backoff(int n)
817 {
818 	return 1 << n;
819 }
820 
821 static void
localclose(Conv * s,char * reason)822 localclose(Conv *s, char *reason)	/* called with tcb locked */
823 {
824 	Tcpctl *tcb;
825 	Tcppriv *tpriv;
826 
827 	tpriv = s->p->priv;
828 	tcb = (Tcpctl*)s->ptcl;
829 
830 	iphtrem(&tpriv->ht, s);
831 
832 	tcphalt(tpriv, &tcb->timer);
833 	tcphalt(tpriv, &tcb->rtt_timer);
834 	tcphalt(tpriv, &tcb->acktimer);
835 	tcphalt(tpriv, &tcb->katimer);
836 
837 	/* Flush reassembly queue; nothing more can arrive */
838 	dumpreseq(tcb);
839 
840 	if(tcb->state == Syn_sent)
841 		Fsconnected(s, reason);
842 	if(s->state == Announced)
843 		wakeup(&s->listenr);
844 
845 	qhangup(s->rq, reason);
846 	qhangup(s->wq, reason);
847 
848 	tcpsetstate(s, Closed);
849 }
850 
851 /* mtu (- TCP + IP hdr len) of 1st hop */
852 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)853 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
854 {
855 	Ipifc *ifc;
856 	int mtu;
857 
858 	ifc = findipifc(tcp->f, addr, 0);
859 	switch(version){
860 	default:
861 	case V4:
862 		mtu = DEF_MSS;
863 		if(ifc != nil)
864 			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
865 		break;
866 	case V6:
867 		mtu = DEF_MSS6;
868 		if(ifc != nil)
869 			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
870 		break;
871 	}
872 	/*
873 	 * set the ws.  it doesn't commit us to anything.
874 	 * ws is the ultimate limit to the bandwidth-delay product.
875 	 */
876 	*scale = Defadvscale;
877 
878 	return mtu;
879 }
880 
881 static void
inittcpctl(Conv * s,int mode)882 inittcpctl(Conv *s, int mode)
883 {
884 	Tcpctl *tcb;
885 	Tcp4hdr* h4;
886 	Tcp6hdr* h6;
887 	Tcppriv *tpriv;
888 	int mss;
889 
890 	tcb = (Tcpctl*)s->ptcl;
891 
892 	memset(tcb, 0, sizeof(Tcpctl));
893 
894 	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
895 	tcb->srtt = tcp_irtt<<LOGAGAIN;
896 	tcb->mdev = 0;
897 
898 	/* setup timers */
899 	tcb->timer.start = tcp_irtt / MSPTICK;
900 	tcb->timer.func = tcptimeout;
901 	tcb->timer.arg = s;
902 	tcb->rtt_timer.start = MAX_TIME;
903 	tcb->acktimer.start = TCP_ACK / MSPTICK;
904 	tcb->acktimer.func = tcpacktimer;
905 	tcb->acktimer.arg = s;
906 	tcb->katimer.start = DEF_KAT / MSPTICK;
907 	tcb->katimer.func = tcpkeepalive;
908 	tcb->katimer.arg = s;
909 
910 	mss = DEF_MSS;
911 
912 	/* create a prototype(pseudo) header */
913 	if(mode != TCP_LISTEN){
914 		if(ipcmp(s->laddr, IPnoaddr) == 0)
915 			findlocalip(s->p->f, s->laddr, s->raddr);
916 
917 		switch(s->ipversion){
918 		case V4:
919 			h4 = &tcb->protohdr.tcp4hdr;
920 			memset(h4, 0, sizeof(*h4));
921 			h4->proto = IP_TCPPROTO;
922 			hnputs(h4->tcpsport, s->lport);
923 			hnputs(h4->tcpdport, s->rport);
924 			v6tov4(h4->tcpsrc, s->laddr);
925 			v6tov4(h4->tcpdst, s->raddr);
926 			break;
927 		case V6:
928 			h6 = &tcb->protohdr.tcp6hdr;
929 			memset(h6, 0, sizeof(*h6));
930 			h6->proto = IP_TCPPROTO;
931 			hnputs(h6->tcpsport, s->lport);
932 			hnputs(h6->tcpdport, s->rport);
933 			ipmove(h6->tcpsrc, s->laddr);
934 			ipmove(h6->tcpdst, s->raddr);
935 			mss = DEF_MSS6;
936 			break;
937 		default:
938 			panic("inittcpctl: version %d", s->ipversion);
939 		}
940 	}
941 
942 	tcb->mss = tcb->cwind = mss;
943 	tcb->abcbytes = 0;
944 	tpriv = s->p->priv;
945 	tpriv->stats[Mss] = tcb->mss;
946 
947 	/* default is no window scaling */
948 	tcpsetscale(s, tcb, 0, 0);
949 }
950 
951 /*
952  *  called with s qlocked
953  */
954 static void
tcpstart(Conv * s,int mode)955 tcpstart(Conv *s, int mode)
956 {
957 	Tcpctl *tcb;
958 	Tcppriv *tpriv;
959 	char kpname[KNAMELEN];
960 
961 	tpriv = s->p->priv;
962 
963 	if(tpriv->ackprocstarted == 0){
964 		qlock(&tpriv->apl);
965 		if(tpriv->ackprocstarted == 0){
966 			snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
967 			kproc(kpname, tcpackproc, s->p);
968 			tpriv->ackprocstarted = 1;
969 		}
970 		qunlock(&tpriv->apl);
971 	}
972 
973 	tcb = (Tcpctl*)s->ptcl;
974 
975 	inittcpctl(s, mode);
976 
977 	iphtadd(&tpriv->ht, s);
978 	switch(mode) {
979 	case TCP_LISTEN:
980 		tpriv->stats[PassiveOpens]++;
981 		tcb->flags |= CLONE;
982 		tcpsetstate(s, Listen);
983 		break;
984 
985 	case TCP_CONNECT:
986 		tpriv->stats[ActiveOpens]++;
987 		tcb->flags |= ACTIVE;
988 		tcpsndsyn(s, tcb);
989 		tcpsetstate(s, Syn_sent);
990 		tcpoutput(s);
991 		break;
992 	}
993 }
994 
995 static char*
tcpflag(char * buf,char * e,ushort flag)996 tcpflag(char *buf, char *e, ushort flag)
997 {
998 	char *p;
999 
1000 	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
1001 	if(flag & URG)
1002 		p = seprint(p, e, " URG");
1003 	if(flag & ACK)
1004 		p = seprint(p, e, " ACK");
1005 	if(flag & PSH)
1006 		p = seprint(p, e, " PSH");
1007 	if(flag & RST)
1008 		p = seprint(p, e, " RST");
1009 	if(flag & SYN)
1010 		p = seprint(p, e, " SYN");
1011 	if(flag & FIN)
1012 		p = seprint(p, e, " FIN");
1013 	USED(p);
1014 	return buf;
1015 }
1016 
1017 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1018 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1019 {
1020 	int dlen;
1021 	Tcp6hdr *h;
1022 	ushort csum;
1023 	ushort hdrlen, optpad = 0;
1024 	uchar *opt;
1025 
1026 	hdrlen = TCP6_HDRSIZE;
1027 	if(tcph->flags & SYN){
1028 		if(tcph->mss)
1029 			hdrlen += MSS_LENGTH;
1030 		if(tcph->ws)
1031 			hdrlen += WS_LENGTH;
1032 		optpad = hdrlen & 3;
1033 		if(optpad)
1034 			optpad = 4 - optpad;
1035 		hdrlen += optpad;
1036 	}
1037 
1038 	if(data) {
1039 		dlen = blocklen(data);
1040 		data = padblock(data, hdrlen + TCP6_PKT);
1041 		if(data == nil)
1042 			return nil;
1043 	}
1044 	else {
1045 		dlen = 0;
1046 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
1047 		if(data == nil)
1048 			return nil;
1049 		data->wp += hdrlen + TCP6_PKT;
1050 	}
1051 
1052 	/* copy in pseudo ip header plus port numbers */
1053 	h = (Tcp6hdr *)(data->rp);
1054 	memmove(h, ph, TCP6_TCBPHDRSZ);
1055 
1056 	/* compose pseudo tcp header, do cksum calculation */
1057 	hnputl(h->vcf, hdrlen + dlen);
1058 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1059 	h->ttl = ph->proto;
1060 
1061 	/* copy in variable bits */
1062 	hnputl(h->tcpseq, tcph->seq);
1063 	hnputl(h->tcpack, tcph->ack);
1064 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1065 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1066 	hnputs(h->tcpurg, tcph->urg);
1067 
1068 	if(tcph->flags & SYN){
1069 		opt = h->tcpopt;
1070 		if(tcph->mss != 0){
1071 			*opt++ = MSSOPT;
1072 			*opt++ = MSS_LENGTH;
1073 			hnputs(opt, tcph->mss);
1074 			opt += 2;
1075 		}
1076 		if(tcph->ws != 0){
1077 			*opt++ = WSOPT;
1078 			*opt++ = WS_LENGTH;
1079 			*opt++ = tcph->ws;
1080 		}
1081 		while(optpad-- > 0)
1082 			*opt++ = NOOPOPT;
1083 	}
1084 
1085 	if(tcb != nil && tcb->nochecksum){
1086 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1087 	} else {
1088 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1089 		hnputs(h->tcpcksum, csum);
1090 	}
1091 
1092 	/* move from pseudo header back to normal ip header */
1093 	memset(h->vcf, 0, 4);
1094 	h->vcf[0] = IP_VER6;
1095 	hnputs(h->ploadlen, hdrlen+dlen);
1096 	h->proto = ph->proto;
1097 
1098 	return data;
1099 }
1100 
1101 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1102 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1103 {
1104 	int dlen;
1105 	Tcp4hdr *h;
1106 	ushort csum;
1107 	ushort hdrlen, optpad = 0;
1108 	uchar *opt;
1109 
1110 	hdrlen = TCP4_HDRSIZE;
1111 	if(tcph->flags & SYN){
1112 		if(tcph->mss)
1113 			hdrlen += MSS_LENGTH;
1114 		if(1)
1115 			hdrlen += WS_LENGTH;
1116 		optpad = hdrlen & 3;
1117 		if(optpad)
1118 			optpad = 4 - optpad;
1119 		hdrlen += optpad;
1120 	}
1121 
1122 	if(data) {
1123 		dlen = blocklen(data);
1124 		data = padblock(data, hdrlen + TCP4_PKT);
1125 		if(data == nil)
1126 			return nil;
1127 	}
1128 	else {
1129 		dlen = 0;
1130 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1131 		if(data == nil)
1132 			return nil;
1133 		data->wp += hdrlen + TCP4_PKT;
1134 	}
1135 
1136 	/* copy in pseudo ip header plus port numbers */
1137 	h = (Tcp4hdr *)(data->rp);
1138 	memmove(h, ph, TCP4_TCBPHDRSZ);
1139 
1140 	/* copy in variable bits */
1141 	hnputs(h->tcplen, hdrlen + dlen);
1142 	hnputl(h->tcpseq, tcph->seq);
1143 	hnputl(h->tcpack, tcph->ack);
1144 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1145 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1146 	hnputs(h->tcpurg, tcph->urg);
1147 
1148 	if(tcph->flags & SYN){
1149 		opt = h->tcpopt;
1150 		if(tcph->mss != 0){
1151 			*opt++ = MSSOPT;
1152 			*opt++ = MSS_LENGTH;
1153 			hnputs(opt, tcph->mss);
1154 			opt += 2;
1155 		}
1156 		/* always offer.  rfc1323 §2.2 */
1157 		if(1){
1158 			*opt++ = WSOPT;
1159 			*opt++ = WS_LENGTH;
1160 			*opt++ = tcph->ws;
1161 		}
1162 		while(optpad-- > 0)
1163 			*opt++ = NOOPOPT;
1164 	}
1165 
1166 	if(tcb != nil && tcb->nochecksum){
1167 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1168 	} else {
1169 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1170 		hnputs(h->tcpcksum, csum);
1171 	}
1172 
1173 	return data;
1174 }
1175 
1176 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1177 ntohtcp6(Tcp *tcph, Block **bpp)
1178 {
1179 	Tcp6hdr *h;
1180 	uchar *optr;
1181 	ushort hdrlen;
1182 	ushort optlen;
1183 	int n;
1184 
1185 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1186 	if(*bpp == nil)
1187 		return -1;
1188 
1189 	h = (Tcp6hdr *)((*bpp)->rp);
1190 	tcph->source = nhgets(h->tcpsport);
1191 	tcph->dest = nhgets(h->tcpdport);
1192 	tcph->seq = nhgetl(h->tcpseq);
1193 	tcph->ack = nhgetl(h->tcpack);
1194 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1195 	if(hdrlen < TCP6_HDRSIZE) {
1196 		freeblist(*bpp);
1197 		return -1;
1198 	}
1199 
1200 	tcph->flags = h->tcpflag[1];
1201 	tcph->wnd = nhgets(h->tcpwin);
1202 	tcph->urg = nhgets(h->tcpurg);
1203 	tcph->mss = 0;
1204 	tcph->ws = 0;
1205 	tcph->update = 0;
1206 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1207 
1208 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1209 	if(*bpp == nil)
1210 		return -1;
1211 
1212 	optr = h->tcpopt;
1213 	n = hdrlen - TCP6_HDRSIZE;
1214 	while(n > 0 && *optr != EOLOPT) {
1215 		if(*optr == NOOPOPT) {
1216 			n--;
1217 			optr++;
1218 			continue;
1219 		}
1220 		optlen = optr[1];
1221 		if(optlen < 2 || optlen > n)
1222 			break;
1223 		switch(*optr) {
1224 		case MSSOPT:
1225 			if(optlen == MSS_LENGTH)
1226 				tcph->mss = nhgets(optr+2);
1227 			break;
1228 		case WSOPT:
1229 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1230 				tcph->ws = *(optr+2);
1231 			break;
1232 		}
1233 		n -= optlen;
1234 		optr += optlen;
1235 	}
1236 	return hdrlen;
1237 }
1238 
1239 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1240 ntohtcp4(Tcp *tcph, Block **bpp)
1241 {
1242 	Tcp4hdr *h;
1243 	uchar *optr;
1244 	ushort hdrlen;
1245 	ushort optlen;
1246 	int n;
1247 
1248 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1249 	if(*bpp == nil)
1250 		return -1;
1251 
1252 	h = (Tcp4hdr *)((*bpp)->rp);
1253 	tcph->source = nhgets(h->tcpsport);
1254 	tcph->dest = nhgets(h->tcpdport);
1255 	tcph->seq = nhgetl(h->tcpseq);
1256 	tcph->ack = nhgetl(h->tcpack);
1257 
1258 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1259 	if(hdrlen < TCP4_HDRSIZE) {
1260 		freeblist(*bpp);
1261 		return -1;
1262 	}
1263 
1264 	tcph->flags = h->tcpflag[1];
1265 	tcph->wnd = nhgets(h->tcpwin);
1266 	tcph->urg = nhgets(h->tcpurg);
1267 	tcph->mss = 0;
1268 	tcph->ws = 0;
1269 	tcph->update = 0;
1270 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1271 
1272 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1273 	if(*bpp == nil)
1274 		return -1;
1275 
1276 	optr = h->tcpopt;
1277 	n = hdrlen - TCP4_HDRSIZE;
1278 	while(n > 0 && *optr != EOLOPT) {
1279 		if(*optr == NOOPOPT) {
1280 			n--;
1281 			optr++;
1282 			continue;
1283 		}
1284 		optlen = optr[1];
1285 		if(optlen < 2 || optlen > n)
1286 			break;
1287 		switch(*optr) {
1288 		case MSSOPT:
1289 			if(optlen == MSS_LENGTH)
1290 				tcph->mss = nhgets(optr+2);
1291 			break;
1292 		case WSOPT:
1293 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1294 				tcph->ws = *(optr+2);
1295 			break;
1296 		}
1297 		n -= optlen;
1298 		optr += optlen;
1299 	}
1300 	return hdrlen;
1301 }
1302 
1303 /*
1304  *  For outgoing calls, generate an initial sequence
1305  *  number and put a SYN on the send queue
1306  */
1307 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1308 tcpsndsyn(Conv *s, Tcpctl *tcb)
1309 {
1310 	Tcppriv *tpriv;
1311 
1312 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1313 	tcb->rttseq = tcb->iss;
1314 	tcb->snd.wl2 = tcb->iss;
1315 	tcb->snd.una = tcb->iss;
1316 	tcb->snd.rxt = tcb->iss;
1317 	tcb->snd.ptr = tcb->rttseq;
1318 	tcb->snd.nxt = tcb->rttseq;
1319 	tcb->flgcnt++;
1320 	tcb->flags |= FORCE;
1321 	tcb->sndsyntime = NOW;
1322 
1323 	/* set desired mss and scale */
1324 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1325 	tpriv = s->p->priv;
1326 	tpriv->stats[Mss] = tcb->mss;
1327 }
1328 
1329 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1330 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1331 {
1332 	Block *hbp;
1333 	uchar rflags;
1334 	Tcppriv *tpriv;
1335 	Tcp4hdr ph4;
1336 	Tcp6hdr ph6;
1337 
1338 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1339 
1340 	tpriv = tcp->priv;
1341 
1342 	if(seg->flags & RST)
1343 		return;
1344 
1345 	/* make pseudo header */
1346 	switch(version) {
1347 	case V4:
1348 		memset(&ph4, 0, sizeof(ph4));
1349 		ph4.vihl = IP_VER4;
1350 		v6tov4(ph4.tcpsrc, dest);
1351 		v6tov4(ph4.tcpdst, source);
1352 		ph4.proto = IP_TCPPROTO;
1353 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1354 		hnputs(ph4.tcpsport, seg->dest);
1355 		hnputs(ph4.tcpdport, seg->source);
1356 		break;
1357 	case V6:
1358 		memset(&ph6, 0, sizeof(ph6));
1359 		ph6.vcf[0] = IP_VER6;
1360 		ipmove(ph6.tcpsrc, dest);
1361 		ipmove(ph6.tcpdst, source);
1362 		ph6.proto = IP_TCPPROTO;
1363 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1364 		hnputs(ph6.tcpsport, seg->dest);
1365 		hnputs(ph6.tcpdport, seg->source);
1366 		break;
1367 	default:
1368 		panic("sndrst: version %d", version);
1369 	}
1370 
1371 	tpriv->stats[OutRsts]++;
1372 	rflags = RST;
1373 
1374 	/* convince the other end that this reset is in band */
1375 	if(seg->flags & ACK) {
1376 		seg->seq = seg->ack;
1377 		seg->ack = 0;
1378 	}
1379 	else {
1380 		rflags |= ACK;
1381 		seg->ack = seg->seq;
1382 		seg->seq = 0;
1383 		if(seg->flags & SYN)
1384 			seg->ack++;
1385 		seg->ack += length;
1386 		if(seg->flags & FIN)
1387 			seg->ack++;
1388 	}
1389 	seg->flags = rflags;
1390 	seg->wnd = 0;
1391 	seg->urg = 0;
1392 	seg->mss = 0;
1393 	seg->ws = 0;
1394 	switch(version) {
1395 	case V4:
1396 		hbp = htontcp4(seg, nil, &ph4, nil);
1397 		if(hbp == nil)
1398 			return;
1399 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1400 		break;
1401 	case V6:
1402 		hbp = htontcp6(seg, nil, &ph6, nil);
1403 		if(hbp == nil)
1404 			return;
1405 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1406 		break;
1407 	default:
1408 		panic("sndrst2: version %d", version);
1409 	}
1410 }
1411 
1412 /*
1413  *  send a reset to the remote side and close the conversation
1414  *  called with s qlocked
1415  */
1416 static char*
tcphangup(Conv * s)1417 tcphangup(Conv *s)
1418 {
1419 	Tcp seg;
1420 	Tcpctl *tcb;
1421 	Block *hbp;
1422 
1423 	tcb = (Tcpctl*)s->ptcl;
1424 	if(waserror())
1425 		return commonerror();
1426 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1427 		if(!waserror()){
1428 			memset(&seg, 0, sizeof seg);
1429 			seg.flags = RST | ACK;
1430 			seg.ack = tcb->rcv.nxt;
1431 			tcb->rcv.ackptr = seg.ack;
1432 			seg.seq = tcb->snd.ptr;
1433 			seg.wnd = 0;
1434 			seg.urg = 0;
1435 			seg.mss = 0;
1436 			seg.ws = 0;
1437 			switch(s->ipversion) {
1438 			case V4:
1439 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1440 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1441 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1442 				break;
1443 			case V6:
1444 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1445 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1446 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1447 				break;
1448 			default:
1449 				panic("tcphangup: version %d", s->ipversion);
1450 			}
1451 			poperror();
1452 		}
1453 	}
1454 	localclose(s, nil);
1455 	poperror();
1456 	return nil;
1457 }
1458 
1459 /*
1460  *  (re)send a SYN ACK
1461  */
1462 static int
sndsynack(Proto * tcp,Limbo * lp)1463 sndsynack(Proto *tcp, Limbo *lp)
1464 {
1465 	Block *hbp;
1466 	Tcp4hdr ph4;
1467 	Tcp6hdr ph6;
1468 	Tcp seg;
1469 	uint scale;
1470 
1471 	/* make pseudo header */
1472 	switch(lp->version) {
1473 	case V4:
1474 		memset(&ph4, 0, sizeof(ph4));
1475 		ph4.vihl = IP_VER4;
1476 		v6tov4(ph4.tcpsrc, lp->laddr);
1477 		v6tov4(ph4.tcpdst, lp->raddr);
1478 		ph4.proto = IP_TCPPROTO;
1479 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1480 		hnputs(ph4.tcpsport, lp->lport);
1481 		hnputs(ph4.tcpdport, lp->rport);
1482 		break;
1483 	case V6:
1484 		memset(&ph6, 0, sizeof(ph6));
1485 		ph6.vcf[0] = IP_VER6;
1486 		ipmove(ph6.tcpsrc, lp->laddr);
1487 		ipmove(ph6.tcpdst, lp->raddr);
1488 		ph6.proto = IP_TCPPROTO;
1489 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1490 		hnputs(ph6.tcpsport, lp->lport);
1491 		hnputs(ph6.tcpdport, lp->rport);
1492 		break;
1493 	default:
1494 		panic("sndrst: version %d", lp->version);
1495 	}
1496 
1497 	memset(&seg, 0, sizeof seg);
1498 	seg.seq = lp->iss;
1499 	seg.ack = lp->irs+1;
1500 	seg.flags = SYN|ACK;
1501 	seg.urg = 0;
1502 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1503 	seg.wnd = QMAX;
1504 
1505 	/* if the other side set scale, we should too */
1506 	if(lp->rcvscale){
1507 		seg.ws = scale;
1508 		lp->sndscale = scale;
1509 	} else {
1510 		seg.ws = 0;
1511 		lp->sndscale = 0;
1512 	}
1513 
1514 	switch(lp->version) {
1515 	case V4:
1516 		hbp = htontcp4(&seg, nil, &ph4, nil);
1517 		if(hbp == nil)
1518 			return -1;
1519 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1520 		break;
1521 	case V6:
1522 		hbp = htontcp6(&seg, nil, &ph6, nil);
1523 		if(hbp == nil)
1524 			return -1;
1525 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1526 		break;
1527 	default:
1528 		panic("sndsnack: version %d", lp->version);
1529 	}
1530 	lp->lastsend = NOW;
1531 	return 0;
1532 }
1533 
1534 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1535 
1536 /*
1537  *  put a call into limbo and respond with a SYN ACK
1538  *
1539  *  called with proto locked
1540  */
1541 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1542 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1543 {
1544 	Limbo *lp, **l;
1545 	Tcppriv *tpriv;
1546 	int h;
1547 
1548 	tpriv = s->p->priv;
1549 	h = hashipa(source, seg->source);
1550 
1551 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1552 		lp = *l;
1553 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1554 			continue;
1555 		if(ipcmp(lp->raddr, source) != 0)
1556 			continue;
1557 		if(ipcmp(lp->laddr, dest) != 0)
1558 			continue;
1559 
1560 		/* each new SYN restarts the retransmits */
1561 		lp->irs = seg->seq;
1562 		break;
1563 	}
1564 	lp = *l;
1565 	if(lp == nil){
1566 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1567 			lp = tpriv->lht[h];
1568 			tpriv->lht[h] = lp->next;
1569 			lp->next = nil;
1570 		} else {
1571 			lp = malloc(sizeof(*lp));
1572 			if(lp == nil)
1573 				return;
1574 			tpriv->nlimbo++;
1575 		}
1576 		*l = lp;
1577 		lp->version = version;
1578 		ipmove(lp->laddr, dest);
1579 		ipmove(lp->raddr, source);
1580 		lp->lport = seg->dest;
1581 		lp->rport = seg->source;
1582 		lp->mss = seg->mss;
1583 		lp->rcvscale = seg->ws;
1584 		lp->irs = seg->seq;
1585 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1586 	}
1587 
1588 	if(sndsynack(s->p, lp) < 0){
1589 		*l = lp->next;
1590 		tpriv->nlimbo--;
1591 		free(lp);
1592 	}
1593 }
1594 
1595 /*
1596  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1597  */
1598 static void
limborexmit(Proto * tcp)1599 limborexmit(Proto *tcp)
1600 {
1601 	Tcppriv *tpriv;
1602 	Limbo **l, *lp;
1603 	int h;
1604 	int seen;
1605 	ulong now;
1606 
1607 	tpriv = tcp->priv;
1608 
1609 	if(!canqlock(tcp))
1610 		return;
1611 	seen = 0;
1612 	now = NOW;
1613 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1614 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1615 			lp = *l;
1616 			seen++;
1617 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1618 				continue;
1619 
1620 			/* time it out after 1 second */
1621 			if(++(lp->rexmits) > 5){
1622 				tpriv->nlimbo--;
1623 				*l = lp->next;
1624 				free(lp);
1625 				continue;
1626 			}
1627 
1628 			/* if we're being attacked, don't bother resending SYN ACK's */
1629 			if(tpriv->nlimbo > 100)
1630 				continue;
1631 
1632 			if(sndsynack(tcp, lp) < 0){
1633 				tpriv->nlimbo--;
1634 				*l = lp->next;
1635 				free(lp);
1636 				continue;
1637 			}
1638 
1639 			l = &lp->next;
1640 		}
1641 	}
1642 	qunlock(tcp);
1643 }
1644 
1645 /*
1646  *  lookup call in limbo.  if found, throw it out.
1647  *
1648  *  called with proto locked
1649  */
1650 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1651 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1652 {
1653 	Limbo *lp, **l;
1654 	int h;
1655 	Tcppriv *tpriv;
1656 
1657 	tpriv = s->p->priv;
1658 
1659 	/* find a call in limbo */
1660 	h = hashipa(src, segp->source);
1661 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1662 		lp = *l;
1663 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1664 			continue;
1665 		if(ipcmp(lp->laddr, dst) != 0)
1666 			continue;
1667 		if(ipcmp(lp->raddr, src) != 0)
1668 			continue;
1669 
1670 		/* RST can only follow the SYN */
1671 		if(segp->seq == lp->irs+1){
1672 			tpriv->nlimbo--;
1673 			*l = lp->next;
1674 			free(lp);
1675 		}
1676 		break;
1677 	}
1678 }
1679 
1680 static void
initialwindow(Tcpctl * tcb)1681 initialwindow(Tcpctl *tcb)
1682 {
1683 	/* RFC 3390 initial window */
1684 	if(tcb->mss < 1095)
1685 		tcb->cwind = 4*tcb->mss;
1686 	else if(tcb->mss < 2190)
1687 		tcb->cwind = 2*2190;
1688 	else
1689 		tcb->cwind = 2*tcb->mss;
1690 }
1691 
1692 /*
1693  *  come here when we finally get an ACK to our SYN-ACK.
1694  *  lookup call in limbo.  if found, create a new conversation
1695  *
1696  *  called with proto locked
1697  */
1698 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1699 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1700 {
1701 	Conv *new;
1702 	Tcpctl *tcb;
1703 	Tcppriv *tpriv;
1704 	Tcp4hdr *h4;
1705 	Tcp6hdr *h6;
1706 	Limbo *lp, **l;
1707 	int h;
1708 
1709 	/* unless it's just an ack, it can't be someone coming out of limbo */
1710 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1711 		return nil;
1712 
1713 	tpriv = s->p->priv;
1714 
1715 	/* find a call in limbo */
1716 	h = hashipa(src, segp->source);
1717 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1718 		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1719 			src, segp->source, lp->raddr, lp->rport,
1720 			dst, segp->dest, lp->laddr, lp->lport,
1721 			version, lp->version
1722  		);
1723 
1724 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1725 			continue;
1726 		if(ipcmp(lp->laddr, dst) != 0)
1727 			continue;
1728 		if(ipcmp(lp->raddr, src) != 0)
1729 			continue;
1730 
1731 		/* we're assuming no data with the initial SYN */
1732 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1733 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1734 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1735 			lp = nil;
1736 		} else {
1737 			tpriv->nlimbo--;
1738 			*l = lp->next;
1739 		}
1740 		break;
1741 	}
1742 	if(lp == nil)
1743 		return nil;
1744 
1745 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1746 	if(new == nil)
1747 		return nil;
1748 
1749 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1750 	tcb = (Tcpctl*)new->ptcl;
1751 	tcb->flags &= ~CLONE;
1752 	tcb->timer.arg = new;
1753 	tcb->timer.state = TcptimerOFF;
1754 	tcb->acktimer.arg = new;
1755 	tcb->acktimer.state = TcptimerOFF;
1756 	tcb->katimer.arg = new;
1757 	tcb->katimer.state = TcptimerOFF;
1758 	tcb->rtt_timer.arg = new;
1759 	tcb->rtt_timer.state = TcptimerOFF;
1760 
1761 	tcb->irs = lp->irs;
1762 	tcb->rcv.nxt = tcb->irs+1;
1763 	tcb->rcv.wptr = tcb->rcv.nxt;
1764 	tcb->rcv.wsnt = 0;
1765 	tcb->rcv.urg = tcb->rcv.nxt;
1766 
1767 	tcb->iss = lp->iss;
1768 	tcb->rttseq = tcb->iss;
1769 	tcb->snd.wl2 = tcb->iss;
1770 	tcb->snd.una = tcb->iss+1;
1771 	tcb->snd.ptr = tcb->iss+1;
1772 	tcb->snd.nxt = tcb->iss+1;
1773 	tcb->snd.rxt = tcb->iss+1;
1774 	tcb->flgcnt = 0;
1775 	tcb->flags |= SYNACK;
1776 
1777 	/* our sending max segment size cannot be bigger than what he asked for */
1778 	if(lp->mss != 0 && lp->mss < tcb->mss) {
1779 		tcb->mss = lp->mss;
1780 		tpriv->stats[Mss] = tcb->mss;
1781 	}
1782 
1783 	/* window scaling */
1784 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1785 
1786 	/* congestion window */
1787 	tcb->snd.wnd = segp->wnd;
1788 	initialwindow(tcb);
1789 
1790 	/* set initial round trip time */
1791 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1792 	tcpsynackrtt(new);
1793 
1794 	free(lp);
1795 
1796 	/* set up proto header */
1797 	switch(version){
1798 	case V4:
1799 		h4 = &tcb->protohdr.tcp4hdr;
1800 		memset(h4, 0, sizeof(*h4));
1801 		h4->proto = IP_TCPPROTO;
1802 		hnputs(h4->tcpsport, new->lport);
1803 		hnputs(h4->tcpdport, new->rport);
1804 		v6tov4(h4->tcpsrc, dst);
1805 		v6tov4(h4->tcpdst, src);
1806 		break;
1807 	case V6:
1808 		h6 = &tcb->protohdr.tcp6hdr;
1809 		memset(h6, 0, sizeof(*h6));
1810 		h6->proto = IP_TCPPROTO;
1811 		hnputs(h6->tcpsport, new->lport);
1812 		hnputs(h6->tcpdport, new->rport);
1813 		ipmove(h6->tcpsrc, dst);
1814 		ipmove(h6->tcpdst, src);
1815 		break;
1816 	default:
1817 		panic("tcpincoming: version %d", new->ipversion);
1818 	}
1819 
1820 	tcpsetstate(new, Established);
1821 
1822 	iphtadd(&tpriv->ht, new);
1823 
1824 	return new;
1825 }
1826 
1827 static int
seq_within(ulong x,ulong low,ulong high)1828 seq_within(ulong x, ulong low, ulong high)
1829 {
1830 	if(low <= high){
1831 		if(low <= x && x <= high)
1832 			return 1;
1833 	}
1834 	else {
1835 		if(x >= low || x <= high)
1836 			return 1;
1837 	}
1838 	return 0;
1839 }
1840 
1841 static int
seq_lt(ulong x,ulong y)1842 seq_lt(ulong x, ulong y)
1843 {
1844 	return (int)(x-y) < 0;
1845 }
1846 
1847 static int
seq_le(ulong x,ulong y)1848 seq_le(ulong x, ulong y)
1849 {
1850 	return (int)(x-y) <= 0;
1851 }
1852 
1853 static int
seq_gt(ulong x,ulong y)1854 seq_gt(ulong x, ulong y)
1855 {
1856 	return (int)(x-y) > 0;
1857 }
1858 
1859 static int
seq_ge(ulong x,ulong y)1860 seq_ge(ulong x, ulong y)
1861 {
1862 	return (int)(x-y) >= 0;
1863 }
1864 
1865 /*
1866  *  use the time between the first SYN and it's ack as the
1867  *  initial round trip time
1868  */
1869 static void
tcpsynackrtt(Conv * s)1870 tcpsynackrtt(Conv *s)
1871 {
1872 	Tcpctl *tcb;
1873 	int delta;
1874 	Tcppriv *tpriv;
1875 
1876 	tcb = (Tcpctl*)s->ptcl;
1877 	tpriv = s->p->priv;
1878 
1879 	delta = NOW - tcb->sndsyntime;
1880 	tcb->srtt = delta<<LOGAGAIN;
1881 	tcb->mdev = delta<<LOGDGAIN;
1882 
1883 	/* halt round trip timer */
1884 	tcphalt(tpriv, &tcb->rtt_timer);
1885 }
1886 
1887 static void
update(Conv * s,Tcp * seg)1888 update(Conv *s, Tcp *seg)
1889 {
1890 	int rtt, delta;
1891 	Tcpctl *tcb;
1892 	ulong acked;
1893 	Tcppriv *tpriv;
1894 
1895 	if(seg->update)
1896 		return;
1897 	seg->update = 1;
1898 
1899 	tpriv = s->p->priv;
1900 	tcb = (Tcpctl*)s->ptcl;
1901 
1902 	/* catch zero-window updates, update window & recover */
1903 	if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1904 	    seq_lt(seg->ack, tcb->snd.ptr)){
1905 		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1906 			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1907 		tcb->snd.wnd = seg->wnd;
1908 		goto recovery;
1909 	}
1910 
1911 	/* newreno fast retransmit */
1912 	if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1913 	    ++tcb->snd.dupacks == 3){		/* was TCPREXMTTHRESH */
1914 recovery:
1915 		if(tcb->snd.recovery){
1916 			tpriv->stats[RecoveryCwind]++;
1917 			tcb->cwind += tcb->mss;
1918 		}else if(seq_le(tcb->snd.rxt, seg->ack)){
1919 			tpriv->stats[Recovery]++;
1920 			tcb->abcbytes = 0;
1921 			tcb->snd.recovery = 1;
1922 			tcb->snd.partialack = 0;
1923 			tcb->snd.rxt = tcb->snd.nxt;
1924 			tcpcongestion(tcb);
1925 			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1926 			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1927 				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1928 			tcprxmit(s);
1929 		}else{
1930 			tpriv->stats[RecoveryNoSeq]++;
1931 			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1932 				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1933 			/* don't enter fast retransmit, don't change ssthresh */
1934 		}
1935 	}else if(tcb->snd.recovery){
1936 		tpriv->stats[RecoveryCwind]++;
1937 		tcb->cwind += tcb->mss;
1938 	}
1939 
1940 	/*
1941 	 *  update window
1942 	 */
1943 	if(seq_gt(seg->ack, tcb->snd.wl2)
1944 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1945 		/* clear dupack if we advance wl2 */
1946 		if(tcb->snd.wl2 != seg->ack)
1947 			tcb->snd.dupacks = 0;
1948 		tcb->snd.wnd = seg->wnd;
1949 		tcb->snd.wl2 = seg->ack;
1950 	}
1951 
1952 	if(!seq_gt(seg->ack, tcb->snd.una)){
1953 		/*
1954 		 *  don't let us hangup if sending into a closed window and
1955 		 *  we're still getting acks
1956 		 */
1957 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1958 			tcb->backedoff = MAXBACKMS/4;
1959 		return;
1960 	}
1961 
1962 	/* Compute the new send window size */
1963 	acked = seg->ack - tcb->snd.una;
1964 
1965 	/* avoid slow start and timers for SYN acks */
1966 	if((tcb->flags & SYNACK) == 0) {
1967 		tcb->flags |= SYNACK;
1968 		acked--;
1969 		tcb->flgcnt--;
1970 		goto done;
1971 	}
1972 
1973 	/*
1974 	 * congestion control
1975 	 */
1976 	if(tcb->snd.recovery){
1977 		if(seq_ge(seg->ack, tcb->snd.rxt)){
1978 			/* recovery finished; deflate window */
1979 			tpriv->stats[RecoveryDone]++;
1980 			tcb->snd.dupacks = 0;
1981 			tcb->snd.recovery = 0;
1982 			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1983 			if(tcb->ssthresh < tcb->cwind)
1984 				tcb->cwind = tcb->ssthresh;
1985 			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1986 				tcb->cwind, tcb->ssthresh);
1987 		} else {
1988 			/* partial ack; we lost more than one segment */
1989 			tpriv->stats[RecoveryPA]++;
1990 			if(tcb->cwind > acked)
1991 				tcb->cwind -= acked;
1992 			else{
1993 				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1994 				tcb->cwind = tcb->mss;
1995 			}
1996 			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1997 				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1998 
1999 			if(acked >= tcb->mss)
2000 				tcb->cwind += tcb->mss;
2001 			tcb->snd.partialack++;
2002 		}
2003 	} else
2004 		tcpabcincr(tcb, acked);
2005 
2006 	/* Adjust the timers according to the round trip time */
2007 	/* TODO: fix sloppy treatment of overflow cases here. */
2008 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2009 		tcphalt(tpriv, &tcb->rtt_timer);
2010 		if((tcb->flags&RETRAN) == 0) {
2011 			tcb->backoff = 0;
2012 			tcb->backedoff = 0;
2013 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2014 			if(rtt == 0)
2015 				rtt = 1; /* else all close sys's will rexmit in 0 time */
2016 			rtt *= MSPTICK;
2017 			if(tcb->srtt == 0) {
2018 				tcb->srtt = rtt << LOGAGAIN;
2019 				tcb->mdev = rtt << LOGDGAIN;
2020 			} else {
2021 				delta = rtt - (tcb->srtt>>LOGAGAIN);
2022 				tcb->srtt += delta;
2023 				if(tcb->srtt <= 0)
2024 					tcb->srtt = 1;
2025 
2026 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2027 				tcb->mdev += delta;
2028 				if(tcb->mdev <= 0)
2029 					tcb->mdev = 1;
2030 			}
2031 			tcpsettimer(tcb);
2032 		}
2033 	}
2034 
2035 done:
2036 	if(qdiscard(s->wq, acked) < acked)
2037 		tcb->flgcnt--;
2038 	tcb->snd.una = seg->ack;
2039 
2040 	/* newreno fast recovery */
2041 	if(tcb->snd.recovery)
2042 		tcprxmit(s);
2043 
2044 	if(seq_gt(seg->ack, tcb->snd.urg))
2045 		tcb->snd.urg = seg->ack;
2046 
2047 	if(tcb->snd.una != tcb->snd.nxt){
2048 		/* `impatient' variant */
2049 		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2050 			tcb->time = NOW;
2051 			tcb->timeuna = tcb->snd.una;
2052 			tcpgo(tpriv, &tcb->timer);
2053 		}
2054 	} else
2055 		tcphalt(tpriv, &tcb->timer);
2056 
2057 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2058 		tcb->snd.ptr = tcb->snd.una;
2059 
2060 	if(!tcb->snd.recovery)
2061 		tcb->flags &= ~RETRAN;
2062 	tcb->backoff = 0;
2063 	tcb->backedoff = 0;
2064 }
2065 
2066 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2067 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2068 {
2069 	Tcp seg;
2070 	Tcp4hdr *h4;
2071 	Tcp6hdr *h6;
2072 	int hdrlen;
2073 	Tcpctl *tcb;
2074 	ushort length, csum;
2075 	uchar source[IPaddrlen], dest[IPaddrlen];
2076 	Conv *s;
2077 	Fs *f;
2078 	Tcppriv *tpriv;
2079 	uchar version;
2080 
2081 	f = tcp->f;
2082 	tpriv = tcp->priv;
2083 
2084 	tpriv->stats[InSegs]++;
2085 
2086 	h4 = (Tcp4hdr*)(bp->rp);
2087 	h6 = (Tcp6hdr*)(bp->rp);
2088 
2089 	if((h4->vihl&0xF0)==IP_VER4) {
2090 		version = V4;
2091 		length = nhgets(h4->length);
2092 		v4tov6(dest, h4->tcpdst);
2093 		v4tov6(source, h4->tcpsrc);
2094 
2095 		h4->Unused = 0;
2096 		hnputs(h4->tcplen, length-TCP4_PKT);
2097 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2098 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2099 			tpriv->stats[CsumErrs]++;
2100 			tpriv->stats[InErrs]++;
2101 			netlog(f, Logtcp, "bad tcp proto cksum\n");
2102 			freeblist(bp);
2103 			return;
2104 		}
2105 
2106 		hdrlen = ntohtcp4(&seg, &bp);
2107 		if(hdrlen < 0){
2108 			tpriv->stats[HlenErrs]++;
2109 			tpriv->stats[InErrs]++;
2110 			netlog(f, Logtcp, "bad tcp hdr len\n");
2111 			return;
2112 		}
2113 
2114 		/* trim the packet to the size claimed by the datagram */
2115 		length -= hdrlen+TCP4_PKT;
2116 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2117 		if(bp == nil){
2118 			tpriv->stats[LenErrs]++;
2119 			tpriv->stats[InErrs]++;
2120 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2121 			return;
2122 		}
2123 	}
2124 	else {
2125 		int ttl = h6->ttl;
2126 		int proto = h6->proto;
2127 
2128 		version = V6;
2129 		length = nhgets(h6->ploadlen);
2130 		ipmove(dest, h6->tcpdst);
2131 		ipmove(source, h6->tcpsrc);
2132 
2133 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2134 		h6->ttl = proto;
2135 		hnputl(h6->vcf, length);
2136 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2137 		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2138 			tpriv->stats[CsumErrs]++;
2139 			tpriv->stats[InErrs]++;
2140 			netlog(f, Logtcp,
2141 			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2142 				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2143 			freeblist(bp);
2144 			return;
2145 		}
2146 		h6->ttl = ttl;
2147 		h6->proto = proto;
2148 		hnputs(h6->ploadlen, length);
2149 
2150 		hdrlen = ntohtcp6(&seg, &bp);
2151 		if(hdrlen < 0){
2152 			tpriv->stats[HlenErrs]++;
2153 			tpriv->stats[InErrs]++;
2154 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2155 			return;
2156 		}
2157 
2158 		/* trim the packet to the size claimed by the datagram */
2159 		length -= hdrlen;
2160 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2161 		if(bp == nil){
2162 			tpriv->stats[LenErrs]++;
2163 			tpriv->stats[InErrs]++;
2164 			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2165 			return;
2166 		}
2167 	}
2168 
2169 	/* lock protocol while searching for a conversation */
2170 	qlock(tcp);
2171 
2172 	/* Look for a matching conversation */
2173 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2174 	if(s == nil){
2175 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2176 			source, seg.source, dest, seg.dest);
2177 reset:
2178 		qunlock(tcp);
2179 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2180 		freeblist(bp);
2181 		return;
2182 	}
2183 
2184 	/* if it's a listener, look for the right flags and get a new conv */
2185 	tcb = (Tcpctl*)s->ptcl;
2186 	if(tcb->state == Listen){
2187 		if(seg.flags & RST){
2188 			limborst(s, &seg, source, dest, version);
2189 			qunlock(tcp);
2190 			freeblist(bp);
2191 			return;
2192 		}
2193 
2194 		/* if this is a new SYN, put the call into limbo */
2195 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2196 			limbo(s, source, dest, &seg, version);
2197 			qunlock(tcp);
2198 			freeblist(bp);
2199 			return;
2200 		}
2201 
2202 		/*
2203 		 *  if there's a matching call in limbo, tcpincoming will
2204 		 *  return it in state Syn_received
2205 		 */
2206 		s = tcpincoming(s, &seg, source, dest, version);
2207 		if(s == nil)
2208 			goto reset;
2209 	}
2210 
2211 	/* The rest of the input state machine is run with the control block
2212 	 * locked and implements the state machine directly out of the RFC.
2213 	 * Out-of-band data is ignored - it was always a bad idea.
2214 	 */
2215 	tcb = (Tcpctl*)s->ptcl;
2216 	if(waserror()){
2217 		qunlock(s);
2218 		nexterror();
2219 	}
2220 	qlock(s);
2221 	qunlock(tcp);
2222 
2223 	/* fix up window */
2224 	seg.wnd <<= tcb->rcv.scale;
2225 
2226 	/* every input packet in puts off the keep alive time out */
2227 	tcpsetkacounter(tcb);
2228 
2229 	switch(tcb->state) {
2230 	case Closed:
2231 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2232 		goto raise;
2233 	case Syn_sent:
2234 		if(seg.flags & ACK) {
2235 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2236 				sndrst(tcp, source, dest, length, &seg, version,
2237 					 "bad seq in Syn_sent");
2238 				goto raise;
2239 			}
2240 		}
2241 		if(seg.flags & RST) {
2242 			if(seg.flags & ACK)
2243 				localclose(s, Econrefused);
2244 			goto raise;
2245 		}
2246 
2247 		if(seg.flags & SYN) {
2248 			procsyn(s, &seg);
2249 			if(seg.flags & ACK){
2250 				update(s, &seg);
2251 				tcpsynackrtt(s);
2252 				tcpsetstate(s, Established);
2253 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2254 			}
2255 			else {
2256 				tcb->time = NOW;
2257 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2258 			}
2259 
2260 			if(length != 0 || (seg.flags & FIN))
2261 				break;
2262 
2263 			freeblist(bp);
2264 			goto output;
2265 		}
2266 		else
2267 			freeblist(bp);
2268 
2269 		qunlock(s);
2270 		poperror();
2271 		return;
2272 	case Syn_received:
2273 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2274 		if(seg.flags & ACK)
2275 			tcpsynackrtt(s);
2276 		break;
2277 	}
2278 
2279 	/*
2280 	 *  One DOS attack is to open connections to us and then forget about them,
2281 	 *  thereby tying up a conv at no long term cost to the attacker.
2282 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2283 	 *  corresponding code in tcpsendka().
2284 	 */
2285 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2286 		if(tcpporthogdefense
2287 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2288 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2289 				source, seg.source, dest, seg.dest, seg.flags,
2290 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2291 			localclose(s, "stateless hog");
2292 		}
2293 	}
2294 
2295 	/* Cut the data to fit the receive window */
2296 	tcprcvwin(s);
2297 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2298 		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2299 		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2300 			"%lud-%lud l %d from %I\n", seg.seq,
2301 			seg.seq + length - 1, tcb->rcv.nxt,
2302 			tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2303 		update(s, &seg);
2304 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2305 			tcphalt(tpriv, &tcb->rtt_timer);
2306 			tcphalt(tpriv, &tcb->acktimer);
2307 			tcphalt(tpriv, &tcb->katimer);
2308 			tcpsetstate(s, Time_wait);
2309 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2310 			tcpgo(tpriv, &tcb->timer);
2311 		}
2312 		if(!(seg.flags & RST)) {
2313 			tcb->flags |= FORCE;
2314 			goto output;
2315 		}
2316 		qunlock(s);
2317 		poperror();
2318 		return;
2319 	}
2320 
2321 	/* Cannot accept so answer with a rst */
2322 	if(length && tcb->state == Closed) {
2323 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2324 		goto raise;
2325 	}
2326 
2327 	/* The segment is beyond the current receive pointer so
2328 	 * queue the data in the resequence queue
2329 	 */
2330 	if(seg.seq != tcb->rcv.nxt)
2331 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2332 		update(s, &seg);
2333 		if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2334 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2335 				s->laddr, s->lport);
2336 		tcb->flags |= FORCE;	/* force duplicate ack; RFC 5681 §3.2 */
2337 		goto output;
2338 	}
2339 
2340 	if(tcb->nreseq > 0)
2341 		tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2342 
2343 	/*
2344 	 *  keep looping till we've processed this packet plus any
2345 	 *  adjacent packets in the resequence queue
2346 	 */
2347 	for(;;) {
2348 		if(seg.flags & RST) {
2349 			if(tcb->state == Established) {
2350 				tpriv->stats[EstabResets]++;
2351 				if(tcb->rcv.nxt != seg.seq)
2352 					netlog(f, Logtcp, "out of order RST "
2353 						"rcvd: %I.%d -> %I.%d, rcv.nxt "
2354 						"%lux seq %lux\n",
2355 						s->raddr, s->rport, s->laddr,
2356 						s->lport, tcb->rcv.nxt, seg.seq);
2357 			}
2358 			localclose(s, Econrefused);
2359 			goto raise;
2360 		}
2361 
2362 		if((seg.flags&ACK) == 0)
2363 			goto raise;
2364 
2365 		switch(tcb->state) {
2366 		case Syn_received:
2367 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2368 				sndrst(tcp, source, dest, length, &seg, version,
2369 					"bad seq in Syn_received");
2370 				goto raise;
2371 			}
2372 			update(s, &seg);
2373 			tcpsetstate(s, Established);
2374 		case Established:
2375 		case Close_wait:
2376 			update(s, &seg);
2377 			break;
2378 		case Finwait1:
2379 			update(s, &seg);
2380 			if(qlen(s->wq)+tcb->flgcnt == 0){
2381 				tcphalt(tpriv, &tcb->rtt_timer);
2382 				tcphalt(tpriv, &tcb->acktimer);
2383 				tcpsetkacounter(tcb);
2384 				tcb->time = NOW;
2385 				tcpsetstate(s, Finwait2);
2386 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2387 				tcpgo(tpriv, &tcb->katimer);
2388 			}
2389 			break;
2390 		case Finwait2:
2391 			update(s, &seg);
2392 			break;
2393 		case Closing:
2394 			update(s, &seg);
2395 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2396 				tcphalt(tpriv, &tcb->rtt_timer);
2397 				tcphalt(tpriv, &tcb->acktimer);
2398 				tcphalt(tpriv, &tcb->katimer);
2399 				tcpsetstate(s, Time_wait);
2400 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2401 				tcpgo(tpriv, &tcb->timer);
2402 			}
2403 			break;
2404 		case Last_ack:
2405 			update(s, &seg);
2406 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2407 				localclose(s, nil);
2408 				goto raise;
2409 			}
2410 		case Time_wait:
2411 			tcb->flags |= FORCE;
2412 			if(tcb->timer.state != TcptimerON)
2413 				tcpgo(tpriv, &tcb->timer);
2414 		}
2415 
2416 		if((seg.flags&URG) && seg.urg) {
2417 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2418 				tcb->rcv.urg = seg.urg + seg.seq;
2419 				pullblock(&bp, seg.urg);
2420 			}
2421 		}
2422 		else
2423 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2424 			tcb->rcv.urg = tcb->rcv.nxt;
2425 
2426 		if(length == 0) {
2427 			if(bp != nil)
2428 				freeblist(bp);
2429 		}
2430 		else {
2431 			switch(tcb->state){
2432 			default:
2433 				/* Ignore segment text */
2434 				if(bp != nil)
2435 					freeblist(bp);
2436 				break;
2437 
2438 			case Syn_received:
2439 			case Established:
2440 			case Finwait1:
2441 				/* If we still have some data place on
2442 				 * receive queue
2443 				 */
2444 				if(bp) {
2445 					bp = packblock(bp);
2446 					if(bp == nil)
2447 						panic("tcp packblock");
2448 					qpassnolim(s->rq, bp);
2449 					bp = nil;
2450 				}
2451 				tcb->rcv.nxt += length;
2452 
2453 				/*
2454 				 *  turn on the acktimer if there's something
2455 				 *  to ack
2456 				 */
2457 				if(tcb->acktimer.state != TcptimerON)
2458 					tcpgo(tpriv, &tcb->acktimer);
2459 
2460 				break;
2461 			case Finwait2:
2462 				/* no process to read the data, send a reset */
2463 				if(bp != nil)
2464 					freeblist(bp);
2465 				sndrst(tcp, source, dest, length, &seg, version,
2466 					"send to Finwait2");
2467 				qunlock(s);
2468 				poperror();
2469 				return;
2470 			}
2471 		}
2472 
2473 		if(seg.flags & FIN) {
2474 			tcb->flags |= FORCE;
2475 
2476 			switch(tcb->state) {
2477 			case Syn_received:
2478 			case Established:
2479 				tcb->rcv.nxt++;
2480 				tcpsetstate(s, Close_wait);
2481 				break;
2482 			case Finwait1:
2483 				tcb->rcv.nxt++;
2484 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2485 					tcphalt(tpriv, &tcb->rtt_timer);
2486 					tcphalt(tpriv, &tcb->acktimer);
2487 					tcphalt(tpriv, &tcb->katimer);
2488 					tcpsetstate(s, Time_wait);
2489 					tcb->timer.start = MSL2*(1000/MSPTICK);
2490 					tcpgo(tpriv, &tcb->timer);
2491 				}
2492 				else
2493 					tcpsetstate(s, Closing);
2494 				break;
2495 			case Finwait2:
2496 				tcb->rcv.nxt++;
2497 				tcphalt(tpriv, &tcb->rtt_timer);
2498 				tcphalt(tpriv, &tcb->acktimer);
2499 				tcphalt(tpriv, &tcb->katimer);
2500 				tcpsetstate(s, Time_wait);
2501 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2502 				tcpgo(tpriv, &tcb->timer);
2503 				break;
2504 			case Close_wait:
2505 			case Closing:
2506 			case Last_ack:
2507 				break;
2508 			case Time_wait:
2509 				tcpgo(tpriv, &tcb->timer);
2510 				break;
2511 			}
2512 		}
2513 
2514 		/*
2515 		 *  get next adjacent segment from the resequence queue.
2516 		 *  dump/trim any overlapping segments
2517 		 */
2518 		for(;;) {
2519 			if(tcb->reseq == nil)
2520 				goto output;
2521 
2522 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2523 				goto output;
2524 
2525 			getreseq(tcb, &seg, &bp, &length);
2526 
2527 			tcprcvwin(s);
2528 			if(tcptrim(tcb, &seg, &bp, &length) == 0){
2529 				tcb->flags |= FORCE;
2530 				break;
2531 			}
2532 		}
2533 	}
2534 output:
2535 	tcpoutput(s);
2536 	qunlock(s);
2537 	poperror();
2538 	return;
2539 raise:
2540 	qunlock(s);
2541 	poperror();
2542 	freeblist(bp);
2543 	tcpkick(s);
2544 }
2545 
2546 /*
2547  *  always enters and exits with the s locked.  We drop
2548  *  the lock to ipoput the packet so some care has to be
2549  *  taken by callers.
2550  */
2551 static void
tcpoutput(Conv * s)2552 tcpoutput(Conv *s)
2553 {
2554 	Tcp seg;
2555 	uint msgs;
2556 	Tcpctl *tcb;
2557 	Block *hbp, *bp;
2558 	int sndcnt;
2559 	ulong ssize, dsize, sent;
2560 	Fs *f;
2561 	Tcppriv *tpriv;
2562 	uchar version;
2563 
2564 	f = s->p->f;
2565 	tpriv = s->p->priv;
2566 	version = s->ipversion;
2567 
2568 	tcb = (Tcpctl*)s->ptcl;
2569 
2570 	/* force ack every 2*mss */
2571 	if((tcb->flags & FORCE) == 0 &&
2572 	    tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2573 		tpriv->stats[Delayack]++;
2574 		tcb->flags |= FORCE;
2575 	}
2576 
2577 	/* force ack if window opening */
2578 	if((tcb->flags & FORCE) == 0){
2579 		tcprcvwin(s);
2580 		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2581 			tpriv->stats[Wopenack]++;
2582 			tcb->flags |= FORCE;
2583 		}
2584 	}
2585 
2586 	for(msgs = 0; msgs < 100; msgs++) {
2587 		switch(tcb->state) {
2588 		case Listen:
2589 		case Closed:
2590 		case Finwait2:
2591 			return;
2592 		}
2593 
2594 		/* Don't send anything else until our SYN has been acked */
2595 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2596 			break;
2597 
2598 		/* force an ack when a window has opened up */
2599 		tcprcvwin(s);
2600 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2601 			tcb->rcv.blocked = 0;
2602 			tcb->flags |= FORCE;
2603 		}
2604 
2605 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2606 		sent = tcb->snd.ptr - tcb->snd.una;
2607 		ssize = sndcnt;
2608 		if(tcb->snd.wnd == 0){
2609 			/* zero window probe */
2610 			if(sent > 0 && !(tcb->flags & FORCE))
2611 				break;	/* already probing, rto re-probes */
2612 			if(ssize < sent)
2613 				ssize = 0;
2614 			else{
2615 				ssize -= sent;
2616 				if(ssize > 0)
2617 					ssize = 1;
2618 			}
2619 		} else {
2620 			/* calculate usable segment size */
2621 			if(ssize > tcb->cwind)
2622 				ssize = tcb->cwind;
2623 			if(ssize > tcb->snd.wnd)
2624 				ssize = tcb->snd.wnd;
2625 
2626 			if(ssize < sent)
2627 				ssize = 0;
2628 			else {
2629 				ssize -= sent;
2630 				if(ssize > tcb->mss)
2631 					ssize = tcb->mss;
2632 			}
2633 		}
2634 
2635 		dsize = ssize;
2636 		seg.urg = 0;
2637 
2638 		if(!(tcb->flags & FORCE))
2639 			if(ssize == 0 ||
2640 			    ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2641 			    sent > TCPREXMTTHRESH * tcb->mss)
2642 				break;
2643 
2644 		tcb->flags &= ~FORCE;
2645 
2646 		/* By default we will generate an ack */
2647 		tcphalt(tpriv, &tcb->acktimer);
2648 		seg.source = s->lport;
2649 		seg.dest = s->rport;
2650 		seg.flags = ACK;
2651 		seg.mss = 0;
2652 		seg.ws = 0;
2653 		seg.update = 0;
2654 		switch(tcb->state){
2655 		case Syn_sent:
2656 			seg.flags = 0;
2657 			if(tcb->snd.ptr == tcb->iss){
2658 				seg.flags |= SYN;
2659 				dsize--;
2660 				seg.mss = tcb->mss;
2661 				seg.ws = tcb->scale;
2662 			}
2663 			break;
2664 		case Syn_received:
2665 			/*
2666 			 *  don't send any data with a SYN/ACK packet
2667 			 *  because Linux rejects the packet in its
2668 			 *  attempt to solve the SYN attack problem
2669 			 */
2670 			if(tcb->snd.ptr == tcb->iss){
2671 				seg.flags |= SYN;
2672 				dsize = 0;
2673 				ssize = 1;
2674 				seg.mss = tcb->mss;
2675 				seg.ws = tcb->scale;
2676 			}
2677 			break;
2678 		}
2679 		seg.seq = tcb->snd.ptr;
2680 		seg.ack = tcb->rcv.nxt;
2681 		seg.wnd = tcb->rcv.wnd;
2682 
2683 		/* Pull out data to send */
2684 		bp = nil;
2685 		if(dsize != 0) {
2686 			bp = qcopy(s->wq, dsize, sent);
2687 			if(BLEN(bp) != dsize) {
2688 				seg.flags |= FIN;
2689 				dsize--;
2690 			}
2691 		}
2692 
2693 		if(sent+dsize == sndcnt && dsize)
2694 			seg.flags |= PSH;
2695 
2696 		tcb->snd.ptr += ssize;
2697 
2698 		/* Pull up the send pointer so we can accept acks
2699 		 * for this window
2700 		 */
2701 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2702 			tcb->snd.nxt = tcb->snd.ptr;
2703 
2704 		/* Build header, link data and compute cksum */
2705 		switch(version){
2706 		case V4:
2707 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2708 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2709 			if(hbp == nil) {
2710 				freeblist(bp);
2711 				return;
2712 			}
2713 			break;
2714 		case V6:
2715 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2716 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2717 			if(hbp == nil) {
2718 				freeblist(bp);
2719 				return;
2720 			}
2721 			break;
2722 		default:
2723 			hbp = nil;	/* to suppress a warning */
2724 			panic("tcpoutput: version %d", version);
2725 		}
2726 
2727 		/* Start the transmission timers if there is new data and we
2728 		 * expect acknowledges
2729 		 */
2730 		if(ssize != 0){
2731 			if(tcb->timer.state != TcptimerON){
2732 				tcb->time = NOW;
2733 				tcb->timeuna = tcb->snd.una;
2734 				tcpgo(tpriv, &tcb->timer);
2735 			}
2736 
2737 			/*  If round trip timer isn't running, start it.
2738 			 *  measure the longest packet only in case the
2739 			 *  transmission time dominates RTT
2740 			 */
2741 			if(tcb->snd.retransmit == 0)
2742 			if(tcb->rtt_timer.state != TcptimerON)
2743 			if(ssize == tcb->mss) {
2744 				tcpgo(tpriv, &tcb->rtt_timer);
2745 				tcb->rttseq = tcb->snd.ptr;
2746 			}
2747 		}
2748 
2749 		tpriv->stats[OutSegs]++;
2750 		if(tcb->snd.retransmit)
2751 			tpriv->stats[RetransSegsSent]++;
2752 		tcb->rcv.ackptr = seg.ack;
2753 		tcb->rcv.wsnt = tcb->rcv.wptr;
2754 
2755 		/* put off the next keep alive */
2756 		tcpgo(tpriv, &tcb->katimer);
2757 
2758 		switch(version){
2759 		case V4:
2760 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761 				/* a negative return means no route */
2762 				localclose(s, "no route");
2763 			}
2764 			break;
2765 		case V6:
2766 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2767 				/* a negative return means no route */
2768 				localclose(s, "no route");
2769 			}
2770 			break;
2771 		default:
2772 			panic("tcpoutput2: version %d", version);
2773 		}
2774 		if((msgs%4) == 3){
2775 			qunlock(s);
2776 			qlock(s);
2777 		}
2778 	}
2779 }
2780 
2781 /*
2782  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2783  */
2784 static void
tcpsendka(Conv * s)2785 tcpsendka(Conv *s)
2786 {
2787 	Tcp seg;
2788 	Tcpctl *tcb;
2789 	Block *hbp,*dbp;
2790 
2791 	tcb = (Tcpctl*)s->ptcl;
2792 
2793 	dbp = nil;
2794 	memset(&seg, 0, sizeof seg);
2795 	seg.urg = 0;
2796 	seg.source = s->lport;
2797 	seg.dest = s->rport;
2798 	seg.flags = ACK|PSH;
2799 	seg.mss = 0;
2800 	seg.ws = 0;
2801 	if(tcpporthogdefense)
2802 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2803 	else
2804 		seg.seq = tcb->snd.una-1;
2805 	seg.ack = tcb->rcv.nxt;
2806 	tcb->rcv.ackptr = seg.ack;
2807 	tcprcvwin(s);
2808 	seg.wnd = tcb->rcv.wnd;
2809 	if(tcb->state == Finwait2){
2810 		seg.flags |= FIN;
2811 	} else {
2812 		dbp = allocb(1);
2813 		dbp->wp++;
2814 	}
2815 
2816 	if(isv4(s->raddr)) {
2817 		/* Build header, link data and compute cksum */
2818 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2819 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2820 		if(hbp == nil) {
2821 			freeblist(dbp);
2822 			return;
2823 		}
2824 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2825 	}
2826 	else {
2827 		/* Build header, link data and compute cksum */
2828 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2829 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2830 		if(hbp == nil) {
2831 			freeblist(dbp);
2832 			return;
2833 		}
2834 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2835 	}
2836 }
2837 
2838 /*
2839  *  set connection to time out after 12 minutes
2840  */
2841 static void
tcpsetkacounter(Tcpctl * tcb)2842 tcpsetkacounter(Tcpctl *tcb)
2843 {
2844 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2845 	if(tcb->kacounter < 3)
2846 		tcb->kacounter = 3;
2847 }
2848 
2849 /*
2850  *  if we've timed out, close the connection
2851  *  otherwise, send a keepalive and restart the timer
2852  */
2853 static void
tcpkeepalive(void * v)2854 tcpkeepalive(void *v)
2855 {
2856 	Tcpctl *tcb;
2857 	Conv *s;
2858 
2859 	s = v;
2860 	tcb = (Tcpctl*)s->ptcl;
2861 	if(waserror()){
2862 		qunlock(s);
2863 		nexterror();
2864 	}
2865 	qlock(s);
2866 	if(tcb->state != Closed){
2867 		if(--(tcb->kacounter) <= 0) {
2868 			localclose(s, Etimedout);
2869 		} else {
2870 			tcpsendka(s);
2871 			tcpgo(s->p->priv, &tcb->katimer);
2872 		}
2873 	}
2874 	qunlock(s);
2875 	poperror();
2876 }
2877 
2878 /*
2879  *  start keepalive timer
2880  */
2881 static char*
tcpstartka(Conv * s,char ** f,int n)2882 tcpstartka(Conv *s, char **f, int n)
2883 {
2884 	Tcpctl *tcb;
2885 	int x;
2886 
2887 	tcb = (Tcpctl*)s->ptcl;
2888 	if(tcb->state != Established)
2889 		return "connection must be in Establised state";
2890 	if(n > 1){
2891 		x = atoi(f[1]);
2892 		if(x >= MSPTICK)
2893 			tcb->katimer.start = x/MSPTICK;
2894 	}
2895 	tcpsetkacounter(tcb);
2896 	tcpgo(s->p->priv, &tcb->katimer);
2897 
2898 	return nil;
2899 }
2900 
2901 /*
2902  *  turn checksums on/off
2903  */
2904 static char*
tcpsetchecksum(Conv * s,char ** f,int)2905 tcpsetchecksum(Conv *s, char **f, int)
2906 {
2907 	Tcpctl *tcb;
2908 
2909 	tcb = (Tcpctl*)s->ptcl;
2910 	tcb->nochecksum = !atoi(f[1]);
2911 
2912 	return nil;
2913 }
2914 
2915 /*
2916  *  retransmit (at most) one segment at snd.una.
2917  *  preserve cwind & snd.ptr
2918  */
2919 static void
tcprxmit(Conv * s)2920 tcprxmit(Conv *s)
2921 {
2922 	Tcpctl *tcb;
2923 	Tcppriv *tpriv;
2924 	ulong tcwind, tptr;
2925 
2926 	tcb = (Tcpctl*)s->ptcl;
2927 	tcb->flags |= RETRAN|FORCE;
2928 
2929 	tptr = tcb->snd.ptr;
2930 	tcwind = tcb->cwind;
2931 	tcb->snd.ptr = tcb->snd.una;
2932 	tcb->cwind = tcb->mss;
2933 	tcb->snd.retransmit = 1;
2934 	tcpoutput(s);
2935 	tcb->snd.retransmit = 0;
2936 	tcb->cwind = tcwind;
2937 	tcb->snd.ptr = tptr;
2938 
2939 	tpriv = s->p->priv;
2940 	tpriv->stats[RetransSegs]++;
2941 }
2942 
2943 /*
2944  *  TODO: RFC 4138 F-RTO
2945  */
2946 static void
tcptimeout(void * arg)2947 tcptimeout(void *arg)
2948 {
2949 	Conv *s;
2950 	Tcpctl *tcb;
2951 	int maxback;
2952 	Tcppriv *tpriv;
2953 
2954 	s = (Conv*)arg;
2955 	tpriv = s->p->priv;
2956 	tcb = (Tcpctl*)s->ptcl;
2957 
2958 	if(waserror()){
2959 		qunlock(s);
2960 		nexterror();
2961 	}
2962 	qlock(s);
2963 	switch(tcb->state){
2964 	default:
2965 		tcb->backoff++;
2966 		if(tcb->state == Syn_sent)
2967 			maxback = MAXBACKMS/2;
2968 		else
2969 			maxback = MAXBACKMS;
2970 		tcb->backedoff += tcb->timer.start * MSPTICK;
2971 		if(tcb->backedoff >= maxback) {
2972 			localclose(s, Etimedout);
2973 			break;
2974 		}
2975 		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2976 			tcb->srtt, tcb->mdev, NOW - tcb->time,
2977 			tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2978 			tcpstates[s->state]);
2979 		tcpsettimer(tcb);
2980 		if(tcb->snd.rto == 0)
2981 			tcpcongestion(tcb);
2982 		tcprxmit(s);
2983 		tcb->snd.ptr = tcb->snd.una;
2984 		tcb->cwind = tcb->mss;
2985 		tcb->snd.rto = 1;
2986 		tpriv->stats[RetransTimeouts]++;
2987 
2988 		if(tcb->snd.recovery){
2989 			tcb->snd.dupacks = 0;		/* reno rto */
2990 			tcb->snd.recovery = 0;
2991 			tpriv->stats[RecoveryRTO]++;
2992 			tcb->snd.rxt = tcb->snd.nxt;
2993 			netlog(s->p->f, Logtcpwin,
2994 				"rto recovery rxt @%lud\n", tcb->snd.nxt);
2995 		}
2996 
2997 		tcb->abcbytes = 0;
2998 		break;
2999 	case Time_wait:
3000 		localclose(s, nil);
3001 		break;
3002 	case Closed:
3003 		break;
3004 	}
3005 	qunlock(s);
3006 	poperror();
3007 }
3008 
3009 static int
inwindow(Tcpctl * tcb,int seq)3010 inwindow(Tcpctl *tcb, int seq)
3011 {
3012 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3013 }
3014 
3015 /*
3016  *  set up state for a received SYN (or SYN ACK) packet
3017  */
3018 static void
procsyn(Conv * s,Tcp * seg)3019 procsyn(Conv *s, Tcp *seg)
3020 {
3021 	Tcpctl *tcb;
3022 	Tcppriv *tpriv;
3023 
3024 	tcb = (Tcpctl*)s->ptcl;
3025 	tcb->flags |= FORCE;
3026 
3027 	tcb->rcv.nxt = seg->seq + 1;
3028 	tcb->rcv.wptr = tcb->rcv.nxt;
3029 	tcb->rcv.wsnt = 0;
3030 	tcb->rcv.urg = tcb->rcv.nxt;
3031 	tcb->irs = seg->seq;
3032 
3033 	/* our sending max segment size cannot be bigger than what he asked for */
3034 	if(seg->mss != 0 && seg->mss < tcb->mss) {
3035 		tcb->mss = seg->mss;
3036 		tpriv = s->p->priv;
3037 		tpriv->stats[Mss] = tcb->mss;
3038 	}
3039 
3040 	tcb->snd.wnd = seg->wnd;
3041 	initialwindow(tcb);
3042 }
3043 
3044 static int
dumpreseq(Tcpctl * tcb)3045 dumpreseq(Tcpctl *tcb)
3046 {
3047 	Reseq *r, *next;
3048 
3049 	for(r = tcb->reseq; r != nil; r = next){
3050 		next = r->next;
3051 		freeblist(r->bp);
3052 		free(r);
3053 	}
3054 	tcb->reseq = nil;
3055 	tcb->nreseq = 0;
3056 	tcb->reseqlen = 0;
3057 	return -1;
3058 }
3059 
3060 static void
logreseq(Fs * f,Reseq * r,ulong n)3061 logreseq(Fs *f, Reseq *r, ulong n)
3062 {
3063 	char *s;
3064 
3065 	for(; r != nil; r = r->next){
3066 		s = nil;
3067 		if(r->next == nil && r->seg.seq != n)
3068 			s = "hole/end";
3069 		else if(r->next == nil)
3070 			s = "end";
3071 		else if(r->seg.seq != n)
3072 			s = "hole";
3073 		if(s != nil)
3074 			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3075 				n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3076 		n = r->seg.seq + r->seg.len;
3077 	}
3078 }
3079 
3080 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3081 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3082 {
3083 	Reseq *rp, **rr;
3084 	int qmax;
3085 
3086 	rp = malloc(sizeof *rp);
3087 	if(rp == nil){
3088 		freeblist(bp);		/* bp always consumed by addreseq */
3089 		return 0;
3090 	}
3091 
3092 	rp->seg = *seg;
3093 	rp->bp = bp;
3094 	rp->length = length;
3095 
3096 	tcb->reseqlen += length;
3097 	tcb->nreseq++;
3098 
3099 	/* Place on reassembly list sorting by starting seq number */
3100 	for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3101 		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3102 			rp->next = *rr;
3103 			*rr = rp;
3104 			tpriv->stats[Resequenced]++;
3105 			if(rp->next != nil)
3106 				tpriv->stats[OutOfOrder]++;
3107 			break;
3108 		}
3109 
3110 	qmax = tcb->window;
3111 	if(tcb->reseqlen > qmax){
3112 		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3113 			tcb->reseqlen, qmax, tcb->nreseq);
3114 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3115 		tpriv->stats[ReseqBytelim]++;
3116 		return dumpreseq(tcb);
3117 	}
3118 	qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3119 	if(tcb->nreseq > qmax){
3120 		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3121 			tcb->nreseq, qmax, tcb->reseqlen);
3122 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3123 		tpriv->stats[ReseqPktlim]++;
3124 		return dumpreseq(tcb);
3125 	}
3126 	return 0;
3127 }
3128 
3129 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3130 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3131 {
3132 	Reseq *rp;
3133 
3134 	rp = tcb->reseq;
3135 	if(rp == nil)
3136 		return;
3137 
3138 	tcb->reseq = rp->next;
3139 
3140 	*seg = rp->seg;
3141 	*bp = rp->bp;
3142 	*length = rp->length;
3143 
3144 	tcb->nreseq--;
3145 	tcb->reseqlen -= rp->length;
3146 
3147 	free(rp);
3148 }
3149 
3150 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3151 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3152 {
3153 	ushort len;
3154 	uchar accept;
3155 	int dupcnt, excess;
3156 
3157 	accept = 0;
3158 	len = *length;
3159 	if(seg->flags & SYN)
3160 		len++;
3161 	if(seg->flags & FIN)
3162 		len++;
3163 
3164 	if(tcb->rcv.wnd == 0) {
3165 		if(len == 0 && seg->seq == tcb->rcv.nxt)
3166 			return 0;
3167 	}
3168 	else {
3169 		/* Some part of the segment should be in the window */
3170 		if(inwindow(tcb,seg->seq))
3171 			accept++;
3172 		else
3173 		if(len != 0) {
3174 			if(inwindow(tcb, seg->seq+len-1) ||
3175 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3176 				accept++;
3177 		}
3178 	}
3179 	if(!accept) {
3180 		freeblist(*bp);
3181 		return -1;
3182 	}
3183 	dupcnt = tcb->rcv.nxt - seg->seq;
3184 	if(dupcnt > 0){
3185 		tcb->rerecv += dupcnt;
3186 		if(seg->flags & SYN){
3187 			seg->flags &= ~SYN;
3188 			seg->seq++;
3189 
3190 			if(seg->urg > 1)
3191 				seg->urg--;
3192 			else
3193 				seg->flags &= ~URG;
3194 			dupcnt--;
3195 		}
3196 		if(dupcnt > 0){
3197 			pullblock(bp, (ushort)dupcnt);
3198 			seg->seq += dupcnt;
3199 			*length -= dupcnt;
3200 
3201 			if(seg->urg > dupcnt)
3202 				seg->urg -= dupcnt;
3203 			else {
3204 				seg->flags &= ~URG;
3205 				seg->urg = 0;
3206 			}
3207 		}
3208 	}
3209 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3210 	if(excess > 0) {
3211 		tcb->rerecv += excess;
3212 		*length -= excess;
3213 		*bp = trimblock(*bp, 0, *length);
3214 		if(*bp == nil)
3215 			panic("presotto is a boofhead");
3216 		seg->flags &= ~FIN;
3217 	}
3218 	return 0;
3219 }
3220 
3221 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3222 tcpadvise(Proto *tcp, Block *bp, char *msg)
3223 {
3224 	Tcp4hdr *h4;
3225 	Tcp6hdr *h6;
3226 	Tcpctl *tcb;
3227 	uchar source[IPaddrlen];
3228 	uchar dest[IPaddrlen];
3229 	ushort psource, pdest;
3230 	Conv *s, **p;
3231 
3232 	h4 = (Tcp4hdr*)(bp->rp);
3233 	h6 = (Tcp6hdr*)(bp->rp);
3234 
3235 	if((h4->vihl&0xF0)==IP_VER4) {
3236 		v4tov6(dest, h4->tcpdst);
3237 		v4tov6(source, h4->tcpsrc);
3238 		psource = nhgets(h4->tcpsport);
3239 		pdest = nhgets(h4->tcpdport);
3240 	}
3241 	else {
3242 		ipmove(dest, h6->tcpdst);
3243 		ipmove(source, h6->tcpsrc);
3244 		psource = nhgets(h6->tcpsport);
3245 		pdest = nhgets(h6->tcpdport);
3246 	}
3247 
3248 	/* Look for a connection */
3249 	qlock(tcp);
3250 	for(p = tcp->conv; *p; p++) {
3251 		s = *p;
3252 		tcb = (Tcpctl*)s->ptcl;
3253 		if(s->rport == pdest)
3254 		if(s->lport == psource)
3255 		if(tcb->state != Closed)
3256 		if(ipcmp(s->raddr, dest) == 0)
3257 		if(ipcmp(s->laddr, source) == 0){
3258 			qlock(s);
3259 			qunlock(tcp);
3260 			switch(tcb->state){
3261 			case Syn_sent:
3262 				localclose(s, msg);
3263 				break;
3264 			}
3265 			qunlock(s);
3266 			freeblist(bp);
3267 			return;
3268 		}
3269 	}
3270 	qunlock(tcp);
3271 	freeblist(bp);
3272 }
3273 
3274 static char*
tcpporthogdefensectl(char * val)3275 tcpporthogdefensectl(char *val)
3276 {
3277 	if(strcmp(val, "on") == 0)
3278 		tcpporthogdefense = 1;
3279 	else if(strcmp(val, "off") == 0)
3280 		tcpporthogdefense = 0;
3281 	else
3282 		return "unknown value for tcpporthogdefense";
3283 	return nil;
3284 }
3285 
3286 /* called with c qlocked */
3287 static char*
tcpctl(Conv * c,char ** f,int n)3288 tcpctl(Conv* c, char** f, int n)
3289 {
3290 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3291 		return tcphangup(c);
3292 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3293 		return tcpstartka(c, f, n);
3294 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3295 		return tcpsetchecksum(c, f, n);
3296 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3297 		return tcpporthogdefensectl(f[1]);
3298 	return "unknown control request";
3299 }
3300 
3301 static int
tcpstats(Proto * tcp,char * buf,int len)3302 tcpstats(Proto *tcp, char *buf, int len)
3303 {
3304 	Tcppriv *priv;
3305 	char *p, *e;
3306 	int i;
3307 
3308 	priv = tcp->priv;
3309 	p = buf;
3310 	e = p+len;
3311 	for(i = 0; i < Nstats; i++)
3312 		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3313 	return p - buf;
3314 }
3315 
3316 /*
3317  *  garbage collect any stale conversations:
3318  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3319  *	- Finwait2 after 5 minutes
3320  *
3321  *  this is called whenever we run out of channels.  Both checks are
3322  *  of questionable validity so we try to use them only when we're
3323  *  up against the wall.
3324  */
3325 static int
tcpgc(Proto * tcp)3326 tcpgc(Proto *tcp)
3327 {
3328 	Conv *c, **pp, **ep;
3329 	int n;
3330 	Tcpctl *tcb;
3331 
3332 
3333 	n = 0;
3334 	ep = &tcp->conv[tcp->nc];
3335 	for(pp = tcp->conv; pp < ep; pp++) {
3336 		c = *pp;
3337 		if(c == nil)
3338 			break;
3339 		if(!canqlock(c))
3340 			continue;
3341 		tcb = (Tcpctl*)c->ptcl;
3342 		switch(tcb->state){
3343 		case Syn_received:
3344 			if(NOW - tcb->time > 5000){
3345 				localclose(c, Etimedout);
3346 				n++;
3347 			}
3348 			break;
3349 		case Finwait2:
3350 			if(NOW - tcb->time > 5*60*1000){
3351 				localclose(c, Etimedout);
3352 				n++;
3353 			}
3354 			break;
3355 		}
3356 		qunlock(c);
3357 	}
3358 	return n;
3359 }
3360 
3361 static void
tcpsettimer(Tcpctl * tcb)3362 tcpsettimer(Tcpctl *tcb)
3363 {
3364 	int x;
3365 
3366 	/* round trip dependency */
3367 	x = backoff(tcb->backoff) *
3368 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3369 
3370 	/* bounded twixt 0.3 and 64 seconds */
3371 	if(x < 300/MSPTICK)
3372 		x = 300/MSPTICK;
3373 	else if(x > (64000/MSPTICK))
3374 		x = 64000/MSPTICK;
3375 	tcb->timer.start = x;
3376 }
3377 
3378 void
tcpinit(Fs * fs)3379 tcpinit(Fs *fs)
3380 {
3381 	Proto *tcp;
3382 	Tcppriv *tpriv;
3383 
3384 	tcp = smalloc(sizeof(Proto));
3385 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3386 	tcp->name = "tcp";
3387 	tcp->connect = tcpconnect;
3388 	tcp->announce = tcpannounce;
3389 	tcp->ctl = tcpctl;
3390 	tcp->state = tcpstate;
3391 	tcp->create = tcpcreate;
3392 	tcp->close = tcpclose;
3393 	tcp->rcv = tcpiput;
3394 	tcp->advise = tcpadvise;
3395 	tcp->stats = tcpstats;
3396 	tcp->inuse = tcpinuse;
3397 	tcp->gc = tcpgc;
3398 	tcp->ipproto = IP_TCPPROTO;
3399 	tcp->nc = scalednconv();
3400 	tcp->ptclsize = sizeof(Tcpctl);
3401 	tpriv->stats[MaxConn] = tcp->nc;
3402 
3403 	Fsproto(fs, tcp);
3404 }
3405 
3406 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3407 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3408 {
3409 	/*
3410 	 * guess at reasonable queue sizes.  there's no current way
3411 	 * to know how many nic receive buffers we can safely tie up in the
3412 	 * tcp stack, and we don't adjust our queues to maximize throughput
3413 	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3414 	 * respected, but we still control our own buffer commitment by
3415 	 * keeping a seperate qscale.
3416 	 */
3417 	tcb->rcv.scale = rcvscale & 0xff;
3418 	tcb->snd.scale = sndscale & 0xff;
3419 	tcb->qscale = rcvscale & 0xff;
3420 	if(rcvscale > Maxqscale)
3421 		tcb->qscale = Maxqscale;
3422 
3423 	if(rcvscale != tcb->rcv.scale)
3424 		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3425 			"qlen %d >> window %ud lport %d\n",
3426 			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3427 	tcb->window = QMAX << tcb->qscale;
3428 	tcb->ssthresh = tcb->window;
3429 
3430 	/*
3431 	 * it's important to set wq large enough to cover the full
3432 	 * bandwidth-delay product.  it's possible to be in loss
3433 	 * recovery with a big window, and we need to keep sending
3434 	 * into the inflated window.  the difference can be huge
3435 	 * for even modest (70ms) ping times.
3436 	 */
3437 	qsetlimit(s->rq, tcb->window);
3438 	qsetlimit(s->wq, tcb->window);
3439 	tcprcvwin(s);
3440 }
3441