xref: /plan9-contrib/sys/src/9k/ip/tcp.c (revision 41165bba57fea051ea0ad3129955d848d8699516)
1 #include	"u.h"
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"ip.h"
9 
10 enum
11 {
12 	QMAX		= 64*1024-1,
13 	IP_TCPPROTO	= 6,
14 
15 	TCP4_IPLEN	= 8,
16 	TCP4_PHDRSIZE	= 12,
17 	TCP4_HDRSIZE	= 20,
18 	TCP4_TCBPHDRSZ	= 40,
19 	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21 	TCP6_IPLEN	= 0,
22 	TCP6_PHDRSIZE	= 40,
23 	TCP6_HDRSIZE	= 20,
24 	TCP6_TCBPHDRSZ	= 60,
25 	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27 	TcptimerOFF	= 0,
28 	TcptimerON	= 1,
29 	TcptimerDONE	= 2,
30 	MAX_TIME 	= (1<<20),	/* Forever */
31 	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32 	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33 
34 	URG		= 0x20,		/* Data marked urgent */
35 	ACK		= 0x10,		/* Acknowledge is valid */
36 	PSH		= 0x08,		/* Whole data pipe is pushed */
37 	RST		= 0x04,		/* Reset connection */
38 	SYN		= 0x02,		/* Pkt. is synchronise */
39 	FIN		= 0x01,		/* Start close down */
40 
41 	EOLOPT		= 0,
42 	NOOPOPT		= 1,
43 	MSSOPT		= 2,
44 	MSS_LENGTH	= 4,		/* Maximum segment size */
45 	WSOPT		= 3,
46 	WS_LENGTH	= 3,		/* Bits to scale window size by */
47 	MSL2		= 10,
48 	MSPTICK		= 50,		/* Milliseconds per timer tick */
49 	DEF_MSS		= 1460,		/* Default maximum segment */
50 	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51 	DEF_RTT		= 500,		/* Default round trip */
52 	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53 	TCP_LISTEN	= 0,		/* Listen connection */
54 	TCP_CONNECT	= 1,		/* Outgoing connection */
55 	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56 
57 	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58 
59 	FORCE		= 1,
60 	CLONE		= 2,
61 	RETRAN		= 4,
62 	ACTIVE		= 8,
63 	SYNACK		= 16,
64 
65 	LOGAGAIN	= 3,
66 	LOGDGAIN	= 2,
67 
68 	Closed		= 0,		/* Connection states */
69 	Listen,
70 	Syn_sent,
71 	Syn_received,
72 	Established,
73 	Finwait1,
74 	Finwait2,
75 	Close_wait,
76 	Closing,
77 	Last_ack,
78 	Time_wait,
79 
80 	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81 	NLHT		= 256,		/* hash table size, must be a power of 2 */
82 	LHTMASK		= NLHT-1,
83 
84 	/*
85 	 * window is 64kb * 2ⁿ
86 	 * these factors determine the ultimate bandwidth-delay product.
87 	 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 	 */
89 	Maxqscale	= 4,		/* maximum queuing scale */
90 	Defadvscale	= 4,		/* default advertisement */
91 };
92 
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
97 	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
98 	"Closing", 	"Last_ack", 	"Time_wait"
99 };
100 
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 	Tcptimer	*next;
105 	Tcptimer	*prev;
106 	Tcptimer	*readynext;
107 	int	state;
108 	int	start;
109 	int	count;
110 	void	(*func)(void*);
111 	void	*arg;
112 };
113 
114 /*
115  *  v4 and v6 pseudo headers used for
116  *  checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 	uchar	vihl;		/* Version and header length */
122 	uchar	tos;		/* Type of service */
123 	uchar	length[2];	/* packet length */
124 	uchar	id[2];		/* Identification */
125 	uchar	frag[2];	/* Fragment information */
126 	uchar	Unused;
127 	uchar	proto;
128 	uchar	tcplen[2];
129 	uchar	tcpsrc[4];
130 	uchar	tcpdst[4];
131 	uchar	tcpsport[2];
132 	uchar	tcpdport[2];
133 	uchar	tcpseq[4];
134 	uchar	tcpack[4];
135 	uchar	tcpflag[2];
136 	uchar	tcpwin[2];
137 	uchar	tcpcksum[2];
138 	uchar	tcpurg[2];
139 	/* Options segment */
140 	uchar	tcpopt[1];
141 };
142 
143 typedef struct Tcp6hdr Tcp6hdr;
144 struct Tcp6hdr
145 {
146 	uchar	vcf[4];
147 	uchar	ploadlen[2];
148 	uchar	proto;
149 	uchar	ttl;
150 	uchar	tcpsrc[IPaddrlen];
151 	uchar	tcpdst[IPaddrlen];
152 	uchar	tcpsport[2];
153 	uchar	tcpdport[2];
154 	uchar	tcpseq[4];
155 	uchar	tcpack[4];
156 	uchar	tcpflag[2];
157 	uchar	tcpwin[2];
158 	uchar	tcpcksum[2];
159 	uchar	tcpurg[2];
160 	/* Options segment */
161 	uchar	tcpopt[1];
162 };
163 
164 /*
165  *  this represents the control info
166  *  for a single packet.  It is derived from
167  *  a packet in ntohtcp{4,6}() and stuck into
168  *  a packet in htontcp{4,6}().
169  */
170 typedef struct Tcp Tcp;
171 struct	Tcp
172 {
173 	ushort	source;
174 	ushort	dest;
175 	ulong	seq;
176 	ulong	ack;
177 	uchar	flags;
178 	uchar	update;
179 	ushort	ws;	/* window scale option */
180 	ulong	wnd;	/* prescaled window*/
181 	ushort	urg;
182 	ushort	mss;	/* max segment size option (if not zero) */
183 	ushort	len;	/* size of data */
184 };
185 
186 /*
187  *  this header is malloc'd to thread together fragments
188  *  waiting to be coalesced
189  */
190 typedef struct Reseq Reseq;
191 struct Reseq
192 {
193 	Reseq	*next;
194 	Tcp	seg;
195 	Block	*bp;
196 	ushort	length;
197 };
198 
199 /*
200  *  the qlock in the Conv locks this structure
201  */
202 typedef struct Tcpctl Tcpctl;
203 struct Tcpctl
204 {
205 	uchar	state;			/* Connection state */
206 	uchar	type;			/* Listening or active connection */
207 	uchar	code;			/* Icmp code */
208 	struct {
209 		ulong	una;		/* Unacked data pointer */
210 		ulong	nxt;		/* Next sequence expected */
211 		ulong	ptr;		/* Data pointer */
212 		ulong	wnd;		/* Tcp send window */
213 		ulong	urg;		/* Urgent data pointer */
214 		ulong	wl2;
215 		uint	scale;		/* how much to right shift window */
216 					/* in xmitted packets */
217 		/* to implement tahoe and reno TCP */
218 		ulong	dupacks;	/* number of duplicate acks rcvd */
219 		ulong	partialack;
220 		int	recovery;	/* loss recovery flag */
221 		int	retransmit;	/* retransmit 1 packet @ una flag */
222 		int	rto;
223 		ulong	rxt;		/* right window marker for recovery */
224 					/* "recover" rfc3782 */
225 	} snd;
226 	struct {
227 		ulong	nxt;		/* Receive pointer to next uchar slot */
228 		ulong	wnd;		/* Receive window incoming */
229 		ulong	wsnt;		/* Last wptr sent.  important to */
230 					/* track for large bdp */
231 		ulong	wptr;
232 		ulong	urg;		/* Urgent pointer */
233 		ulong	ackptr;		/* last acked sequence */
234 		int	blocked;
235 		uint	scale;		/* how much to left shift window in */
236 					/* rcv'd packets */
237 	} rcv;
238 	ulong	iss;			/* Initial sequence number */
239 	ulong	cwind;			/* Congestion window */
240 	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
241 	uint	scale;			/* desired snd.scale */
242 	ulong	ssthresh;		/* Slow start threshold */
243 	int	resent;			/* Bytes just resent */
244 	int	irs;			/* Initial received squence */
245 	ushort	mss;			/* Maximum segment size */
246 	int	rerecv;			/* Overlap of data rerecevived */
247 	ulong	window;			/* Our receive window (queue) */
248 	uint	qscale;			/* Log2 of our receive window (queue) */
249 	uchar	backoff;		/* Exponential backoff counter */
250 	int	backedoff;		/* ms we've backed off for rexmits */
251 	uchar	flags;			/* State flags */
252 	Reseq	*reseq;			/* Resequencing queue */
253 	int	nreseq;
254 	int	reseqlen;
255 	Tcptimer	timer;			/* Activity timer */
256 	Tcptimer	acktimer;		/* Acknowledge timer */
257 	Tcptimer	rtt_timer;		/* Round trip timer */
258 	Tcptimer	katimer;		/* keep alive timer */
259 	ulong	rttseq;			/* Round trip sequence */
260 	int	srtt;			/* Smoothed round trip */
261 	int	mdev;			/* Mean deviation of round trip */
262 	int	kacounter;		/* count down for keep alive */
263 	uint	sndsyntime;		/* time syn sent */
264 	ulong	time;			/* time Finwait2 or Syn_received was sent */
265 	ulong	timeuna;		/* snd.una when time was set */
266 	int	nochecksum;		/* non-zero means don't send checksums */
267 	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
268 
269 	union {
270 		Tcp4hdr	tcp4hdr;
271 		Tcp6hdr	tcp6hdr;
272 	} protohdr;		/* prototype header */
273 };
274 
275 /*
276  *  New calls are put in limbo rather than having a conversation structure
277  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
278  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
279  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
280  *
281  *  In particular they aren't on a listener's queue so that they don't figure
282  *  in the input queue limit.
283  *
284  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
285  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
286  *  there is no hashing of this list.
287  */
288 typedef struct Limbo Limbo;
289 struct Limbo
290 {
291 	Limbo	*next;
292 
293 	uchar	laddr[IPaddrlen];
294 	uchar	raddr[IPaddrlen];
295 	ushort	lport;
296 	ushort	rport;
297 	ulong	irs;		/* initial received sequence */
298 	ulong	iss;		/* initial sent sequence */
299 	ushort	mss;		/* mss from the other end */
300 	ushort	rcvscale;	/* how much to scale rcvd windows */
301 	ushort	sndscale;	/* how much to scale sent windows */
302 	ulong	lastsend;	/* last time we sent a synack */
303 	uchar	version;	/* v4 or v6 */
304 	uchar	rexmits;	/* number of retransmissions */
305 };
306 
307 int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
308 
309 enum {
310 	/* MIB stats */
311 	MaxConn,
312 	Mss,
313 	ActiveOpens,
314 	PassiveOpens,
315 	EstabResets,
316 	CurrEstab,
317 	InSegs,
318 	OutSegs,
319 	RetransSegs,
320 	RetransSegsSent,
321 	RetransTimeouts,
322 	InErrs,
323 	OutRsts,
324 
325 	/* non-MIB stats */
326 	CsumErrs,
327 	HlenErrs,
328 	LenErrs,
329 	Resequenced,
330 	OutOfOrder,
331 	ReseqBytelim,
332 	ReseqPktlim,
333 	Delayack,
334 	Wopenack,
335 
336 	Recovery,
337 	RecoveryDone,
338 	RecoveryRTO,
339 	RecoveryNoSeq,
340 	RecoveryCwind,
341 	RecoveryPA,
342 
343 	Nstats
344 };
345 
346 static char *statnames[Nstats] =
347 {
348 [MaxConn]	"MaxConn",
349 [Mss]		"MaxSegment",
350 [ActiveOpens]	"ActiveOpens",
351 [PassiveOpens]	"PassiveOpens",
352 [EstabResets]	"EstabResets",
353 [CurrEstab]	"CurrEstab",
354 [InSegs]	"InSegs",
355 [OutSegs]	"OutSegs",
356 [RetransSegs]	"RetransSegs",
357 [RetransSegsSent]	"RetransSegsSent",
358 [RetransTimeouts]	"RetransTimeouts",
359 [InErrs]	"InErrs",
360 [OutRsts]	"OutRsts",
361 [CsumErrs]	"CsumErrs",
362 [HlenErrs]	"HlenErrs",
363 [LenErrs]	"LenErrs",
364 [OutOfOrder]	"OutOfOrder",
365 [Resequenced]	"Resequenced",
366 [ReseqBytelim]	"ReseqBytelim",
367 [ReseqPktlim]	"ReseqPktlim",
368 [Delayack]	"Delayack",
369 [Wopenack]	"Wopenack",
370 
371 [Recovery]	"Recovery",
372 [RecoveryDone]	"RecoveryDone",
373 [RecoveryRTO]	"RecoveryRTO",
374 
375 [RecoveryNoSeq]	"RecoveryNoSeq",
376 [RecoveryCwind]	"RecoveryCwind",
377 [RecoveryPA]	"RecoveryPA",
378 };
379 
380 typedef struct Tcppriv Tcppriv;
381 struct Tcppriv
382 {
383 	/* List of active timers */
384 	QLock 	tl;
385 	Tcptimer *timers;
386 
387 	/* hash table for matching conversations */
388 	Ipht	ht;
389 
390 	/* calls in limbo waiting for an ACK to our SYN ACK */
391 	int	nlimbo;
392 	Limbo	*lht[NLHT];
393 
394 	/* for keeping track of tcpackproc */
395 	QLock	apl;
396 	int	ackprocstarted;
397 
398 	uvlong	stats[Nstats];
399 };
400 
401 /*
402  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
403  *  solution to hijacked systems staking out port's as a form
404  *  of DoS attack.
405  *
406  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
407  *  that number gets acked by the other end, we shut down the connection.
408  *  Look for tcpporthogdefense in the code.
409  */
410 int tcpporthogdefense = 0;
411 
412 static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
413 static	int	dumpreseq(Tcpctl*);
414 static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
415 static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
416 static	void	limborexmit(Proto*);
417 static	void	localclose(Conv*, char*);
418 static	void	procsyn(Conv*, Tcp*);
419 static	void	tcpacktimer(void*);
420 static	void	tcpiput(Proto*, Ipifc*, Block*);
421 static	void	tcpkeepalive(void*);
422 static	void	tcpoutput(Conv*);
423 static	void	tcprcvwin(Conv*);
424 static	void	tcprxmit(Conv*);
425 static	void	tcpsetkacounter(Tcpctl*);
426 static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
427 static	void	tcpsettimer(Tcpctl*);
428 static	void	tcpsndsyn(Conv*, Tcpctl*);
429 static	void	tcpstart(Conv*, int);
430 static	void	tcpsynackrtt(Conv*);
431 static	void	tcptimeout(void*);
432 static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
433 
434 static void
tcpsetstate(Conv * s,uchar newstate)435 tcpsetstate(Conv *s, uchar newstate)
436 {
437 	Tcpctl *tcb;
438 	uchar oldstate;
439 	Tcppriv *tpriv;
440 
441 	tpriv = s->p->priv;
442 
443 	tcb = (Tcpctl*)s->ptcl;
444 
445 	oldstate = tcb->state;
446 	if(oldstate == newstate)
447 		return;
448 
449 	if(oldstate == Established)
450 		tpriv->stats[CurrEstab]--;
451 	if(newstate == Established)
452 		tpriv->stats[CurrEstab]++;
453 
454 	switch(newstate) {
455 	case Closed:
456 		qclose(s->rq);
457 		qclose(s->wq);
458 		qclose(s->eq);
459 		break;
460 
461 	case Close_wait:		/* Remote closes */
462 		qhangup(s->rq, nil);
463 		break;
464 	}
465 
466 	tcb->state = newstate;
467 
468 	if(oldstate == Syn_sent && newstate != Closed)
469 		Fsconnected(s, nil);
470 }
471 
472 static char*
tcpconnect(Conv * c,char ** argv,int argc)473 tcpconnect(Conv *c, char **argv, int argc)
474 {
475 	char *e;
476 	Tcpctl *tcb;
477 
478 	tcb = (Tcpctl*)(c->ptcl);
479 	if(tcb->state != Closed)
480 		return Econinuse;
481 
482 	e = Fsstdconnect(c, argv, argc);
483 	if(e != nil)
484 		return e;
485 	tcpstart(c, TCP_CONNECT);
486 
487 	return nil;
488 }
489 
490 static int
tcpstate(Conv * c,char * state,int n)491 tcpstate(Conv *c, char *state, int n)
492 {
493 	Tcpctl *s;
494 
495 	s = (Tcpctl*)(c->ptcl);
496 
497 	return snprint(state, n,
498 		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
499 		"swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
500 		"timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
501 		tcpstates[s->state],
502 		c->rq ? qlen(c->rq) : 0,
503 		c->wq ? qlen(c->wq) : 0,
504 		s->nreseq, s->reseqlen,
505 		s->srtt, s->mdev, s->ssthresh,
506 		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
507 		s->qscale,
508 		s->timer.start, s->timer.count, s->rerecv,
509 		s->katimer.start, s->katimer.count);
510 }
511 
512 static int
tcpinuse(Conv * c)513 tcpinuse(Conv *c)
514 {
515 	Tcpctl *s;
516 
517 	s = (Tcpctl*)(c->ptcl);
518 	return s->state != Closed;
519 }
520 
521 static char*
tcpannounce(Conv * c,char ** argv,int argc)522 tcpannounce(Conv *c, char **argv, int argc)
523 {
524 	char *e;
525 	Tcpctl *tcb;
526 
527 	tcb = (Tcpctl*)(c->ptcl);
528 	if(tcb->state != Closed)
529 		return Econinuse;
530 
531 	e = Fsstdannounce(c, argv, argc);
532 	if(e != nil)
533 		return e;
534 	tcpstart(c, TCP_LISTEN);
535 	Fsconnected(c, nil);
536 
537 	return nil;
538 }
539 
540 static void
tcpclosestate(Conv * c,Tcpctl * tcb,int state)541 tcpclosestate(Conv *c, Tcpctl *tcb, int state)
542 {
543 	tcb->flgcnt++;
544 	tcb->snd.nxt++;
545 	tcpsetstate(c, state);
546 	tcpoutput(c);
547 }
548 
549 /* close the output half of a tcp connection */
550 static char *
tcpxmitclose(Conv * c)551 tcpxmitclose(Conv *c)
552 {
553 	Tcpctl *tcb;
554 
555 	qhangup(c->wq, nil);
556 
557 	tcb = (Tcpctl*)c->ptcl;
558 	switch(tcb->state) {
559 	case Listen:
560 		/*
561 		 *  reset any incoming calls to this listener
562 		 */
563 		Fsconnected(c, "Hangup");
564 		/* fall through */
565 	case Closed:
566 	case Syn_sent:
567 		localclose(c, nil);
568 		break;
569 	case Syn_received:
570 	case Established:
571 	case Close_wait:
572 		tcpclosestate(c, tcb, tcb->state);
573 		break;
574 	}
575 	return nil;
576 }
577 
578 /*
579  *  tcpclose is always called with the q locked
580  */
581 static void
tcpclose(Conv * c)582 tcpclose(Conv *c)
583 {
584 	Tcpctl *tcb;
585 
586 	tcb = (Tcpctl*)c->ptcl;
587 
588 	qhangup(c->rq, nil);
589 	qhangup(c->wq, nil);
590 	qhangup(c->eq, nil);
591 	qflush(c->rq);
592 
593 	switch(tcb->state) {
594 	case Listen:
595 		/*
596 		 *  reset any incoming calls to this listener
597 		 */
598 		Fsconnected(c, "Hangup");
599 		/* fall through */
600 	case Closed:
601 	case Syn_sent:
602 		localclose(c, nil);
603 		break;
604 	case Syn_received:
605 	case Established:
606 		tcpclosestate(c, tcb, Finwait1);
607 		break;
608 	case Close_wait:
609 		tcpclosestate(c, tcb, Last_ack);
610 		break;
611 	}
612 }
613 
614 static void
tcpkick(void * x)615 tcpkick(void *x)
616 {
617 	Conv *s = x;
618 	Tcpctl *tcb;
619 
620 	tcb = (Tcpctl*)s->ptcl;
621 
622 	if(waserror()){
623 		qunlock(s);
624 		nexterror();
625 	}
626 	qlock(s);
627 
628 	switch(tcb->state) {
629 	case Syn_sent:
630 	case Syn_received:
631 	case Established:
632 	case Close_wait:
633 		/*
634 		 * Push data
635 		 */
636 		tcpoutput(s);
637 		break;
638 	default:
639 		localclose(s, "Hangup");
640 		break;
641 	}
642 
643 	qunlock(s);
644 	poperror();
645 }
646 
647 static int seq_lt(ulong, ulong);
648 
649 static void
tcprcvwin(Conv * s)650 tcprcvwin(Conv *s)				/* Call with tcb locked */
651 {
652 	int w;
653 	Tcpctl *tcb;
654 
655 	tcb = (Tcpctl*)s->ptcl;
656 	w = tcb->window - qlen(s->rq);
657 	if(w < 0)
658 		w = 0;
659 	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
660 	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
661 		w = tcb->rcv.wptr - tcb->rcv.nxt;
662 	if(w != tcb->rcv.wnd)
663 	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
664 		tcb->rcv.blocked = 1;
665 		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
666 			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
667 	}
668 	tcb->rcv.wnd = w;
669 	tcb->rcv.wptr = tcb->rcv.nxt + w;
670 }
671 
672 static void
tcpacktimer(void * v)673 tcpacktimer(void *v)
674 {
675 	Tcpctl *tcb;
676 	Conv *s;
677 
678 	s = v;
679 	tcb = (Tcpctl*)s->ptcl;
680 
681 	if(waserror()){
682 		qunlock(s);
683 		nexterror();
684 	}
685 	qlock(s);
686 	if(tcb->state != Closed){
687 		tcb->flags |= FORCE;
688 		tcpoutput(s);
689 	}
690 	qunlock(s);
691 	poperror();
692 }
693 
694 static void
tcpcongestion(Tcpctl * tcb)695 tcpcongestion(Tcpctl *tcb)
696 {
697 	ulong inflight;
698 
699 	inflight = tcb->snd.nxt - tcb->snd.una;
700 	if(inflight > tcb->cwind)
701 		inflight = tcb->cwind;
702 	tcb->ssthresh = inflight / 2;
703 	if(tcb->ssthresh < 2*tcb->mss)
704 		tcb->ssthresh = 2*tcb->mss;
705 }
706 
707 enum {
708 	L	= 2,	/* aggressive slow start; legal values ∈ (1.0, 2.0) */
709 };
710 
711 static void
tcpabcincr(Tcpctl * tcb,uint acked)712 tcpabcincr(Tcpctl *tcb, uint acked)
713 {
714 	uint limit;
715 
716 	tcb->abcbytes += acked;
717 	if(tcb->cwind < tcb->ssthresh){
718 		/* slow start */
719 		if(tcb->snd.rto)
720 			limit = tcb->mss;
721 		else
722 			limit = L*tcb->mss;
723 		tcb->cwind += MIN(tcb->abcbytes, limit);
724 		tcb->abcbytes = 0;
725 	} else {
726 		tcb->snd.rto = 0;
727 		/* avoidance */
728 		if(tcb->abcbytes >= tcb->cwind){
729 			tcb->abcbytes -= tcb->cwind;
730 			tcb->cwind += tcb->mss;
731 		}
732 	}
733 }
734 
735 static void
tcpcreate(Conv * c)736 tcpcreate(Conv *c)
737 {
738 	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
739 	c->wq = qopen(QMAX, Qkick, tcpkick, c);
740 }
741 
742 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)743 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
744 {
745 	if(newstate != TcptimerON){
746 		if(t->state == TcptimerON){
747 			/* unchain */
748 			if(priv->timers == t){
749 				priv->timers = t->next;
750 				if(t->prev != nil)
751 					panic("timerstate1");
752 			}
753 			if(t->next)
754 				t->next->prev = t->prev;
755 			if(t->prev)
756 				t->prev->next = t->next;
757 			t->next = t->prev = nil;
758 		}
759 	} else {
760 		if(t->state != TcptimerON){
761 			/* chain */
762 			if(t->prev != nil || t->next != nil)
763 				panic("timerstate2");
764 			t->prev = nil;
765 			t->next = priv->timers;
766 			if(t->next)
767 				t->next->prev = t;
768 			priv->timers = t;
769 		}
770 	}
771 	t->state = newstate;
772 }
773 
774 static void
tcpackproc(void * a)775 tcpackproc(void *a)
776 {
777 	Tcptimer *t, *tp, *timeo;
778 	Proto *tcp;
779 	Tcppriv *priv;
780 	int loop;
781 
782 	tcp = a;
783 	priv = tcp->priv;
784 
785 	for(;;) {
786 		tsleep(&up->sleep, return0, 0, MSPTICK);
787 
788 		qlock(&priv->tl);
789 		timeo = nil;
790 		loop = 0;
791 		for(t = priv->timers; t != nil; t = tp) {
792 			if(loop++ > 10000)
793 				panic("tcpackproc1");
794 			tp = t->next;
795  			if(t->state == TcptimerON) {
796 				t->count--;
797 				if(t->count == 0) {
798 					timerstate(priv, t, TcptimerDONE);
799 					t->readynext = timeo;
800 					timeo = t;
801 				}
802 			}
803 		}
804 		qunlock(&priv->tl);
805 
806 		loop = 0;
807 		for(t = timeo; t != nil; t = t->readynext) {
808 			if(loop++ > 10000)
809 				panic("tcpackproc2");
810 			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
811 				(*t->func)(t->arg);
812 				poperror();
813 			}
814 		}
815 
816 		limborexmit(tcp);
817 	}
818 }
819 
820 static void
tcpgo(Tcppriv * priv,Tcptimer * t)821 tcpgo(Tcppriv *priv, Tcptimer *t)
822 {
823 	if(t == nil || t->start == 0)
824 		return;
825 
826 	qlock(&priv->tl);
827 	t->count = t->start;
828 	timerstate(priv, t, TcptimerON);
829 	qunlock(&priv->tl);
830 }
831 
832 static void
tcphalt(Tcppriv * priv,Tcptimer * t)833 tcphalt(Tcppriv *priv, Tcptimer *t)
834 {
835 	if(t == nil)
836 		return;
837 
838 	qlock(&priv->tl);
839 	timerstate(priv, t, TcptimerOFF);
840 	qunlock(&priv->tl);
841 }
842 
843 static int
backoff(int n)844 backoff(int n)
845 {
846 	return 1 << n;
847 }
848 
849 static void
localclose(Conv * s,char * reason)850 localclose(Conv *s, char *reason)	/* called with tcb locked */
851 {
852 	Tcpctl *tcb;
853 	Tcppriv *tpriv;
854 
855 	tpriv = s->p->priv;
856 	tcb = (Tcpctl*)s->ptcl;
857 
858 	iphtrem(&tpriv->ht, s);
859 
860 	tcphalt(tpriv, &tcb->timer);
861 	tcphalt(tpriv, &tcb->rtt_timer);
862 	tcphalt(tpriv, &tcb->acktimer);
863 	tcphalt(tpriv, &tcb->katimer);
864 
865 	/* Flush reassembly queue; nothing more can arrive */
866 	dumpreseq(tcb);
867 
868 	if(tcb->state == Syn_sent)
869 		Fsconnected(s, reason);
870 	if(s->state == Announced)
871 		wakeup(&s->listenr);
872 
873 	qhangup(s->rq, reason);
874 	qhangup(s->wq, reason);
875 
876 	tcpsetstate(s, Closed);
877 }
878 
879 /* mtu (- TCP + IP hdr len) of 1st hop */
880 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)881 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
882 {
883 	Ipifc *ifc;
884 	int mtu;
885 
886 	ifc = findipifc(tcp->f, addr, 0);
887 	switch(version){
888 	default:
889 	case V4:
890 		mtu = DEF_MSS;
891 		if(ifc != nil)
892 			mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE);
893 		break;
894 	case V6:
895 		mtu = DEF_MSS6;
896 		if(ifc != nil)
897 			mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE);
898 		break;
899 	}
900 	/*
901 	 * set the ws.  it doesn't commit us to anything.
902 	 * ws is the ultimate limit to the bandwidth-delay product.
903 	 */
904 	*scale = Defadvscale;
905 
906 	return mtu;
907 }
908 
909 static void
inittcpctl(Conv * s,int mode)910 inittcpctl(Conv *s, int mode)
911 {
912 	Tcpctl *tcb;
913 	Tcp4hdr* h4;
914 	Tcp6hdr* h6;
915 	Tcppriv *tpriv;
916 	int mss;
917 
918 	tcb = (Tcpctl*)s->ptcl;
919 
920 	memset(tcb, 0, sizeof(Tcpctl));
921 
922 	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
923 	tcb->srtt = tcp_irtt<<LOGAGAIN;
924 	tcb->mdev = 0;
925 
926 	/* setup timers */
927 	tcb->timer.start = tcp_irtt / MSPTICK;
928 	tcb->timer.func = tcptimeout;
929 	tcb->timer.arg = s;
930 	tcb->rtt_timer.start = MAX_TIME;
931 	tcb->acktimer.start = TCP_ACK / MSPTICK;
932 	tcb->acktimer.func = tcpacktimer;
933 	tcb->acktimer.arg = s;
934 	tcb->katimer.start = DEF_KAT / MSPTICK;
935 	tcb->katimer.func = tcpkeepalive;
936 	tcb->katimer.arg = s;
937 
938 	mss = DEF_MSS;
939 
940 	/* create a prototype(pseudo) header */
941 	if(mode != TCP_LISTEN){
942 		if(ipcmp(s->laddr, IPnoaddr) == 0)
943 			findlocalip(s->p->f, s->laddr, s->raddr);
944 
945 		switch(s->ipversion){
946 		case V4:
947 			h4 = &tcb->protohdr.tcp4hdr;
948 			memset(h4, 0, sizeof(*h4));
949 			h4->proto = IP_TCPPROTO;
950 			hnputs(h4->tcpsport, s->lport);
951 			hnputs(h4->tcpdport, s->rport);
952 			v6tov4(h4->tcpsrc, s->laddr);
953 			v6tov4(h4->tcpdst, s->raddr);
954 			break;
955 		case V6:
956 			h6 = &tcb->protohdr.tcp6hdr;
957 			memset(h6, 0, sizeof(*h6));
958 			h6->proto = IP_TCPPROTO;
959 			hnputs(h6->tcpsport, s->lport);
960 			hnputs(h6->tcpdport, s->rport);
961 			ipmove(h6->tcpsrc, s->laddr);
962 			ipmove(h6->tcpdst, s->raddr);
963 			mss = DEF_MSS6;
964 			break;
965 		default:
966 			panic("inittcpctl: version %d", s->ipversion);
967 		}
968 	}
969 
970 	tcb->mss = tcb->cwind = mss;
971 	tcb->abcbytes = 0;
972 	tpriv = s->p->priv;
973 	tpriv->stats[Mss] = tcb->mss;
974 
975 	/* default is no window scaling */
976 	tcpsetscale(s, tcb, 0, 0);
977 }
978 
979 /*
980  *  called with s qlocked
981  */
982 static void
tcpstart(Conv * s,int mode)983 tcpstart(Conv *s, int mode)
984 {
985 	Tcpctl *tcb;
986 	Tcppriv *tpriv;
987 	char kpname[KNAMELEN];
988 
989 	tpriv = s->p->priv;
990 
991 	if(tpriv->ackprocstarted == 0){
992 		qlock(&tpriv->apl);
993 		if(tpriv->ackprocstarted == 0){
994 			snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
995 			kproc(kpname, tcpackproc, s->p);
996 			tpriv->ackprocstarted = 1;
997 		}
998 		qunlock(&tpriv->apl);
999 	}
1000 
1001 	tcb = (Tcpctl*)s->ptcl;
1002 
1003 	inittcpctl(s, mode);
1004 
1005 	iphtadd(&tpriv->ht, s);
1006 	switch(mode) {
1007 	case TCP_LISTEN:
1008 		tpriv->stats[PassiveOpens]++;
1009 		tcb->flags |= CLONE;
1010 		tcpsetstate(s, Listen);
1011 		break;
1012 
1013 	case TCP_CONNECT:
1014 		tpriv->stats[ActiveOpens]++;
1015 		tcb->flags |= ACTIVE;
1016 		tcpsndsyn(s, tcb);
1017 		tcpsetstate(s, Syn_sent);
1018 		tcpoutput(s);
1019 		break;
1020 	}
1021 }
1022 
1023 static char*
tcpflag(char * buf,char * e,ushort flag)1024 tcpflag(char *buf, char *e, ushort flag)
1025 {
1026 	char *p;
1027 
1028 	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
1029 	if(flag & URG)
1030 		p = seprint(p, e, " URG");
1031 	if(flag & ACK)
1032 		p = seprint(p, e, " ACK");
1033 	if(flag & PSH)
1034 		p = seprint(p, e, " PSH");
1035 	if(flag & RST)
1036 		p = seprint(p, e, " RST");
1037 	if(flag & SYN)
1038 		p = seprint(p, e, " SYN");
1039 	if(flag & FIN)
1040 		p = seprint(p, e, " FIN");
1041 	USED(p);
1042 	return buf;
1043 }
1044 
1045 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1046 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1047 {
1048 	int dlen;
1049 	Tcp6hdr *h;
1050 	ushort csum;
1051 	ushort hdrlen, optpad = 0;
1052 	uchar *opt;
1053 
1054 	hdrlen = TCP6_HDRSIZE;
1055 	if(tcph->flags & SYN){
1056 		if(tcph->mss)
1057 			hdrlen += MSS_LENGTH;
1058 		if(tcph->ws)
1059 			hdrlen += WS_LENGTH;
1060 		optpad = hdrlen & 3;
1061 		if(optpad)
1062 			optpad = 4 - optpad;
1063 		hdrlen += optpad;
1064 	}
1065 
1066 	if(data) {
1067 		dlen = blocklen(data);
1068 		data = padblock(data, hdrlen + TCP6_PKT);
1069 		if(data == nil)
1070 			return nil;
1071 	}
1072 	else {
1073 		dlen = 0;
1074 		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
1075 		if(data == nil)
1076 			return nil;
1077 		data->wp += hdrlen + TCP6_PKT;
1078 	}
1079 
1080 	/* copy in pseudo ip header plus port numbers */
1081 	h = (Tcp6hdr *)(data->rp);
1082 	memmove(h, ph, TCP6_TCBPHDRSZ);
1083 
1084 	/* compose pseudo tcp header, do cksum calculation */
1085 	hnputl(h->vcf, hdrlen + dlen);
1086 	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1087 	h->ttl = ph->proto;
1088 
1089 	/* copy in variable bits */
1090 	hnputl(h->tcpseq, tcph->seq);
1091 	hnputl(h->tcpack, tcph->ack);
1092 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1093 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1094 	hnputs(h->tcpurg, tcph->urg);
1095 
1096 	if(tcph->flags & SYN){
1097 		opt = h->tcpopt;
1098 		if(tcph->mss != 0){
1099 			*opt++ = MSSOPT;
1100 			*opt++ = MSS_LENGTH;
1101 			hnputs(opt, tcph->mss);
1102 			opt += 2;
1103 		}
1104 		if(tcph->ws != 0){
1105 			*opt++ = WSOPT;
1106 			*opt++ = WS_LENGTH;
1107 			*opt++ = tcph->ws;
1108 		}
1109 		while(optpad-- > 0)
1110 			*opt++ = NOOPOPT;
1111 	}
1112 
1113 	if(tcb != nil && tcb->nochecksum){
1114 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1115 	} else {
1116 		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1117 		hnputs(h->tcpcksum, csum);
1118 	}
1119 
1120 	/* move from pseudo header back to normal ip header */
1121 	memset(h->vcf, 0, 4);
1122 	h->vcf[0] = IP_VER6;
1123 	hnputs(h->ploadlen, hdrlen+dlen);
1124 	h->proto = ph->proto;
1125 
1126 	return data;
1127 }
1128 
1129 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1130 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1131 {
1132 	int dlen;
1133 	Tcp4hdr *h;
1134 	ushort csum;
1135 	ushort hdrlen, optpad = 0;
1136 	uchar *opt;
1137 
1138 	hdrlen = TCP4_HDRSIZE;
1139 	if(tcph->flags & SYN){
1140 		if(tcph->mss)
1141 			hdrlen += MSS_LENGTH;
1142 		if(1)
1143 			hdrlen += WS_LENGTH;
1144 		optpad = hdrlen & 3;
1145 		if(optpad)
1146 			optpad = 4 - optpad;
1147 		hdrlen += optpad;
1148 	}
1149 
1150 	if(data) {
1151 		dlen = blocklen(data);
1152 		data = padblock(data, hdrlen + TCP4_PKT);
1153 		if(data == nil)
1154 			return nil;
1155 	}
1156 	else {
1157 		dlen = 0;
1158 		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1159 		if(data == nil)
1160 			return nil;
1161 		data->wp += hdrlen + TCP4_PKT;
1162 	}
1163 
1164 	/* copy in pseudo ip header plus port numbers */
1165 	h = (Tcp4hdr *)(data->rp);
1166 	memmove(h, ph, TCP4_TCBPHDRSZ);
1167 
1168 	/* copy in variable bits */
1169 	hnputs(h->tcplen, hdrlen + dlen);
1170 	hnputl(h->tcpseq, tcph->seq);
1171 	hnputl(h->tcpack, tcph->ack);
1172 	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1173 	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1174 	hnputs(h->tcpurg, tcph->urg);
1175 
1176 	if(tcph->flags & SYN){
1177 		opt = h->tcpopt;
1178 		if(tcph->mss != 0){
1179 			*opt++ = MSSOPT;
1180 			*opt++ = MSS_LENGTH;
1181 			hnputs(opt, tcph->mss);
1182 			opt += 2;
1183 		}
1184 		/* always offer.  rfc1323 §2.2 */
1185 		if(1){
1186 			*opt++ = WSOPT;
1187 			*opt++ = WS_LENGTH;
1188 			*opt++ = tcph->ws;
1189 		}
1190 		while(optpad-- > 0)
1191 			*opt++ = NOOPOPT;
1192 	}
1193 
1194 	if(tcb != nil && tcb->nochecksum){
1195 		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1196 	} else {
1197 		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1198 		hnputs(h->tcpcksum, csum);
1199 	}
1200 
1201 	return data;
1202 }
1203 
1204 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1205 ntohtcp6(Tcp *tcph, Block **bpp)
1206 {
1207 	Tcp6hdr *h;
1208 	uchar *optr;
1209 	ushort hdrlen;
1210 	ushort optlen;
1211 	int n;
1212 
1213 	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1214 	if(*bpp == nil)
1215 		return -1;
1216 
1217 	h = (Tcp6hdr *)((*bpp)->rp);
1218 	tcph->source = nhgets(h->tcpsport);
1219 	tcph->dest = nhgets(h->tcpdport);
1220 	tcph->seq = nhgetl(h->tcpseq);
1221 	tcph->ack = nhgetl(h->tcpack);
1222 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1223 	if(hdrlen < TCP6_HDRSIZE) {
1224 		freeblist(*bpp);
1225 		return -1;
1226 	}
1227 
1228 	tcph->flags = h->tcpflag[1];
1229 	tcph->wnd = nhgets(h->tcpwin);
1230 	tcph->urg = nhgets(h->tcpurg);
1231 	tcph->mss = 0;
1232 	tcph->ws = 0;
1233 	tcph->update = 0;
1234 	tcph->len = nhgets(h->ploadlen) - hdrlen;
1235 
1236 	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1237 	if(*bpp == nil)
1238 		return -1;
1239 
1240 	optr = h->tcpopt;
1241 	n = hdrlen - TCP6_HDRSIZE;
1242 	while(n > 0 && *optr != EOLOPT) {
1243 		if(*optr == NOOPOPT) {
1244 			n--;
1245 			optr++;
1246 			continue;
1247 		}
1248 		optlen = optr[1];
1249 		if(optlen < 2 || optlen > n)
1250 			break;
1251 		switch(*optr) {
1252 		case MSSOPT:
1253 			if(optlen == MSS_LENGTH)
1254 				tcph->mss = nhgets(optr+2);
1255 			break;
1256 		case WSOPT:
1257 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1258 				tcph->ws = *(optr+2);
1259 			break;
1260 		}
1261 		n -= optlen;
1262 		optr += optlen;
1263 	}
1264 	return hdrlen;
1265 }
1266 
1267 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1268 ntohtcp4(Tcp *tcph, Block **bpp)
1269 {
1270 	Tcp4hdr *h;
1271 	uchar *optr;
1272 	ushort hdrlen;
1273 	ushort optlen;
1274 	int n;
1275 
1276 	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1277 	if(*bpp == nil)
1278 		return -1;
1279 
1280 	h = (Tcp4hdr *)((*bpp)->rp);
1281 	tcph->source = nhgets(h->tcpsport);
1282 	tcph->dest = nhgets(h->tcpdport);
1283 	tcph->seq = nhgetl(h->tcpseq);
1284 	tcph->ack = nhgetl(h->tcpack);
1285 
1286 	hdrlen = (h->tcpflag[0]>>2) & ~3;
1287 	if(hdrlen < TCP4_HDRSIZE) {
1288 		freeblist(*bpp);
1289 		return -1;
1290 	}
1291 
1292 	tcph->flags = h->tcpflag[1];
1293 	tcph->wnd = nhgets(h->tcpwin);
1294 	tcph->urg = nhgets(h->tcpurg);
1295 	tcph->mss = 0;
1296 	tcph->ws = 0;
1297 	tcph->update = 0;
1298 	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1299 
1300 	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1301 	if(*bpp == nil)
1302 		return -1;
1303 
1304 	optr = h->tcpopt;
1305 	n = hdrlen - TCP4_HDRSIZE;
1306 	while(n > 0 && *optr != EOLOPT) {
1307 		if(*optr == NOOPOPT) {
1308 			n--;
1309 			optr++;
1310 			continue;
1311 		}
1312 		optlen = optr[1];
1313 		if(optlen < 2 || optlen > n)
1314 			break;
1315 		switch(*optr) {
1316 		case MSSOPT:
1317 			if(optlen == MSS_LENGTH)
1318 				tcph->mss = nhgets(optr+2);
1319 			break;
1320 		case WSOPT:
1321 			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1322 				tcph->ws = *(optr+2);
1323 			break;
1324 		}
1325 		n -= optlen;
1326 		optr += optlen;
1327 	}
1328 	return hdrlen;
1329 }
1330 
1331 /*
1332  *  For outgoing calls, generate an initial sequence
1333  *  number and put a SYN on the send queue
1334  */
1335 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1336 tcpsndsyn(Conv *s, Tcpctl *tcb)
1337 {
1338 	Tcppriv *tpriv;
1339 
1340 	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1341 	tcb->rttseq = tcb->iss;
1342 	tcb->snd.wl2 = tcb->iss;
1343 	tcb->snd.una = tcb->iss;
1344 	tcb->snd.rxt = tcb->iss;
1345 	tcb->snd.ptr = tcb->rttseq;
1346 	tcb->snd.nxt = tcb->rttseq;
1347 	tcb->flgcnt++;
1348 	tcb->flags |= FORCE;
1349 	tcb->sndsyntime = NOW;
1350 
1351 	/* set desired mss and scale */
1352 	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1353 	tpriv = s->p->priv;
1354 	tpriv->stats[Mss] = tcb->mss;
1355 }
1356 
1357 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1358 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1359 {
1360 	Block *hbp;
1361 	uchar rflags;
1362 	Tcppriv *tpriv;
1363 	Tcp4hdr ph4;
1364 	Tcp6hdr ph6;
1365 
1366 	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1367 
1368 	tpriv = tcp->priv;
1369 
1370 	if(seg->flags & RST)
1371 		return;
1372 
1373 	/* make pseudo header */
1374 	switch(version) {
1375 	case V4:
1376 		memset(&ph4, 0, sizeof(ph4));
1377 		ph4.vihl = IP_VER4;
1378 		v6tov4(ph4.tcpsrc, dest);
1379 		v6tov4(ph4.tcpdst, source);
1380 		ph4.proto = IP_TCPPROTO;
1381 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1382 		hnputs(ph4.tcpsport, seg->dest);
1383 		hnputs(ph4.tcpdport, seg->source);
1384 		break;
1385 	case V6:
1386 		memset(&ph6, 0, sizeof(ph6));
1387 		ph6.vcf[0] = IP_VER6;
1388 		ipmove(ph6.tcpsrc, dest);
1389 		ipmove(ph6.tcpdst, source);
1390 		ph6.proto = IP_TCPPROTO;
1391 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1392 		hnputs(ph6.tcpsport, seg->dest);
1393 		hnputs(ph6.tcpdport, seg->source);
1394 		break;
1395 	default:
1396 		panic("sndrst: version %d", version);
1397 	}
1398 
1399 	tpriv->stats[OutRsts]++;
1400 	rflags = RST;
1401 
1402 	/* convince the other end that this reset is in band */
1403 	if(seg->flags & ACK) {
1404 		seg->seq = seg->ack;
1405 		seg->ack = 0;
1406 	}
1407 	else {
1408 		rflags |= ACK;
1409 		seg->ack = seg->seq;
1410 		seg->seq = 0;
1411 		if(seg->flags & SYN)
1412 			seg->ack++;
1413 		seg->ack += length;
1414 		if(seg->flags & FIN)
1415 			seg->ack++;
1416 	}
1417 	seg->flags = rflags;
1418 	seg->wnd = 0;
1419 	seg->urg = 0;
1420 	seg->mss = 0;
1421 	seg->ws = 0;
1422 	switch(version) {
1423 	case V4:
1424 		hbp = htontcp4(seg, nil, &ph4, nil);
1425 		if(hbp == nil)
1426 			return;
1427 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1428 		break;
1429 	case V6:
1430 		hbp = htontcp6(seg, nil, &ph6, nil);
1431 		if(hbp == nil)
1432 			return;
1433 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1434 		break;
1435 	default:
1436 		panic("sndrst2: version %d", version);
1437 	}
1438 }
1439 
1440 /*
1441  * close the conversation
1442  */
1443 static char*
tcpclose2(Conv * s)1444 tcpclose2(Conv *s)
1445 {
1446 	tcpclose(s);
1447 	return nil;
1448 }
1449 
1450 /*
1451  *  send a reset to the remote side and close the conversation
1452  *  called with s qlocked
1453  */
1454 static char*
tcphangup(Conv * s)1455 tcphangup(Conv *s)
1456 {
1457 	Tcp seg;
1458 	Tcpctl *tcb;
1459 	Block *hbp;
1460 
1461 	tcb = (Tcpctl*)s->ptcl;
1462 	if(waserror())
1463 		return up->errstr;
1464 	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1465 		if(!waserror()){
1466 			memset(&seg, 0, sizeof seg);
1467 			seg.flags = RST | ACK;
1468 			seg.ack = tcb->rcv.nxt;
1469 			tcb->rcv.ackptr = seg.ack;
1470 			seg.seq = tcb->snd.ptr;
1471 			seg.wnd = 0;
1472 			seg.urg = 0;
1473 			seg.mss = 0;
1474 			seg.ws = 0;
1475 			switch(s->ipversion) {
1476 			case V4:
1477 				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1478 				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1479 				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1480 				break;
1481 			case V6:
1482 				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1483 				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1484 				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1485 				break;
1486 			default:
1487 				panic("tcphangup: version %d", s->ipversion);
1488 			}
1489 			poperror();
1490 		}
1491 	}
1492 	localclose(s, nil);
1493 	poperror();
1494 	return nil;
1495 }
1496 
1497 /*
1498  *  (re)send a SYN ACK
1499  */
1500 static int
sndsynack(Proto * tcp,Limbo * lp)1501 sndsynack(Proto *tcp, Limbo *lp)
1502 {
1503 	Block *hbp;
1504 	Tcp4hdr ph4;
1505 	Tcp6hdr ph6;
1506 	Tcp seg;
1507 	uint scale;
1508 
1509 	/* make pseudo header */
1510 	switch(lp->version) {
1511 	case V4:
1512 		memset(&ph4, 0, sizeof(ph4));
1513 		ph4.vihl = IP_VER4;
1514 		v6tov4(ph4.tcpsrc, lp->laddr);
1515 		v6tov4(ph4.tcpdst, lp->raddr);
1516 		ph4.proto = IP_TCPPROTO;
1517 		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1518 		hnputs(ph4.tcpsport, lp->lport);
1519 		hnputs(ph4.tcpdport, lp->rport);
1520 		break;
1521 	case V6:
1522 		memset(&ph6, 0, sizeof(ph6));
1523 		ph6.vcf[0] = IP_VER6;
1524 		ipmove(ph6.tcpsrc, lp->laddr);
1525 		ipmove(ph6.tcpdst, lp->raddr);
1526 		ph6.proto = IP_TCPPROTO;
1527 		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1528 		hnputs(ph6.tcpsport, lp->lport);
1529 		hnputs(ph6.tcpdport, lp->rport);
1530 		break;
1531 	default:
1532 		panic("sndrst: version %d", lp->version);
1533 	}
1534 
1535 	memset(&seg, 0, sizeof seg);
1536 	seg.seq = lp->iss;
1537 	seg.ack = lp->irs+1;
1538 	seg.flags = SYN|ACK;
1539 	seg.urg = 0;
1540 	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1541 	seg.wnd = QMAX;
1542 
1543 	/* if the other side set scale, we should too */
1544 	if(lp->rcvscale){
1545 		seg.ws = scale;
1546 		lp->sndscale = scale;
1547 	} else {
1548 		seg.ws = 0;
1549 		lp->sndscale = 0;
1550 	}
1551 
1552 	switch(lp->version) {
1553 	case V4:
1554 		hbp = htontcp4(&seg, nil, &ph4, nil);
1555 		if(hbp == nil)
1556 			return -1;
1557 		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1558 		break;
1559 	case V6:
1560 		hbp = htontcp6(&seg, nil, &ph6, nil);
1561 		if(hbp == nil)
1562 			return -1;
1563 		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1564 		break;
1565 	default:
1566 		panic("sndsnack: version %d", lp->version);
1567 	}
1568 	lp->lastsend = NOW;
1569 	return 0;
1570 }
1571 
1572 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1573 
1574 /*
1575  *  put a call into limbo and respond with a SYN ACK
1576  *
1577  *  called with proto locked
1578  */
1579 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1580 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1581 {
1582 	Limbo *lp, **l;
1583 	Tcppriv *tpriv;
1584 	int h;
1585 
1586 	tpriv = s->p->priv;
1587 	h = hashipa(source, seg->source);
1588 
1589 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1590 		lp = *l;
1591 		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1592 			continue;
1593 		if(ipcmp(lp->raddr, source) != 0)
1594 			continue;
1595 		if(ipcmp(lp->laddr, dest) != 0)
1596 			continue;
1597 
1598 		/* each new SYN restarts the retransmits */
1599 		lp->irs = seg->seq;
1600 		break;
1601 	}
1602 	lp = *l;
1603 	if(lp == nil){
1604 		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1605 			lp = tpriv->lht[h];
1606 			tpriv->lht[h] = lp->next;
1607 			lp->next = nil;
1608 		} else {
1609 			lp = malloc(sizeof(*lp));
1610 			if(lp == nil)
1611 				return;
1612 			tpriv->nlimbo++;
1613 		}
1614 		*l = lp;
1615 		lp->version = version;
1616 		ipmove(lp->laddr, dest);
1617 		ipmove(lp->raddr, source);
1618 		lp->lport = seg->dest;
1619 		lp->rport = seg->source;
1620 		lp->mss = seg->mss;
1621 		lp->rcvscale = seg->ws;
1622 		lp->irs = seg->seq;
1623 		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1624 	}
1625 
1626 	if(sndsynack(s->p, lp) < 0){
1627 		*l = lp->next;
1628 		tpriv->nlimbo--;
1629 		free(lp);
1630 	}
1631 }
1632 
1633 /*
1634  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1635  */
1636 static void
limborexmit(Proto * tcp)1637 limborexmit(Proto *tcp)
1638 {
1639 	Tcppriv *tpriv;
1640 	Limbo **l, *lp;
1641 	int h;
1642 	int seen;
1643 	ulong now;
1644 
1645 	tpriv = tcp->priv;
1646 
1647 	if(!canqlock(tcp))
1648 		return;
1649 	seen = 0;
1650 	now = NOW;
1651 	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1652 		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1653 			lp = *l;
1654 			seen++;
1655 			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1656 				continue;
1657 
1658 			/* time it out after 1 second */
1659 			if(++(lp->rexmits) > 5){
1660 				tpriv->nlimbo--;
1661 				*l = lp->next;
1662 				free(lp);
1663 				continue;
1664 			}
1665 
1666 			/* if we're being attacked, don't bother resending SYN ACK's */
1667 			if(tpriv->nlimbo > 100)
1668 				continue;
1669 
1670 			if(sndsynack(tcp, lp) < 0){
1671 				tpriv->nlimbo--;
1672 				*l = lp->next;
1673 				free(lp);
1674 				continue;
1675 			}
1676 
1677 			l = &lp->next;
1678 		}
1679 	}
1680 	qunlock(tcp);
1681 }
1682 
1683 /*
1684  *  lookup call in limbo.  if found, throw it out.
1685  *
1686  *  called with proto locked
1687  */
1688 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1689 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1690 {
1691 	Limbo *lp, **l;
1692 	int h;
1693 	Tcppriv *tpriv;
1694 
1695 	tpriv = s->p->priv;
1696 
1697 	/* find a call in limbo */
1698 	h = hashipa(src, segp->source);
1699 	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1700 		lp = *l;
1701 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1702 			continue;
1703 		if(ipcmp(lp->laddr, dst) != 0)
1704 			continue;
1705 		if(ipcmp(lp->raddr, src) != 0)
1706 			continue;
1707 
1708 		/* RST can only follow the SYN */
1709 		if(segp->seq == lp->irs+1){
1710 			tpriv->nlimbo--;
1711 			*l = lp->next;
1712 			free(lp);
1713 		}
1714 		break;
1715 	}
1716 }
1717 
1718 static void
initialwindow(Tcpctl * tcb)1719 initialwindow(Tcpctl *tcb)
1720 {
1721 	/* RFC 3390 initial window */
1722 	if(tcb->mss < 1095)
1723 		tcb->cwind = 4*tcb->mss;
1724 	else if(tcb->mss < 2190)
1725 		tcb->cwind = 2*2190;
1726 	else
1727 		tcb->cwind = 2*tcb->mss;
1728 }
1729 
1730 /*
1731  *  come here when we finally get an ACK to our SYN-ACK.
1732  *  lookup call in limbo.  if found, create a new conversation
1733  *
1734  *  called with proto locked
1735  */
1736 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1737 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1738 {
1739 	Conv *new;
1740 	Tcpctl *tcb;
1741 	Tcppriv *tpriv;
1742 	Tcp4hdr *h4;
1743 	Tcp6hdr *h6;
1744 	Limbo *lp, **l;
1745 	int h;
1746 
1747 	/* unless it's just an ack, it can't be someone coming out of limbo */
1748 	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1749 		return nil;
1750 
1751 	tpriv = s->p->priv;
1752 
1753 	/* find a call in limbo */
1754 	h = hashipa(src, segp->source);
1755 	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1756 		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1757 			src, segp->source, lp->raddr, lp->rport,
1758 			dst, segp->dest, lp->laddr, lp->lport,
1759 			version, lp->version
1760  		);
1761 
1762 		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1763 			continue;
1764 		if(ipcmp(lp->laddr, dst) != 0)
1765 			continue;
1766 		if(ipcmp(lp->raddr, src) != 0)
1767 			continue;
1768 
1769 		/* we're assuming no data with the initial SYN */
1770 		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1771 			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1772 				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1773 			lp = nil;
1774 		} else {
1775 			tpriv->nlimbo--;
1776 			*l = lp->next;
1777 		}
1778 		break;
1779 	}
1780 	if(lp == nil)
1781 		return nil;
1782 
1783 	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1784 	if(new == nil)
1785 		return nil;
1786 
1787 	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1788 	tcb = (Tcpctl*)new->ptcl;
1789 	tcb->flags &= ~CLONE;
1790 	tcb->timer.arg = new;
1791 	tcb->timer.state = TcptimerOFF;
1792 	tcb->acktimer.arg = new;
1793 	tcb->acktimer.state = TcptimerOFF;
1794 	tcb->katimer.arg = new;
1795 	tcb->katimer.state = TcptimerOFF;
1796 	tcb->rtt_timer.arg = new;
1797 	tcb->rtt_timer.state = TcptimerOFF;
1798 
1799 	tcb->irs = lp->irs;
1800 	tcb->rcv.nxt = tcb->irs+1;
1801 	tcb->rcv.wptr = tcb->rcv.nxt;
1802 	tcb->rcv.wsnt = 0;
1803 	tcb->rcv.urg = tcb->rcv.nxt;
1804 
1805 	tcb->iss = lp->iss;
1806 	tcb->rttseq = tcb->iss;
1807 	tcb->snd.wl2 = tcb->iss;
1808 	tcb->snd.una = tcb->iss+1;
1809 	tcb->snd.ptr = tcb->iss+1;
1810 	tcb->snd.nxt = tcb->iss+1;
1811 	tcb->snd.rxt = tcb->iss+1;
1812 	tcb->flgcnt = 0;
1813 	tcb->flags |= SYNACK;
1814 
1815 	/* set desired mss and scale */
1816 	tcb->mss = tcpmtu(s->p, dst, s->ipversion, &tcb->scale);
1817 
1818 	/* our sending max segment size cannot be bigger than what he asked for */
1819 	if(lp->mss != 0 && lp->mss < tcb->mss)
1820 		tcb->mss = lp->mss;
1821 	tpriv->stats[Mss] = tcb->mss;
1822 
1823 	/* window scaling */
1824 	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1825 
1826 	/* congestion window */
1827 	tcb->snd.wnd = segp->wnd;
1828 	initialwindow(tcb);
1829 
1830 	/* set initial round trip time */
1831 	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1832 	tcpsynackrtt(new);
1833 
1834 	free(lp);
1835 
1836 	/* set up proto header */
1837 	switch(version){
1838 	case V4:
1839 		h4 = &tcb->protohdr.tcp4hdr;
1840 		memset(h4, 0, sizeof(*h4));
1841 		h4->proto = IP_TCPPROTO;
1842 		hnputs(h4->tcpsport, new->lport);
1843 		hnputs(h4->tcpdport, new->rport);
1844 		v6tov4(h4->tcpsrc, dst);
1845 		v6tov4(h4->tcpdst, src);
1846 		break;
1847 	case V6:
1848 		h6 = &tcb->protohdr.tcp6hdr;
1849 		memset(h6, 0, sizeof(*h6));
1850 		h6->proto = IP_TCPPROTO;
1851 		hnputs(h6->tcpsport, new->lport);
1852 		hnputs(h6->tcpdport, new->rport);
1853 		ipmove(h6->tcpsrc, dst);
1854 		ipmove(h6->tcpdst, src);
1855 		break;
1856 	default:
1857 		panic("tcpincoming: version %d", new->ipversion);
1858 	}
1859 
1860 	tcpsetstate(new, Established);
1861 
1862 	iphtadd(&tpriv->ht, new);
1863 
1864 	return new;
1865 }
1866 
1867 static int
seq_within(ulong x,ulong low,ulong high)1868 seq_within(ulong x, ulong low, ulong high)
1869 {
1870 	if(low <= high){
1871 		if(low <= x && x <= high)
1872 			return 1;
1873 	}
1874 	else {
1875 		if(x >= low || x <= high)
1876 			return 1;
1877 	}
1878 	return 0;
1879 }
1880 
1881 static int
seq_lt(ulong x,ulong y)1882 seq_lt(ulong x, ulong y)
1883 {
1884 	return (int)(x-y) < 0;
1885 }
1886 
1887 static int
seq_le(ulong x,ulong y)1888 seq_le(ulong x, ulong y)
1889 {
1890 	return (int)(x-y) <= 0;
1891 }
1892 
1893 static int
seq_gt(ulong x,ulong y)1894 seq_gt(ulong x, ulong y)
1895 {
1896 	return (int)(x-y) > 0;
1897 }
1898 
1899 static int
seq_ge(ulong x,ulong y)1900 seq_ge(ulong x, ulong y)
1901 {
1902 	return (int)(x-y) >= 0;
1903 }
1904 
1905 /*
1906  *  use the time between the first SYN and it's ack as the
1907  *  initial round trip time
1908  */
1909 static void
tcpsynackrtt(Conv * s)1910 tcpsynackrtt(Conv *s)
1911 {
1912 	Tcpctl *tcb;
1913 	int delta;
1914 	Tcppriv *tpriv;
1915 
1916 	tcb = (Tcpctl*)s->ptcl;
1917 	tpriv = s->p->priv;
1918 
1919 	delta = NOW - tcb->sndsyntime;
1920 	tcb->srtt = delta<<LOGAGAIN;
1921 	tcb->mdev = delta<<LOGDGAIN;
1922 
1923 	/* halt round trip timer */
1924 	tcphalt(tpriv, &tcb->rtt_timer);
1925 }
1926 
1927 static void
update(Conv * s,Tcp * seg)1928 update(Conv *s, Tcp *seg)
1929 {
1930 	int rtt, delta;
1931 	Tcpctl *tcb;
1932 	ulong acked;
1933 	Tcppriv *tpriv;
1934 
1935 	if(seg->update)
1936 		return;
1937 	seg->update = 1;
1938 
1939 	tpriv = s->p->priv;
1940 	tcb = (Tcpctl*)s->ptcl;
1941 
1942 	/* catch zero-window updates, update window & recover */
1943 	if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1944 	    seq_lt(seg->ack, tcb->snd.ptr)){
1945 		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1946 			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1947 		tcb->snd.wnd = seg->wnd;
1948 		goto recovery;
1949 	}
1950 
1951 	/* newreno fast retransmit */
1952 	if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1953 	    ++tcb->snd.dupacks == 3){		/* was TCPREXMTTHRESH */
1954 recovery:
1955 		if(tcb->snd.recovery){
1956 			tpriv->stats[RecoveryCwind]++;
1957 			tcb->cwind += tcb->mss;
1958 		}else if(seq_le(tcb->snd.rxt, seg->ack)){
1959 			tpriv->stats[Recovery]++;
1960 			tcb->abcbytes = 0;
1961 			tcb->snd.recovery = 1;
1962 			tcb->snd.partialack = 0;
1963 			tcb->snd.rxt = tcb->snd.nxt;
1964 			tcpcongestion(tcb);
1965 			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1966 			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1967 				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1968 			tcprxmit(s);
1969 		}else{
1970 			tpriv->stats[RecoveryNoSeq]++;
1971 			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1972 				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1973 			/* don't enter fast retransmit, don't change ssthresh */
1974 		}
1975 	}else if(tcb->snd.recovery){
1976 		tpriv->stats[RecoveryCwind]++;
1977 		tcb->cwind += tcb->mss;
1978 	}
1979 
1980 	/*
1981 	 *  update window
1982 	 */
1983 	if(seq_gt(seg->ack, tcb->snd.wl2)
1984 	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1985 		/* clear dupack if we advance wl2 */
1986 		if(tcb->snd.wl2 != seg->ack)
1987 			tcb->snd.dupacks = 0;
1988 		tcb->snd.wnd = seg->wnd;
1989 		tcb->snd.wl2 = seg->ack;
1990 	}
1991 
1992 	if(!seq_gt(seg->ack, tcb->snd.una)){
1993 		/*
1994 		 *  don't let us hangup if sending into a closed window and
1995 		 *  we're still getting acks
1996 		 */
1997 		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1998 			tcb->backedoff = MAXBACKMS/4;
1999 		return;
2000 	}
2001 
2002 	/* Compute the new send window size */
2003 	acked = seg->ack - tcb->snd.una;
2004 
2005 	/* avoid slow start and timers for SYN acks */
2006 	if((tcb->flags & SYNACK) == 0) {
2007 		tcb->flags |= SYNACK;
2008 		acked--;
2009 		tcb->flgcnt--;
2010 		goto done;
2011 	}
2012 
2013 	/*
2014 	 * congestion control
2015 	 */
2016 	if(tcb->snd.recovery){
2017 		if(seq_ge(seg->ack, tcb->snd.rxt)){
2018 			/* recovery finished; deflate window */
2019 			tpriv->stats[RecoveryDone]++;
2020 			tcb->snd.dupacks = 0;
2021 			tcb->snd.recovery = 0;
2022 			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
2023 			if(tcb->ssthresh < tcb->cwind)
2024 				tcb->cwind = tcb->ssthresh;
2025 			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
2026 				tcb->cwind, tcb->ssthresh);
2027 		} else {
2028 			/* partial ack; we lost more than one segment */
2029 			tpriv->stats[RecoveryPA]++;
2030 			if(tcb->cwind > acked)
2031 				tcb->cwind -= acked;
2032 			else{
2033 				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
2034 				tcb->cwind = tcb->mss;
2035 			}
2036 			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
2037 				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
2038 
2039 			if(acked >= tcb->mss)
2040 				tcb->cwind += tcb->mss;
2041 			tcb->snd.partialack++;
2042 		}
2043 	} else
2044 		tcpabcincr(tcb, acked);
2045 
2046 	/* Adjust the timers according to the round trip time */
2047 	/* TODO: fix sloppy treatment of overflow cases here. */
2048 	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2049 		tcphalt(tpriv, &tcb->rtt_timer);
2050 		if((tcb->flags&RETRAN) == 0) {
2051 			tcb->backoff = 0;
2052 			tcb->backedoff = 0;
2053 			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2054 			if(rtt == 0)
2055 				rtt = 1; /* else all close sys's will rexmit in 0 time */
2056 			rtt *= MSPTICK;
2057 			if(tcb->srtt == 0) {
2058 				tcb->srtt = rtt << LOGAGAIN;
2059 				tcb->mdev = rtt << LOGDGAIN;
2060 			} else {
2061 				delta = rtt - (tcb->srtt>>LOGAGAIN);
2062 				tcb->srtt += delta;
2063 				if(tcb->srtt <= 0)
2064 					tcb->srtt = 1;
2065 
2066 				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2067 				tcb->mdev += delta;
2068 				if(tcb->mdev <= 0)
2069 					tcb->mdev = 1;
2070 			}
2071 			tcpsettimer(tcb);
2072 		}
2073 	}
2074 
2075 done:
2076 	if(qdiscard(s->wq, acked) < acked)
2077 		tcb->flgcnt--;
2078 	tcb->snd.una = seg->ack;
2079 
2080 	/* newreno fast recovery */
2081 	if(tcb->snd.recovery)
2082 		tcprxmit(s);
2083 
2084 	if(seq_gt(seg->ack, tcb->snd.urg))
2085 		tcb->snd.urg = seg->ack;
2086 
2087 	if(tcb->snd.una != tcb->snd.nxt){
2088 		/* `impatient' variant */
2089 		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2090 			tcb->time = NOW;
2091 			tcb->timeuna = tcb->snd.una;
2092 			tcpgo(tpriv, &tcb->timer);
2093 		}
2094 	} else
2095 		tcphalt(tpriv, &tcb->timer);
2096 
2097 	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2098 		tcb->snd.ptr = tcb->snd.una;
2099 
2100 	if(!tcb->snd.recovery)
2101 		tcb->flags &= ~RETRAN;
2102 	tcb->backoff = 0;
2103 	tcb->backedoff = 0;
2104 }
2105 
2106 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2107 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2108 {
2109 	Tcp seg;
2110 	Tcp4hdr *h4;
2111 	Tcp6hdr *h6;
2112 	int hdrlen;
2113 	Tcpctl *tcb;
2114 	ushort length, csum;
2115 	uchar source[IPaddrlen], dest[IPaddrlen];
2116 	Conv *s;
2117 	Fs *f;
2118 	Tcppriv *tpriv;
2119 	uchar version;
2120 
2121 	f = tcp->f;
2122 	tpriv = tcp->priv;
2123 
2124 	tpriv->stats[InSegs]++;
2125 
2126 	h4 = (Tcp4hdr*)(bp->rp);
2127 	h6 = (Tcp6hdr*)(bp->rp);
2128 
2129 	if((h4->vihl&0xF0)==IP_VER4) {
2130 		version = V4;
2131 		length = nhgets(h4->length);
2132 		v4tov6(dest, h4->tcpdst);
2133 		v4tov6(source, h4->tcpsrc);
2134 
2135 		h4->Unused = 0;
2136 		hnputs(h4->tcplen, length-TCP4_PKT);
2137 		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2138 			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2139 			tpriv->stats[CsumErrs]++;
2140 			tpriv->stats[InErrs]++;
2141 			netlog(f, Logtcp, "bad tcp proto cksum\n");
2142 			freeblist(bp);
2143 			return;
2144 		}
2145 
2146 		hdrlen = ntohtcp4(&seg, &bp);
2147 		if(hdrlen < 0){
2148 			tpriv->stats[HlenErrs]++;
2149 			tpriv->stats[InErrs]++;
2150 			netlog(f, Logtcp, "bad tcp hdr len\n");
2151 			return;
2152 		}
2153 
2154 		/* trim the packet to the size claimed by the datagram */
2155 		length -= hdrlen+TCP4_PKT;
2156 		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2157 		if(bp == nil){
2158 			tpriv->stats[LenErrs]++;
2159 			tpriv->stats[InErrs]++;
2160 			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2161 			return;
2162 		}
2163 	}
2164 	else {
2165 		int ttl = h6->ttl;
2166 		int proto = h6->proto;
2167 
2168 		version = V6;
2169 		length = nhgets(h6->ploadlen);
2170 		ipmove(dest, h6->tcpdst);
2171 		ipmove(source, h6->tcpsrc);
2172 
2173 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2174 		h6->ttl = proto;
2175 		hnputl(h6->vcf, length);
2176 		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2177 		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2178 			tpriv->stats[CsumErrs]++;
2179 			tpriv->stats[InErrs]++;
2180 			netlog(f, Logtcp,
2181 			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2182 				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2183 			freeblist(bp);
2184 			return;
2185 		}
2186 		h6->ttl = ttl;
2187 		h6->proto = proto;
2188 		hnputs(h6->ploadlen, length);
2189 
2190 		hdrlen = ntohtcp6(&seg, &bp);
2191 		if(hdrlen < 0){
2192 			tpriv->stats[HlenErrs]++;
2193 			tpriv->stats[InErrs]++;
2194 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2195 			return;
2196 		}
2197 
2198 		/* trim the packet to the size claimed by the datagram */
2199 		length -= hdrlen;
2200 		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2201 		if(bp == nil){
2202 			tpriv->stats[LenErrs]++;
2203 			tpriv->stats[InErrs]++;
2204 			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2205 			return;
2206 		}
2207 	}
2208 
2209 	/* lock protocol while searching for a conversation */
2210 	qlock(tcp);
2211 
2212 	/* Look for a matching conversation */
2213 	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2214 	if(s == nil){
2215 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2216 			source, seg.source, dest, seg.dest);
2217 reset:
2218 		qunlock(tcp);
2219 		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2220 		freeblist(bp);
2221 		return;
2222 	}
2223 
2224 	/* if it's a listener, look for the right flags and get a new conv */
2225 	tcb = (Tcpctl*)s->ptcl;
2226 	if(tcb->state == Listen){
2227 		if(seg.flags & RST){
2228 			limborst(s, &seg, source, dest, version);
2229 			qunlock(tcp);
2230 			freeblist(bp);
2231 			return;
2232 		}
2233 
2234 		/* if this is a new SYN, put the call into limbo */
2235 		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2236 			limbo(s, source, dest, &seg, version);
2237 			qunlock(tcp);
2238 			freeblist(bp);
2239 			return;
2240 		}
2241 
2242 		/*
2243 		 *  if there's a matching call in limbo, tcpincoming will
2244 		 *  return it in state Syn_received
2245 		 */
2246 		s = tcpincoming(s, &seg, source, dest, version);
2247 		if(s == nil)
2248 			goto reset;
2249 	}
2250 
2251 	/* The rest of the input state machine is run with the control block
2252 	 * locked and implements the state machine directly out of the RFC.
2253 	 * Out-of-band data is ignored - it was always a bad idea.
2254 	 */
2255 	tcb = (Tcpctl*)s->ptcl;
2256 	if(waserror()){
2257 		qunlock(s);
2258 		nexterror();
2259 	}
2260 	qlock(s);
2261 	qunlock(tcp);
2262 
2263 	/* fix up window */
2264 	seg.wnd <<= tcb->rcv.scale;
2265 
2266 	/* every input packet in puts off the keep alive time out */
2267 	tcpsetkacounter(tcb);
2268 
2269 	switch(tcb->state) {
2270 	case Closed:
2271 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2272 		goto raise;
2273 	case Syn_sent:
2274 		if(seg.flags & ACK) {
2275 			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2276 				sndrst(tcp, source, dest, length, &seg, version,
2277 					 "bad seq in Syn_sent");
2278 				goto raise;
2279 			}
2280 		}
2281 		if(seg.flags & RST) {
2282 			if(seg.flags & ACK)
2283 				localclose(s, Econrefused);
2284 			goto raise;
2285 		}
2286 
2287 		if(seg.flags & SYN) {
2288 			procsyn(s, &seg);
2289 			if(seg.flags & ACK){
2290 				update(s, &seg);
2291 				tcpsynackrtt(s);
2292 				tcpsetstate(s, Established);
2293 				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2294 			}
2295 			else {
2296 				tcb->time = NOW;
2297 				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2298 			}
2299 
2300 			if(length != 0 || (seg.flags & FIN))
2301 				break;
2302 
2303 			freeblist(bp);
2304 			goto output;
2305 		}
2306 		else
2307 			freeblist(bp);
2308 
2309 		qunlock(s);
2310 		poperror();
2311 		return;
2312 	case Syn_received:
2313 		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2314 		if(seg.flags & ACK)
2315 			tcpsynackrtt(s);
2316 		break;
2317 	}
2318 
2319 	/*
2320 	 *  One DOS attack is to open connections to us and then forget about them,
2321 	 *  thereby tying up a conv at no long term cost to the attacker.
2322 	 *  This is an attempt to defeat these stateless DOS attacks.  See
2323 	 *  corresponding code in tcpsendka().
2324 	 */
2325 	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2326 		if(tcpporthogdefense
2327 		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2328 			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2329 				source, seg.source, dest, seg.dest, seg.flags,
2330 				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2331 			localclose(s, "stateless hog");
2332 		}
2333 	}
2334 
2335 	/* Cut the data to fit the receive window */
2336 	tcprcvwin(s);
2337 	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2338 		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2339 		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2340 			"%lud-%lud l %d from %I\n", seg.seq,
2341 			seg.seq + length - 1, tcb->rcv.nxt,
2342 			tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2343 		update(s, &seg);
2344 		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2345 			tcphalt(tpriv, &tcb->rtt_timer);
2346 			tcphalt(tpriv, &tcb->acktimer);
2347 			tcphalt(tpriv, &tcb->katimer);
2348 			tcpsetstate(s, Time_wait);
2349 			tcb->timer.start = MSL2*(1000 / MSPTICK);
2350 			tcpgo(tpriv, &tcb->timer);
2351 		}
2352 		if(!(seg.flags & RST)) {
2353 			tcb->flags |= FORCE;
2354 			goto output;
2355 		}
2356 		qunlock(s);
2357 		poperror();
2358 		return;
2359 	}
2360 
2361 	/* Cannot accept so answer with a rst */
2362 	if(length && tcb->state == Closed) {
2363 		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2364 		goto raise;
2365 	}
2366 
2367 	/* The segment is beyond the current receive pointer so
2368 	 * queue the data in the resequence queue
2369 	 */
2370 	if(seg.seq != tcb->rcv.nxt)
2371 	if(length != 0 || (seg.flags & (SYN|FIN))) {
2372 		update(s, &seg);
2373 		if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2374 			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2375 				s->laddr, s->lport);
2376 		tcb->flags |= FORCE;	/* force duplicate ack; RFC 5681 §3.2 */
2377 		goto output;
2378 	}
2379 
2380 	if(tcb->nreseq > 0)
2381 		tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2382 
2383 	/*
2384 	 *  keep looping till we've processed this packet plus any
2385 	 *  adjacent packets in the resequence queue
2386 	 */
2387 	for(;;) {
2388 		if(seg.flags & RST) {
2389 			if(tcb->state == Established) {
2390 				tpriv->stats[EstabResets]++;
2391 				if(tcb->rcv.nxt != seg.seq)
2392 					print("out of order RST rcvd: %I.%d -> "
2393 						"%I.%d, rcv.nxt %lux seq %lux\n",
2394 						s->raddr, s->rport, s->laddr,
2395 						s->lport, tcb->rcv.nxt, seg.seq);
2396 			}
2397 			localclose(s, Econrefused);
2398 			goto raise;
2399 		}
2400 
2401 		if((seg.flags&ACK) == 0)
2402 			goto raise;
2403 
2404 		switch(tcb->state) {
2405 		case Syn_received:
2406 			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2407 				sndrst(tcp, source, dest, length, &seg, version,
2408 					"bad seq in Syn_received");
2409 				goto raise;
2410 			}
2411 			update(s, &seg);
2412 			tcpsetstate(s, Established);
2413 		case Established:
2414 		case Close_wait:
2415 			update(s, &seg);
2416 			break;
2417 		case Finwait1:
2418 			update(s, &seg);
2419 			if(qlen(s->wq)+tcb->flgcnt == 0){
2420 				tcphalt(tpriv, &tcb->rtt_timer);
2421 				tcphalt(tpriv, &tcb->acktimer);
2422 				tcpsetkacounter(tcb);
2423 				tcb->time = NOW;
2424 				tcpsetstate(s, Finwait2);
2425 				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2426 				tcpgo(tpriv, &tcb->katimer);
2427 			}
2428 			break;
2429 		case Finwait2:
2430 			update(s, &seg);
2431 			break;
2432 		case Closing:
2433 			update(s, &seg);
2434 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2435 				tcphalt(tpriv, &tcb->rtt_timer);
2436 				tcphalt(tpriv, &tcb->acktimer);
2437 				tcphalt(tpriv, &tcb->katimer);
2438 				tcpsetstate(s, Time_wait);
2439 				tcb->timer.start = MSL2*(1000 / MSPTICK);
2440 				tcpgo(tpriv, &tcb->timer);
2441 			}
2442 			break;
2443 		case Last_ack:
2444 			update(s, &seg);
2445 			if(qlen(s->wq)+tcb->flgcnt == 0) {
2446 				localclose(s, nil);
2447 				goto raise;
2448 			}
2449 		case Time_wait:
2450 			tcb->flags |= FORCE;
2451 			if(tcb->timer.state != TcptimerON)
2452 				tcpgo(tpriv, &tcb->timer);
2453 		}
2454 
2455 		if((seg.flags&URG) && seg.urg) {
2456 			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2457 				tcb->rcv.urg = seg.urg + seg.seq;
2458 				pullblock(&bp, seg.urg);
2459 			}
2460 		}
2461 		else
2462 		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2463 			tcb->rcv.urg = tcb->rcv.nxt;
2464 
2465 		if(length == 0) {
2466 			if(bp != nil)
2467 				freeblist(bp);
2468 		}
2469 		else {
2470 			switch(tcb->state){
2471 			default:
2472 				/* Ignore segment text */
2473 				if(bp != nil)
2474 					freeblist(bp);
2475 				break;
2476 
2477 			case Syn_received:
2478 			case Established:
2479 			case Finwait1:
2480 				/* If we still have some data place on
2481 				 * receive queue
2482 				 */
2483 				if(bp) {
2484 					bp = packblock(bp);
2485 					if(bp == nil)
2486 						panic("tcp packblock");
2487 					qpassnolim(s->rq, bp);
2488 					bp = nil;
2489 				}
2490 				tcb->rcv.nxt += length;
2491 
2492 				/*
2493 				 *  turn on the acktimer if there's something
2494 				 *  to ack
2495 				 */
2496 				if(tcb->acktimer.state != TcptimerON)
2497 					tcpgo(tpriv, &tcb->acktimer);
2498 
2499 				break;
2500 			case Finwait2:
2501 				/* no process to read the data, send a reset */
2502 				if(bp != nil)
2503 					freeblist(bp);
2504 				sndrst(tcp, source, dest, length, &seg, version,
2505 					"send to Finwait2");
2506 				qunlock(s);
2507 				poperror();
2508 				return;
2509 			}
2510 		}
2511 
2512 		if(seg.flags & FIN) {
2513 			tcb->flags |= FORCE;
2514 
2515 			switch(tcb->state) {
2516 			case Syn_received:
2517 			case Established:
2518 				tcb->rcv.nxt++;
2519 				tcpsetstate(s, Close_wait);
2520 				break;
2521 			case Finwait1:
2522 				tcb->rcv.nxt++;
2523 				if(qlen(s->wq)+tcb->flgcnt == 0) {
2524 					tcphalt(tpriv, &tcb->rtt_timer);
2525 					tcphalt(tpriv, &tcb->acktimer);
2526 					tcphalt(tpriv, &tcb->katimer);
2527 					tcpsetstate(s, Time_wait);
2528 					tcb->timer.start = MSL2*(1000/MSPTICK);
2529 					tcpgo(tpriv, &tcb->timer);
2530 				}
2531 				else
2532 					tcpsetstate(s, Closing);
2533 				break;
2534 			case Finwait2:
2535 				tcb->rcv.nxt++;
2536 				tcphalt(tpriv, &tcb->rtt_timer);
2537 				tcphalt(tpriv, &tcb->acktimer);
2538 				tcphalt(tpriv, &tcb->katimer);
2539 				tcpsetstate(s, Time_wait);
2540 				tcb->timer.start = MSL2 * (1000/MSPTICK);
2541 				tcpgo(tpriv, &tcb->timer);
2542 				break;
2543 			case Close_wait:
2544 			case Closing:
2545 			case Last_ack:
2546 				break;
2547 			case Time_wait:
2548 				tcpgo(tpriv, &tcb->timer);
2549 				break;
2550 			}
2551 		}
2552 
2553 		/*
2554 		 *  get next adjacent segment from the resequence queue.
2555 		 *  dump/trim any overlapping segments
2556 		 */
2557 		for(;;) {
2558 			if(tcb->reseq == nil)
2559 				goto output;
2560 
2561 			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2562 				goto output;
2563 
2564 			getreseq(tcb, &seg, &bp, &length);
2565 
2566 			tcprcvwin(s);
2567 			if(tcptrim(tcb, &seg, &bp, &length) == 0){
2568 				tcb->flags |= FORCE;
2569 				break;
2570 			}
2571 		}
2572 	}
2573 output:
2574 	tcpoutput(s);
2575 	qunlock(s);
2576 	poperror();
2577 	return;
2578 raise:
2579 	qunlock(s);
2580 	poperror();
2581 	freeblist(bp);
2582 	tcpkick(s);
2583 }
2584 
2585 /*
2586  *  always enters and exits with the s locked.  We drop
2587  *  the lock to ipoput the packet so some care has to be
2588  *  taken by callers.
2589  */
2590 static void
tcpoutput(Conv * s)2591 tcpoutput(Conv *s)
2592 {
2593 	Tcp seg;
2594 	uint msgs;
2595 	Tcpctl *tcb;
2596 	Block *hbp, *bp;
2597 	int sndcnt;
2598 	ulong ssize, dsize, sent;
2599 	Fs *f;
2600 	Tcppriv *tpriv;
2601 	uchar version;
2602 
2603 	f = s->p->f;
2604 	tpriv = s->p->priv;
2605 	version = s->ipversion;
2606 
2607 	tcb = (Tcpctl*)s->ptcl;
2608 
2609 	/* force ack every 2*mss */
2610 	if((tcb->flags & FORCE) == 0 &&
2611 	    tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2612 		tpriv->stats[Delayack]++;
2613 		tcb->flags |= FORCE;
2614 	}
2615 
2616 	/* force ack if window opening */
2617 	if((tcb->flags & FORCE) == 0){
2618 		tcprcvwin(s);
2619 		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2620 			tpriv->stats[Wopenack]++;
2621 			tcb->flags |= FORCE;
2622 		}
2623 	}
2624 
2625 	for(msgs = 0; msgs < 100; msgs++) {
2626 		switch(tcb->state) {
2627 		case Listen:
2628 		case Closed:
2629 		case Finwait2:
2630 			return;
2631 		}
2632 
2633 		/* Don't send anything else until our SYN has been acked */
2634 		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2635 			break;
2636 
2637 		/* force an ack when a window has opened up */
2638 		tcprcvwin(s);
2639 		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2640 			tcb->rcv.blocked = 0;
2641 			tcb->flags |= FORCE;
2642 		}
2643 
2644 		sndcnt = qlen(s->wq)+tcb->flgcnt;
2645 		sent = tcb->snd.ptr - tcb->snd.una;
2646 		ssize = sndcnt;
2647 		if(tcb->snd.wnd == 0){
2648 			/* zero window probe */
2649 			if(sent > 0 && !(tcb->flags & FORCE))
2650 				break;	/* already probing, rto re-probes */
2651 			if(ssize < sent)
2652 				ssize = 0;
2653 			else{
2654 				ssize -= sent;
2655 				if(ssize > 0)
2656 					ssize = 1;
2657 			}
2658 		} else {
2659 			/* calculate usable segment size */
2660 			if(ssize > tcb->cwind)
2661 				ssize = tcb->cwind;
2662 			if(ssize > tcb->snd.wnd)
2663 				ssize = tcb->snd.wnd;
2664 
2665 			if(ssize < sent)
2666 				ssize = 0;
2667 			else {
2668 				ssize -= sent;
2669 				if(ssize > tcb->mss)
2670 					ssize = tcb->mss;
2671 			}
2672 		}
2673 
2674 		dsize = ssize;
2675 		seg.urg = 0;
2676 
2677 		if(!(tcb->flags & FORCE))
2678 			if(ssize == 0 ||
2679 			    ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2680 			    sent > TCPREXMTTHRESH * tcb->mss)
2681 				break;
2682 
2683 		tcb->flags &= ~FORCE;
2684 
2685 		/* By default we will generate an ack */
2686 		tcphalt(tpriv, &tcb->acktimer);
2687 		seg.source = s->lport;
2688 		seg.dest = s->rport;
2689 		seg.flags = ACK;
2690 		seg.mss = 0;
2691 		seg.ws = 0;
2692 		seg.update = 0;
2693 		switch(tcb->state){
2694 		case Syn_sent:
2695 			seg.flags = 0;
2696 			if(tcb->snd.ptr == tcb->iss){
2697 				seg.flags |= SYN;
2698 				dsize--;
2699 				seg.mss = tcb->mss;
2700 				seg.ws = tcb->scale;
2701 			}
2702 			break;
2703 		case Syn_received:
2704 			/*
2705 			 *  don't send any data with a SYN/ACK packet
2706 			 *  because Linux rejects the packet in its
2707 			 *  attempt to solve the SYN attack problem
2708 			 */
2709 			if(tcb->snd.ptr == tcb->iss){
2710 				seg.flags |= SYN;
2711 				dsize = 0;
2712 				ssize = 1;
2713 				seg.mss = tcb->mss;
2714 				seg.ws = tcb->scale;
2715 			}
2716 			break;
2717 		}
2718 		seg.seq = tcb->snd.ptr;
2719 		seg.ack = tcb->rcv.nxt;
2720 		seg.wnd = tcb->rcv.wnd;
2721 
2722 		/* Pull out data to send */
2723 		bp = nil;
2724 		if(dsize != 0) {
2725 			bp = qcopy(s->wq, dsize, sent);
2726 			if(BLEN(bp) != dsize) {
2727 				seg.flags |= FIN;
2728 				dsize--;
2729 			}
2730 		}
2731 
2732 		if(sent+dsize == sndcnt && dsize)
2733 			seg.flags |= PSH;
2734 
2735 		tcb->snd.ptr += ssize;
2736 
2737 		/* Pull up the send pointer so we can accept acks
2738 		 * for this window
2739 		 */
2740 		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2741 			tcb->snd.nxt = tcb->snd.ptr;
2742 
2743 		/* Build header, link data and compute cksum */
2744 		switch(version){
2745 		case V4:
2746 			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2747 			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2748 			if(hbp == nil) {
2749 				freeblist(bp);
2750 				return;
2751 			}
2752 			break;
2753 		case V6:
2754 			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2755 			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2756 			if(hbp == nil) {
2757 				freeblist(bp);
2758 				return;
2759 			}
2760 			break;
2761 		default:
2762 			hbp = nil;	/* to suppress a warning */
2763 			panic("tcpoutput: version %d", version);
2764 		}
2765 
2766 		/* Start the transmission timers if there is new data and we
2767 		 * expect acknowledges
2768 		 */
2769 		if(ssize != 0){
2770 			if(tcb->timer.state != TcptimerON){
2771 				tcb->time = NOW;
2772 				tcb->timeuna = tcb->snd.una;
2773 				tcpgo(tpriv, &tcb->timer);
2774 			}
2775 
2776 			/*  If round trip timer isn't running, start it.
2777 			 *  measure the longest packet only in case the
2778 			 *  transmission time dominates RTT
2779 			 */
2780 			if(tcb->snd.retransmit == 0)
2781 			if(tcb->rtt_timer.state != TcptimerON)
2782 			if(ssize == tcb->mss) {
2783 				tcpgo(tpriv, &tcb->rtt_timer);
2784 				tcb->rttseq = tcb->snd.ptr;
2785 			}
2786 		}
2787 
2788 		tpriv->stats[OutSegs]++;
2789 		if(tcb->snd.retransmit)
2790 			tpriv->stats[RetransSegsSent]++;
2791 		tcb->rcv.ackptr = seg.ack;
2792 		tcb->rcv.wsnt = tcb->rcv.wptr;
2793 
2794 		/* put off the next keep alive */
2795 		tcpgo(tpriv, &tcb->katimer);
2796 
2797 		switch(version){
2798 		case V4:
2799 			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2800 				/* a negative return means no route */
2801 				localclose(s, "no route");
2802 			}
2803 			break;
2804 		case V6:
2805 			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2806 				/* a negative return means no route */
2807 				localclose(s, "no route");
2808 			}
2809 			break;
2810 		default:
2811 			panic("tcpoutput2: version %d", version);
2812 		}
2813 		if((msgs%4) == 3){
2814 			qunlock(s);
2815 			qlock(s);
2816 		}
2817 	}
2818 }
2819 
2820 /*
2821  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2822  */
2823 static void
tcpsendka(Conv * s)2824 tcpsendka(Conv *s)
2825 {
2826 	Tcp seg;
2827 	Tcpctl *tcb;
2828 	Block *hbp,*dbp;
2829 
2830 	tcb = (Tcpctl*)s->ptcl;
2831 
2832 	dbp = nil;
2833 	memset(&seg, 0, sizeof seg);
2834 	seg.urg = 0;
2835 	seg.source = s->lport;
2836 	seg.dest = s->rport;
2837 	seg.flags = ACK|PSH;
2838 	seg.mss = 0;
2839 	seg.ws = 0;
2840 	if(tcpporthogdefense)
2841 		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2842 	else
2843 		seg.seq = tcb->snd.una-1;
2844 	seg.ack = tcb->rcv.nxt;
2845 	tcb->rcv.ackptr = seg.ack;
2846 	tcprcvwin(s);
2847 	seg.wnd = tcb->rcv.wnd;
2848 	if(tcb->state == Finwait2){
2849 		seg.flags |= FIN;
2850 	} else {
2851 		dbp = allocb(1);
2852 		dbp->wp++;
2853 	}
2854 
2855 	if(isv4(s->raddr)) {
2856 		/* Build header, link data and compute cksum */
2857 		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2858 		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2859 		if(hbp == nil) {
2860 			freeblist(dbp);
2861 			return;
2862 		}
2863 		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2864 	}
2865 	else {
2866 		/* Build header, link data and compute cksum */
2867 		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2868 		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2869 		if(hbp == nil) {
2870 			freeblist(dbp);
2871 			return;
2872 		}
2873 		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2874 	}
2875 }
2876 
2877 /*
2878  *  set connection to time out after 12 minutes
2879  */
2880 static void
tcpsetkacounter(Tcpctl * tcb)2881 tcpsetkacounter(Tcpctl *tcb)
2882 {
2883 	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2884 	if(tcb->kacounter < 3)
2885 		tcb->kacounter = 3;
2886 }
2887 
2888 /*
2889  *  if we've timed out, close the connection
2890  *  otherwise, send a keepalive and restart the timer
2891  */
2892 static void
tcpkeepalive(void * v)2893 tcpkeepalive(void *v)
2894 {
2895 	Tcpctl *tcb;
2896 	Conv *s;
2897 
2898 	s = v;
2899 	tcb = (Tcpctl*)s->ptcl;
2900 	if(waserror()){
2901 		qunlock(s);
2902 		nexterror();
2903 	}
2904 	qlock(s);
2905 	if(tcb->state != Closed){
2906 		if(--(tcb->kacounter) <= 0) {
2907 			localclose(s, Etimedout);
2908 		} else {
2909 			tcpsendka(s);
2910 			tcpgo(s->p->priv, &tcb->katimer);
2911 		}
2912 	}
2913 	qunlock(s);
2914 	poperror();
2915 }
2916 
2917 /*
2918  *  start keepalive timer
2919  */
2920 static char*
tcpstartka(Conv * s,char ** f,int n)2921 tcpstartka(Conv *s, char **f, int n)
2922 {
2923 	Tcpctl *tcb;
2924 	int x;
2925 
2926 	tcb = (Tcpctl*)s->ptcl;
2927 	if(tcb->state != Established)
2928 		return "connection must be in Establised state";
2929 	if(n > 1){
2930 		x = atoi(f[1]);
2931 		if(x >= MSPTICK)
2932 			tcb->katimer.start = x/MSPTICK;
2933 	}
2934 	tcpsetkacounter(tcb);
2935 	tcpgo(s->p->priv, &tcb->katimer);
2936 
2937 	return nil;
2938 }
2939 
2940 /*
2941  *  turn checksums on/off
2942  */
2943 static char*
tcpsetchecksum(Conv * s,char ** f,int)2944 tcpsetchecksum(Conv *s, char **f, int)
2945 {
2946 	Tcpctl *tcb;
2947 
2948 	tcb = (Tcpctl*)s->ptcl;
2949 	tcb->nochecksum = !atoi(f[1]);
2950 
2951 	return nil;
2952 }
2953 
2954 /*
2955  *  retransmit (at most) one segment at snd.una.
2956  *  preserve cwind & snd.ptr
2957  */
2958 static void
tcprxmit(Conv * s)2959 tcprxmit(Conv *s)
2960 {
2961 	Tcpctl *tcb;
2962 	Tcppriv *tpriv;
2963 	ulong tcwind, tptr;
2964 
2965 	tcb = (Tcpctl*)s->ptcl;
2966 	tcb->flags |= RETRAN|FORCE;
2967 
2968 	tptr = tcb->snd.ptr;
2969 	tcwind = tcb->cwind;
2970 	tcb->snd.ptr = tcb->snd.una;
2971 	tcb->cwind = tcb->mss;
2972 	tcb->snd.retransmit = 1;
2973 	tcpoutput(s);
2974 	tcb->snd.retransmit = 0;
2975 	tcb->cwind = tcwind;
2976 	tcb->snd.ptr = tptr;
2977 
2978 	tpriv = s->p->priv;
2979 	tpriv->stats[RetransSegs]++;
2980 }
2981 
2982 /*
2983  *  TODO: RFC 4138 F-RTO
2984  */
2985 static void
tcptimeout(void * arg)2986 tcptimeout(void *arg)
2987 {
2988 	Conv *s;
2989 	Tcpctl *tcb;
2990 	int maxback;
2991 	Tcppriv *tpriv;
2992 
2993 	s = (Conv*)arg;
2994 	tpriv = s->p->priv;
2995 	tcb = (Tcpctl*)s->ptcl;
2996 
2997 	if(waserror()){
2998 		qunlock(s);
2999 		nexterror();
3000 	}
3001 	qlock(s);
3002 	switch(tcb->state){
3003 	default:
3004 		tcb->backoff++;
3005 		if(tcb->state == Syn_sent)
3006 			maxback = MAXBACKMS/2;
3007 		else
3008 			maxback = MAXBACKMS;
3009 		tcb->backedoff += tcb->timer.start * MSPTICK;
3010 		if(tcb->backedoff >= maxback) {
3011 			localclose(s, Etimedout);
3012 			break;
3013 		}
3014 		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
3015 			tcb->srtt, tcb->mdev, NOW - tcb->time,
3016 			tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
3017 			tcpstates[s->state]);
3018 		tcpsettimer(tcb);
3019 		if(tcb->snd.rto == 0)
3020 			tcpcongestion(tcb);
3021 		tcprxmit(s);
3022 		tcb->snd.ptr = tcb->snd.una;
3023 		tcb->cwind = tcb->mss;
3024 		tcb->snd.rto = 1;
3025 		tpriv->stats[RetransTimeouts]++;
3026 
3027 		if(tcb->snd.recovery){
3028 			tcb->snd.dupacks = 0;		/* reno rto */
3029 			tcb->snd.recovery = 0;
3030 			tpriv->stats[RecoveryRTO]++;
3031 			tcb->snd.rxt = tcb->snd.nxt;
3032 			netlog(s->p->f, Logtcpwin,
3033 				"rto recovery rxt @%lud\n", tcb->snd.nxt);
3034 		}
3035 
3036 		tcb->abcbytes = 0;
3037 		break;
3038 	case Time_wait:
3039 		localclose(s, nil);
3040 		break;
3041 	case Closed:
3042 		break;
3043 	}
3044 	qunlock(s);
3045 	poperror();
3046 }
3047 
3048 static int
inwindow(Tcpctl * tcb,int seq)3049 inwindow(Tcpctl *tcb, int seq)
3050 {
3051 	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3052 }
3053 
3054 /*
3055  *  set up state for a received SYN (or SYN ACK) packet
3056  */
3057 static void
procsyn(Conv * s,Tcp * seg)3058 procsyn(Conv *s, Tcp *seg)
3059 {
3060 	Tcpctl *tcb;
3061 	Tcppriv *tpriv;
3062 
3063 	tcb = (Tcpctl*)s->ptcl;
3064 	tcb->flags |= FORCE;
3065 
3066 	tcb->rcv.nxt = seg->seq + 1;
3067 	tcb->rcv.wptr = tcb->rcv.nxt;
3068 	tcb->rcv.wsnt = 0;
3069 	tcb->rcv.urg = tcb->rcv.nxt;
3070 	tcb->irs = seg->seq;
3071 
3072 	/* our sending max segment size cannot be bigger than what he asked for */
3073 	if(seg->mss != 0 && seg->mss < tcb->mss) {
3074 		tcb->mss = seg->mss;
3075 		tpriv = s->p->priv;
3076 		tpriv->stats[Mss] = tcb->mss;
3077 	}
3078 
3079 	tcb->snd.wnd = seg->wnd;
3080 	initialwindow(tcb);
3081 }
3082 
3083 static int
dumpreseq(Tcpctl * tcb)3084 dumpreseq(Tcpctl *tcb)
3085 {
3086 	Reseq *r, *next;
3087 
3088 	for(r = tcb->reseq; r != nil; r = next){
3089 		next = r->next;
3090 		freeblist(r->bp);
3091 		free(r);
3092 	}
3093 	tcb->reseq = nil;
3094 	tcb->nreseq = 0;
3095 	tcb->reseqlen = 0;
3096 	return -1;
3097 }
3098 
3099 static void
logreseq(Fs * f,Reseq * r,ulong n)3100 logreseq(Fs *f, Reseq *r, ulong n)
3101 {
3102 	char *s;
3103 
3104 	for(; r != nil; r = r->next){
3105 		s = nil;
3106 		if(r->next == nil && r->seg.seq != n)
3107 			s = "hole/end";
3108 		else if(r->next == nil)
3109 			s = "end";
3110 		else if(r->seg.seq != n)
3111 			s = "hole";
3112 		if(s != nil)
3113 			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3114 				n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3115 		n = r->seg.seq + r->seg.len;
3116 	}
3117 }
3118 
3119 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3120 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3121 {
3122 	Reseq *rp, **rr;
3123 	int qmax;
3124 
3125 	rp = malloc(sizeof *rp);
3126 	if(rp == nil){
3127 		freeblist(bp);		/* bp always consumed by addreseq */
3128 		return 0;
3129 	}
3130 
3131 	rp->seg = *seg;
3132 	rp->bp = bp;
3133 	rp->length = length;
3134 
3135 	tcb->reseqlen += length;
3136 	tcb->nreseq++;
3137 
3138 	/* Place on reassembly list sorting by starting seq number */
3139 	for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3140 		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3141 			rp->next = *rr;
3142 			*rr = rp;
3143 			tpriv->stats[Resequenced]++;
3144 			if(rp->next != nil)
3145 				tpriv->stats[OutOfOrder]++;
3146 			break;
3147 		}
3148 
3149 	qmax = tcb->window;
3150 	if(tcb->reseqlen > qmax){
3151 		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3152 			tcb->reseqlen, qmax, tcb->nreseq);
3153 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3154 		tpriv->stats[ReseqBytelim]++;
3155 		return dumpreseq(tcb);
3156 	}
3157 	qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3158 	if(tcb->nreseq > qmax){
3159 		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3160 			tcb->nreseq, qmax, tcb->reseqlen);
3161 		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3162 		tpriv->stats[ReseqPktlim]++;
3163 		return dumpreseq(tcb);
3164 	}
3165 	return 0;
3166 }
3167 
3168 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3169 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3170 {
3171 	Reseq *rp;
3172 
3173 	rp = tcb->reseq;
3174 	if(rp == nil)
3175 		return;
3176 
3177 	tcb->reseq = rp->next;
3178 
3179 	*seg = rp->seg;
3180 	*bp = rp->bp;
3181 	*length = rp->length;
3182 
3183 	tcb->nreseq--;
3184 	tcb->reseqlen -= rp->length;
3185 
3186 	free(rp);
3187 }
3188 
3189 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3190 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3191 {
3192 	ushort len;
3193 	uchar accept;
3194 	int dupcnt, excess;
3195 
3196 	accept = 0;
3197 	len = *length;
3198 	if(seg->flags & SYN)
3199 		len++;
3200 	if(seg->flags & FIN)
3201 		len++;
3202 
3203 	if(tcb->rcv.wnd == 0) {
3204 		if(len == 0 && seg->seq == tcb->rcv.nxt)
3205 			return 0;
3206 	}
3207 	else {
3208 		/* Some part of the segment should be in the window */
3209 		if(inwindow(tcb,seg->seq))
3210 			accept++;
3211 		else
3212 		if(len != 0) {
3213 			if(inwindow(tcb, seg->seq+len-1) ||
3214 			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3215 				accept++;
3216 		}
3217 	}
3218 	if(!accept) {
3219 		freeblist(*bp);
3220 		return -1;
3221 	}
3222 	dupcnt = tcb->rcv.nxt - seg->seq;
3223 	if(dupcnt > 0){
3224 		tcb->rerecv += dupcnt;
3225 		if(seg->flags & SYN){
3226 			seg->flags &= ~SYN;
3227 			seg->seq++;
3228 
3229 			if(seg->urg > 1)
3230 				seg->urg--;
3231 			else
3232 				seg->flags &= ~URG;
3233 			dupcnt--;
3234 		}
3235 		if(dupcnt > 0){
3236 			pullblock(bp, (ushort)dupcnt);
3237 			seg->seq += dupcnt;
3238 			*length -= dupcnt;
3239 
3240 			if(seg->urg > dupcnt)
3241 				seg->urg -= dupcnt;
3242 			else {
3243 				seg->flags &= ~URG;
3244 				seg->urg = 0;
3245 			}
3246 		}
3247 	}
3248 	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3249 	if(excess > 0) {
3250 		tcb->rerecv += excess;
3251 		*length -= excess;
3252 		*bp = trimblock(*bp, 0, *length);
3253 		if(*bp == nil)
3254 			panic("presotto is a boofhead");
3255 		seg->flags &= ~FIN;
3256 	}
3257 	return 0;
3258 }
3259 
3260 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3261 tcpadvise(Proto *tcp, Block *bp, char *msg)
3262 {
3263 	Tcp4hdr *h4;
3264 	Tcp6hdr *h6;
3265 	Tcpctl *tcb;
3266 	uchar source[IPaddrlen];
3267 	uchar dest[IPaddrlen];
3268 	ushort psource, pdest;
3269 	Conv *s, **p;
3270 
3271 	h4 = (Tcp4hdr*)(bp->rp);
3272 	h6 = (Tcp6hdr*)(bp->rp);
3273 
3274 	if((h4->vihl&0xF0)==IP_VER4) {
3275 		v4tov6(dest, h4->tcpdst);
3276 		v4tov6(source, h4->tcpsrc);
3277 		psource = nhgets(h4->tcpsport);
3278 		pdest = nhgets(h4->tcpdport);
3279 	}
3280 	else {
3281 		ipmove(dest, h6->tcpdst);
3282 		ipmove(source, h6->tcpsrc);
3283 		psource = nhgets(h6->tcpsport);
3284 		pdest = nhgets(h6->tcpdport);
3285 	}
3286 
3287 	/* Look for a connection */
3288 	qlock(tcp);
3289 	for(p = tcp->conv; *p; p++) {
3290 		s = *p;
3291 		tcb = (Tcpctl*)s->ptcl;
3292 		if(s->rport == pdest)
3293 		if(s->lport == psource)
3294 		if(tcb->state != Closed)
3295 		if(ipcmp(s->raddr, dest) == 0)
3296 		if(ipcmp(s->laddr, source) == 0){
3297 			qlock(s);
3298 			qunlock(tcp);
3299 			switch(tcb->state){
3300 			case Syn_sent:
3301 				localclose(s, msg);
3302 				break;
3303 			}
3304 			qunlock(s);
3305 			freeblist(bp);
3306 			return;
3307 		}
3308 	}
3309 	qunlock(tcp);
3310 	freeblist(bp);
3311 }
3312 
3313 static char*
tcpporthogdefensectl(char * val)3314 tcpporthogdefensectl(char *val)
3315 {
3316 	if(strcmp(val, "on") == 0)
3317 		tcpporthogdefense = 1;
3318 	else if(strcmp(val, "off") == 0)
3319 		tcpporthogdefense = 0;
3320 	else
3321 		return "unknown value for tcpporthogdefense";
3322 	return nil;
3323 }
3324 
3325 /* called with c qlocked */
3326 static char*
tcpctl(Conv * c,char ** f,int n)3327 tcpctl(Conv* c, char** f, int n)
3328 {
3329 	if(n == 1 && strcmp(f[0], "close") == 0)
3330 		return tcpclose2(c);
3331 	if(n == 1 && strcmp(f[0], "hangup") == 0)
3332 		return tcphangup(c);
3333 	if(n == 1 && strcmp(f[0], "hangupxmit") == 0)
3334 		return tcpxmitclose(c);
3335 	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3336 		return tcpstartka(c, f, n);
3337 	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3338 		return tcpsetchecksum(c, f, n);
3339 	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3340 		return tcpporthogdefensectl(f[1]);
3341 	return "unknown control request";
3342 }
3343 
3344 static int
tcpstats(Proto * tcp,char * buf,int len)3345 tcpstats(Proto *tcp, char *buf, int len)
3346 {
3347 	Tcppriv *priv;
3348 	char *p, *e;
3349 	int i;
3350 
3351 	priv = tcp->priv;
3352 	p = buf;
3353 	e = p+len;
3354 	for(i = 0; i < Nstats; i++)
3355 		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3356 	return p - buf;
3357 }
3358 
3359 /*
3360  *  garbage collect any stale conversations:
3361  *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3362  *	- Finwait2 after 5 minutes
3363  *
3364  *  this is called whenever we run out of channels.  Both checks are
3365  *  of questionable validity so we try to use them only when we're
3366  *  up against the wall.
3367  */
3368 static int
tcpgc(Proto * tcp)3369 tcpgc(Proto *tcp)
3370 {
3371 	Conv *c, **pp, **ep;
3372 	int n;
3373 	Tcpctl *tcb;
3374 
3375 
3376 	n = 0;
3377 	ep = &tcp->conv[tcp->nc];
3378 	for(pp = tcp->conv; pp < ep; pp++) {
3379 		c = *pp;
3380 		if(c == nil)
3381 			break;
3382 		if(!canqlock(c))
3383 			continue;
3384 		tcb = (Tcpctl*)c->ptcl;
3385 		switch(tcb->state){
3386 		case Syn_received:
3387 			if(NOW - tcb->time > 5000){
3388 				localclose(c, Etimedout);
3389 				n++;
3390 			}
3391 			break;
3392 		case Finwait2:
3393 			if(NOW - tcb->time > 5*60*1000){
3394 				localclose(c, Etimedout);
3395 				n++;
3396 			}
3397 			break;
3398 		}
3399 		qunlock(c);
3400 	}
3401 	return n;
3402 }
3403 
3404 static void
tcpsettimer(Tcpctl * tcb)3405 tcpsettimer(Tcpctl *tcb)
3406 {
3407 	int x;
3408 
3409 	/* round trip dependency */
3410 	x = backoff(tcb->backoff) *
3411 		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3412 
3413 	/* bounded twixt 0.3 and 64 seconds */
3414 	if(x < 300/MSPTICK)
3415 		x = 300/MSPTICK;
3416 	else if(x > (64000/MSPTICK))
3417 		x = 64000/MSPTICK;
3418 	tcb->timer.start = x;
3419 }
3420 
3421 void
tcpinit(Fs * fs)3422 tcpinit(Fs *fs)
3423 {
3424 	Proto *tcp;
3425 	Tcppriv *tpriv;
3426 
3427 	tcp = smalloc(sizeof(Proto));
3428 	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3429 	tcp->name = "tcp";
3430 	tcp->connect = tcpconnect;
3431 	tcp->announce = tcpannounce;
3432 	tcp->ctl = tcpctl;
3433 	tcp->state = tcpstate;
3434 	tcp->create = tcpcreate;
3435 	tcp->close = tcpclose;
3436 	tcp->rcv = tcpiput;
3437 	tcp->advise = tcpadvise;
3438 	tcp->stats = tcpstats;
3439 	tcp->inuse = tcpinuse;
3440 	tcp->gc = tcpgc;
3441 	tcp->ipproto = IP_TCPPROTO;
3442 	tcp->nc = scalednconv();
3443 	tcp->ptclsize = sizeof(Tcpctl);
3444 	tpriv->stats[MaxConn] = tcp->nc;
3445 
3446 	Fsproto(fs, tcp);
3447 }
3448 
3449 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3450 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3451 {
3452 	/*
3453 	 * guess at reasonable queue sizes.  there's no current way
3454 	 * to know how many nic receive buffers we can safely tie up in the
3455 	 * tcp stack, and we don't adjust our queues to maximize throughput
3456 	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3457 	 * respected, but we still control our own buffer commitment by
3458 	 * keeping a seperate qscale.
3459 	 */
3460 	tcb->rcv.scale = rcvscale & 0xff;
3461 	tcb->snd.scale = sndscale & 0xff;
3462 	tcb->qscale = rcvscale & 0xff;
3463 	if(rcvscale > Maxqscale)
3464 		tcb->qscale = Maxqscale;
3465 
3466 	if(rcvscale != tcb->rcv.scale)
3467 		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3468 			"qlen %d >> window %ud lport %d\n",
3469 			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3470 	tcb->window = QMAX << tcb->qscale;
3471 	tcb->ssthresh = tcb->window;
3472 
3473 	/*
3474 	 * it's important to set wq large enough to cover the full
3475 	 * bandwidth-delay product.  it's possible to be in loss
3476 	 * recovery with a big window, and we need to keep sending
3477 	 * into the inflated window.  the difference can be huge
3478 	 * for even modest (70ms) ping times.
3479 	 */
3480 	qsetlimit(s->rq, tcb->window);
3481 	qsetlimit(s->wq, tcb->window);
3482 	tcprcvwin(s);
3483 }
3484