1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7
8 #include "ip.h"
9
10 enum
11 {
12 QMAX = 64*1024-1,
13 IP_TCPPROTO = 6,
14
15 TCP4_IPLEN = 8,
16 TCP4_PHDRSIZE = 12,
17 TCP4_HDRSIZE = 20,
18 TCP4_TCBPHDRSZ = 40,
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21 TCP6_IPLEN = 0,
22 TCP6_PHDRSIZE = 40,
23 TCP6_HDRSIZE = 20,
24 TCP6_TCBPHDRSZ = 60,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27 TcptimerOFF = 0,
28 TcptimerON = 1,
29 TcptimerDONE = 2,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
40
41 EOLOPT = 0,
42 NOOPOPT = 1,
43 MSSOPT = 2,
44 MSS_LENGTH = 4, /* Maximum segment size */
45 WSOPT = 3,
46 WS_LENGTH = 3, /* Bits to scale window size by */
47 MSL2 = 10,
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default maximum segment */
50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58
59 FORCE = 1,
60 CLONE = 2,
61 RETRAN = 4,
62 ACTIVE = 8,
63 SYNACK = 16,
64
65 LOGAGAIN = 3,
66 LOGDGAIN = 2,
67
68 Closed = 0, /* Connection states */
69 Listen,
70 Syn_sent,
71 Syn_received,
72 Established,
73 Finwait1,
74 Finwait2,
75 Close_wait,
76 Closing,
77 Last_ack,
78 Time_wait,
79
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
82 LHTMASK = NLHT-1,
83
84 /*
85 * window is 64kb * 2ⁿ
86 * these factors determine the ultimate bandwidth-delay product.
87 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 */
89 Maxqscale = 4, /* maximum queuing scale */
90 Defadvscale = 4, /* default advertisement */
91 };
92
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 "Closed", "Listen", "Syn_sent", "Syn_received",
97 "Established", "Finwait1", "Finwait2", "Close_wait",
98 "Closing", "Last_ack", "Time_wait"
99 };
100
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 Tcptimer *next;
105 Tcptimer *prev;
106 Tcptimer *readynext;
107 int state;
108 int start;
109 int count;
110 void (*func)(void*);
111 void *arg;
112 };
113
114 /*
115 * v4 and v6 pseudo headers used for
116 * checksuming tcp
117 */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 uchar vihl; /* Version and header length */
122 uchar tos; /* Type of service */
123 uchar length[2]; /* packet length */
124 uchar id[2]; /* Identification */
125 uchar frag[2]; /* Fragment information */
126 uchar Unused;
127 uchar proto;
128 uchar tcplen[2];
129 uchar tcpsrc[4];
130 uchar tcpdst[4];
131 /* same as v6 from here on */
132 uchar tcpsport[2];
133 uchar tcpdport[2];
134 uchar tcpseq[4];
135 uchar tcpack[4];
136 uchar tcpflag[2];
137 uchar tcpwin[2];
138 uchar tcpcksum[2];
139 uchar tcpurg[2];
140 /* Options segment */
141 uchar tcpopt[1];
142 };
143
144 typedef struct Tcp6hdr Tcp6hdr;
145 struct Tcp6hdr
146 {
147 uchar vcf[4];
148 uchar ploadlen[2];
149 uchar proto;
150 uchar ttl;
151 uchar tcpsrc[IPaddrlen];
152 uchar tcpdst[IPaddrlen];
153 /* same as v4 from here on */
154 uchar tcpsport[2];
155 uchar tcpdport[2];
156 uchar tcpseq[4];
157 uchar tcpack[4];
158 uchar tcpflag[2];
159 uchar tcpwin[2];
160 uchar tcpcksum[2];
161 uchar tcpurg[2];
162 /* Options segment */
163 uchar tcpopt[1];
164 };
165
166 /*
167 * this represents the control info
168 * for a single packet. It is derived from
169 * a packet in ntohtcp{4,6}() and stuck into
170 * a packet in htontcp{4,6}().
171 */
172 typedef struct Tcp Tcp;
173 struct Tcp
174 {
175 ushort source;
176 ushort dest;
177 ulong seq;
178 ulong ack;
179 uchar flags;
180 uchar update;
181 ushort ws; /* window scale option */
182 ulong wnd; /* prescaled window*/
183 ushort urg;
184 ushort mss; /* max segment size option (if not zero) */
185 ushort len; /* size of data */
186 };
187
188 /*
189 * this header is malloc'd to thread together fragments
190 * waiting to be coalesced
191 */
192 typedef struct Reseq Reseq;
193 struct Reseq
194 {
195 Reseq *next;
196 Tcp seg;
197 Block *bp;
198 ushort length;
199 };
200
201 /*
202 * the qlock in the Conv locks this structure
203 */
204 typedef struct Tcpctl Tcpctl;
205 struct Tcpctl
206 {
207 uchar state; /* Connection state */
208 uchar type; /* Listening or active connection */
209 uchar code; /* Icmp code */
210 struct {
211 ulong una; /* Unacked data pointer */
212 ulong nxt; /* Next sequence expected */
213 ulong ptr; /* Data pointer */
214 ulong wnd; /* Tcp send window */
215 ulong urg; /* Urgent data pointer */
216 ulong wl2;
217 uint scale; /* how much to right shift window */
218 /* in xmitted packets */
219 /* to implement tahoe and reno TCP */
220 ulong dupacks; /* number of duplicate acks rcvd */
221 ulong partialack;
222 int recovery; /* loss recovery flag */
223 int retransmit; /* retransmit 1 packet @ una flag */
224 int rto;
225 ulong rxt; /* right window marker for recovery */
226 /* "recover" rfc3782 */
227 } snd;
228 struct {
229 ulong nxt; /* Receive pointer to next uchar slot */
230 ulong wnd; /* Receive window incoming */
231 ulong wsnt; /* Last wptr sent. important to */
232 /* track for large bdp */
233 ulong wptr;
234 ulong urg; /* Urgent pointer */
235 ulong ackptr; /* last acked sequence */
236 int blocked;
237 uint scale; /* how much to left shift window in */
238 /* rcv'd packets */
239 } rcv;
240 ulong iss; /* Initial sequence number */
241 ulong cwind; /* Congestion window */
242 ulong abcbytes; /* appropriate byte counting rfc 3465 */
243 uint scale; /* desired snd.scale */
244 ulong ssthresh; /* Slow start threshold */
245 int resent; /* Bytes just resent */
246 int irs; /* Initial received squence */
247 ushort mss; /* Maximum segment size */
248 int rerecv; /* Overlap of data rerecevived */
249 ulong window; /* Our receive window (queue) */
250 uint qscale; /* Log2 of our receive window (queue) */
251 uchar backoff; /* Exponential backoff counter */
252 int backedoff; /* ms we've backed off for rexmits */
253 uchar flags; /* State flags */
254 Reseq *reseq; /* Resequencing queue */
255 int nreseq;
256 int reseqlen;
257 Tcptimer timer; /* Activity timer */
258 Tcptimer acktimer; /* Acknowledge timer */
259 Tcptimer rtt_timer; /* Round trip timer */
260 Tcptimer katimer; /* keep alive timer */
261 ulong rttseq; /* Round trip sequence */
262 int srtt; /* Smoothed round trip */
263 int mdev; /* Mean deviation of round trip */
264 int kacounter; /* count down for keep alive */
265 uint sndsyntime; /* time syn sent */
266 ulong time; /* time Finwait2 or Syn_received was sent */
267 ulong timeuna; /* snd.una when time was set */
268 int nochecksum; /* non-zero means don't send checksums */
269 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
270
271 union {
272 Tcp4hdr tcp4hdr;
273 Tcp6hdr tcp6hdr;
274 } protohdr; /* prototype header */
275 };
276
277 /*
278 * New calls are put in limbo rather than having a conversation structure
279 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
280 * any real Conv structures mucking things up. Calls in limbo rexmit their
281 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
282 *
283 * In particular they aren't on a listener's queue so that they don't figure
284 * in the input queue limit.
285 *
286 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
287 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
288 * there is no hashing of this list.
289 */
290 typedef struct Limbo Limbo;
291 struct Limbo
292 {
293 Limbo *next;
294
295 uchar laddr[IPaddrlen];
296 uchar raddr[IPaddrlen];
297 ushort lport;
298 ushort rport;
299 ulong irs; /* initial received sequence */
300 ulong iss; /* initial sent sequence */
301 ushort mss; /* mss from the other end */
302 ushort rcvscale; /* how much to scale rcvd windows */
303 ushort sndscale; /* how much to scale sent windows */
304 ulong lastsend; /* last time we sent a synack */
305 uchar version; /* v4 or v6 */
306 uchar rexmits; /* number of retransmissions */
307 };
308
309 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
310
311 enum {
312 /* MIB stats */
313 MaxConn,
314 Mss,
315 ActiveOpens,
316 PassiveOpens,
317 EstabResets,
318 CurrEstab,
319 InSegs,
320 OutSegs,
321 RetransSegs,
322 RetransSegsSent,
323 RetransTimeouts,
324 InErrs,
325 OutRsts,
326
327 /* non-MIB stats */
328 CsumErrs,
329 HlenErrs,
330 LenErrs,
331 Resequenced,
332 OutOfOrder,
333 ReseqBytelim,
334 ReseqPktlim,
335 Delayack,
336 Wopenack,
337
338 Recovery,
339 RecoveryDone,
340 RecoveryRTO,
341 RecoveryNoSeq,
342 RecoveryCwind,
343 RecoveryPA,
344
345 Nstats
346 };
347
348 static char *statnames[Nstats] =
349 {
350 [MaxConn] "MaxConn",
351 [Mss] "MaxSegment",
352 [ActiveOpens] "ActiveOpens",
353 [PassiveOpens] "PassiveOpens",
354 [EstabResets] "EstabResets",
355 [CurrEstab] "CurrEstab",
356 [InSegs] "InSegs",
357 [OutSegs] "OutSegs",
358 [RetransSegs] "RetransSegs",
359 [RetransSegsSent] "RetransSegsSent",
360 [RetransTimeouts] "RetransTimeouts",
361 [InErrs] "InErrs",
362 [OutRsts] "OutRsts",
363 [CsumErrs] "CsumErrs",
364 [HlenErrs] "HlenErrs",
365 [LenErrs] "LenErrs",
366 [OutOfOrder] "OutOfOrder",
367 [Resequenced] "Resequenced",
368 [ReseqBytelim] "ReseqBytelim",
369 [ReseqPktlim] "ReseqPktlim",
370 [Delayack] "Delayack",
371 [Wopenack] "Wopenack",
372
373 [Recovery] "Recovery",
374 [RecoveryDone] "RecoveryDone",
375 [RecoveryRTO] "RecoveryRTO",
376
377 [RecoveryNoSeq] "RecoveryNoSeq",
378 [RecoveryCwind] "RecoveryCwind",
379 [RecoveryPA] "RecoveryPA",
380 };
381
382 typedef struct Tcppriv Tcppriv;
383 struct Tcppriv
384 {
385 /* List of active timers */
386 QLock tl;
387 Tcptimer *timers;
388
389 /* hash table for matching conversations */
390 Ipht ht;
391
392 /* calls in limbo waiting for an ACK to our SYN ACK */
393 int nlimbo;
394 Limbo *lht[NLHT];
395
396 /* for keeping track of tcpackproc */
397 QLock apl;
398 int ackprocstarted;
399
400 uvlong stats[Nstats];
401 };
402
403 /*
404 * Setting tcpporthogdefense to non-zero enables Dong Lin's
405 * solution to hijacked systems staking out port's as a form
406 * of DoS attack.
407 *
408 * To avoid stateless Conv hogs, we pick a sequence number at random. If
409 * that number gets acked by the other end, we shut down the connection.
410 * Look for tcpporthogdefense in the code.
411 */
412 int tcpporthogdefense = 0;
413
414 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
415 static int dumpreseq(Tcpctl*);
416 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
417 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
418 static void limborexmit(Proto*);
419 static void localclose(Conv*, char*);
420 static void procsyn(Conv*, Tcp*);
421 static void tcpacktimer(void*);
422 static void tcpiput(Proto*, Ipifc*, Block*);
423 static void tcpkeepalive(void*);
424 static void tcpoutput(Conv*);
425 static void tcprcvwin(Conv*);
426 static void tcprxmit(Conv*);
427 static void tcpsetkacounter(Tcpctl*);
428 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
429 static void tcpsettimer(Tcpctl*);
430 static void tcpsndsyn(Conv*, Tcpctl*);
431 static void tcpstart(Conv*, int);
432 static void tcpsynackrtt(Conv*);
433 static void tcptimeout(void*);
434 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
435
436 static void
tcpsetstate(Conv * s,uchar newstate)437 tcpsetstate(Conv *s, uchar newstate)
438 {
439 Tcpctl *tcb;
440 uchar oldstate;
441 Tcppriv *tpriv;
442
443 tpriv = s->p->priv;
444
445 tcb = (Tcpctl*)s->ptcl;
446
447 oldstate = tcb->state;
448 if(oldstate == newstate)
449 return;
450
451 if(oldstate == Established)
452 tpriv->stats[CurrEstab]--;
453 if(newstate == Established)
454 tpriv->stats[CurrEstab]++;
455
456 switch(newstate) {
457 case Closed:
458 qclose(s->rq);
459 qclose(s->wq);
460 qclose(s->eq);
461 break;
462
463 case Close_wait: /* Remote closes */
464 qhangup(s->rq, nil);
465 break;
466 }
467
468 tcb->state = newstate;
469
470 if(oldstate == Syn_sent && newstate != Closed)
471 Fsconnected(s, nil);
472 }
473
474 static char*
tcpconnect(Conv * c,char ** argv,int argc)475 tcpconnect(Conv *c, char **argv, int argc)
476 {
477 char *e;
478 Tcpctl *tcb;
479
480 tcb = (Tcpctl*)(c->ptcl);
481 if(tcb->state != Closed)
482 return Econinuse;
483
484 e = Fsstdconnect(c, argv, argc);
485 if(e != nil)
486 return e;
487 tcpstart(c, TCP_CONNECT);
488
489 return nil;
490 }
491
492 static int
tcpstate(Conv * c,char * state,int n)493 tcpstate(Conv *c, char *state, int n)
494 {
495 Tcpctl *s;
496
497 s = (Tcpctl*)(c->ptcl);
498
499 return snprint(state, n,
500 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
501 "swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
502 "timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
503 tcpstates[s->state],
504 c->rq ? qlen(c->rq) : 0,
505 c->wq ? qlen(c->wq) : 0,
506 s->nreseq, s->reseqlen,
507 s->srtt, s->mdev, s->ssthresh,
508 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
509 s->qscale,
510 s->timer.start, s->timer.count, s->rerecv,
511 s->katimer.start, s->katimer.count);
512 }
513
514 static int
tcpinuse(Conv * c)515 tcpinuse(Conv *c)
516 {
517 Tcpctl *s;
518
519 s = (Tcpctl*)(c->ptcl);
520 return s->state != Closed;
521 }
522
523 static char*
tcpannounce(Conv * c,char ** argv,int argc)524 tcpannounce(Conv *c, char **argv, int argc)
525 {
526 char *e;
527 Tcpctl *tcb;
528
529 tcb = (Tcpctl*)(c->ptcl);
530 if(tcb->state != Closed)
531 return Econinuse;
532
533 e = Fsstdannounce(c, argv, argc);
534 if(e != nil)
535 return e;
536 tcpstart(c, TCP_LISTEN);
537 Fsconnected(c, nil);
538
539 return nil;
540 }
541
542 /*
543 * tcpclose is always called with the q locked
544 */
545 static void
tcpclose(Conv * c)546 tcpclose(Conv *c)
547 {
548 Tcpctl *tcb;
549
550 tcb = (Tcpctl*)c->ptcl;
551
552 qhangup(c->rq, nil);
553 qhangup(c->wq, nil);
554 qhangup(c->eq, nil);
555 qflush(c->rq);
556
557 switch(tcb->state) {
558 case Listen:
559 /*
560 * reset any incoming calls to this listener
561 */
562 Fsconnected(c, "Hangup");
563
564 localclose(c, nil);
565 break;
566 case Closed:
567 case Syn_sent:
568 localclose(c, nil);
569 break;
570 case Syn_received:
571 case Established:
572 tcb->flgcnt++;
573 tcb->snd.nxt++;
574 tcpsetstate(c, Finwait1);
575 tcpoutput(c);
576 break;
577 case Close_wait:
578 tcb->flgcnt++;
579 tcb->snd.nxt++;
580 tcpsetstate(c, Last_ack);
581 tcpoutput(c);
582 break;
583 }
584 }
585
586 static void
tcpkick(void * x)587 tcpkick(void *x)
588 {
589 Conv *s = x;
590 Tcpctl *tcb;
591
592 tcb = (Tcpctl*)s->ptcl;
593
594 if(waserror()){
595 qunlock(s);
596 nexterror();
597 }
598 qlock(s);
599
600 switch(tcb->state) {
601 case Syn_sent:
602 case Syn_received:
603 case Established:
604 case Close_wait:
605 /*
606 * Push data
607 */
608 tcpoutput(s);
609 break;
610 default:
611 localclose(s, "Hangup");
612 break;
613 }
614
615 qunlock(s);
616 poperror();
617 }
618
619 static int seq_lt(ulong, ulong);
620
621 static void
tcprcvwin(Conv * s)622 tcprcvwin(Conv *s) /* Call with tcb locked */
623 {
624 int w;
625 Tcpctl *tcb;
626
627 tcb = (Tcpctl*)s->ptcl;
628 w = tcb->window - qlen(s->rq);
629 if(w < 0)
630 w = 0;
631 /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
632 if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
633 w = tcb->rcv.wptr - tcb->rcv.nxt;
634 if(w != tcb->rcv.wnd)
635 if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
636 tcb->rcv.blocked = 1;
637 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
638 tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
639 }
640 tcb->rcv.wnd = w;
641 tcb->rcv.wptr = tcb->rcv.nxt + w;
642 }
643
644 static void
tcpacktimer(void * v)645 tcpacktimer(void *v)
646 {
647 Tcpctl *tcb;
648 Conv *s;
649
650 s = v;
651 tcb = (Tcpctl*)s->ptcl;
652
653 if(waserror()){
654 qunlock(s);
655 nexterror();
656 }
657 qlock(s);
658 if(tcb->state != Closed){
659 tcb->flags |= FORCE;
660 tcpoutput(s);
661 }
662 qunlock(s);
663 poperror();
664 }
665
666 static void
tcpcongestion(Tcpctl * tcb)667 tcpcongestion(Tcpctl *tcb)
668 {
669 ulong inflight;
670
671 inflight = tcb->snd.nxt - tcb->snd.una;
672 if(inflight > tcb->cwind)
673 inflight = tcb->cwind;
674 tcb->ssthresh = inflight / 2;
675 if(tcb->ssthresh < 2*tcb->mss)
676 tcb->ssthresh = 2*tcb->mss;
677 }
678
679 enum {
680 L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
681 };
682
683 static void
tcpabcincr(Tcpctl * tcb,uint acked)684 tcpabcincr(Tcpctl *tcb, uint acked)
685 {
686 uint limit;
687
688 tcb->abcbytes += acked;
689 if(tcb->cwind < tcb->ssthresh){
690 /* slow start */
691 if(tcb->snd.rto)
692 limit = tcb->mss;
693 else
694 limit = L*tcb->mss;
695 tcb->cwind += MIN(tcb->abcbytes, limit);
696 tcb->abcbytes = 0;
697 } else {
698 tcb->snd.rto = 0;
699 /* avoidance */
700 if(tcb->abcbytes >= tcb->cwind){
701 tcb->abcbytes -= tcb->cwind;
702 tcb->cwind += tcb->mss;
703 }
704 }
705 }
706
707 static void
tcpcreate(Conv * c)708 tcpcreate(Conv *c)
709 {
710 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
711 c->wq = qopen(QMAX, Qkick, tcpkick, c);
712 }
713
714 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)715 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
716 {
717 if(newstate != TcptimerON){
718 if(t->state == TcptimerON){
719 /* unchain */
720 if(priv->timers == t){
721 priv->timers = t->next;
722 if(t->prev != nil)
723 panic("timerstate1");
724 }
725 if(t->next)
726 t->next->prev = t->prev;
727 if(t->prev)
728 t->prev->next = t->next;
729 t->next = t->prev = nil;
730 }
731 } else {
732 if(t->state != TcptimerON){
733 /* chain */
734 if(t->prev != nil || t->next != nil)
735 panic("timerstate2");
736 t->prev = nil;
737 t->next = priv->timers;
738 if(t->next)
739 t->next->prev = t;
740 priv->timers = t;
741 }
742 }
743 t->state = newstate;
744 }
745
746 static void
tcpackproc(void * a)747 tcpackproc(void *a)
748 {
749 Tcptimer *t, *tp, *timeo;
750 Proto *tcp;
751 Tcppriv *priv;
752 int loop;
753
754 tcp = a;
755 priv = tcp->priv;
756
757 for(;;) {
758 tsleep(&up->sleep, return0, 0, MSPTICK);
759
760 qlock(&priv->tl);
761 timeo = nil;
762 loop = 0;
763 for(t = priv->timers; t != nil; t = tp) {
764 if(loop++ > 10000)
765 panic("tcpackproc1");
766 tp = t->next;
767 if(t->state == TcptimerON) {
768 t->count--;
769 if(t->count == 0) {
770 timerstate(priv, t, TcptimerDONE);
771 t->readynext = timeo;
772 timeo = t;
773 }
774 }
775 }
776 qunlock(&priv->tl);
777
778 loop = 0;
779 for(t = timeo; t != nil; t = t->readynext) {
780 if(loop++ > 10000)
781 panic("tcpackproc2");
782 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
783 (*t->func)(t->arg);
784 poperror();
785 }
786 }
787
788 limborexmit(tcp);
789 }
790 }
791
792 static void
tcpgo(Tcppriv * priv,Tcptimer * t)793 tcpgo(Tcppriv *priv, Tcptimer *t)
794 {
795 if(t == nil || t->start == 0)
796 return;
797
798 qlock(&priv->tl);
799 t->count = t->start;
800 timerstate(priv, t, TcptimerON);
801 qunlock(&priv->tl);
802 }
803
804 static void
tcphalt(Tcppriv * priv,Tcptimer * t)805 tcphalt(Tcppriv *priv, Tcptimer *t)
806 {
807 if(t == nil)
808 return;
809
810 qlock(&priv->tl);
811 timerstate(priv, t, TcptimerOFF);
812 qunlock(&priv->tl);
813 }
814
815 static int
backoff(int n)816 backoff(int n)
817 {
818 return 1 << n;
819 }
820
821 static void
localclose(Conv * s,char * reason)822 localclose(Conv *s, char *reason) /* called with tcb locked */
823 {
824 Tcpctl *tcb;
825 Tcppriv *tpriv;
826
827 tpriv = s->p->priv;
828 tcb = (Tcpctl*)s->ptcl;
829
830 iphtrem(&tpriv->ht, s);
831
832 tcphalt(tpriv, &tcb->timer);
833 tcphalt(tpriv, &tcb->rtt_timer);
834 tcphalt(tpriv, &tcb->acktimer);
835 tcphalt(tpriv, &tcb->katimer);
836
837 /* Flush reassembly queue; nothing more can arrive */
838 dumpreseq(tcb);
839
840 if(tcb->state == Syn_sent)
841 Fsconnected(s, reason);
842 if(s->state == Announced)
843 wakeup(&s->listenr);
844
845 qhangup(s->rq, reason);
846 qhangup(s->wq, reason);
847
848 tcpsetstate(s, Closed);
849 }
850
851 /* mtu (- TCP + IP hdr len) of 1st hop */
852 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)853 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
854 {
855 Ipifc *ifc;
856 int mtu;
857
858 ifc = findipifc(tcp->f, addr, 0);
859 switch(version){
860 default:
861 case V4:
862 mtu = DEF_MSS;
863 if(ifc != nil)
864 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
865 break;
866 case V6:
867 mtu = DEF_MSS6;
868 if(ifc != nil)
869 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
870 break;
871 }
872 /*
873 * set the ws. it doesn't commit us to anything.
874 * ws is the ultimate limit to the bandwidth-delay product.
875 */
876 *scale = Defadvscale;
877
878 return mtu;
879 }
880
881 static void
inittcpctl(Conv * s,int mode)882 inittcpctl(Conv *s, int mode)
883 {
884 Tcpctl *tcb;
885 Tcp4hdr* h4;
886 Tcp6hdr* h6;
887 Tcppriv *tpriv;
888 int mss;
889
890 tcb = (Tcpctl*)s->ptcl;
891
892 memset(tcb, 0, sizeof(Tcpctl));
893
894 tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
895 tcb->srtt = tcp_irtt<<LOGAGAIN;
896 tcb->mdev = 0;
897
898 /* setup timers */
899 tcb->timer.start = tcp_irtt / MSPTICK;
900 tcb->timer.func = tcptimeout;
901 tcb->timer.arg = s;
902 tcb->rtt_timer.start = MAX_TIME;
903 tcb->acktimer.start = TCP_ACK / MSPTICK;
904 tcb->acktimer.func = tcpacktimer;
905 tcb->acktimer.arg = s;
906 tcb->katimer.start = DEF_KAT / MSPTICK;
907 tcb->katimer.func = tcpkeepalive;
908 tcb->katimer.arg = s;
909
910 mss = DEF_MSS;
911
912 /* create a prototype(pseudo) header */
913 if(mode != TCP_LISTEN){
914 if(ipcmp(s->laddr, IPnoaddr) == 0)
915 findlocalip(s->p->f, s->laddr, s->raddr);
916
917 switch(s->ipversion){
918 case V4:
919 h4 = &tcb->protohdr.tcp4hdr;
920 memset(h4, 0, sizeof(*h4));
921 h4->proto = IP_TCPPROTO;
922 hnputs(h4->tcpsport, s->lport);
923 hnputs(h4->tcpdport, s->rport);
924 v6tov4(h4->tcpsrc, s->laddr);
925 v6tov4(h4->tcpdst, s->raddr);
926 break;
927 case V6:
928 h6 = &tcb->protohdr.tcp6hdr;
929 memset(h6, 0, sizeof(*h6));
930 h6->proto = IP_TCPPROTO;
931 hnputs(h6->tcpsport, s->lport);
932 hnputs(h6->tcpdport, s->rport);
933 ipmove(h6->tcpsrc, s->laddr);
934 ipmove(h6->tcpdst, s->raddr);
935 mss = DEF_MSS6;
936 break;
937 default:
938 panic("inittcpctl: version %d", s->ipversion);
939 }
940 }
941
942 tcb->mss = tcb->cwind = mss;
943 tcb->abcbytes = 0;
944 tpriv = s->p->priv;
945 tpriv->stats[Mss] = tcb->mss;
946
947 /* default is no window scaling */
948 tcpsetscale(s, tcb, 0, 0);
949 }
950
951 /*
952 * called with s qlocked
953 */
954 static void
tcpstart(Conv * s,int mode)955 tcpstart(Conv *s, int mode)
956 {
957 Tcpctl *tcb;
958 Tcppriv *tpriv;
959 char kpname[KNAMELEN];
960
961 tpriv = s->p->priv;
962
963 if(tpriv->ackprocstarted == 0){
964 qlock(&tpriv->apl);
965 if(tpriv->ackprocstarted == 0){
966 snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
967 kproc(kpname, tcpackproc, s->p);
968 tpriv->ackprocstarted = 1;
969 }
970 qunlock(&tpriv->apl);
971 }
972
973 tcb = (Tcpctl*)s->ptcl;
974
975 inittcpctl(s, mode);
976
977 iphtadd(&tpriv->ht, s);
978 switch(mode) {
979 case TCP_LISTEN:
980 tpriv->stats[PassiveOpens]++;
981 tcb->flags |= CLONE;
982 tcpsetstate(s, Listen);
983 break;
984
985 case TCP_CONNECT:
986 tpriv->stats[ActiveOpens]++;
987 tcb->flags |= ACTIVE;
988 tcpsndsyn(s, tcb);
989 tcpsetstate(s, Syn_sent);
990 tcpoutput(s);
991 break;
992 }
993 }
994
995 static char*
tcpflag(char * buf,char * e,ushort flag)996 tcpflag(char *buf, char *e, ushort flag)
997 {
998 char *p;
999
1000 p = seprint(buf, e, "%d", flag>>10); /* Head len */
1001 if(flag & URG)
1002 p = seprint(p, e, " URG");
1003 if(flag & ACK)
1004 p = seprint(p, e, " ACK");
1005 if(flag & PSH)
1006 p = seprint(p, e, " PSH");
1007 if(flag & RST)
1008 p = seprint(p, e, " RST");
1009 if(flag & SYN)
1010 p = seprint(p, e, " SYN");
1011 if(flag & FIN)
1012 p = seprint(p, e, " FIN");
1013 USED(p);
1014 return buf;
1015 }
1016
1017 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1018 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1019 {
1020 int dlen;
1021 Tcp6hdr *h;
1022 ushort csum;
1023 ushort hdrlen, optpad = 0;
1024 uchar *opt;
1025
1026 hdrlen = TCP6_HDRSIZE;
1027 if(tcph->flags & SYN){
1028 if(tcph->mss)
1029 hdrlen += MSS_LENGTH;
1030 if(tcph->ws)
1031 hdrlen += WS_LENGTH;
1032 optpad = hdrlen & 3;
1033 if(optpad)
1034 optpad = 4 - optpad;
1035 hdrlen += optpad;
1036 }
1037
1038 if(data) {
1039 dlen = blocklen(data);
1040 data = padblock(data, hdrlen + TCP6_PKT);
1041 if(data == nil)
1042 return nil;
1043 }
1044 else {
1045 dlen = 0;
1046 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
1047 if(data == nil)
1048 return nil;
1049 data->wp += hdrlen + TCP6_PKT;
1050 }
1051
1052 /* copy in pseudo ip header plus port numbers */
1053 h = (Tcp6hdr *)(data->rp);
1054 memmove(h, ph, TCP6_TCBPHDRSZ);
1055
1056 /* compose pseudo tcp header, do cksum calculation */
1057 hnputl(h->vcf, hdrlen + dlen);
1058 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1059 h->ttl = ph->proto;
1060
1061 /* copy in variable bits */
1062 hnputl(h->tcpseq, tcph->seq);
1063 hnputl(h->tcpack, tcph->ack);
1064 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1065 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1066 hnputs(h->tcpurg, tcph->urg);
1067
1068 if(tcph->flags & SYN){
1069 opt = h->tcpopt;
1070 if(tcph->mss != 0){
1071 *opt++ = MSSOPT;
1072 *opt++ = MSS_LENGTH;
1073 hnputs(opt, tcph->mss);
1074 opt += 2;
1075 }
1076 if(tcph->ws != 0){
1077 *opt++ = WSOPT;
1078 *opt++ = WS_LENGTH;
1079 *opt++ = tcph->ws;
1080 }
1081 while(optpad-- > 0)
1082 *opt++ = NOOPOPT;
1083 }
1084
1085 if(tcb != nil && tcb->nochecksum){
1086 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1087 } else {
1088 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1089 hnputs(h->tcpcksum, csum);
1090 }
1091
1092 /* move from pseudo header back to normal ip header */
1093 memset(h->vcf, 0, 4);
1094 h->vcf[0] = IP_VER6;
1095 hnputs(h->ploadlen, hdrlen+dlen);
1096 h->proto = ph->proto;
1097
1098 return data;
1099 }
1100
1101 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1102 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1103 {
1104 int dlen;
1105 Tcp4hdr *h;
1106 ushort csum;
1107 ushort hdrlen, optpad = 0;
1108 uchar *opt;
1109
1110 hdrlen = TCP4_HDRSIZE;
1111 if(tcph->flags & SYN){
1112 if(tcph->mss)
1113 hdrlen += MSS_LENGTH;
1114 if(1)
1115 hdrlen += WS_LENGTH;
1116 optpad = hdrlen & 3;
1117 if(optpad)
1118 optpad = 4 - optpad;
1119 hdrlen += optpad;
1120 }
1121
1122 if(data) {
1123 dlen = blocklen(data);
1124 data = padblock(data, hdrlen + TCP4_PKT);
1125 if(data == nil)
1126 return nil;
1127 }
1128 else {
1129 dlen = 0;
1130 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1131 if(data == nil)
1132 return nil;
1133 data->wp += hdrlen + TCP4_PKT;
1134 }
1135
1136 /* copy in pseudo ip header plus port numbers */
1137 h = (Tcp4hdr *)(data->rp);
1138 memmove(h, ph, TCP4_TCBPHDRSZ);
1139
1140 /* copy in variable bits */
1141 hnputs(h->tcplen, hdrlen + dlen);
1142 hnputl(h->tcpseq, tcph->seq);
1143 hnputl(h->tcpack, tcph->ack);
1144 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1145 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1146 hnputs(h->tcpurg, tcph->urg);
1147
1148 if(tcph->flags & SYN){
1149 opt = h->tcpopt;
1150 if(tcph->mss != 0){
1151 *opt++ = MSSOPT;
1152 *opt++ = MSS_LENGTH;
1153 hnputs(opt, tcph->mss);
1154 opt += 2;
1155 }
1156 /* always offer. rfc1323 §2.2 */
1157 if(1){
1158 *opt++ = WSOPT;
1159 *opt++ = WS_LENGTH;
1160 *opt++ = tcph->ws;
1161 }
1162 while(optpad-- > 0)
1163 *opt++ = NOOPOPT;
1164 }
1165
1166 if(tcb != nil && tcb->nochecksum){
1167 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1168 } else {
1169 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1170 hnputs(h->tcpcksum, csum);
1171 }
1172
1173 return data;
1174 }
1175
1176 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1177 ntohtcp6(Tcp *tcph, Block **bpp)
1178 {
1179 Tcp6hdr *h;
1180 uchar *optr;
1181 ushort hdrlen;
1182 ushort optlen;
1183 int n;
1184
1185 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1186 if(*bpp == nil)
1187 return -1;
1188
1189 h = (Tcp6hdr *)((*bpp)->rp);
1190 tcph->source = nhgets(h->tcpsport);
1191 tcph->dest = nhgets(h->tcpdport);
1192 tcph->seq = nhgetl(h->tcpseq);
1193 tcph->ack = nhgetl(h->tcpack);
1194 hdrlen = (h->tcpflag[0]>>2) & ~3;
1195 if(hdrlen < TCP6_HDRSIZE) {
1196 freeblist(*bpp);
1197 return -1;
1198 }
1199
1200 tcph->flags = h->tcpflag[1];
1201 tcph->wnd = nhgets(h->tcpwin);
1202 tcph->urg = nhgets(h->tcpurg);
1203 tcph->mss = 0;
1204 tcph->ws = 0;
1205 tcph->update = 0;
1206 tcph->len = nhgets(h->ploadlen) - hdrlen;
1207
1208 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1209 if(*bpp == nil)
1210 return -1;
1211
1212 optr = h->tcpopt;
1213 n = hdrlen - TCP6_HDRSIZE;
1214 while(n > 0 && *optr != EOLOPT) {
1215 if(*optr == NOOPOPT) {
1216 n--;
1217 optr++;
1218 continue;
1219 }
1220 optlen = optr[1];
1221 if(optlen < 2 || optlen > n)
1222 break;
1223 switch(*optr) {
1224 case MSSOPT:
1225 if(optlen == MSS_LENGTH)
1226 tcph->mss = nhgets(optr+2);
1227 break;
1228 case WSOPT:
1229 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1230 tcph->ws = *(optr+2);
1231 break;
1232 }
1233 n -= optlen;
1234 optr += optlen;
1235 }
1236 return hdrlen;
1237 }
1238
1239 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1240 ntohtcp4(Tcp *tcph, Block **bpp)
1241 {
1242 Tcp4hdr *h;
1243 uchar *optr;
1244 ushort hdrlen;
1245 ushort optlen;
1246 int n;
1247
1248 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1249 if(*bpp == nil)
1250 return -1;
1251
1252 h = (Tcp4hdr *)((*bpp)->rp);
1253 tcph->source = nhgets(h->tcpsport);
1254 tcph->dest = nhgets(h->tcpdport);
1255 tcph->seq = nhgetl(h->tcpseq);
1256 tcph->ack = nhgetl(h->tcpack);
1257
1258 hdrlen = (h->tcpflag[0]>>2) & ~3;
1259 if(hdrlen < TCP4_HDRSIZE) {
1260 freeblist(*bpp);
1261 return -1;
1262 }
1263
1264 tcph->flags = h->tcpflag[1];
1265 tcph->wnd = nhgets(h->tcpwin);
1266 tcph->urg = nhgets(h->tcpurg);
1267 tcph->mss = 0;
1268 tcph->ws = 0;
1269 tcph->update = 0;
1270 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1271
1272 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1273 if(*bpp == nil)
1274 return -1;
1275
1276 optr = h->tcpopt;
1277 n = hdrlen - TCP4_HDRSIZE;
1278 while(n > 0 && *optr != EOLOPT) {
1279 if(*optr == NOOPOPT) {
1280 n--;
1281 optr++;
1282 continue;
1283 }
1284 optlen = optr[1];
1285 if(optlen < 2 || optlen > n)
1286 break;
1287 switch(*optr) {
1288 case MSSOPT:
1289 if(optlen == MSS_LENGTH)
1290 tcph->mss = nhgets(optr+2);
1291 break;
1292 case WSOPT:
1293 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1294 tcph->ws = *(optr+2);
1295 break;
1296 }
1297 n -= optlen;
1298 optr += optlen;
1299 }
1300 return hdrlen;
1301 }
1302
1303 /*
1304 * For outgoing calls, generate an initial sequence
1305 * number and put a SYN on the send queue
1306 */
1307 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1308 tcpsndsyn(Conv *s, Tcpctl *tcb)
1309 {
1310 Tcppriv *tpriv;
1311
1312 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1313 tcb->rttseq = tcb->iss;
1314 tcb->snd.wl2 = tcb->iss;
1315 tcb->snd.una = tcb->iss;
1316 tcb->snd.rxt = tcb->iss;
1317 tcb->snd.ptr = tcb->rttseq;
1318 tcb->snd.nxt = tcb->rttseq;
1319 tcb->flgcnt++;
1320 tcb->flags |= FORCE;
1321 tcb->sndsyntime = NOW;
1322
1323 /* set desired mss and scale */
1324 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1325 tpriv = s->p->priv;
1326 tpriv->stats[Mss] = tcb->mss;
1327 }
1328
1329 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1330 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1331 {
1332 Block *hbp;
1333 uchar rflags;
1334 Tcppriv *tpriv;
1335 Tcp4hdr ph4;
1336 Tcp6hdr ph6;
1337
1338 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1339
1340 tpriv = tcp->priv;
1341
1342 if(seg->flags & RST)
1343 return;
1344
1345 /* make pseudo header */
1346 switch(version) {
1347 case V4:
1348 memset(&ph4, 0, sizeof(ph4));
1349 ph4.vihl = IP_VER4;
1350 v6tov4(ph4.tcpsrc, dest);
1351 v6tov4(ph4.tcpdst, source);
1352 ph4.proto = IP_TCPPROTO;
1353 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1354 hnputs(ph4.tcpsport, seg->dest);
1355 hnputs(ph4.tcpdport, seg->source);
1356 break;
1357 case V6:
1358 memset(&ph6, 0, sizeof(ph6));
1359 ph6.vcf[0] = IP_VER6;
1360 ipmove(ph6.tcpsrc, dest);
1361 ipmove(ph6.tcpdst, source);
1362 ph6.proto = IP_TCPPROTO;
1363 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1364 hnputs(ph6.tcpsport, seg->dest);
1365 hnputs(ph6.tcpdport, seg->source);
1366 break;
1367 default:
1368 panic("sndrst: version %d", version);
1369 }
1370
1371 tpriv->stats[OutRsts]++;
1372 rflags = RST;
1373
1374 /* convince the other end that this reset is in band */
1375 if(seg->flags & ACK) {
1376 seg->seq = seg->ack;
1377 seg->ack = 0;
1378 }
1379 else {
1380 rflags |= ACK;
1381 seg->ack = seg->seq;
1382 seg->seq = 0;
1383 if(seg->flags & SYN)
1384 seg->ack++;
1385 seg->ack += length;
1386 if(seg->flags & FIN)
1387 seg->ack++;
1388 }
1389 seg->flags = rflags;
1390 seg->wnd = 0;
1391 seg->urg = 0;
1392 seg->mss = 0;
1393 seg->ws = 0;
1394 switch(version) {
1395 case V4:
1396 hbp = htontcp4(seg, nil, &ph4, nil);
1397 if(hbp == nil)
1398 return;
1399 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1400 break;
1401 case V6:
1402 hbp = htontcp6(seg, nil, &ph6, nil);
1403 if(hbp == nil)
1404 return;
1405 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1406 break;
1407 default:
1408 panic("sndrst2: version %d", version);
1409 }
1410 }
1411
1412 /*
1413 * send a reset to the remote side and close the conversation
1414 * called with s qlocked
1415 */
1416 static char*
tcphangup(Conv * s)1417 tcphangup(Conv *s)
1418 {
1419 Tcp seg;
1420 Tcpctl *tcb;
1421 Block *hbp;
1422
1423 tcb = (Tcpctl*)s->ptcl;
1424 if(waserror())
1425 return commonerror();
1426 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1427 if(!waserror()){
1428 memset(&seg, 0, sizeof seg);
1429 seg.flags = RST | ACK;
1430 seg.ack = tcb->rcv.nxt;
1431 tcb->rcv.ackptr = seg.ack;
1432 seg.seq = tcb->snd.ptr;
1433 seg.wnd = 0;
1434 seg.urg = 0;
1435 seg.mss = 0;
1436 seg.ws = 0;
1437 switch(s->ipversion) {
1438 case V4:
1439 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1440 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1441 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1442 break;
1443 case V6:
1444 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1445 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1446 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1447 break;
1448 default:
1449 panic("tcphangup: version %d", s->ipversion);
1450 }
1451 poperror();
1452 }
1453 }
1454 localclose(s, nil);
1455 poperror();
1456 return nil;
1457 }
1458
1459 /*
1460 * (re)send a SYN ACK
1461 */
1462 static int
sndsynack(Proto * tcp,Limbo * lp)1463 sndsynack(Proto *tcp, Limbo *lp)
1464 {
1465 Block *hbp;
1466 Tcp4hdr ph4;
1467 Tcp6hdr ph6;
1468 Tcp seg;
1469 uint scale;
1470
1471 /* make pseudo header */
1472 switch(lp->version) {
1473 case V4:
1474 memset(&ph4, 0, sizeof(ph4));
1475 ph4.vihl = IP_VER4;
1476 v6tov4(ph4.tcpsrc, lp->laddr);
1477 v6tov4(ph4.tcpdst, lp->raddr);
1478 ph4.proto = IP_TCPPROTO;
1479 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1480 hnputs(ph4.tcpsport, lp->lport);
1481 hnputs(ph4.tcpdport, lp->rport);
1482 break;
1483 case V6:
1484 memset(&ph6, 0, sizeof(ph6));
1485 ph6.vcf[0] = IP_VER6;
1486 ipmove(ph6.tcpsrc, lp->laddr);
1487 ipmove(ph6.tcpdst, lp->raddr);
1488 ph6.proto = IP_TCPPROTO;
1489 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1490 hnputs(ph6.tcpsport, lp->lport);
1491 hnputs(ph6.tcpdport, lp->rport);
1492 break;
1493 default:
1494 panic("sndrst: version %d", lp->version);
1495 }
1496
1497 memset(&seg, 0, sizeof seg);
1498 seg.seq = lp->iss;
1499 seg.ack = lp->irs+1;
1500 seg.flags = SYN|ACK;
1501 seg.urg = 0;
1502 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1503 seg.wnd = QMAX;
1504
1505 /* if the other side set scale, we should too */
1506 if(lp->rcvscale){
1507 seg.ws = scale;
1508 lp->sndscale = scale;
1509 } else {
1510 seg.ws = 0;
1511 lp->sndscale = 0;
1512 }
1513
1514 switch(lp->version) {
1515 case V4:
1516 hbp = htontcp4(&seg, nil, &ph4, nil);
1517 if(hbp == nil)
1518 return -1;
1519 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1520 break;
1521 case V6:
1522 hbp = htontcp6(&seg, nil, &ph6, nil);
1523 if(hbp == nil)
1524 return -1;
1525 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1526 break;
1527 default:
1528 panic("sndsnack: version %d", lp->version);
1529 }
1530 lp->lastsend = NOW;
1531 return 0;
1532 }
1533
1534 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1535
1536 /*
1537 * put a call into limbo and respond with a SYN ACK
1538 *
1539 * called with proto locked
1540 */
1541 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1542 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1543 {
1544 Limbo *lp, **l;
1545 Tcppriv *tpriv;
1546 int h;
1547
1548 tpriv = s->p->priv;
1549 h = hashipa(source, seg->source);
1550
1551 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1552 lp = *l;
1553 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1554 continue;
1555 if(ipcmp(lp->raddr, source) != 0)
1556 continue;
1557 if(ipcmp(lp->laddr, dest) != 0)
1558 continue;
1559
1560 /* each new SYN restarts the retransmits */
1561 lp->irs = seg->seq;
1562 break;
1563 }
1564 lp = *l;
1565 if(lp == nil){
1566 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1567 lp = tpriv->lht[h];
1568 tpriv->lht[h] = lp->next;
1569 lp->next = nil;
1570 } else {
1571 lp = malloc(sizeof(*lp));
1572 if(lp == nil)
1573 return;
1574 tpriv->nlimbo++;
1575 }
1576 *l = lp;
1577 lp->version = version;
1578 ipmove(lp->laddr, dest);
1579 ipmove(lp->raddr, source);
1580 lp->lport = seg->dest;
1581 lp->rport = seg->source;
1582 lp->mss = seg->mss;
1583 lp->rcvscale = seg->ws;
1584 lp->irs = seg->seq;
1585 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1586 }
1587
1588 if(sndsynack(s->p, lp) < 0){
1589 *l = lp->next;
1590 tpriv->nlimbo--;
1591 free(lp);
1592 }
1593 }
1594
1595 /*
1596 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1597 */
1598 static void
limborexmit(Proto * tcp)1599 limborexmit(Proto *tcp)
1600 {
1601 Tcppriv *tpriv;
1602 Limbo **l, *lp;
1603 int h;
1604 int seen;
1605 ulong now;
1606
1607 tpriv = tcp->priv;
1608
1609 if(!canqlock(tcp))
1610 return;
1611 seen = 0;
1612 now = NOW;
1613 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1614 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1615 lp = *l;
1616 seen++;
1617 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1618 continue;
1619
1620 /* time it out after 1 second */
1621 if(++(lp->rexmits) > 5){
1622 tpriv->nlimbo--;
1623 *l = lp->next;
1624 free(lp);
1625 continue;
1626 }
1627
1628 /* if we're being attacked, don't bother resending SYN ACK's */
1629 if(tpriv->nlimbo > 100)
1630 continue;
1631
1632 if(sndsynack(tcp, lp) < 0){
1633 tpriv->nlimbo--;
1634 *l = lp->next;
1635 free(lp);
1636 continue;
1637 }
1638
1639 l = &lp->next;
1640 }
1641 }
1642 qunlock(tcp);
1643 }
1644
1645 /*
1646 * lookup call in limbo. if found, throw it out.
1647 *
1648 * called with proto locked
1649 */
1650 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1651 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1652 {
1653 Limbo *lp, **l;
1654 int h;
1655 Tcppriv *tpriv;
1656
1657 tpriv = s->p->priv;
1658
1659 /* find a call in limbo */
1660 h = hashipa(src, segp->source);
1661 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1662 lp = *l;
1663 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1664 continue;
1665 if(ipcmp(lp->laddr, dst) != 0)
1666 continue;
1667 if(ipcmp(lp->raddr, src) != 0)
1668 continue;
1669
1670 /* RST can only follow the SYN */
1671 if(segp->seq == lp->irs+1){
1672 tpriv->nlimbo--;
1673 *l = lp->next;
1674 free(lp);
1675 }
1676 break;
1677 }
1678 }
1679
1680 static void
initialwindow(Tcpctl * tcb)1681 initialwindow(Tcpctl *tcb)
1682 {
1683 /* RFC 3390 initial window */
1684 if(tcb->mss < 1095)
1685 tcb->cwind = 4*tcb->mss;
1686 else if(tcb->mss < 2190)
1687 tcb->cwind = 2*2190;
1688 else
1689 tcb->cwind = 2*tcb->mss;
1690 }
1691
1692 /*
1693 * come here when we finally get an ACK to our SYN-ACK.
1694 * lookup call in limbo. if found, create a new conversation
1695 *
1696 * called with proto locked
1697 */
1698 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1699 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1700 {
1701 Conv *new;
1702 Tcpctl *tcb;
1703 Tcppriv *tpriv;
1704 Tcp4hdr *h4;
1705 Tcp6hdr *h6;
1706 Limbo *lp, **l;
1707 int h;
1708
1709 /* unless it's just an ack, it can't be someone coming out of limbo */
1710 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1711 return nil;
1712
1713 tpriv = s->p->priv;
1714
1715 /* find a call in limbo */
1716 h = hashipa(src, segp->source);
1717 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1718 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1719 src, segp->source, lp->raddr, lp->rport,
1720 dst, segp->dest, lp->laddr, lp->lport,
1721 version, lp->version
1722 );
1723
1724 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1725 continue;
1726 if(ipcmp(lp->laddr, dst) != 0)
1727 continue;
1728 if(ipcmp(lp->raddr, src) != 0)
1729 continue;
1730
1731 /* we're assuming no data with the initial SYN */
1732 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1733 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1734 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1735 lp = nil;
1736 } else {
1737 tpriv->nlimbo--;
1738 *l = lp->next;
1739 }
1740 break;
1741 }
1742 if(lp == nil)
1743 return nil;
1744
1745 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1746 if(new == nil)
1747 return nil;
1748
1749 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1750 tcb = (Tcpctl*)new->ptcl;
1751 tcb->flags &= ~CLONE;
1752 tcb->timer.arg = new;
1753 tcb->timer.state = TcptimerOFF;
1754 tcb->acktimer.arg = new;
1755 tcb->acktimer.state = TcptimerOFF;
1756 tcb->katimer.arg = new;
1757 tcb->katimer.state = TcptimerOFF;
1758 tcb->rtt_timer.arg = new;
1759 tcb->rtt_timer.state = TcptimerOFF;
1760
1761 tcb->irs = lp->irs;
1762 tcb->rcv.nxt = tcb->irs+1;
1763 tcb->rcv.wptr = tcb->rcv.nxt;
1764 tcb->rcv.wsnt = 0;
1765 tcb->rcv.urg = tcb->rcv.nxt;
1766
1767 tcb->iss = lp->iss;
1768 tcb->rttseq = tcb->iss;
1769 tcb->snd.wl2 = tcb->iss;
1770 tcb->snd.una = tcb->iss+1;
1771 tcb->snd.ptr = tcb->iss+1;
1772 tcb->snd.nxt = tcb->iss+1;
1773 tcb->snd.rxt = tcb->iss+1;
1774 tcb->flgcnt = 0;
1775 tcb->flags |= SYNACK;
1776
1777 /* our sending max segment size cannot be bigger than what he asked for */
1778 if(lp->mss != 0 && lp->mss < tcb->mss) {
1779 tcb->mss = lp->mss;
1780 tpriv->stats[Mss] = tcb->mss;
1781 }
1782
1783 /* window scaling */
1784 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1785
1786 /* congestion window */
1787 tcb->snd.wnd = segp->wnd;
1788 initialwindow(tcb);
1789
1790 /* set initial round trip time */
1791 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1792 tcpsynackrtt(new);
1793
1794 free(lp);
1795
1796 /* set up proto header */
1797 switch(version){
1798 case V4:
1799 h4 = &tcb->protohdr.tcp4hdr;
1800 memset(h4, 0, sizeof(*h4));
1801 h4->proto = IP_TCPPROTO;
1802 hnputs(h4->tcpsport, new->lport);
1803 hnputs(h4->tcpdport, new->rport);
1804 v6tov4(h4->tcpsrc, dst);
1805 v6tov4(h4->tcpdst, src);
1806 break;
1807 case V6:
1808 h6 = &tcb->protohdr.tcp6hdr;
1809 memset(h6, 0, sizeof(*h6));
1810 h6->proto = IP_TCPPROTO;
1811 hnputs(h6->tcpsport, new->lport);
1812 hnputs(h6->tcpdport, new->rport);
1813 ipmove(h6->tcpsrc, dst);
1814 ipmove(h6->tcpdst, src);
1815 break;
1816 default:
1817 panic("tcpincoming: version %d", new->ipversion);
1818 }
1819
1820 tcpsetstate(new, Established);
1821
1822 iphtadd(&tpriv->ht, new);
1823
1824 return new;
1825 }
1826
1827 static int
seq_within(ulong x,ulong low,ulong high)1828 seq_within(ulong x, ulong low, ulong high)
1829 {
1830 if(low <= high){
1831 if(low <= x && x <= high)
1832 return 1;
1833 }
1834 else {
1835 if(x >= low || x <= high)
1836 return 1;
1837 }
1838 return 0;
1839 }
1840
1841 static int
seq_lt(ulong x,ulong y)1842 seq_lt(ulong x, ulong y)
1843 {
1844 return (int)(x-y) < 0;
1845 }
1846
1847 static int
seq_le(ulong x,ulong y)1848 seq_le(ulong x, ulong y)
1849 {
1850 return (int)(x-y) <= 0;
1851 }
1852
1853 static int
seq_gt(ulong x,ulong y)1854 seq_gt(ulong x, ulong y)
1855 {
1856 return (int)(x-y) > 0;
1857 }
1858
1859 static int
seq_ge(ulong x,ulong y)1860 seq_ge(ulong x, ulong y)
1861 {
1862 return (int)(x-y) >= 0;
1863 }
1864
1865 /*
1866 * use the time between the first SYN and it's ack as the
1867 * initial round trip time
1868 */
1869 static void
tcpsynackrtt(Conv * s)1870 tcpsynackrtt(Conv *s)
1871 {
1872 Tcpctl *tcb;
1873 int delta;
1874 Tcppriv *tpriv;
1875
1876 tcb = (Tcpctl*)s->ptcl;
1877 tpriv = s->p->priv;
1878
1879 delta = NOW - tcb->sndsyntime;
1880 tcb->srtt = delta<<LOGAGAIN;
1881 tcb->mdev = delta<<LOGDGAIN;
1882
1883 /* halt round trip timer */
1884 tcphalt(tpriv, &tcb->rtt_timer);
1885 }
1886
1887 static void
update(Conv * s,Tcp * seg)1888 update(Conv *s, Tcp *seg)
1889 {
1890 int rtt, delta;
1891 Tcpctl *tcb;
1892 ulong acked;
1893 Tcppriv *tpriv;
1894
1895 if(seg->update)
1896 return;
1897 seg->update = 1;
1898
1899 tpriv = s->p->priv;
1900 tcb = (Tcpctl*)s->ptcl;
1901
1902 /* catch zero-window updates, update window & recover */
1903 if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1904 seq_lt(seg->ack, tcb->snd.ptr)){
1905 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1906 seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
1907 tcb->snd.wnd = seg->wnd;
1908 goto recovery;
1909 }
1910
1911 /* newreno fast retransmit */
1912 if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1913 ++tcb->snd.dupacks == 3){ /* was TCPREXMTTHRESH */
1914 recovery:
1915 if(tcb->snd.recovery){
1916 tpriv->stats[RecoveryCwind]++;
1917 tcb->cwind += tcb->mss;
1918 }else if(seq_le(tcb->snd.rxt, seg->ack)){
1919 tpriv->stats[Recovery]++;
1920 tcb->abcbytes = 0;
1921 tcb->snd.recovery = 1;
1922 tcb->snd.partialack = 0;
1923 tcb->snd.rxt = tcb->snd.nxt;
1924 tcpcongestion(tcb);
1925 tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1926 netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1927 tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1928 tcprxmit(s);
1929 }else{
1930 tpriv->stats[RecoveryNoSeq]++;
1931 netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1932 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1933 /* don't enter fast retransmit, don't change ssthresh */
1934 }
1935 }else if(tcb->snd.recovery){
1936 tpriv->stats[RecoveryCwind]++;
1937 tcb->cwind += tcb->mss;
1938 }
1939
1940 /*
1941 * update window
1942 */
1943 if(seq_gt(seg->ack, tcb->snd.wl2)
1944 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1945 /* clear dupack if we advance wl2 */
1946 if(tcb->snd.wl2 != seg->ack)
1947 tcb->snd.dupacks = 0;
1948 tcb->snd.wnd = seg->wnd;
1949 tcb->snd.wl2 = seg->ack;
1950 }
1951
1952 if(!seq_gt(seg->ack, tcb->snd.una)){
1953 /*
1954 * don't let us hangup if sending into a closed window and
1955 * we're still getting acks
1956 */
1957 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1958 tcb->backedoff = MAXBACKMS/4;
1959 return;
1960 }
1961
1962 /* Compute the new send window size */
1963 acked = seg->ack - tcb->snd.una;
1964
1965 /* avoid slow start and timers for SYN acks */
1966 if((tcb->flags & SYNACK) == 0) {
1967 tcb->flags |= SYNACK;
1968 acked--;
1969 tcb->flgcnt--;
1970 goto done;
1971 }
1972
1973 /*
1974 * congestion control
1975 */
1976 if(tcb->snd.recovery){
1977 if(seq_ge(seg->ack, tcb->snd.rxt)){
1978 /* recovery finished; deflate window */
1979 tpriv->stats[RecoveryDone]++;
1980 tcb->snd.dupacks = 0;
1981 tcb->snd.recovery = 0;
1982 tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1983 if(tcb->ssthresh < tcb->cwind)
1984 tcb->cwind = tcb->ssthresh;
1985 netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1986 tcb->cwind, tcb->ssthresh);
1987 } else {
1988 /* partial ack; we lost more than one segment */
1989 tpriv->stats[RecoveryPA]++;
1990 if(tcb->cwind > acked)
1991 tcb->cwind -= acked;
1992 else{
1993 netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1994 tcb->cwind = tcb->mss;
1995 }
1996 netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1997 acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1998
1999 if(acked >= tcb->mss)
2000 tcb->cwind += tcb->mss;
2001 tcb->snd.partialack++;
2002 }
2003 } else
2004 tcpabcincr(tcb, acked);
2005
2006 /* Adjust the timers according to the round trip time */
2007 /* TODO: fix sloppy treatment of overflow cases here. */
2008 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2009 tcphalt(tpriv, &tcb->rtt_timer);
2010 if((tcb->flags&RETRAN) == 0) {
2011 tcb->backoff = 0;
2012 tcb->backedoff = 0;
2013 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2014 if(rtt == 0)
2015 rtt = 1; /* else all close sys's will rexmit in 0 time */
2016 rtt *= MSPTICK;
2017 if(tcb->srtt == 0) {
2018 tcb->srtt = rtt << LOGAGAIN;
2019 tcb->mdev = rtt << LOGDGAIN;
2020 } else {
2021 delta = rtt - (tcb->srtt>>LOGAGAIN);
2022 tcb->srtt += delta;
2023 if(tcb->srtt <= 0)
2024 tcb->srtt = 1;
2025
2026 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2027 tcb->mdev += delta;
2028 if(tcb->mdev <= 0)
2029 tcb->mdev = 1;
2030 }
2031 tcpsettimer(tcb);
2032 }
2033 }
2034
2035 done:
2036 if(qdiscard(s->wq, acked) < acked)
2037 tcb->flgcnt--;
2038 tcb->snd.una = seg->ack;
2039
2040 /* newreno fast recovery */
2041 if(tcb->snd.recovery)
2042 tcprxmit(s);
2043
2044 if(seq_gt(seg->ack, tcb->snd.urg))
2045 tcb->snd.urg = seg->ack;
2046
2047 if(tcb->snd.una != tcb->snd.nxt){
2048 /* `impatient' variant */
2049 if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2050 tcb->time = NOW;
2051 tcb->timeuna = tcb->snd.una;
2052 tcpgo(tpriv, &tcb->timer);
2053 }
2054 } else
2055 tcphalt(tpriv, &tcb->timer);
2056
2057 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2058 tcb->snd.ptr = tcb->snd.una;
2059
2060 if(!tcb->snd.recovery)
2061 tcb->flags &= ~RETRAN;
2062 tcb->backoff = 0;
2063 tcb->backedoff = 0;
2064 }
2065
2066 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2067 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2068 {
2069 Tcp seg;
2070 Tcp4hdr *h4;
2071 Tcp6hdr *h6;
2072 int hdrlen;
2073 Tcpctl *tcb;
2074 ushort length, csum;
2075 uchar source[IPaddrlen], dest[IPaddrlen];
2076 Conv *s;
2077 Fs *f;
2078 Tcppriv *tpriv;
2079 uchar version;
2080
2081 f = tcp->f;
2082 tpriv = tcp->priv;
2083
2084 tpriv->stats[InSegs]++;
2085
2086 h4 = (Tcp4hdr*)(bp->rp);
2087 h6 = (Tcp6hdr*)(bp->rp);
2088
2089 if((h4->vihl&0xF0)==IP_VER4) {
2090 version = V4;
2091 length = nhgets(h4->length);
2092 v4tov6(dest, h4->tcpdst);
2093 v4tov6(source, h4->tcpsrc);
2094
2095 h4->Unused = 0;
2096 hnputs(h4->tcplen, length-TCP4_PKT);
2097 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2098 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2099 tpriv->stats[CsumErrs]++;
2100 tpriv->stats[InErrs]++;
2101 netlog(f, Logtcp, "bad tcp proto cksum\n");
2102 freeblist(bp);
2103 return;
2104 }
2105
2106 hdrlen = ntohtcp4(&seg, &bp);
2107 if(hdrlen < 0){
2108 tpriv->stats[HlenErrs]++;
2109 tpriv->stats[InErrs]++;
2110 netlog(f, Logtcp, "bad tcp hdr len\n");
2111 return;
2112 }
2113
2114 /* trim the packet to the size claimed by the datagram */
2115 length -= hdrlen+TCP4_PKT;
2116 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2117 if(bp == nil){
2118 tpriv->stats[LenErrs]++;
2119 tpriv->stats[InErrs]++;
2120 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2121 return;
2122 }
2123 }
2124 else {
2125 int ttl = h6->ttl;
2126 int proto = h6->proto;
2127
2128 version = V6;
2129 length = nhgets(h6->ploadlen);
2130 ipmove(dest, h6->tcpdst);
2131 ipmove(source, h6->tcpsrc);
2132
2133 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2134 h6->ttl = proto;
2135 hnputl(h6->vcf, length);
2136 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2137 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2138 tpriv->stats[CsumErrs]++;
2139 tpriv->stats[InErrs]++;
2140 netlog(f, Logtcp,
2141 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2142 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2143 freeblist(bp);
2144 return;
2145 }
2146 h6->ttl = ttl;
2147 h6->proto = proto;
2148 hnputs(h6->ploadlen, length);
2149
2150 hdrlen = ntohtcp6(&seg, &bp);
2151 if(hdrlen < 0){
2152 tpriv->stats[HlenErrs]++;
2153 tpriv->stats[InErrs]++;
2154 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2155 return;
2156 }
2157
2158 /* trim the packet to the size claimed by the datagram */
2159 length -= hdrlen;
2160 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2161 if(bp == nil){
2162 tpriv->stats[LenErrs]++;
2163 tpriv->stats[InErrs]++;
2164 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2165 return;
2166 }
2167 }
2168
2169 /* lock protocol while searching for a conversation */
2170 qlock(tcp);
2171
2172 /* Look for a matching conversation */
2173 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2174 if(s == nil){
2175 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2176 source, seg.source, dest, seg.dest);
2177 reset:
2178 qunlock(tcp);
2179 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2180 freeblist(bp);
2181 return;
2182 }
2183
2184 /* if it's a listener, look for the right flags and get a new conv */
2185 tcb = (Tcpctl*)s->ptcl;
2186 if(tcb->state == Listen){
2187 if(seg.flags & RST){
2188 limborst(s, &seg, source, dest, version);
2189 qunlock(tcp);
2190 freeblist(bp);
2191 return;
2192 }
2193
2194 /* if this is a new SYN, put the call into limbo */
2195 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2196 limbo(s, source, dest, &seg, version);
2197 qunlock(tcp);
2198 freeblist(bp);
2199 return;
2200 }
2201
2202 /*
2203 * if there's a matching call in limbo, tcpincoming will
2204 * return it in state Syn_received
2205 */
2206 s = tcpincoming(s, &seg, source, dest, version);
2207 if(s == nil)
2208 goto reset;
2209 }
2210
2211 /* The rest of the input state machine is run with the control block
2212 * locked and implements the state machine directly out of the RFC.
2213 * Out-of-band data is ignored - it was always a bad idea.
2214 */
2215 tcb = (Tcpctl*)s->ptcl;
2216 if(waserror()){
2217 qunlock(s);
2218 nexterror();
2219 }
2220 qlock(s);
2221 qunlock(tcp);
2222
2223 /* fix up window */
2224 seg.wnd <<= tcb->rcv.scale;
2225
2226 /* every input packet in puts off the keep alive time out */
2227 tcpsetkacounter(tcb);
2228
2229 switch(tcb->state) {
2230 case Closed:
2231 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2232 goto raise;
2233 case Syn_sent:
2234 if(seg.flags & ACK) {
2235 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2236 sndrst(tcp, source, dest, length, &seg, version,
2237 "bad seq in Syn_sent");
2238 goto raise;
2239 }
2240 }
2241 if(seg.flags & RST) {
2242 if(seg.flags & ACK)
2243 localclose(s, Econrefused);
2244 goto raise;
2245 }
2246
2247 if(seg.flags & SYN) {
2248 procsyn(s, &seg);
2249 if(seg.flags & ACK){
2250 update(s, &seg);
2251 tcpsynackrtt(s);
2252 tcpsetstate(s, Established);
2253 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2254 }
2255 else {
2256 tcb->time = NOW;
2257 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2258 }
2259
2260 if(length != 0 || (seg.flags & FIN))
2261 break;
2262
2263 freeblist(bp);
2264 goto output;
2265 }
2266 else
2267 freeblist(bp);
2268
2269 qunlock(s);
2270 poperror();
2271 return;
2272 case Syn_received:
2273 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2274 if(seg.flags & ACK)
2275 tcpsynackrtt(s);
2276 break;
2277 }
2278
2279 /*
2280 * One DOS attack is to open connections to us and then forget about them,
2281 * thereby tying up a conv at no long term cost to the attacker.
2282 * This is an attempt to defeat these stateless DOS attacks. See
2283 * corresponding code in tcpsendka().
2284 */
2285 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2286 if(tcpporthogdefense
2287 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2288 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2289 source, seg.source, dest, seg.dest, seg.flags,
2290 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2291 localclose(s, "stateless hog");
2292 }
2293 }
2294
2295 /* Cut the data to fit the receive window */
2296 tcprcvwin(s);
2297 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2298 if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2299 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2300 "%lud-%lud l %d from %I\n", seg.seq,
2301 seg.seq + length - 1, tcb->rcv.nxt,
2302 tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2303 update(s, &seg);
2304 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2305 tcphalt(tpriv, &tcb->rtt_timer);
2306 tcphalt(tpriv, &tcb->acktimer);
2307 tcphalt(tpriv, &tcb->katimer);
2308 tcpsetstate(s, Time_wait);
2309 tcb->timer.start = MSL2*(1000 / MSPTICK);
2310 tcpgo(tpriv, &tcb->timer);
2311 }
2312 if(!(seg.flags & RST)) {
2313 tcb->flags |= FORCE;
2314 goto output;
2315 }
2316 qunlock(s);
2317 poperror();
2318 return;
2319 }
2320
2321 /* Cannot accept so answer with a rst */
2322 if(length && tcb->state == Closed) {
2323 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2324 goto raise;
2325 }
2326
2327 /* The segment is beyond the current receive pointer so
2328 * queue the data in the resequence queue
2329 */
2330 if(seg.seq != tcb->rcv.nxt)
2331 if(length != 0 || (seg.flags & (SYN|FIN))) {
2332 update(s, &seg);
2333 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2334 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2335 s->laddr, s->lport);
2336 tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
2337 goto output;
2338 }
2339
2340 if(tcb->nreseq > 0)
2341 tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2342
2343 /*
2344 * keep looping till we've processed this packet plus any
2345 * adjacent packets in the resequence queue
2346 */
2347 for(;;) {
2348 if(seg.flags & RST) {
2349 if(tcb->state == Established) {
2350 tpriv->stats[EstabResets]++;
2351 if(tcb->rcv.nxt != seg.seq)
2352 netlog(f, Logtcp, "out of order RST "
2353 "rcvd: %I.%d -> %I.%d, rcv.nxt "
2354 "%lux seq %lux\n",
2355 s->raddr, s->rport, s->laddr,
2356 s->lport, tcb->rcv.nxt, seg.seq);
2357 }
2358 localclose(s, Econrefused);
2359 goto raise;
2360 }
2361
2362 if((seg.flags&ACK) == 0)
2363 goto raise;
2364
2365 switch(tcb->state) {
2366 case Syn_received:
2367 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2368 sndrst(tcp, source, dest, length, &seg, version,
2369 "bad seq in Syn_received");
2370 goto raise;
2371 }
2372 update(s, &seg);
2373 tcpsetstate(s, Established);
2374 case Established:
2375 case Close_wait:
2376 update(s, &seg);
2377 break;
2378 case Finwait1:
2379 update(s, &seg);
2380 if(qlen(s->wq)+tcb->flgcnt == 0){
2381 tcphalt(tpriv, &tcb->rtt_timer);
2382 tcphalt(tpriv, &tcb->acktimer);
2383 tcpsetkacounter(tcb);
2384 tcb->time = NOW;
2385 tcpsetstate(s, Finwait2);
2386 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2387 tcpgo(tpriv, &tcb->katimer);
2388 }
2389 break;
2390 case Finwait2:
2391 update(s, &seg);
2392 break;
2393 case Closing:
2394 update(s, &seg);
2395 if(qlen(s->wq)+tcb->flgcnt == 0) {
2396 tcphalt(tpriv, &tcb->rtt_timer);
2397 tcphalt(tpriv, &tcb->acktimer);
2398 tcphalt(tpriv, &tcb->katimer);
2399 tcpsetstate(s, Time_wait);
2400 tcb->timer.start = MSL2*(1000 / MSPTICK);
2401 tcpgo(tpriv, &tcb->timer);
2402 }
2403 break;
2404 case Last_ack:
2405 update(s, &seg);
2406 if(qlen(s->wq)+tcb->flgcnt == 0) {
2407 localclose(s, nil);
2408 goto raise;
2409 }
2410 case Time_wait:
2411 tcb->flags |= FORCE;
2412 if(tcb->timer.state != TcptimerON)
2413 tcpgo(tpriv, &tcb->timer);
2414 }
2415
2416 if((seg.flags&URG) && seg.urg) {
2417 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2418 tcb->rcv.urg = seg.urg + seg.seq;
2419 pullblock(&bp, seg.urg);
2420 }
2421 }
2422 else
2423 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2424 tcb->rcv.urg = tcb->rcv.nxt;
2425
2426 if(length == 0) {
2427 if(bp != nil)
2428 freeblist(bp);
2429 }
2430 else {
2431 switch(tcb->state){
2432 default:
2433 /* Ignore segment text */
2434 if(bp != nil)
2435 freeblist(bp);
2436 break;
2437
2438 case Syn_received:
2439 case Established:
2440 case Finwait1:
2441 /* If we still have some data place on
2442 * receive queue
2443 */
2444 if(bp) {
2445 bp = packblock(bp);
2446 if(bp == nil)
2447 panic("tcp packblock");
2448 qpassnolim(s->rq, bp);
2449 bp = nil;
2450 }
2451 tcb->rcv.nxt += length;
2452
2453 /*
2454 * turn on the acktimer if there's something
2455 * to ack
2456 */
2457 if(tcb->acktimer.state != TcptimerON)
2458 tcpgo(tpriv, &tcb->acktimer);
2459
2460 break;
2461 case Finwait2:
2462 /* no process to read the data, send a reset */
2463 if(bp != nil)
2464 freeblist(bp);
2465 sndrst(tcp, source, dest, length, &seg, version,
2466 "send to Finwait2");
2467 qunlock(s);
2468 poperror();
2469 return;
2470 }
2471 }
2472
2473 if(seg.flags & FIN) {
2474 tcb->flags |= FORCE;
2475
2476 switch(tcb->state) {
2477 case Syn_received:
2478 case Established:
2479 tcb->rcv.nxt++;
2480 tcpsetstate(s, Close_wait);
2481 break;
2482 case Finwait1:
2483 tcb->rcv.nxt++;
2484 if(qlen(s->wq)+tcb->flgcnt == 0) {
2485 tcphalt(tpriv, &tcb->rtt_timer);
2486 tcphalt(tpriv, &tcb->acktimer);
2487 tcphalt(tpriv, &tcb->katimer);
2488 tcpsetstate(s, Time_wait);
2489 tcb->timer.start = MSL2*(1000/MSPTICK);
2490 tcpgo(tpriv, &tcb->timer);
2491 }
2492 else
2493 tcpsetstate(s, Closing);
2494 break;
2495 case Finwait2:
2496 tcb->rcv.nxt++;
2497 tcphalt(tpriv, &tcb->rtt_timer);
2498 tcphalt(tpriv, &tcb->acktimer);
2499 tcphalt(tpriv, &tcb->katimer);
2500 tcpsetstate(s, Time_wait);
2501 tcb->timer.start = MSL2 * (1000/MSPTICK);
2502 tcpgo(tpriv, &tcb->timer);
2503 break;
2504 case Close_wait:
2505 case Closing:
2506 case Last_ack:
2507 break;
2508 case Time_wait:
2509 tcpgo(tpriv, &tcb->timer);
2510 break;
2511 }
2512 }
2513
2514 /*
2515 * get next adjacent segment from the resequence queue.
2516 * dump/trim any overlapping segments
2517 */
2518 for(;;) {
2519 if(tcb->reseq == nil)
2520 goto output;
2521
2522 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2523 goto output;
2524
2525 getreseq(tcb, &seg, &bp, &length);
2526
2527 tcprcvwin(s);
2528 if(tcptrim(tcb, &seg, &bp, &length) == 0){
2529 tcb->flags |= FORCE;
2530 break;
2531 }
2532 }
2533 }
2534 output:
2535 tcpoutput(s);
2536 qunlock(s);
2537 poperror();
2538 return;
2539 raise:
2540 qunlock(s);
2541 poperror();
2542 freeblist(bp);
2543 tcpkick(s);
2544 }
2545
2546 /*
2547 * always enters and exits with the s locked. We drop
2548 * the lock to ipoput the packet so some care has to be
2549 * taken by callers.
2550 */
2551 static void
tcpoutput(Conv * s)2552 tcpoutput(Conv *s)
2553 {
2554 Tcp seg;
2555 uint msgs;
2556 Tcpctl *tcb;
2557 Block *hbp, *bp;
2558 int sndcnt;
2559 ulong ssize, dsize, sent;
2560 Fs *f;
2561 Tcppriv *tpriv;
2562 uchar version;
2563
2564 f = s->p->f;
2565 tpriv = s->p->priv;
2566 version = s->ipversion;
2567
2568 tcb = (Tcpctl*)s->ptcl;
2569
2570 /* force ack every 2*mss */
2571 if((tcb->flags & FORCE) == 0 &&
2572 tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2573 tpriv->stats[Delayack]++;
2574 tcb->flags |= FORCE;
2575 }
2576
2577 /* force ack if window opening */
2578 if((tcb->flags & FORCE) == 0){
2579 tcprcvwin(s);
2580 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2581 tpriv->stats[Wopenack]++;
2582 tcb->flags |= FORCE;
2583 }
2584 }
2585
2586 for(msgs = 0; msgs < 100; msgs++) {
2587 switch(tcb->state) {
2588 case Listen:
2589 case Closed:
2590 case Finwait2:
2591 return;
2592 }
2593
2594 /* Don't send anything else until our SYN has been acked */
2595 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2596 break;
2597
2598 /* force an ack when a window has opened up */
2599 tcprcvwin(s);
2600 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2601 tcb->rcv.blocked = 0;
2602 tcb->flags |= FORCE;
2603 }
2604
2605 sndcnt = qlen(s->wq)+tcb->flgcnt;
2606 sent = tcb->snd.ptr - tcb->snd.una;
2607 ssize = sndcnt;
2608 if(tcb->snd.wnd == 0){
2609 /* zero window probe */
2610 if(sent > 0 && !(tcb->flags & FORCE))
2611 break; /* already probing, rto re-probes */
2612 if(ssize < sent)
2613 ssize = 0;
2614 else{
2615 ssize -= sent;
2616 if(ssize > 0)
2617 ssize = 1;
2618 }
2619 } else {
2620 /* calculate usable segment size */
2621 if(ssize > tcb->cwind)
2622 ssize = tcb->cwind;
2623 if(ssize > tcb->snd.wnd)
2624 ssize = tcb->snd.wnd;
2625
2626 if(ssize < sent)
2627 ssize = 0;
2628 else {
2629 ssize -= sent;
2630 if(ssize > tcb->mss)
2631 ssize = tcb->mss;
2632 }
2633 }
2634
2635 dsize = ssize;
2636 seg.urg = 0;
2637
2638 if(!(tcb->flags & FORCE))
2639 if(ssize == 0 ||
2640 ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2641 sent > TCPREXMTTHRESH * tcb->mss)
2642 break;
2643
2644 tcb->flags &= ~FORCE;
2645
2646 /* By default we will generate an ack */
2647 tcphalt(tpriv, &tcb->acktimer);
2648 seg.source = s->lport;
2649 seg.dest = s->rport;
2650 seg.flags = ACK;
2651 seg.mss = 0;
2652 seg.ws = 0;
2653 seg.update = 0;
2654 switch(tcb->state){
2655 case Syn_sent:
2656 seg.flags = 0;
2657 if(tcb->snd.ptr == tcb->iss){
2658 seg.flags |= SYN;
2659 dsize--;
2660 seg.mss = tcb->mss;
2661 seg.ws = tcb->scale;
2662 }
2663 break;
2664 case Syn_received:
2665 /*
2666 * don't send any data with a SYN/ACK packet
2667 * because Linux rejects the packet in its
2668 * attempt to solve the SYN attack problem
2669 */
2670 if(tcb->snd.ptr == tcb->iss){
2671 seg.flags |= SYN;
2672 dsize = 0;
2673 ssize = 1;
2674 seg.mss = tcb->mss;
2675 seg.ws = tcb->scale;
2676 }
2677 break;
2678 }
2679 seg.seq = tcb->snd.ptr;
2680 seg.ack = tcb->rcv.nxt;
2681 seg.wnd = tcb->rcv.wnd;
2682
2683 /* Pull out data to send */
2684 bp = nil;
2685 if(dsize != 0) {
2686 bp = qcopy(s->wq, dsize, sent);
2687 if(BLEN(bp) != dsize) {
2688 seg.flags |= FIN;
2689 dsize--;
2690 }
2691 }
2692
2693 if(sent+dsize == sndcnt && dsize)
2694 seg.flags |= PSH;
2695
2696 tcb->snd.ptr += ssize;
2697
2698 /* Pull up the send pointer so we can accept acks
2699 * for this window
2700 */
2701 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2702 tcb->snd.nxt = tcb->snd.ptr;
2703
2704 /* Build header, link data and compute cksum */
2705 switch(version){
2706 case V4:
2707 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2708 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2709 if(hbp == nil) {
2710 freeblist(bp);
2711 return;
2712 }
2713 break;
2714 case V6:
2715 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2716 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2717 if(hbp == nil) {
2718 freeblist(bp);
2719 return;
2720 }
2721 break;
2722 default:
2723 hbp = nil; /* to suppress a warning */
2724 panic("tcpoutput: version %d", version);
2725 }
2726
2727 /* Start the transmission timers if there is new data and we
2728 * expect acknowledges
2729 */
2730 if(ssize != 0){
2731 if(tcb->timer.state != TcptimerON){
2732 tcb->time = NOW;
2733 tcb->timeuna = tcb->snd.una;
2734 tcpgo(tpriv, &tcb->timer);
2735 }
2736
2737 /* If round trip timer isn't running, start it.
2738 * measure the longest packet only in case the
2739 * transmission time dominates RTT
2740 */
2741 if(tcb->snd.retransmit == 0)
2742 if(tcb->rtt_timer.state != TcptimerON)
2743 if(ssize == tcb->mss) {
2744 tcpgo(tpriv, &tcb->rtt_timer);
2745 tcb->rttseq = tcb->snd.ptr;
2746 }
2747 }
2748
2749 tpriv->stats[OutSegs]++;
2750 if(tcb->snd.retransmit)
2751 tpriv->stats[RetransSegsSent]++;
2752 tcb->rcv.ackptr = seg.ack;
2753 tcb->rcv.wsnt = tcb->rcv.wptr;
2754
2755 /* put off the next keep alive */
2756 tcpgo(tpriv, &tcb->katimer);
2757
2758 switch(version){
2759 case V4:
2760 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761 /* a negative return means no route */
2762 localclose(s, "no route");
2763 }
2764 break;
2765 case V6:
2766 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2767 /* a negative return means no route */
2768 localclose(s, "no route");
2769 }
2770 break;
2771 default:
2772 panic("tcpoutput2: version %d", version);
2773 }
2774 if((msgs%4) == 3){
2775 qunlock(s);
2776 qlock(s);
2777 }
2778 }
2779 }
2780
2781 /*
2782 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2783 */
2784 static void
tcpsendka(Conv * s)2785 tcpsendka(Conv *s)
2786 {
2787 Tcp seg;
2788 Tcpctl *tcb;
2789 Block *hbp,*dbp;
2790
2791 tcb = (Tcpctl*)s->ptcl;
2792
2793 dbp = nil;
2794 memset(&seg, 0, sizeof seg);
2795 seg.urg = 0;
2796 seg.source = s->lport;
2797 seg.dest = s->rport;
2798 seg.flags = ACK|PSH;
2799 seg.mss = 0;
2800 seg.ws = 0;
2801 if(tcpporthogdefense)
2802 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2803 else
2804 seg.seq = tcb->snd.una-1;
2805 seg.ack = tcb->rcv.nxt;
2806 tcb->rcv.ackptr = seg.ack;
2807 tcprcvwin(s);
2808 seg.wnd = tcb->rcv.wnd;
2809 if(tcb->state == Finwait2){
2810 seg.flags |= FIN;
2811 } else {
2812 dbp = allocb(1);
2813 dbp->wp++;
2814 }
2815
2816 if(isv4(s->raddr)) {
2817 /* Build header, link data and compute cksum */
2818 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2819 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2820 if(hbp == nil) {
2821 freeblist(dbp);
2822 return;
2823 }
2824 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2825 }
2826 else {
2827 /* Build header, link data and compute cksum */
2828 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2829 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2830 if(hbp == nil) {
2831 freeblist(dbp);
2832 return;
2833 }
2834 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2835 }
2836 }
2837
2838 /*
2839 * set connection to time out after 12 minutes
2840 */
2841 static void
tcpsetkacounter(Tcpctl * tcb)2842 tcpsetkacounter(Tcpctl *tcb)
2843 {
2844 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2845 if(tcb->kacounter < 3)
2846 tcb->kacounter = 3;
2847 }
2848
2849 /*
2850 * if we've timed out, close the connection
2851 * otherwise, send a keepalive and restart the timer
2852 */
2853 static void
tcpkeepalive(void * v)2854 tcpkeepalive(void *v)
2855 {
2856 Tcpctl *tcb;
2857 Conv *s;
2858
2859 s = v;
2860 tcb = (Tcpctl*)s->ptcl;
2861 if(waserror()){
2862 qunlock(s);
2863 nexterror();
2864 }
2865 qlock(s);
2866 if(tcb->state != Closed){
2867 if(--(tcb->kacounter) <= 0) {
2868 localclose(s, Etimedout);
2869 } else {
2870 tcpsendka(s);
2871 tcpgo(s->p->priv, &tcb->katimer);
2872 }
2873 }
2874 qunlock(s);
2875 poperror();
2876 }
2877
2878 /*
2879 * start keepalive timer
2880 */
2881 static char*
tcpstartka(Conv * s,char ** f,int n)2882 tcpstartka(Conv *s, char **f, int n)
2883 {
2884 Tcpctl *tcb;
2885 int x;
2886
2887 tcb = (Tcpctl*)s->ptcl;
2888 if(tcb->state != Established)
2889 return "connection must be in Establised state";
2890 if(n > 1){
2891 x = atoi(f[1]);
2892 if(x >= MSPTICK)
2893 tcb->katimer.start = x/MSPTICK;
2894 }
2895 tcpsetkacounter(tcb);
2896 tcpgo(s->p->priv, &tcb->katimer);
2897
2898 return nil;
2899 }
2900
2901 /*
2902 * turn checksums on/off
2903 */
2904 static char*
tcpsetchecksum(Conv * s,char ** f,int)2905 tcpsetchecksum(Conv *s, char **f, int)
2906 {
2907 Tcpctl *tcb;
2908
2909 tcb = (Tcpctl*)s->ptcl;
2910 tcb->nochecksum = !atoi(f[1]);
2911
2912 return nil;
2913 }
2914
2915 /*
2916 * retransmit (at most) one segment at snd.una.
2917 * preserve cwind & snd.ptr
2918 */
2919 static void
tcprxmit(Conv * s)2920 tcprxmit(Conv *s)
2921 {
2922 Tcpctl *tcb;
2923 Tcppriv *tpriv;
2924 ulong tcwind, tptr;
2925
2926 tcb = (Tcpctl*)s->ptcl;
2927 tcb->flags |= RETRAN|FORCE;
2928
2929 tptr = tcb->snd.ptr;
2930 tcwind = tcb->cwind;
2931 tcb->snd.ptr = tcb->snd.una;
2932 tcb->cwind = tcb->mss;
2933 tcb->snd.retransmit = 1;
2934 tcpoutput(s);
2935 tcb->snd.retransmit = 0;
2936 tcb->cwind = tcwind;
2937 tcb->snd.ptr = tptr;
2938
2939 tpriv = s->p->priv;
2940 tpriv->stats[RetransSegs]++;
2941 }
2942
2943 /*
2944 * TODO: RFC 4138 F-RTO
2945 */
2946 static void
tcptimeout(void * arg)2947 tcptimeout(void *arg)
2948 {
2949 Conv *s;
2950 Tcpctl *tcb;
2951 int maxback;
2952 Tcppriv *tpriv;
2953
2954 s = (Conv*)arg;
2955 tpriv = s->p->priv;
2956 tcb = (Tcpctl*)s->ptcl;
2957
2958 if(waserror()){
2959 qunlock(s);
2960 nexterror();
2961 }
2962 qlock(s);
2963 switch(tcb->state){
2964 default:
2965 tcb->backoff++;
2966 if(tcb->state == Syn_sent)
2967 maxback = MAXBACKMS/2;
2968 else
2969 maxback = MAXBACKMS;
2970 tcb->backedoff += tcb->timer.start * MSPTICK;
2971 if(tcb->backedoff >= maxback) {
2972 localclose(s, Etimedout);
2973 break;
2974 }
2975 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2976 tcb->srtt, tcb->mdev, NOW - tcb->time,
2977 tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2978 tcpstates[s->state]);
2979 tcpsettimer(tcb);
2980 if(tcb->snd.rto == 0)
2981 tcpcongestion(tcb);
2982 tcprxmit(s);
2983 tcb->snd.ptr = tcb->snd.una;
2984 tcb->cwind = tcb->mss;
2985 tcb->snd.rto = 1;
2986 tpriv->stats[RetransTimeouts]++;
2987
2988 if(tcb->snd.recovery){
2989 tcb->snd.dupacks = 0; /* reno rto */
2990 tcb->snd.recovery = 0;
2991 tpriv->stats[RecoveryRTO]++;
2992 tcb->snd.rxt = tcb->snd.nxt;
2993 netlog(s->p->f, Logtcpwin,
2994 "rto recovery rxt @%lud\n", tcb->snd.nxt);
2995 }
2996
2997 tcb->abcbytes = 0;
2998 break;
2999 case Time_wait:
3000 localclose(s, nil);
3001 break;
3002 case Closed:
3003 break;
3004 }
3005 qunlock(s);
3006 poperror();
3007 }
3008
3009 static int
inwindow(Tcpctl * tcb,int seq)3010 inwindow(Tcpctl *tcb, int seq)
3011 {
3012 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3013 }
3014
3015 /*
3016 * set up state for a received SYN (or SYN ACK) packet
3017 */
3018 static void
procsyn(Conv * s,Tcp * seg)3019 procsyn(Conv *s, Tcp *seg)
3020 {
3021 Tcpctl *tcb;
3022 Tcppriv *tpriv;
3023
3024 tcb = (Tcpctl*)s->ptcl;
3025 tcb->flags |= FORCE;
3026
3027 tcb->rcv.nxt = seg->seq + 1;
3028 tcb->rcv.wptr = tcb->rcv.nxt;
3029 tcb->rcv.wsnt = 0;
3030 tcb->rcv.urg = tcb->rcv.nxt;
3031 tcb->irs = seg->seq;
3032
3033 /* our sending max segment size cannot be bigger than what he asked for */
3034 if(seg->mss != 0 && seg->mss < tcb->mss) {
3035 tcb->mss = seg->mss;
3036 tpriv = s->p->priv;
3037 tpriv->stats[Mss] = tcb->mss;
3038 }
3039
3040 tcb->snd.wnd = seg->wnd;
3041 initialwindow(tcb);
3042 }
3043
3044 static int
dumpreseq(Tcpctl * tcb)3045 dumpreseq(Tcpctl *tcb)
3046 {
3047 Reseq *r, *next;
3048
3049 for(r = tcb->reseq; r != nil; r = next){
3050 next = r->next;
3051 freeblist(r->bp);
3052 free(r);
3053 }
3054 tcb->reseq = nil;
3055 tcb->nreseq = 0;
3056 tcb->reseqlen = 0;
3057 return -1;
3058 }
3059
3060 static void
logreseq(Fs * f,Reseq * r,ulong n)3061 logreseq(Fs *f, Reseq *r, ulong n)
3062 {
3063 char *s;
3064
3065 for(; r != nil; r = r->next){
3066 s = nil;
3067 if(r->next == nil && r->seg.seq != n)
3068 s = "hole/end";
3069 else if(r->next == nil)
3070 s = "end";
3071 else if(r->seg.seq != n)
3072 s = "hole";
3073 if(s != nil)
3074 netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3075 n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3076 n = r->seg.seq + r->seg.len;
3077 }
3078 }
3079
3080 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3081 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3082 {
3083 Reseq *rp, **rr;
3084 int qmax;
3085
3086 rp = malloc(sizeof *rp);
3087 if(rp == nil){
3088 freeblist(bp); /* bp always consumed by addreseq */
3089 return 0;
3090 }
3091
3092 rp->seg = *seg;
3093 rp->bp = bp;
3094 rp->length = length;
3095
3096 tcb->reseqlen += length;
3097 tcb->nreseq++;
3098
3099 /* Place on reassembly list sorting by starting seq number */
3100 for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3101 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3102 rp->next = *rr;
3103 *rr = rp;
3104 tpriv->stats[Resequenced]++;
3105 if(rp->next != nil)
3106 tpriv->stats[OutOfOrder]++;
3107 break;
3108 }
3109
3110 qmax = tcb->window;
3111 if(tcb->reseqlen > qmax){
3112 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3113 tcb->reseqlen, qmax, tcb->nreseq);
3114 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3115 tpriv->stats[ReseqBytelim]++;
3116 return dumpreseq(tcb);
3117 }
3118 qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3119 if(tcb->nreseq > qmax){
3120 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3121 tcb->nreseq, qmax, tcb->reseqlen);
3122 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3123 tpriv->stats[ReseqPktlim]++;
3124 return dumpreseq(tcb);
3125 }
3126 return 0;
3127 }
3128
3129 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3130 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3131 {
3132 Reseq *rp;
3133
3134 rp = tcb->reseq;
3135 if(rp == nil)
3136 return;
3137
3138 tcb->reseq = rp->next;
3139
3140 *seg = rp->seg;
3141 *bp = rp->bp;
3142 *length = rp->length;
3143
3144 tcb->nreseq--;
3145 tcb->reseqlen -= rp->length;
3146
3147 free(rp);
3148 }
3149
3150 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3151 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3152 {
3153 ushort len;
3154 uchar accept;
3155 int dupcnt, excess;
3156
3157 accept = 0;
3158 len = *length;
3159 if(seg->flags & SYN)
3160 len++;
3161 if(seg->flags & FIN)
3162 len++;
3163
3164 if(tcb->rcv.wnd == 0) {
3165 if(len == 0 && seg->seq == tcb->rcv.nxt)
3166 return 0;
3167 }
3168 else {
3169 /* Some part of the segment should be in the window */
3170 if(inwindow(tcb,seg->seq))
3171 accept++;
3172 else
3173 if(len != 0) {
3174 if(inwindow(tcb, seg->seq+len-1) ||
3175 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3176 accept++;
3177 }
3178 }
3179 if(!accept) {
3180 freeblist(*bp);
3181 return -1;
3182 }
3183 dupcnt = tcb->rcv.nxt - seg->seq;
3184 if(dupcnt > 0){
3185 tcb->rerecv += dupcnt;
3186 if(seg->flags & SYN){
3187 seg->flags &= ~SYN;
3188 seg->seq++;
3189
3190 if(seg->urg > 1)
3191 seg->urg--;
3192 else
3193 seg->flags &= ~URG;
3194 dupcnt--;
3195 }
3196 if(dupcnt > 0){
3197 pullblock(bp, (ushort)dupcnt);
3198 seg->seq += dupcnt;
3199 *length -= dupcnt;
3200
3201 if(seg->urg > dupcnt)
3202 seg->urg -= dupcnt;
3203 else {
3204 seg->flags &= ~URG;
3205 seg->urg = 0;
3206 }
3207 }
3208 }
3209 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3210 if(excess > 0) {
3211 tcb->rerecv += excess;
3212 *length -= excess;
3213 *bp = trimblock(*bp, 0, *length);
3214 if(*bp == nil)
3215 panic("presotto is a boofhead");
3216 seg->flags &= ~FIN;
3217 }
3218 return 0;
3219 }
3220
3221 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3222 tcpadvise(Proto *tcp, Block *bp, char *msg)
3223 {
3224 Tcp4hdr *h4;
3225 Tcp6hdr *h6;
3226 Tcpctl *tcb;
3227 uchar source[IPaddrlen];
3228 uchar dest[IPaddrlen];
3229 ushort psource, pdest;
3230 Conv *s, **p;
3231
3232 h4 = (Tcp4hdr*)(bp->rp);
3233 h6 = (Tcp6hdr*)(bp->rp);
3234
3235 if((h4->vihl&0xF0)==IP_VER4) {
3236 v4tov6(dest, h4->tcpdst);
3237 v4tov6(source, h4->tcpsrc);
3238 psource = nhgets(h4->tcpsport);
3239 pdest = nhgets(h4->tcpdport);
3240 }
3241 else {
3242 ipmove(dest, h6->tcpdst);
3243 ipmove(source, h6->tcpsrc);
3244 psource = nhgets(h6->tcpsport);
3245 pdest = nhgets(h6->tcpdport);
3246 }
3247
3248 /* Look for a connection */
3249 qlock(tcp);
3250 for(p = tcp->conv; *p; p++) {
3251 s = *p;
3252 tcb = (Tcpctl*)s->ptcl;
3253 if(s->rport == pdest)
3254 if(s->lport == psource)
3255 if(tcb->state != Closed)
3256 if(ipcmp(s->raddr, dest) == 0)
3257 if(ipcmp(s->laddr, source) == 0){
3258 qlock(s);
3259 qunlock(tcp);
3260 switch(tcb->state){
3261 case Syn_sent:
3262 localclose(s, msg);
3263 break;
3264 }
3265 qunlock(s);
3266 freeblist(bp);
3267 return;
3268 }
3269 }
3270 qunlock(tcp);
3271 freeblist(bp);
3272 }
3273
3274 static char*
tcpporthogdefensectl(char * val)3275 tcpporthogdefensectl(char *val)
3276 {
3277 if(strcmp(val, "on") == 0)
3278 tcpporthogdefense = 1;
3279 else if(strcmp(val, "off") == 0)
3280 tcpporthogdefense = 0;
3281 else
3282 return "unknown value for tcpporthogdefense";
3283 return nil;
3284 }
3285
3286 /* called with c qlocked */
3287 static char*
tcpctl(Conv * c,char ** f,int n)3288 tcpctl(Conv* c, char** f, int n)
3289 {
3290 if(n == 1 && strcmp(f[0], "hangup") == 0)
3291 return tcphangup(c);
3292 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3293 return tcpstartka(c, f, n);
3294 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3295 return tcpsetchecksum(c, f, n);
3296 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3297 return tcpporthogdefensectl(f[1]);
3298 return "unknown control request";
3299 }
3300
3301 static int
tcpstats(Proto * tcp,char * buf,int len)3302 tcpstats(Proto *tcp, char *buf, int len)
3303 {
3304 Tcppriv *priv;
3305 char *p, *e;
3306 int i;
3307
3308 priv = tcp->priv;
3309 p = buf;
3310 e = p+len;
3311 for(i = 0; i < Nstats; i++)
3312 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3313 return p - buf;
3314 }
3315
3316 /*
3317 * garbage collect any stale conversations:
3318 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3319 * - Finwait2 after 5 minutes
3320 *
3321 * this is called whenever we run out of channels. Both checks are
3322 * of questionable validity so we try to use them only when we're
3323 * up against the wall.
3324 */
3325 static int
tcpgc(Proto * tcp)3326 tcpgc(Proto *tcp)
3327 {
3328 Conv *c, **pp, **ep;
3329 int n;
3330 Tcpctl *tcb;
3331
3332
3333 n = 0;
3334 ep = &tcp->conv[tcp->nc];
3335 for(pp = tcp->conv; pp < ep; pp++) {
3336 c = *pp;
3337 if(c == nil)
3338 break;
3339 if(!canqlock(c))
3340 continue;
3341 tcb = (Tcpctl*)c->ptcl;
3342 switch(tcb->state){
3343 case Syn_received:
3344 if(NOW - tcb->time > 5000){
3345 localclose(c, Etimedout);
3346 n++;
3347 }
3348 break;
3349 case Finwait2:
3350 if(NOW - tcb->time > 5*60*1000){
3351 localclose(c, Etimedout);
3352 n++;
3353 }
3354 break;
3355 }
3356 qunlock(c);
3357 }
3358 return n;
3359 }
3360
3361 static void
tcpsettimer(Tcpctl * tcb)3362 tcpsettimer(Tcpctl *tcb)
3363 {
3364 int x;
3365
3366 /* round trip dependency */
3367 x = backoff(tcb->backoff) *
3368 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3369
3370 /* bounded twixt 0.3 and 64 seconds */
3371 if(x < 300/MSPTICK)
3372 x = 300/MSPTICK;
3373 else if(x > (64000/MSPTICK))
3374 x = 64000/MSPTICK;
3375 tcb->timer.start = x;
3376 }
3377
3378 void
tcpinit(Fs * fs)3379 tcpinit(Fs *fs)
3380 {
3381 Proto *tcp;
3382 Tcppriv *tpriv;
3383
3384 tcp = smalloc(sizeof(Proto));
3385 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3386 tcp->name = "tcp";
3387 tcp->connect = tcpconnect;
3388 tcp->announce = tcpannounce;
3389 tcp->ctl = tcpctl;
3390 tcp->state = tcpstate;
3391 tcp->create = tcpcreate;
3392 tcp->close = tcpclose;
3393 tcp->rcv = tcpiput;
3394 tcp->advise = tcpadvise;
3395 tcp->stats = tcpstats;
3396 tcp->inuse = tcpinuse;
3397 tcp->gc = tcpgc;
3398 tcp->ipproto = IP_TCPPROTO;
3399 tcp->nc = scalednconv();
3400 tcp->ptclsize = sizeof(Tcpctl);
3401 tpriv->stats[MaxConn] = tcp->nc;
3402
3403 Fsproto(fs, tcp);
3404 }
3405
3406 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3407 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3408 {
3409 /*
3410 * guess at reasonable queue sizes. there's no current way
3411 * to know how many nic receive buffers we can safely tie up in the
3412 * tcp stack, and we don't adjust our queues to maximize throughput
3413 * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
3414 * respected, but we still control our own buffer commitment by
3415 * keeping a seperate qscale.
3416 */
3417 tcb->rcv.scale = rcvscale & 0xff;
3418 tcb->snd.scale = sndscale & 0xff;
3419 tcb->qscale = rcvscale & 0xff;
3420 if(rcvscale > Maxqscale)
3421 tcb->qscale = Maxqscale;
3422
3423 if(rcvscale != tcb->rcv.scale)
3424 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3425 "qlen %d >> window %ud lport %d\n",
3426 tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3427 tcb->window = QMAX << tcb->qscale;
3428 tcb->ssthresh = tcb->window;
3429
3430 /*
3431 * it's important to set wq large enough to cover the full
3432 * bandwidth-delay product. it's possible to be in loss
3433 * recovery with a big window, and we need to keep sending
3434 * into the inflated window. the difference can be huge
3435 * for even modest (70ms) ping times.
3436 */
3437 qsetlimit(s->rq, tcb->window);
3438 qsetlimit(s->wq, tcb->window);
3439 tcprcvwin(s);
3440 }
3441