1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7
8 #include "ip.h"
9
10 enum
11 {
12 QMAX = 64*1024-1,
13 IP_TCPPROTO = 6,
14
15 TCP4_IPLEN = 8,
16 TCP4_PHDRSIZE = 12,
17 TCP4_HDRSIZE = 20,
18 TCP4_TCBPHDRSZ = 40,
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21 TCP6_IPLEN = 0,
22 TCP6_PHDRSIZE = 40,
23 TCP6_HDRSIZE = 20,
24 TCP6_TCBPHDRSZ = 60,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27 TcptimerOFF = 0,
28 TcptimerON = 1,
29 TcptimerDONE = 2,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
40
41 EOLOPT = 0,
42 NOOPOPT = 1,
43 MSSOPT = 2,
44 MSS_LENGTH = 4, /* Maximum segment size */
45 WSOPT = 3,
46 WS_LENGTH = 3, /* Bits to scale window size by */
47 MSL2 = 10,
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default maximum segment */
50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58
59 FORCE = 1,
60 CLONE = 2,
61 RETRAN = 4,
62 ACTIVE = 8,
63 SYNACK = 16,
64
65 LOGAGAIN = 3,
66 LOGDGAIN = 2,
67
68 Closed = 0, /* Connection states */
69 Listen,
70 Syn_sent,
71 Syn_received,
72 Established,
73 Finwait1,
74 Finwait2,
75 Close_wait,
76 Closing,
77 Last_ack,
78 Time_wait,
79
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
82 LHTMASK = NLHT-1,
83
84 /*
85 * window is 64kb * 2ⁿ
86 * these factors determine the ultimate bandwidth-delay product.
87 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 */
89 Maxqscale = 4, /* maximum queuing scale */
90 Defadvscale = 4, /* default advertisement */
91 };
92
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 "Closed", "Listen", "Syn_sent", "Syn_received",
97 "Established", "Finwait1", "Finwait2", "Close_wait",
98 "Closing", "Last_ack", "Time_wait"
99 };
100
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 Tcptimer *next;
105 Tcptimer *prev;
106 Tcptimer *readynext;
107 int state;
108 int start;
109 int count;
110 void (*func)(void*);
111 void *arg;
112 };
113
114 /*
115 * v4 and v6 pseudo headers used for
116 * checksuming tcp
117 */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 uchar vihl; /* Version and header length */
122 uchar tos; /* Type of service */
123 uchar length[2]; /* packet length */
124 uchar id[2]; /* Identification */
125 uchar frag[2]; /* Fragment information */
126 uchar Unused;
127 uchar proto;
128 uchar tcplen[2];
129 uchar tcpsrc[4];
130 uchar tcpdst[4];
131 uchar tcpsport[2];
132 uchar tcpdport[2];
133 uchar tcpseq[4];
134 uchar tcpack[4];
135 uchar tcpflag[2];
136 uchar tcpwin[2];
137 uchar tcpcksum[2];
138 uchar tcpurg[2];
139 /* Options segment */
140 uchar tcpopt[1];
141 };
142
143 typedef struct Tcp6hdr Tcp6hdr;
144 struct Tcp6hdr
145 {
146 uchar vcf[4];
147 uchar ploadlen[2];
148 uchar proto;
149 uchar ttl;
150 uchar tcpsrc[IPaddrlen];
151 uchar tcpdst[IPaddrlen];
152 uchar tcpsport[2];
153 uchar tcpdport[2];
154 uchar tcpseq[4];
155 uchar tcpack[4];
156 uchar tcpflag[2];
157 uchar tcpwin[2];
158 uchar tcpcksum[2];
159 uchar tcpurg[2];
160 /* Options segment */
161 uchar tcpopt[1];
162 };
163
164 /*
165 * this represents the control info
166 * for a single packet. It is derived from
167 * a packet in ntohtcp{4,6}() and stuck into
168 * a packet in htontcp{4,6}().
169 */
170 typedef struct Tcp Tcp;
171 struct Tcp
172 {
173 ushort source;
174 ushort dest;
175 ulong seq;
176 ulong ack;
177 uchar flags;
178 uchar update;
179 ushort ws; /* window scale option */
180 ulong wnd; /* prescaled window*/
181 ushort urg;
182 ushort mss; /* max segment size option (if not zero) */
183 ushort len; /* size of data */
184 };
185
186 /*
187 * this header is malloc'd to thread together fragments
188 * waiting to be coalesced
189 */
190 typedef struct Reseq Reseq;
191 struct Reseq
192 {
193 Reseq *next;
194 Tcp seg;
195 Block *bp;
196 ushort length;
197 };
198
199 /*
200 * the qlock in the Conv locks this structure
201 */
202 typedef struct Tcpctl Tcpctl;
203 struct Tcpctl
204 {
205 uchar state; /* Connection state */
206 uchar type; /* Listening or active connection */
207 uchar code; /* Icmp code */
208 struct {
209 ulong una; /* Unacked data pointer */
210 ulong nxt; /* Next sequence expected */
211 ulong ptr; /* Data pointer */
212 ulong wnd; /* Tcp send window */
213 ulong urg; /* Urgent data pointer */
214 ulong wl2;
215 uint scale; /* how much to right shift window */
216 /* in xmitted packets */
217 /* to implement tahoe and reno TCP */
218 ulong dupacks; /* number of duplicate acks rcvd */
219 ulong partialack;
220 int recovery; /* loss recovery flag */
221 int retransmit; /* retransmit 1 packet @ una flag */
222 int rto;
223 ulong rxt; /* right window marker for recovery */
224 /* "recover" rfc3782 */
225 } snd;
226 struct {
227 ulong nxt; /* Receive pointer to next uchar slot */
228 ulong wnd; /* Receive window incoming */
229 ulong wsnt; /* Last wptr sent. important to */
230 /* track for large bdp */
231 ulong wptr;
232 ulong urg; /* Urgent pointer */
233 ulong ackptr; /* last acked sequence */
234 int blocked;
235 uint scale; /* how much to left shift window in */
236 /* rcv'd packets */
237 } rcv;
238 ulong iss; /* Initial sequence number */
239 ulong cwind; /* Congestion window */
240 ulong abcbytes; /* appropriate byte counting rfc 3465 */
241 uint scale; /* desired snd.scale */
242 ulong ssthresh; /* Slow start threshold */
243 int resent; /* Bytes just resent */
244 int irs; /* Initial received squence */
245 ushort mss; /* Maximum segment size */
246 int rerecv; /* Overlap of data rerecevived */
247 ulong window; /* Our receive window (queue) */
248 uint qscale; /* Log2 of our receive window (queue) */
249 uchar backoff; /* Exponential backoff counter */
250 int backedoff; /* ms we've backed off for rexmits */
251 uchar flags; /* State flags */
252 Reseq *reseq; /* Resequencing queue */
253 int nreseq;
254 int reseqlen;
255 Tcptimer timer; /* Activity timer */
256 Tcptimer acktimer; /* Acknowledge timer */
257 Tcptimer rtt_timer; /* Round trip timer */
258 Tcptimer katimer; /* keep alive timer */
259 ulong rttseq; /* Round trip sequence */
260 int srtt; /* Smoothed round trip */
261 int mdev; /* Mean deviation of round trip */
262 int kacounter; /* count down for keep alive */
263 uint sndsyntime; /* time syn sent */
264 ulong time; /* time Finwait2 or Syn_received was sent */
265 ulong timeuna; /* snd.una when time was set */
266 int nochecksum; /* non-zero means don't send checksums */
267 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
268
269 union {
270 Tcp4hdr tcp4hdr;
271 Tcp6hdr tcp6hdr;
272 } protohdr; /* prototype header */
273 };
274
275 /*
276 * New calls are put in limbo rather than having a conversation structure
277 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
278 * any real Conv structures mucking things up. Calls in limbo rexmit their
279 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
280 *
281 * In particular they aren't on a listener's queue so that they don't figure
282 * in the input queue limit.
283 *
284 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
285 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
286 * there is no hashing of this list.
287 */
288 typedef struct Limbo Limbo;
289 struct Limbo
290 {
291 Limbo *next;
292
293 uchar laddr[IPaddrlen];
294 uchar raddr[IPaddrlen];
295 ushort lport;
296 ushort rport;
297 ulong irs; /* initial received sequence */
298 ulong iss; /* initial sent sequence */
299 ushort mss; /* mss from the other end */
300 ushort rcvscale; /* how much to scale rcvd windows */
301 ushort sndscale; /* how much to scale sent windows */
302 ulong lastsend; /* last time we sent a synack */
303 uchar version; /* v4 or v6 */
304 uchar rexmits; /* number of retransmissions */
305 };
306
307 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
308
309 enum {
310 /* MIB stats */
311 MaxConn,
312 Mss,
313 ActiveOpens,
314 PassiveOpens,
315 EstabResets,
316 CurrEstab,
317 InSegs,
318 OutSegs,
319 RetransSegs,
320 RetransSegsSent,
321 RetransTimeouts,
322 InErrs,
323 OutRsts,
324
325 /* non-MIB stats */
326 CsumErrs,
327 HlenErrs,
328 LenErrs,
329 Resequenced,
330 OutOfOrder,
331 ReseqBytelim,
332 ReseqPktlim,
333 Delayack,
334 Wopenack,
335
336 Recovery,
337 RecoveryDone,
338 RecoveryRTO,
339 RecoveryNoSeq,
340 RecoveryCwind,
341 RecoveryPA,
342
343 Nstats
344 };
345
346 static char *statnames[Nstats] =
347 {
348 [MaxConn] "MaxConn",
349 [Mss] "MaxSegment",
350 [ActiveOpens] "ActiveOpens",
351 [PassiveOpens] "PassiveOpens",
352 [EstabResets] "EstabResets",
353 [CurrEstab] "CurrEstab",
354 [InSegs] "InSegs",
355 [OutSegs] "OutSegs",
356 [RetransSegs] "RetransSegs",
357 [RetransSegsSent] "RetransSegsSent",
358 [RetransTimeouts] "RetransTimeouts",
359 [InErrs] "InErrs",
360 [OutRsts] "OutRsts",
361 [CsumErrs] "CsumErrs",
362 [HlenErrs] "HlenErrs",
363 [LenErrs] "LenErrs",
364 [OutOfOrder] "OutOfOrder",
365 [Resequenced] "Resequenced",
366 [ReseqBytelim] "ReseqBytelim",
367 [ReseqPktlim] "ReseqPktlim",
368 [Delayack] "Delayack",
369 [Wopenack] "Wopenack",
370
371 [Recovery] "Recovery",
372 [RecoveryDone] "RecoveryDone",
373 [RecoveryRTO] "RecoveryRTO",
374
375 [RecoveryNoSeq] "RecoveryNoSeq",
376 [RecoveryCwind] "RecoveryCwind",
377 [RecoveryPA] "RecoveryPA",
378 };
379
380 typedef struct Tcppriv Tcppriv;
381 struct Tcppriv
382 {
383 /* List of active timers */
384 QLock tl;
385 Tcptimer *timers;
386
387 /* hash table for matching conversations */
388 Ipht ht;
389
390 /* calls in limbo waiting for an ACK to our SYN ACK */
391 int nlimbo;
392 Limbo *lht[NLHT];
393
394 /* for keeping track of tcpackproc */
395 QLock apl;
396 int ackprocstarted;
397
398 uvlong stats[Nstats];
399 };
400
401 /*
402 * Setting tcpporthogdefense to non-zero enables Dong Lin's
403 * solution to hijacked systems staking out port's as a form
404 * of DoS attack.
405 *
406 * To avoid stateless Conv hogs, we pick a sequence number at random. If
407 * that number gets acked by the other end, we shut down the connection.
408 * Look for tcpporthogdefense in the code.
409 */
410 int tcpporthogdefense = 0;
411
412 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
413 static int dumpreseq(Tcpctl*);
414 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
415 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
416 static void limborexmit(Proto*);
417 static void localclose(Conv*, char*);
418 static void procsyn(Conv*, Tcp*);
419 static void tcpacktimer(void*);
420 static void tcpiput(Proto*, Ipifc*, Block*);
421 static void tcpkeepalive(void*);
422 static void tcpoutput(Conv*);
423 static void tcprcvwin(Conv*);
424 static void tcprxmit(Conv*);
425 static void tcpsetkacounter(Tcpctl*);
426 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
427 static void tcpsettimer(Tcpctl*);
428 static void tcpsndsyn(Conv*, Tcpctl*);
429 static void tcpstart(Conv*, int);
430 static void tcpsynackrtt(Conv*);
431 static void tcptimeout(void*);
432 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
433
434 static void
tcpsetstate(Conv * s,uchar newstate)435 tcpsetstate(Conv *s, uchar newstate)
436 {
437 Tcpctl *tcb;
438 uchar oldstate;
439 Tcppriv *tpriv;
440
441 tpriv = s->p->priv;
442
443 tcb = (Tcpctl*)s->ptcl;
444
445 oldstate = tcb->state;
446 if(oldstate == newstate)
447 return;
448
449 if(oldstate == Established)
450 tpriv->stats[CurrEstab]--;
451 if(newstate == Established)
452 tpriv->stats[CurrEstab]++;
453
454 switch(newstate) {
455 case Closed:
456 qclose(s->rq);
457 qclose(s->wq);
458 qclose(s->eq);
459 break;
460
461 case Close_wait: /* Remote closes */
462 qhangup(s->rq, nil);
463 break;
464 }
465
466 tcb->state = newstate;
467
468 if(oldstate == Syn_sent && newstate != Closed)
469 Fsconnected(s, nil);
470 }
471
472 static char*
tcpconnect(Conv * c,char ** argv,int argc)473 tcpconnect(Conv *c, char **argv, int argc)
474 {
475 char *e;
476 Tcpctl *tcb;
477
478 tcb = (Tcpctl*)(c->ptcl);
479 if(tcb->state != Closed)
480 return Econinuse;
481
482 e = Fsstdconnect(c, argv, argc);
483 if(e != nil)
484 return e;
485 tcpstart(c, TCP_CONNECT);
486
487 return nil;
488 }
489
490 static int
tcpstate(Conv * c,char * state,int n)491 tcpstate(Conv *c, char *state, int n)
492 {
493 Tcpctl *s;
494
495 s = (Tcpctl*)(c->ptcl);
496
497 return snprint(state, n,
498 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
499 "swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
500 "timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
501 tcpstates[s->state],
502 c->rq ? qlen(c->rq) : 0,
503 c->wq ? qlen(c->wq) : 0,
504 s->nreseq, s->reseqlen,
505 s->srtt, s->mdev, s->ssthresh,
506 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
507 s->qscale,
508 s->timer.start, s->timer.count, s->rerecv,
509 s->katimer.start, s->katimer.count);
510 }
511
512 static int
tcpinuse(Conv * c)513 tcpinuse(Conv *c)
514 {
515 Tcpctl *s;
516
517 s = (Tcpctl*)(c->ptcl);
518 return s->state != Closed;
519 }
520
521 static char*
tcpannounce(Conv * c,char ** argv,int argc)522 tcpannounce(Conv *c, char **argv, int argc)
523 {
524 char *e;
525 Tcpctl *tcb;
526
527 tcb = (Tcpctl*)(c->ptcl);
528 if(tcb->state != Closed)
529 return Econinuse;
530
531 e = Fsstdannounce(c, argv, argc);
532 if(e != nil)
533 return e;
534 tcpstart(c, TCP_LISTEN);
535 Fsconnected(c, nil);
536
537 return nil;
538 }
539
540 static void
tcpclosestate(Conv * c,Tcpctl * tcb,int state)541 tcpclosestate(Conv *c, Tcpctl *tcb, int state)
542 {
543 tcb->flgcnt++;
544 tcb->snd.nxt++;
545 tcpsetstate(c, state);
546 tcpoutput(c);
547 }
548
549 /* close the output half of a tcp connection */
550 static char *
tcpxmitclose(Conv * c)551 tcpxmitclose(Conv *c)
552 {
553 Tcpctl *tcb;
554
555 qhangup(c->wq, nil);
556
557 tcb = (Tcpctl*)c->ptcl;
558 switch(tcb->state) {
559 case Listen:
560 /*
561 * reset any incoming calls to this listener
562 */
563 Fsconnected(c, "Hangup");
564 /* fall through */
565 case Closed:
566 case Syn_sent:
567 localclose(c, nil);
568 break;
569 case Syn_received:
570 case Established:
571 case Close_wait:
572 tcpclosestate(c, tcb, tcb->state);
573 break;
574 }
575 return nil;
576 }
577
578 /*
579 * tcpclose is always called with the q locked
580 */
581 static void
tcpclose(Conv * c)582 tcpclose(Conv *c)
583 {
584 Tcpctl *tcb;
585
586 tcb = (Tcpctl*)c->ptcl;
587
588 qhangup(c->rq, nil);
589 qhangup(c->wq, nil);
590 qhangup(c->eq, nil);
591 qflush(c->rq);
592
593 switch(tcb->state) {
594 case Listen:
595 /*
596 * reset any incoming calls to this listener
597 */
598 Fsconnected(c, "Hangup");
599 /* fall through */
600 case Closed:
601 case Syn_sent:
602 localclose(c, nil);
603 break;
604 case Syn_received:
605 case Established:
606 tcpclosestate(c, tcb, Finwait1);
607 break;
608 case Close_wait:
609 tcpclosestate(c, tcb, Last_ack);
610 break;
611 }
612 }
613
614 static void
tcpkick(void * x)615 tcpkick(void *x)
616 {
617 Conv *s = x;
618 Tcpctl *tcb;
619
620 tcb = (Tcpctl*)s->ptcl;
621
622 if(waserror()){
623 qunlock(s);
624 nexterror();
625 }
626 qlock(s);
627
628 switch(tcb->state) {
629 case Syn_sent:
630 case Syn_received:
631 case Established:
632 case Close_wait:
633 /*
634 * Push data
635 */
636 tcpoutput(s);
637 break;
638 default:
639 localclose(s, "Hangup");
640 break;
641 }
642
643 qunlock(s);
644 poperror();
645 }
646
647 static int seq_lt(ulong, ulong);
648
649 static void
tcprcvwin(Conv * s)650 tcprcvwin(Conv *s) /* Call with tcb locked */
651 {
652 int w;
653 Tcpctl *tcb;
654
655 tcb = (Tcpctl*)s->ptcl;
656 w = tcb->window - qlen(s->rq);
657 if(w < 0)
658 w = 0;
659 /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
660 if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
661 w = tcb->rcv.wptr - tcb->rcv.nxt;
662 if(w != tcb->rcv.wnd)
663 if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
664 tcb->rcv.blocked = 1;
665 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
666 tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
667 }
668 tcb->rcv.wnd = w;
669 tcb->rcv.wptr = tcb->rcv.nxt + w;
670 }
671
672 static void
tcpacktimer(void * v)673 tcpacktimer(void *v)
674 {
675 Tcpctl *tcb;
676 Conv *s;
677
678 s = v;
679 tcb = (Tcpctl*)s->ptcl;
680
681 if(waserror()){
682 qunlock(s);
683 nexterror();
684 }
685 qlock(s);
686 if(tcb->state != Closed){
687 tcb->flags |= FORCE;
688 tcpoutput(s);
689 }
690 qunlock(s);
691 poperror();
692 }
693
694 static void
tcpcongestion(Tcpctl * tcb)695 tcpcongestion(Tcpctl *tcb)
696 {
697 ulong inflight;
698
699 inflight = tcb->snd.nxt - tcb->snd.una;
700 if(inflight > tcb->cwind)
701 inflight = tcb->cwind;
702 tcb->ssthresh = inflight / 2;
703 if(tcb->ssthresh < 2*tcb->mss)
704 tcb->ssthresh = 2*tcb->mss;
705 }
706
707 enum {
708 L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
709 };
710
711 static void
tcpabcincr(Tcpctl * tcb,uint acked)712 tcpabcincr(Tcpctl *tcb, uint acked)
713 {
714 uint limit;
715
716 tcb->abcbytes += acked;
717 if(tcb->cwind < tcb->ssthresh){
718 /* slow start */
719 if(tcb->snd.rto)
720 limit = tcb->mss;
721 else
722 limit = L*tcb->mss;
723 tcb->cwind += MIN(tcb->abcbytes, limit);
724 tcb->abcbytes = 0;
725 } else {
726 tcb->snd.rto = 0;
727 /* avoidance */
728 if(tcb->abcbytes >= tcb->cwind){
729 tcb->abcbytes -= tcb->cwind;
730 tcb->cwind += tcb->mss;
731 }
732 }
733 }
734
735 static void
tcpcreate(Conv * c)736 tcpcreate(Conv *c)
737 {
738 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
739 c->wq = qopen(QMAX, Qkick, tcpkick, c);
740 }
741
742 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)743 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
744 {
745 if(newstate != TcptimerON){
746 if(t->state == TcptimerON){
747 /* unchain */
748 if(priv->timers == t){
749 priv->timers = t->next;
750 if(t->prev != nil)
751 panic("timerstate1");
752 }
753 if(t->next)
754 t->next->prev = t->prev;
755 if(t->prev)
756 t->prev->next = t->next;
757 t->next = t->prev = nil;
758 }
759 } else {
760 if(t->state != TcptimerON){
761 /* chain */
762 if(t->prev != nil || t->next != nil)
763 panic("timerstate2");
764 t->prev = nil;
765 t->next = priv->timers;
766 if(t->next)
767 t->next->prev = t;
768 priv->timers = t;
769 }
770 }
771 t->state = newstate;
772 }
773
774 static void
tcpackproc(void * a)775 tcpackproc(void *a)
776 {
777 Tcptimer *t, *tp, *timeo;
778 Proto *tcp;
779 Tcppriv *priv;
780 int loop;
781
782 tcp = a;
783 priv = tcp->priv;
784
785 for(;;) {
786 tsleep(&up->sleep, return0, 0, MSPTICK);
787
788 qlock(&priv->tl);
789 timeo = nil;
790 loop = 0;
791 for(t = priv->timers; t != nil; t = tp) {
792 if(loop++ > 10000)
793 panic("tcpackproc1");
794 tp = t->next;
795 if(t->state == TcptimerON) {
796 t->count--;
797 if(t->count == 0) {
798 timerstate(priv, t, TcptimerDONE);
799 t->readynext = timeo;
800 timeo = t;
801 }
802 }
803 }
804 qunlock(&priv->tl);
805
806 loop = 0;
807 for(t = timeo; t != nil; t = t->readynext) {
808 if(loop++ > 10000)
809 panic("tcpackproc2");
810 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
811 (*t->func)(t->arg);
812 poperror();
813 }
814 }
815
816 limborexmit(tcp);
817 }
818 }
819
820 static void
tcpgo(Tcppriv * priv,Tcptimer * t)821 tcpgo(Tcppriv *priv, Tcptimer *t)
822 {
823 if(t == nil || t->start == 0)
824 return;
825
826 qlock(&priv->tl);
827 t->count = t->start;
828 timerstate(priv, t, TcptimerON);
829 qunlock(&priv->tl);
830 }
831
832 static void
tcphalt(Tcppriv * priv,Tcptimer * t)833 tcphalt(Tcppriv *priv, Tcptimer *t)
834 {
835 if(t == nil)
836 return;
837
838 qlock(&priv->tl);
839 timerstate(priv, t, TcptimerOFF);
840 qunlock(&priv->tl);
841 }
842
843 static int
backoff(int n)844 backoff(int n)
845 {
846 return 1 << n;
847 }
848
849 static void
localclose(Conv * s,char * reason)850 localclose(Conv *s, char *reason) /* called with tcb locked */
851 {
852 Tcpctl *tcb;
853 Tcppriv *tpriv;
854
855 tpriv = s->p->priv;
856 tcb = (Tcpctl*)s->ptcl;
857
858 iphtrem(&tpriv->ht, s);
859
860 tcphalt(tpriv, &tcb->timer);
861 tcphalt(tpriv, &tcb->rtt_timer);
862 tcphalt(tpriv, &tcb->acktimer);
863 tcphalt(tpriv, &tcb->katimer);
864
865 /* Flush reassembly queue; nothing more can arrive */
866 dumpreseq(tcb);
867
868 if(tcb->state == Syn_sent)
869 Fsconnected(s, reason);
870 if(s->state == Announced)
871 wakeup(&s->listenr);
872
873 qhangup(s->rq, reason);
874 qhangup(s->wq, reason);
875
876 tcpsetstate(s, Closed);
877 }
878
879 /* mtu (- TCP + IP hdr len) of 1st hop */
880 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)881 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
882 {
883 Ipifc *ifc;
884 int mtu;
885
886 ifc = findipifc(tcp->f, addr, 0);
887 switch(version){
888 default:
889 case V4:
890 mtu = DEF_MSS;
891 if(ifc != nil)
892 mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE);
893 break;
894 case V6:
895 mtu = DEF_MSS6;
896 if(ifc != nil)
897 mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE);
898 break;
899 }
900 /*
901 * set the ws. it doesn't commit us to anything.
902 * ws is the ultimate limit to the bandwidth-delay product.
903 */
904 *scale = Defadvscale;
905
906 return mtu;
907 }
908
909 static void
inittcpctl(Conv * s,int mode)910 inittcpctl(Conv *s, int mode)
911 {
912 Tcpctl *tcb;
913 Tcp4hdr* h4;
914 Tcp6hdr* h6;
915 Tcppriv *tpriv;
916 int mss;
917
918 tcb = (Tcpctl*)s->ptcl;
919
920 memset(tcb, 0, sizeof(Tcpctl));
921
922 tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
923 tcb->srtt = tcp_irtt<<LOGAGAIN;
924 tcb->mdev = 0;
925
926 /* setup timers */
927 tcb->timer.start = tcp_irtt / MSPTICK;
928 tcb->timer.func = tcptimeout;
929 tcb->timer.arg = s;
930 tcb->rtt_timer.start = MAX_TIME;
931 tcb->acktimer.start = TCP_ACK / MSPTICK;
932 tcb->acktimer.func = tcpacktimer;
933 tcb->acktimer.arg = s;
934 tcb->katimer.start = DEF_KAT / MSPTICK;
935 tcb->katimer.func = tcpkeepalive;
936 tcb->katimer.arg = s;
937
938 mss = DEF_MSS;
939
940 /* create a prototype(pseudo) header */
941 if(mode != TCP_LISTEN){
942 if(ipcmp(s->laddr, IPnoaddr) == 0)
943 findlocalip(s->p->f, s->laddr, s->raddr);
944
945 switch(s->ipversion){
946 case V4:
947 h4 = &tcb->protohdr.tcp4hdr;
948 memset(h4, 0, sizeof(*h4));
949 h4->proto = IP_TCPPROTO;
950 hnputs(h4->tcpsport, s->lport);
951 hnputs(h4->tcpdport, s->rport);
952 v6tov4(h4->tcpsrc, s->laddr);
953 v6tov4(h4->tcpdst, s->raddr);
954 break;
955 case V6:
956 h6 = &tcb->protohdr.tcp6hdr;
957 memset(h6, 0, sizeof(*h6));
958 h6->proto = IP_TCPPROTO;
959 hnputs(h6->tcpsport, s->lport);
960 hnputs(h6->tcpdport, s->rport);
961 ipmove(h6->tcpsrc, s->laddr);
962 ipmove(h6->tcpdst, s->raddr);
963 mss = DEF_MSS6;
964 break;
965 default:
966 panic("inittcpctl: version %d", s->ipversion);
967 }
968 }
969
970 tcb->mss = tcb->cwind = mss;
971 tcb->abcbytes = 0;
972 tpriv = s->p->priv;
973 tpriv->stats[Mss] = tcb->mss;
974
975 /* default is no window scaling */
976 tcpsetscale(s, tcb, 0, 0);
977 }
978
979 /*
980 * called with s qlocked
981 */
982 static void
tcpstart(Conv * s,int mode)983 tcpstart(Conv *s, int mode)
984 {
985 Tcpctl *tcb;
986 Tcppriv *tpriv;
987 char kpname[KNAMELEN];
988
989 tpriv = s->p->priv;
990
991 if(tpriv->ackprocstarted == 0){
992 qlock(&tpriv->apl);
993 if(tpriv->ackprocstarted == 0){
994 snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
995 kproc(kpname, tcpackproc, s->p);
996 tpriv->ackprocstarted = 1;
997 }
998 qunlock(&tpriv->apl);
999 }
1000
1001 tcb = (Tcpctl*)s->ptcl;
1002
1003 inittcpctl(s, mode);
1004
1005 iphtadd(&tpriv->ht, s);
1006 switch(mode) {
1007 case TCP_LISTEN:
1008 tpriv->stats[PassiveOpens]++;
1009 tcb->flags |= CLONE;
1010 tcpsetstate(s, Listen);
1011 break;
1012
1013 case TCP_CONNECT:
1014 tpriv->stats[ActiveOpens]++;
1015 tcb->flags |= ACTIVE;
1016 tcpsndsyn(s, tcb);
1017 tcpsetstate(s, Syn_sent);
1018 tcpoutput(s);
1019 break;
1020 }
1021 }
1022
1023 static char*
tcpflag(char * buf,char * e,ushort flag)1024 tcpflag(char *buf, char *e, ushort flag)
1025 {
1026 char *p;
1027
1028 p = seprint(buf, e, "%d", flag>>10); /* Head len */
1029 if(flag & URG)
1030 p = seprint(p, e, " URG");
1031 if(flag & ACK)
1032 p = seprint(p, e, " ACK");
1033 if(flag & PSH)
1034 p = seprint(p, e, " PSH");
1035 if(flag & RST)
1036 p = seprint(p, e, " RST");
1037 if(flag & SYN)
1038 p = seprint(p, e, " SYN");
1039 if(flag & FIN)
1040 p = seprint(p, e, " FIN");
1041 USED(p);
1042 return buf;
1043 }
1044
1045 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1046 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1047 {
1048 int dlen;
1049 Tcp6hdr *h;
1050 ushort csum;
1051 ushort hdrlen, optpad = 0;
1052 uchar *opt;
1053
1054 hdrlen = TCP6_HDRSIZE;
1055 if(tcph->flags & SYN){
1056 if(tcph->mss)
1057 hdrlen += MSS_LENGTH;
1058 if(tcph->ws)
1059 hdrlen += WS_LENGTH;
1060 optpad = hdrlen & 3;
1061 if(optpad)
1062 optpad = 4 - optpad;
1063 hdrlen += optpad;
1064 }
1065
1066 if(data) {
1067 dlen = blocklen(data);
1068 data = padblock(data, hdrlen + TCP6_PKT);
1069 if(data == nil)
1070 return nil;
1071 }
1072 else {
1073 dlen = 0;
1074 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
1075 if(data == nil)
1076 return nil;
1077 data->wp += hdrlen + TCP6_PKT;
1078 }
1079
1080 /* copy in pseudo ip header plus port numbers */
1081 h = (Tcp6hdr *)(data->rp);
1082 memmove(h, ph, TCP6_TCBPHDRSZ);
1083
1084 /* compose pseudo tcp header, do cksum calculation */
1085 hnputl(h->vcf, hdrlen + dlen);
1086 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1087 h->ttl = ph->proto;
1088
1089 /* copy in variable bits */
1090 hnputl(h->tcpseq, tcph->seq);
1091 hnputl(h->tcpack, tcph->ack);
1092 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1093 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1094 hnputs(h->tcpurg, tcph->urg);
1095
1096 if(tcph->flags & SYN){
1097 opt = h->tcpopt;
1098 if(tcph->mss != 0){
1099 *opt++ = MSSOPT;
1100 *opt++ = MSS_LENGTH;
1101 hnputs(opt, tcph->mss);
1102 opt += 2;
1103 }
1104 if(tcph->ws != 0){
1105 *opt++ = WSOPT;
1106 *opt++ = WS_LENGTH;
1107 *opt++ = tcph->ws;
1108 }
1109 while(optpad-- > 0)
1110 *opt++ = NOOPOPT;
1111 }
1112
1113 if(tcb != nil && tcb->nochecksum){
1114 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1115 } else {
1116 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1117 hnputs(h->tcpcksum, csum);
1118 }
1119
1120 /* move from pseudo header back to normal ip header */
1121 memset(h->vcf, 0, 4);
1122 h->vcf[0] = IP_VER6;
1123 hnputs(h->ploadlen, hdrlen+dlen);
1124 h->proto = ph->proto;
1125
1126 return data;
1127 }
1128
1129 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1130 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1131 {
1132 int dlen;
1133 Tcp4hdr *h;
1134 ushort csum;
1135 ushort hdrlen, optpad = 0;
1136 uchar *opt;
1137
1138 hdrlen = TCP4_HDRSIZE;
1139 if(tcph->flags & SYN){
1140 if(tcph->mss)
1141 hdrlen += MSS_LENGTH;
1142 if(1)
1143 hdrlen += WS_LENGTH;
1144 optpad = hdrlen & 3;
1145 if(optpad)
1146 optpad = 4 - optpad;
1147 hdrlen += optpad;
1148 }
1149
1150 if(data) {
1151 dlen = blocklen(data);
1152 data = padblock(data, hdrlen + TCP4_PKT);
1153 if(data == nil)
1154 return nil;
1155 }
1156 else {
1157 dlen = 0;
1158 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1159 if(data == nil)
1160 return nil;
1161 data->wp += hdrlen + TCP4_PKT;
1162 }
1163
1164 /* copy in pseudo ip header plus port numbers */
1165 h = (Tcp4hdr *)(data->rp);
1166 memmove(h, ph, TCP4_TCBPHDRSZ);
1167
1168 /* copy in variable bits */
1169 hnputs(h->tcplen, hdrlen + dlen);
1170 hnputl(h->tcpseq, tcph->seq);
1171 hnputl(h->tcpack, tcph->ack);
1172 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1173 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1174 hnputs(h->tcpurg, tcph->urg);
1175
1176 if(tcph->flags & SYN){
1177 opt = h->tcpopt;
1178 if(tcph->mss != 0){
1179 *opt++ = MSSOPT;
1180 *opt++ = MSS_LENGTH;
1181 hnputs(opt, tcph->mss);
1182 opt += 2;
1183 }
1184 /* always offer. rfc1323 §2.2 */
1185 if(1){
1186 *opt++ = WSOPT;
1187 *opt++ = WS_LENGTH;
1188 *opt++ = tcph->ws;
1189 }
1190 while(optpad-- > 0)
1191 *opt++ = NOOPOPT;
1192 }
1193
1194 if(tcb != nil && tcb->nochecksum){
1195 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1196 } else {
1197 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1198 hnputs(h->tcpcksum, csum);
1199 }
1200
1201 return data;
1202 }
1203
1204 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1205 ntohtcp6(Tcp *tcph, Block **bpp)
1206 {
1207 Tcp6hdr *h;
1208 uchar *optr;
1209 ushort hdrlen;
1210 ushort optlen;
1211 int n;
1212
1213 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1214 if(*bpp == nil)
1215 return -1;
1216
1217 h = (Tcp6hdr *)((*bpp)->rp);
1218 tcph->source = nhgets(h->tcpsport);
1219 tcph->dest = nhgets(h->tcpdport);
1220 tcph->seq = nhgetl(h->tcpseq);
1221 tcph->ack = nhgetl(h->tcpack);
1222 hdrlen = (h->tcpflag[0]>>2) & ~3;
1223 if(hdrlen < TCP6_HDRSIZE) {
1224 freeblist(*bpp);
1225 return -1;
1226 }
1227
1228 tcph->flags = h->tcpflag[1];
1229 tcph->wnd = nhgets(h->tcpwin);
1230 tcph->urg = nhgets(h->tcpurg);
1231 tcph->mss = 0;
1232 tcph->ws = 0;
1233 tcph->update = 0;
1234 tcph->len = nhgets(h->ploadlen) - hdrlen;
1235
1236 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1237 if(*bpp == nil)
1238 return -1;
1239
1240 optr = h->tcpopt;
1241 n = hdrlen - TCP6_HDRSIZE;
1242 while(n > 0 && *optr != EOLOPT) {
1243 if(*optr == NOOPOPT) {
1244 n--;
1245 optr++;
1246 continue;
1247 }
1248 optlen = optr[1];
1249 if(optlen < 2 || optlen > n)
1250 break;
1251 switch(*optr) {
1252 case MSSOPT:
1253 if(optlen == MSS_LENGTH)
1254 tcph->mss = nhgets(optr+2);
1255 break;
1256 case WSOPT:
1257 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1258 tcph->ws = *(optr+2);
1259 break;
1260 }
1261 n -= optlen;
1262 optr += optlen;
1263 }
1264 return hdrlen;
1265 }
1266
1267 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1268 ntohtcp4(Tcp *tcph, Block **bpp)
1269 {
1270 Tcp4hdr *h;
1271 uchar *optr;
1272 ushort hdrlen;
1273 ushort optlen;
1274 int n;
1275
1276 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1277 if(*bpp == nil)
1278 return -1;
1279
1280 h = (Tcp4hdr *)((*bpp)->rp);
1281 tcph->source = nhgets(h->tcpsport);
1282 tcph->dest = nhgets(h->tcpdport);
1283 tcph->seq = nhgetl(h->tcpseq);
1284 tcph->ack = nhgetl(h->tcpack);
1285
1286 hdrlen = (h->tcpflag[0]>>2) & ~3;
1287 if(hdrlen < TCP4_HDRSIZE) {
1288 freeblist(*bpp);
1289 return -1;
1290 }
1291
1292 tcph->flags = h->tcpflag[1];
1293 tcph->wnd = nhgets(h->tcpwin);
1294 tcph->urg = nhgets(h->tcpurg);
1295 tcph->mss = 0;
1296 tcph->ws = 0;
1297 tcph->update = 0;
1298 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1299
1300 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1301 if(*bpp == nil)
1302 return -1;
1303
1304 optr = h->tcpopt;
1305 n = hdrlen - TCP4_HDRSIZE;
1306 while(n > 0 && *optr != EOLOPT) {
1307 if(*optr == NOOPOPT) {
1308 n--;
1309 optr++;
1310 continue;
1311 }
1312 optlen = optr[1];
1313 if(optlen < 2 || optlen > n)
1314 break;
1315 switch(*optr) {
1316 case MSSOPT:
1317 if(optlen == MSS_LENGTH)
1318 tcph->mss = nhgets(optr+2);
1319 break;
1320 case WSOPT:
1321 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1322 tcph->ws = *(optr+2);
1323 break;
1324 }
1325 n -= optlen;
1326 optr += optlen;
1327 }
1328 return hdrlen;
1329 }
1330
1331 /*
1332 * For outgoing calls, generate an initial sequence
1333 * number and put a SYN on the send queue
1334 */
1335 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1336 tcpsndsyn(Conv *s, Tcpctl *tcb)
1337 {
1338 Tcppriv *tpriv;
1339
1340 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1341 tcb->rttseq = tcb->iss;
1342 tcb->snd.wl2 = tcb->iss;
1343 tcb->snd.una = tcb->iss;
1344 tcb->snd.rxt = tcb->iss;
1345 tcb->snd.ptr = tcb->rttseq;
1346 tcb->snd.nxt = tcb->rttseq;
1347 tcb->flgcnt++;
1348 tcb->flags |= FORCE;
1349 tcb->sndsyntime = NOW;
1350
1351 /* set desired mss and scale */
1352 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1353 tpriv = s->p->priv;
1354 tpriv->stats[Mss] = tcb->mss;
1355 }
1356
1357 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1358 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1359 {
1360 Block *hbp;
1361 uchar rflags;
1362 Tcppriv *tpriv;
1363 Tcp4hdr ph4;
1364 Tcp6hdr ph6;
1365
1366 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1367
1368 tpriv = tcp->priv;
1369
1370 if(seg->flags & RST)
1371 return;
1372
1373 /* make pseudo header */
1374 switch(version) {
1375 case V4:
1376 memset(&ph4, 0, sizeof(ph4));
1377 ph4.vihl = IP_VER4;
1378 v6tov4(ph4.tcpsrc, dest);
1379 v6tov4(ph4.tcpdst, source);
1380 ph4.proto = IP_TCPPROTO;
1381 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1382 hnputs(ph4.tcpsport, seg->dest);
1383 hnputs(ph4.tcpdport, seg->source);
1384 break;
1385 case V6:
1386 memset(&ph6, 0, sizeof(ph6));
1387 ph6.vcf[0] = IP_VER6;
1388 ipmove(ph6.tcpsrc, dest);
1389 ipmove(ph6.tcpdst, source);
1390 ph6.proto = IP_TCPPROTO;
1391 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1392 hnputs(ph6.tcpsport, seg->dest);
1393 hnputs(ph6.tcpdport, seg->source);
1394 break;
1395 default:
1396 panic("sndrst: version %d", version);
1397 }
1398
1399 tpriv->stats[OutRsts]++;
1400 rflags = RST;
1401
1402 /* convince the other end that this reset is in band */
1403 if(seg->flags & ACK) {
1404 seg->seq = seg->ack;
1405 seg->ack = 0;
1406 }
1407 else {
1408 rflags |= ACK;
1409 seg->ack = seg->seq;
1410 seg->seq = 0;
1411 if(seg->flags & SYN)
1412 seg->ack++;
1413 seg->ack += length;
1414 if(seg->flags & FIN)
1415 seg->ack++;
1416 }
1417 seg->flags = rflags;
1418 seg->wnd = 0;
1419 seg->urg = 0;
1420 seg->mss = 0;
1421 seg->ws = 0;
1422 switch(version) {
1423 case V4:
1424 hbp = htontcp4(seg, nil, &ph4, nil);
1425 if(hbp == nil)
1426 return;
1427 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1428 break;
1429 case V6:
1430 hbp = htontcp6(seg, nil, &ph6, nil);
1431 if(hbp == nil)
1432 return;
1433 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1434 break;
1435 default:
1436 panic("sndrst2: version %d", version);
1437 }
1438 }
1439
1440 /*
1441 * close the conversation
1442 */
1443 static char*
tcpclose2(Conv * s)1444 tcpclose2(Conv *s)
1445 {
1446 tcpclose(s);
1447 return nil;
1448 }
1449
1450 /*
1451 * send a reset to the remote side and close the conversation
1452 * called with s qlocked
1453 */
1454 static char*
tcphangup(Conv * s)1455 tcphangup(Conv *s)
1456 {
1457 Tcp seg;
1458 Tcpctl *tcb;
1459 Block *hbp;
1460
1461 tcb = (Tcpctl*)s->ptcl;
1462 if(waserror())
1463 return up->errstr;
1464 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1465 if(!waserror()){
1466 memset(&seg, 0, sizeof seg);
1467 seg.flags = RST | ACK;
1468 seg.ack = tcb->rcv.nxt;
1469 tcb->rcv.ackptr = seg.ack;
1470 seg.seq = tcb->snd.ptr;
1471 seg.wnd = 0;
1472 seg.urg = 0;
1473 seg.mss = 0;
1474 seg.ws = 0;
1475 switch(s->ipversion) {
1476 case V4:
1477 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1478 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1479 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1480 break;
1481 case V6:
1482 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1483 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1484 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1485 break;
1486 default:
1487 panic("tcphangup: version %d", s->ipversion);
1488 }
1489 poperror();
1490 }
1491 }
1492 localclose(s, nil);
1493 poperror();
1494 return nil;
1495 }
1496
1497 /*
1498 * (re)send a SYN ACK
1499 */
1500 static int
sndsynack(Proto * tcp,Limbo * lp)1501 sndsynack(Proto *tcp, Limbo *lp)
1502 {
1503 Block *hbp;
1504 Tcp4hdr ph4;
1505 Tcp6hdr ph6;
1506 Tcp seg;
1507 uint scale;
1508
1509 /* make pseudo header */
1510 switch(lp->version) {
1511 case V4:
1512 memset(&ph4, 0, sizeof(ph4));
1513 ph4.vihl = IP_VER4;
1514 v6tov4(ph4.tcpsrc, lp->laddr);
1515 v6tov4(ph4.tcpdst, lp->raddr);
1516 ph4.proto = IP_TCPPROTO;
1517 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1518 hnputs(ph4.tcpsport, lp->lport);
1519 hnputs(ph4.tcpdport, lp->rport);
1520 break;
1521 case V6:
1522 memset(&ph6, 0, sizeof(ph6));
1523 ph6.vcf[0] = IP_VER6;
1524 ipmove(ph6.tcpsrc, lp->laddr);
1525 ipmove(ph6.tcpdst, lp->raddr);
1526 ph6.proto = IP_TCPPROTO;
1527 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1528 hnputs(ph6.tcpsport, lp->lport);
1529 hnputs(ph6.tcpdport, lp->rport);
1530 break;
1531 default:
1532 panic("sndrst: version %d", lp->version);
1533 }
1534
1535 memset(&seg, 0, sizeof seg);
1536 seg.seq = lp->iss;
1537 seg.ack = lp->irs+1;
1538 seg.flags = SYN|ACK;
1539 seg.urg = 0;
1540 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1541 seg.wnd = QMAX;
1542
1543 /* if the other side set scale, we should too */
1544 if(lp->rcvscale){
1545 seg.ws = scale;
1546 lp->sndscale = scale;
1547 } else {
1548 seg.ws = 0;
1549 lp->sndscale = 0;
1550 }
1551
1552 switch(lp->version) {
1553 case V4:
1554 hbp = htontcp4(&seg, nil, &ph4, nil);
1555 if(hbp == nil)
1556 return -1;
1557 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1558 break;
1559 case V6:
1560 hbp = htontcp6(&seg, nil, &ph6, nil);
1561 if(hbp == nil)
1562 return -1;
1563 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1564 break;
1565 default:
1566 panic("sndsnack: version %d", lp->version);
1567 }
1568 lp->lastsend = NOW;
1569 return 0;
1570 }
1571
1572 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1573
1574 /*
1575 * put a call into limbo and respond with a SYN ACK
1576 *
1577 * called with proto locked
1578 */
1579 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1580 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1581 {
1582 Limbo *lp, **l;
1583 Tcppriv *tpriv;
1584 int h;
1585
1586 tpriv = s->p->priv;
1587 h = hashipa(source, seg->source);
1588
1589 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1590 lp = *l;
1591 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1592 continue;
1593 if(ipcmp(lp->raddr, source) != 0)
1594 continue;
1595 if(ipcmp(lp->laddr, dest) != 0)
1596 continue;
1597
1598 /* each new SYN restarts the retransmits */
1599 lp->irs = seg->seq;
1600 break;
1601 }
1602 lp = *l;
1603 if(lp == nil){
1604 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1605 lp = tpriv->lht[h];
1606 tpriv->lht[h] = lp->next;
1607 lp->next = nil;
1608 } else {
1609 lp = malloc(sizeof(*lp));
1610 if(lp == nil)
1611 return;
1612 tpriv->nlimbo++;
1613 }
1614 *l = lp;
1615 lp->version = version;
1616 ipmove(lp->laddr, dest);
1617 ipmove(lp->raddr, source);
1618 lp->lport = seg->dest;
1619 lp->rport = seg->source;
1620 lp->mss = seg->mss;
1621 lp->rcvscale = seg->ws;
1622 lp->irs = seg->seq;
1623 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1624 }
1625
1626 if(sndsynack(s->p, lp) < 0){
1627 *l = lp->next;
1628 tpriv->nlimbo--;
1629 free(lp);
1630 }
1631 }
1632
1633 /*
1634 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1635 */
1636 static void
limborexmit(Proto * tcp)1637 limborexmit(Proto *tcp)
1638 {
1639 Tcppriv *tpriv;
1640 Limbo **l, *lp;
1641 int h;
1642 int seen;
1643 ulong now;
1644
1645 tpriv = tcp->priv;
1646
1647 if(!canqlock(tcp))
1648 return;
1649 seen = 0;
1650 now = NOW;
1651 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1652 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1653 lp = *l;
1654 seen++;
1655 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1656 continue;
1657
1658 /* time it out after 1 second */
1659 if(++(lp->rexmits) > 5){
1660 tpriv->nlimbo--;
1661 *l = lp->next;
1662 free(lp);
1663 continue;
1664 }
1665
1666 /* if we're being attacked, don't bother resending SYN ACK's */
1667 if(tpriv->nlimbo > 100)
1668 continue;
1669
1670 if(sndsynack(tcp, lp) < 0){
1671 tpriv->nlimbo--;
1672 *l = lp->next;
1673 free(lp);
1674 continue;
1675 }
1676
1677 l = &lp->next;
1678 }
1679 }
1680 qunlock(tcp);
1681 }
1682
1683 /*
1684 * lookup call in limbo. if found, throw it out.
1685 *
1686 * called with proto locked
1687 */
1688 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1689 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1690 {
1691 Limbo *lp, **l;
1692 int h;
1693 Tcppriv *tpriv;
1694
1695 tpriv = s->p->priv;
1696
1697 /* find a call in limbo */
1698 h = hashipa(src, segp->source);
1699 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1700 lp = *l;
1701 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1702 continue;
1703 if(ipcmp(lp->laddr, dst) != 0)
1704 continue;
1705 if(ipcmp(lp->raddr, src) != 0)
1706 continue;
1707
1708 /* RST can only follow the SYN */
1709 if(segp->seq == lp->irs+1){
1710 tpriv->nlimbo--;
1711 *l = lp->next;
1712 free(lp);
1713 }
1714 break;
1715 }
1716 }
1717
1718 static void
initialwindow(Tcpctl * tcb)1719 initialwindow(Tcpctl *tcb)
1720 {
1721 /* RFC 3390 initial window */
1722 if(tcb->mss < 1095)
1723 tcb->cwind = 4*tcb->mss;
1724 else if(tcb->mss < 2190)
1725 tcb->cwind = 2*2190;
1726 else
1727 tcb->cwind = 2*tcb->mss;
1728 }
1729
1730 /*
1731 * come here when we finally get an ACK to our SYN-ACK.
1732 * lookup call in limbo. if found, create a new conversation
1733 *
1734 * called with proto locked
1735 */
1736 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1737 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1738 {
1739 Conv *new;
1740 Tcpctl *tcb;
1741 Tcppriv *tpriv;
1742 Tcp4hdr *h4;
1743 Tcp6hdr *h6;
1744 Limbo *lp, **l;
1745 int h;
1746
1747 /* unless it's just an ack, it can't be someone coming out of limbo */
1748 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1749 return nil;
1750
1751 tpriv = s->p->priv;
1752
1753 /* find a call in limbo */
1754 h = hashipa(src, segp->source);
1755 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1756 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1757 src, segp->source, lp->raddr, lp->rport,
1758 dst, segp->dest, lp->laddr, lp->lport,
1759 version, lp->version
1760 );
1761
1762 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1763 continue;
1764 if(ipcmp(lp->laddr, dst) != 0)
1765 continue;
1766 if(ipcmp(lp->raddr, src) != 0)
1767 continue;
1768
1769 /* we're assuming no data with the initial SYN */
1770 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1771 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1772 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1773 lp = nil;
1774 } else {
1775 tpriv->nlimbo--;
1776 *l = lp->next;
1777 }
1778 break;
1779 }
1780 if(lp == nil)
1781 return nil;
1782
1783 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1784 if(new == nil)
1785 return nil;
1786
1787 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1788 tcb = (Tcpctl*)new->ptcl;
1789 tcb->flags &= ~CLONE;
1790 tcb->timer.arg = new;
1791 tcb->timer.state = TcptimerOFF;
1792 tcb->acktimer.arg = new;
1793 tcb->acktimer.state = TcptimerOFF;
1794 tcb->katimer.arg = new;
1795 tcb->katimer.state = TcptimerOFF;
1796 tcb->rtt_timer.arg = new;
1797 tcb->rtt_timer.state = TcptimerOFF;
1798
1799 tcb->irs = lp->irs;
1800 tcb->rcv.nxt = tcb->irs+1;
1801 tcb->rcv.wptr = tcb->rcv.nxt;
1802 tcb->rcv.wsnt = 0;
1803 tcb->rcv.urg = tcb->rcv.nxt;
1804
1805 tcb->iss = lp->iss;
1806 tcb->rttseq = tcb->iss;
1807 tcb->snd.wl2 = tcb->iss;
1808 tcb->snd.una = tcb->iss+1;
1809 tcb->snd.ptr = tcb->iss+1;
1810 tcb->snd.nxt = tcb->iss+1;
1811 tcb->snd.rxt = tcb->iss+1;
1812 tcb->flgcnt = 0;
1813 tcb->flags |= SYNACK;
1814
1815 /* set desired mss and scale */
1816 tcb->mss = tcpmtu(s->p, dst, s->ipversion, &tcb->scale);
1817
1818 /* our sending max segment size cannot be bigger than what he asked for */
1819 if(lp->mss != 0 && lp->mss < tcb->mss)
1820 tcb->mss = lp->mss;
1821 tpriv->stats[Mss] = tcb->mss;
1822
1823 /* window scaling */
1824 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1825
1826 /* congestion window */
1827 tcb->snd.wnd = segp->wnd;
1828 initialwindow(tcb);
1829
1830 /* set initial round trip time */
1831 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1832 tcpsynackrtt(new);
1833
1834 free(lp);
1835
1836 /* set up proto header */
1837 switch(version){
1838 case V4:
1839 h4 = &tcb->protohdr.tcp4hdr;
1840 memset(h4, 0, sizeof(*h4));
1841 h4->proto = IP_TCPPROTO;
1842 hnputs(h4->tcpsport, new->lport);
1843 hnputs(h4->tcpdport, new->rport);
1844 v6tov4(h4->tcpsrc, dst);
1845 v6tov4(h4->tcpdst, src);
1846 break;
1847 case V6:
1848 h6 = &tcb->protohdr.tcp6hdr;
1849 memset(h6, 0, sizeof(*h6));
1850 h6->proto = IP_TCPPROTO;
1851 hnputs(h6->tcpsport, new->lport);
1852 hnputs(h6->tcpdport, new->rport);
1853 ipmove(h6->tcpsrc, dst);
1854 ipmove(h6->tcpdst, src);
1855 break;
1856 default:
1857 panic("tcpincoming: version %d", new->ipversion);
1858 }
1859
1860 tcpsetstate(new, Established);
1861
1862 iphtadd(&tpriv->ht, new);
1863
1864 return new;
1865 }
1866
1867 static int
seq_within(ulong x,ulong low,ulong high)1868 seq_within(ulong x, ulong low, ulong high)
1869 {
1870 if(low <= high){
1871 if(low <= x && x <= high)
1872 return 1;
1873 }
1874 else {
1875 if(x >= low || x <= high)
1876 return 1;
1877 }
1878 return 0;
1879 }
1880
1881 static int
seq_lt(ulong x,ulong y)1882 seq_lt(ulong x, ulong y)
1883 {
1884 return (int)(x-y) < 0;
1885 }
1886
1887 static int
seq_le(ulong x,ulong y)1888 seq_le(ulong x, ulong y)
1889 {
1890 return (int)(x-y) <= 0;
1891 }
1892
1893 static int
seq_gt(ulong x,ulong y)1894 seq_gt(ulong x, ulong y)
1895 {
1896 return (int)(x-y) > 0;
1897 }
1898
1899 static int
seq_ge(ulong x,ulong y)1900 seq_ge(ulong x, ulong y)
1901 {
1902 return (int)(x-y) >= 0;
1903 }
1904
1905 /*
1906 * use the time between the first SYN and it's ack as the
1907 * initial round trip time
1908 */
1909 static void
tcpsynackrtt(Conv * s)1910 tcpsynackrtt(Conv *s)
1911 {
1912 Tcpctl *tcb;
1913 int delta;
1914 Tcppriv *tpriv;
1915
1916 tcb = (Tcpctl*)s->ptcl;
1917 tpriv = s->p->priv;
1918
1919 delta = NOW - tcb->sndsyntime;
1920 tcb->srtt = delta<<LOGAGAIN;
1921 tcb->mdev = delta<<LOGDGAIN;
1922
1923 /* halt round trip timer */
1924 tcphalt(tpriv, &tcb->rtt_timer);
1925 }
1926
1927 static void
update(Conv * s,Tcp * seg)1928 update(Conv *s, Tcp *seg)
1929 {
1930 int rtt, delta;
1931 Tcpctl *tcb;
1932 ulong acked;
1933 Tcppriv *tpriv;
1934
1935 if(seg->update)
1936 return;
1937 seg->update = 1;
1938
1939 tpriv = s->p->priv;
1940 tcb = (Tcpctl*)s->ptcl;
1941
1942 /* catch zero-window updates, update window & recover */
1943 if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1944 seq_lt(seg->ack, tcb->snd.ptr)){
1945 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1946 seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
1947 tcb->snd.wnd = seg->wnd;
1948 goto recovery;
1949 }
1950
1951 /* newreno fast retransmit */
1952 if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1953 ++tcb->snd.dupacks == 3){ /* was TCPREXMTTHRESH */
1954 recovery:
1955 if(tcb->snd.recovery){
1956 tpriv->stats[RecoveryCwind]++;
1957 tcb->cwind += tcb->mss;
1958 }else if(seq_le(tcb->snd.rxt, seg->ack)){
1959 tpriv->stats[Recovery]++;
1960 tcb->abcbytes = 0;
1961 tcb->snd.recovery = 1;
1962 tcb->snd.partialack = 0;
1963 tcb->snd.rxt = tcb->snd.nxt;
1964 tcpcongestion(tcb);
1965 tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1966 netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1967 tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1968 tcprxmit(s);
1969 }else{
1970 tpriv->stats[RecoveryNoSeq]++;
1971 netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1972 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1973 /* don't enter fast retransmit, don't change ssthresh */
1974 }
1975 }else if(tcb->snd.recovery){
1976 tpriv->stats[RecoveryCwind]++;
1977 tcb->cwind += tcb->mss;
1978 }
1979
1980 /*
1981 * update window
1982 */
1983 if(seq_gt(seg->ack, tcb->snd.wl2)
1984 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1985 /* clear dupack if we advance wl2 */
1986 if(tcb->snd.wl2 != seg->ack)
1987 tcb->snd.dupacks = 0;
1988 tcb->snd.wnd = seg->wnd;
1989 tcb->snd.wl2 = seg->ack;
1990 }
1991
1992 if(!seq_gt(seg->ack, tcb->snd.una)){
1993 /*
1994 * don't let us hangup if sending into a closed window and
1995 * we're still getting acks
1996 */
1997 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1998 tcb->backedoff = MAXBACKMS/4;
1999 return;
2000 }
2001
2002 /* Compute the new send window size */
2003 acked = seg->ack - tcb->snd.una;
2004
2005 /* avoid slow start and timers for SYN acks */
2006 if((tcb->flags & SYNACK) == 0) {
2007 tcb->flags |= SYNACK;
2008 acked--;
2009 tcb->flgcnt--;
2010 goto done;
2011 }
2012
2013 /*
2014 * congestion control
2015 */
2016 if(tcb->snd.recovery){
2017 if(seq_ge(seg->ack, tcb->snd.rxt)){
2018 /* recovery finished; deflate window */
2019 tpriv->stats[RecoveryDone]++;
2020 tcb->snd.dupacks = 0;
2021 tcb->snd.recovery = 0;
2022 tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
2023 if(tcb->ssthresh < tcb->cwind)
2024 tcb->cwind = tcb->ssthresh;
2025 netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
2026 tcb->cwind, tcb->ssthresh);
2027 } else {
2028 /* partial ack; we lost more than one segment */
2029 tpriv->stats[RecoveryPA]++;
2030 if(tcb->cwind > acked)
2031 tcb->cwind -= acked;
2032 else{
2033 netlog(s->p->f, Logtcpwin, "partial ack neg\n");
2034 tcb->cwind = tcb->mss;
2035 }
2036 netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
2037 acked, tcb->snd.rxt - seg->ack, tcb->cwind);
2038
2039 if(acked >= tcb->mss)
2040 tcb->cwind += tcb->mss;
2041 tcb->snd.partialack++;
2042 }
2043 } else
2044 tcpabcincr(tcb, acked);
2045
2046 /* Adjust the timers according to the round trip time */
2047 /* TODO: fix sloppy treatment of overflow cases here. */
2048 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2049 tcphalt(tpriv, &tcb->rtt_timer);
2050 if((tcb->flags&RETRAN) == 0) {
2051 tcb->backoff = 0;
2052 tcb->backedoff = 0;
2053 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2054 if(rtt == 0)
2055 rtt = 1; /* else all close sys's will rexmit in 0 time */
2056 rtt *= MSPTICK;
2057 if(tcb->srtt == 0) {
2058 tcb->srtt = rtt << LOGAGAIN;
2059 tcb->mdev = rtt << LOGDGAIN;
2060 } else {
2061 delta = rtt - (tcb->srtt>>LOGAGAIN);
2062 tcb->srtt += delta;
2063 if(tcb->srtt <= 0)
2064 tcb->srtt = 1;
2065
2066 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2067 tcb->mdev += delta;
2068 if(tcb->mdev <= 0)
2069 tcb->mdev = 1;
2070 }
2071 tcpsettimer(tcb);
2072 }
2073 }
2074
2075 done:
2076 if(qdiscard(s->wq, acked) < acked)
2077 tcb->flgcnt--;
2078 tcb->snd.una = seg->ack;
2079
2080 /* newreno fast recovery */
2081 if(tcb->snd.recovery)
2082 tcprxmit(s);
2083
2084 if(seq_gt(seg->ack, tcb->snd.urg))
2085 tcb->snd.urg = seg->ack;
2086
2087 if(tcb->snd.una != tcb->snd.nxt){
2088 /* `impatient' variant */
2089 if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2090 tcb->time = NOW;
2091 tcb->timeuna = tcb->snd.una;
2092 tcpgo(tpriv, &tcb->timer);
2093 }
2094 } else
2095 tcphalt(tpriv, &tcb->timer);
2096
2097 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2098 tcb->snd.ptr = tcb->snd.una;
2099
2100 if(!tcb->snd.recovery)
2101 tcb->flags &= ~RETRAN;
2102 tcb->backoff = 0;
2103 tcb->backedoff = 0;
2104 }
2105
2106 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2107 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2108 {
2109 Tcp seg;
2110 Tcp4hdr *h4;
2111 Tcp6hdr *h6;
2112 int hdrlen;
2113 Tcpctl *tcb;
2114 ushort length, csum;
2115 uchar source[IPaddrlen], dest[IPaddrlen];
2116 Conv *s;
2117 Fs *f;
2118 Tcppriv *tpriv;
2119 uchar version;
2120
2121 f = tcp->f;
2122 tpriv = tcp->priv;
2123
2124 tpriv->stats[InSegs]++;
2125
2126 h4 = (Tcp4hdr*)(bp->rp);
2127 h6 = (Tcp6hdr*)(bp->rp);
2128
2129 if((h4->vihl&0xF0)==IP_VER4) {
2130 version = V4;
2131 length = nhgets(h4->length);
2132 v4tov6(dest, h4->tcpdst);
2133 v4tov6(source, h4->tcpsrc);
2134
2135 h4->Unused = 0;
2136 hnputs(h4->tcplen, length-TCP4_PKT);
2137 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2138 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2139 tpriv->stats[CsumErrs]++;
2140 tpriv->stats[InErrs]++;
2141 netlog(f, Logtcp, "bad tcp proto cksum\n");
2142 freeblist(bp);
2143 return;
2144 }
2145
2146 hdrlen = ntohtcp4(&seg, &bp);
2147 if(hdrlen < 0){
2148 tpriv->stats[HlenErrs]++;
2149 tpriv->stats[InErrs]++;
2150 netlog(f, Logtcp, "bad tcp hdr len\n");
2151 return;
2152 }
2153
2154 /* trim the packet to the size claimed by the datagram */
2155 length -= hdrlen+TCP4_PKT;
2156 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2157 if(bp == nil){
2158 tpriv->stats[LenErrs]++;
2159 tpriv->stats[InErrs]++;
2160 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2161 return;
2162 }
2163 }
2164 else {
2165 int ttl = h6->ttl;
2166 int proto = h6->proto;
2167
2168 version = V6;
2169 length = nhgets(h6->ploadlen);
2170 ipmove(dest, h6->tcpdst);
2171 ipmove(source, h6->tcpsrc);
2172
2173 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2174 h6->ttl = proto;
2175 hnputl(h6->vcf, length);
2176 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2177 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2178 tpriv->stats[CsumErrs]++;
2179 tpriv->stats[InErrs]++;
2180 netlog(f, Logtcp,
2181 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2182 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2183 freeblist(bp);
2184 return;
2185 }
2186 h6->ttl = ttl;
2187 h6->proto = proto;
2188 hnputs(h6->ploadlen, length);
2189
2190 hdrlen = ntohtcp6(&seg, &bp);
2191 if(hdrlen < 0){
2192 tpriv->stats[HlenErrs]++;
2193 tpriv->stats[InErrs]++;
2194 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2195 return;
2196 }
2197
2198 /* trim the packet to the size claimed by the datagram */
2199 length -= hdrlen;
2200 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2201 if(bp == nil){
2202 tpriv->stats[LenErrs]++;
2203 tpriv->stats[InErrs]++;
2204 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2205 return;
2206 }
2207 }
2208
2209 /* lock protocol while searching for a conversation */
2210 qlock(tcp);
2211
2212 /* Look for a matching conversation */
2213 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2214 if(s == nil){
2215 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2216 source, seg.source, dest, seg.dest);
2217 reset:
2218 qunlock(tcp);
2219 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2220 freeblist(bp);
2221 return;
2222 }
2223
2224 /* if it's a listener, look for the right flags and get a new conv */
2225 tcb = (Tcpctl*)s->ptcl;
2226 if(tcb->state == Listen){
2227 if(seg.flags & RST){
2228 limborst(s, &seg, source, dest, version);
2229 qunlock(tcp);
2230 freeblist(bp);
2231 return;
2232 }
2233
2234 /* if this is a new SYN, put the call into limbo */
2235 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2236 limbo(s, source, dest, &seg, version);
2237 qunlock(tcp);
2238 freeblist(bp);
2239 return;
2240 }
2241
2242 /*
2243 * if there's a matching call in limbo, tcpincoming will
2244 * return it in state Syn_received
2245 */
2246 s = tcpincoming(s, &seg, source, dest, version);
2247 if(s == nil)
2248 goto reset;
2249 }
2250
2251 /* The rest of the input state machine is run with the control block
2252 * locked and implements the state machine directly out of the RFC.
2253 * Out-of-band data is ignored - it was always a bad idea.
2254 */
2255 tcb = (Tcpctl*)s->ptcl;
2256 if(waserror()){
2257 qunlock(s);
2258 nexterror();
2259 }
2260 qlock(s);
2261 qunlock(tcp);
2262
2263 /* fix up window */
2264 seg.wnd <<= tcb->rcv.scale;
2265
2266 /* every input packet in puts off the keep alive time out */
2267 tcpsetkacounter(tcb);
2268
2269 switch(tcb->state) {
2270 case Closed:
2271 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2272 goto raise;
2273 case Syn_sent:
2274 if(seg.flags & ACK) {
2275 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2276 sndrst(tcp, source, dest, length, &seg, version,
2277 "bad seq in Syn_sent");
2278 goto raise;
2279 }
2280 }
2281 if(seg.flags & RST) {
2282 if(seg.flags & ACK)
2283 localclose(s, Econrefused);
2284 goto raise;
2285 }
2286
2287 if(seg.flags & SYN) {
2288 procsyn(s, &seg);
2289 if(seg.flags & ACK){
2290 update(s, &seg);
2291 tcpsynackrtt(s);
2292 tcpsetstate(s, Established);
2293 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2294 }
2295 else {
2296 tcb->time = NOW;
2297 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2298 }
2299
2300 if(length != 0 || (seg.flags & FIN))
2301 break;
2302
2303 freeblist(bp);
2304 goto output;
2305 }
2306 else
2307 freeblist(bp);
2308
2309 qunlock(s);
2310 poperror();
2311 return;
2312 case Syn_received:
2313 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2314 if(seg.flags & ACK)
2315 tcpsynackrtt(s);
2316 break;
2317 }
2318
2319 /*
2320 * One DOS attack is to open connections to us and then forget about them,
2321 * thereby tying up a conv at no long term cost to the attacker.
2322 * This is an attempt to defeat these stateless DOS attacks. See
2323 * corresponding code in tcpsendka().
2324 */
2325 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2326 if(tcpporthogdefense
2327 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2328 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2329 source, seg.source, dest, seg.dest, seg.flags,
2330 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2331 localclose(s, "stateless hog");
2332 }
2333 }
2334
2335 /* Cut the data to fit the receive window */
2336 tcprcvwin(s);
2337 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2338 if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2339 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2340 "%lud-%lud l %d from %I\n", seg.seq,
2341 seg.seq + length - 1, tcb->rcv.nxt,
2342 tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2343 update(s, &seg);
2344 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2345 tcphalt(tpriv, &tcb->rtt_timer);
2346 tcphalt(tpriv, &tcb->acktimer);
2347 tcphalt(tpriv, &tcb->katimer);
2348 tcpsetstate(s, Time_wait);
2349 tcb->timer.start = MSL2*(1000 / MSPTICK);
2350 tcpgo(tpriv, &tcb->timer);
2351 }
2352 if(!(seg.flags & RST)) {
2353 tcb->flags |= FORCE;
2354 goto output;
2355 }
2356 qunlock(s);
2357 poperror();
2358 return;
2359 }
2360
2361 /* Cannot accept so answer with a rst */
2362 if(length && tcb->state == Closed) {
2363 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2364 goto raise;
2365 }
2366
2367 /* The segment is beyond the current receive pointer so
2368 * queue the data in the resequence queue
2369 */
2370 if(seg.seq != tcb->rcv.nxt)
2371 if(length != 0 || (seg.flags & (SYN|FIN))) {
2372 update(s, &seg);
2373 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2374 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2375 s->laddr, s->lport);
2376 tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
2377 goto output;
2378 }
2379
2380 if(tcb->nreseq > 0)
2381 tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2382
2383 /*
2384 * keep looping till we've processed this packet plus any
2385 * adjacent packets in the resequence queue
2386 */
2387 for(;;) {
2388 if(seg.flags & RST) {
2389 if(tcb->state == Established) {
2390 tpriv->stats[EstabResets]++;
2391 if(tcb->rcv.nxt != seg.seq)
2392 print("out of order RST rcvd: %I.%d -> "
2393 "%I.%d, rcv.nxt %lux seq %lux\n",
2394 s->raddr, s->rport, s->laddr,
2395 s->lport, tcb->rcv.nxt, seg.seq);
2396 }
2397 localclose(s, Econrefused);
2398 goto raise;
2399 }
2400
2401 if((seg.flags&ACK) == 0)
2402 goto raise;
2403
2404 switch(tcb->state) {
2405 case Syn_received:
2406 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2407 sndrst(tcp, source, dest, length, &seg, version,
2408 "bad seq in Syn_received");
2409 goto raise;
2410 }
2411 update(s, &seg);
2412 tcpsetstate(s, Established);
2413 case Established:
2414 case Close_wait:
2415 update(s, &seg);
2416 break;
2417 case Finwait1:
2418 update(s, &seg);
2419 if(qlen(s->wq)+tcb->flgcnt == 0){
2420 tcphalt(tpriv, &tcb->rtt_timer);
2421 tcphalt(tpriv, &tcb->acktimer);
2422 tcpsetkacounter(tcb);
2423 tcb->time = NOW;
2424 tcpsetstate(s, Finwait2);
2425 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2426 tcpgo(tpriv, &tcb->katimer);
2427 }
2428 break;
2429 case Finwait2:
2430 update(s, &seg);
2431 break;
2432 case Closing:
2433 update(s, &seg);
2434 if(qlen(s->wq)+tcb->flgcnt == 0) {
2435 tcphalt(tpriv, &tcb->rtt_timer);
2436 tcphalt(tpriv, &tcb->acktimer);
2437 tcphalt(tpriv, &tcb->katimer);
2438 tcpsetstate(s, Time_wait);
2439 tcb->timer.start = MSL2*(1000 / MSPTICK);
2440 tcpgo(tpriv, &tcb->timer);
2441 }
2442 break;
2443 case Last_ack:
2444 update(s, &seg);
2445 if(qlen(s->wq)+tcb->flgcnt == 0) {
2446 localclose(s, nil);
2447 goto raise;
2448 }
2449 case Time_wait:
2450 tcb->flags |= FORCE;
2451 if(tcb->timer.state != TcptimerON)
2452 tcpgo(tpriv, &tcb->timer);
2453 }
2454
2455 if((seg.flags&URG) && seg.urg) {
2456 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2457 tcb->rcv.urg = seg.urg + seg.seq;
2458 pullblock(&bp, seg.urg);
2459 }
2460 }
2461 else
2462 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2463 tcb->rcv.urg = tcb->rcv.nxt;
2464
2465 if(length == 0) {
2466 if(bp != nil)
2467 freeblist(bp);
2468 }
2469 else {
2470 switch(tcb->state){
2471 default:
2472 /* Ignore segment text */
2473 if(bp != nil)
2474 freeblist(bp);
2475 break;
2476
2477 case Syn_received:
2478 case Established:
2479 case Finwait1:
2480 /* If we still have some data place on
2481 * receive queue
2482 */
2483 if(bp) {
2484 bp = packblock(bp);
2485 if(bp == nil)
2486 panic("tcp packblock");
2487 qpassnolim(s->rq, bp);
2488 bp = nil;
2489 }
2490 tcb->rcv.nxt += length;
2491
2492 /*
2493 * turn on the acktimer if there's something
2494 * to ack
2495 */
2496 if(tcb->acktimer.state != TcptimerON)
2497 tcpgo(tpriv, &tcb->acktimer);
2498
2499 break;
2500 case Finwait2:
2501 /* no process to read the data, send a reset */
2502 if(bp != nil)
2503 freeblist(bp);
2504 sndrst(tcp, source, dest, length, &seg, version,
2505 "send to Finwait2");
2506 qunlock(s);
2507 poperror();
2508 return;
2509 }
2510 }
2511
2512 if(seg.flags & FIN) {
2513 tcb->flags |= FORCE;
2514
2515 switch(tcb->state) {
2516 case Syn_received:
2517 case Established:
2518 tcb->rcv.nxt++;
2519 tcpsetstate(s, Close_wait);
2520 break;
2521 case Finwait1:
2522 tcb->rcv.nxt++;
2523 if(qlen(s->wq)+tcb->flgcnt == 0) {
2524 tcphalt(tpriv, &tcb->rtt_timer);
2525 tcphalt(tpriv, &tcb->acktimer);
2526 tcphalt(tpriv, &tcb->katimer);
2527 tcpsetstate(s, Time_wait);
2528 tcb->timer.start = MSL2*(1000/MSPTICK);
2529 tcpgo(tpriv, &tcb->timer);
2530 }
2531 else
2532 tcpsetstate(s, Closing);
2533 break;
2534 case Finwait2:
2535 tcb->rcv.nxt++;
2536 tcphalt(tpriv, &tcb->rtt_timer);
2537 tcphalt(tpriv, &tcb->acktimer);
2538 tcphalt(tpriv, &tcb->katimer);
2539 tcpsetstate(s, Time_wait);
2540 tcb->timer.start = MSL2 * (1000/MSPTICK);
2541 tcpgo(tpriv, &tcb->timer);
2542 break;
2543 case Close_wait:
2544 case Closing:
2545 case Last_ack:
2546 break;
2547 case Time_wait:
2548 tcpgo(tpriv, &tcb->timer);
2549 break;
2550 }
2551 }
2552
2553 /*
2554 * get next adjacent segment from the resequence queue.
2555 * dump/trim any overlapping segments
2556 */
2557 for(;;) {
2558 if(tcb->reseq == nil)
2559 goto output;
2560
2561 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2562 goto output;
2563
2564 getreseq(tcb, &seg, &bp, &length);
2565
2566 tcprcvwin(s);
2567 if(tcptrim(tcb, &seg, &bp, &length) == 0){
2568 tcb->flags |= FORCE;
2569 break;
2570 }
2571 }
2572 }
2573 output:
2574 tcpoutput(s);
2575 qunlock(s);
2576 poperror();
2577 return;
2578 raise:
2579 qunlock(s);
2580 poperror();
2581 freeblist(bp);
2582 tcpkick(s);
2583 }
2584
2585 /*
2586 * always enters and exits with the s locked. We drop
2587 * the lock to ipoput the packet so some care has to be
2588 * taken by callers.
2589 */
2590 static void
tcpoutput(Conv * s)2591 tcpoutput(Conv *s)
2592 {
2593 Tcp seg;
2594 uint msgs;
2595 Tcpctl *tcb;
2596 Block *hbp, *bp;
2597 int sndcnt;
2598 ulong ssize, dsize, sent;
2599 Fs *f;
2600 Tcppriv *tpriv;
2601 uchar version;
2602
2603 f = s->p->f;
2604 tpriv = s->p->priv;
2605 version = s->ipversion;
2606
2607 tcb = (Tcpctl*)s->ptcl;
2608
2609 /* force ack every 2*mss */
2610 if((tcb->flags & FORCE) == 0 &&
2611 tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2612 tpriv->stats[Delayack]++;
2613 tcb->flags |= FORCE;
2614 }
2615
2616 /* force ack if window opening */
2617 if((tcb->flags & FORCE) == 0){
2618 tcprcvwin(s);
2619 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2620 tpriv->stats[Wopenack]++;
2621 tcb->flags |= FORCE;
2622 }
2623 }
2624
2625 for(msgs = 0; msgs < 100; msgs++) {
2626 switch(tcb->state) {
2627 case Listen:
2628 case Closed:
2629 case Finwait2:
2630 return;
2631 }
2632
2633 /* Don't send anything else until our SYN has been acked */
2634 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2635 break;
2636
2637 /* force an ack when a window has opened up */
2638 tcprcvwin(s);
2639 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2640 tcb->rcv.blocked = 0;
2641 tcb->flags |= FORCE;
2642 }
2643
2644 sndcnt = qlen(s->wq)+tcb->flgcnt;
2645 sent = tcb->snd.ptr - tcb->snd.una;
2646 ssize = sndcnt;
2647 if(tcb->snd.wnd == 0){
2648 /* zero window probe */
2649 if(sent > 0 && !(tcb->flags & FORCE))
2650 break; /* already probing, rto re-probes */
2651 if(ssize < sent)
2652 ssize = 0;
2653 else{
2654 ssize -= sent;
2655 if(ssize > 0)
2656 ssize = 1;
2657 }
2658 } else {
2659 /* calculate usable segment size */
2660 if(ssize > tcb->cwind)
2661 ssize = tcb->cwind;
2662 if(ssize > tcb->snd.wnd)
2663 ssize = tcb->snd.wnd;
2664
2665 if(ssize < sent)
2666 ssize = 0;
2667 else {
2668 ssize -= sent;
2669 if(ssize > tcb->mss)
2670 ssize = tcb->mss;
2671 }
2672 }
2673
2674 dsize = ssize;
2675 seg.urg = 0;
2676
2677 if(!(tcb->flags & FORCE))
2678 if(ssize == 0 ||
2679 ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2680 sent > TCPREXMTTHRESH * tcb->mss)
2681 break;
2682
2683 tcb->flags &= ~FORCE;
2684
2685 /* By default we will generate an ack */
2686 tcphalt(tpriv, &tcb->acktimer);
2687 seg.source = s->lport;
2688 seg.dest = s->rport;
2689 seg.flags = ACK;
2690 seg.mss = 0;
2691 seg.ws = 0;
2692 seg.update = 0;
2693 switch(tcb->state){
2694 case Syn_sent:
2695 seg.flags = 0;
2696 if(tcb->snd.ptr == tcb->iss){
2697 seg.flags |= SYN;
2698 dsize--;
2699 seg.mss = tcb->mss;
2700 seg.ws = tcb->scale;
2701 }
2702 break;
2703 case Syn_received:
2704 /*
2705 * don't send any data with a SYN/ACK packet
2706 * because Linux rejects the packet in its
2707 * attempt to solve the SYN attack problem
2708 */
2709 if(tcb->snd.ptr == tcb->iss){
2710 seg.flags |= SYN;
2711 dsize = 0;
2712 ssize = 1;
2713 seg.mss = tcb->mss;
2714 seg.ws = tcb->scale;
2715 }
2716 break;
2717 }
2718 seg.seq = tcb->snd.ptr;
2719 seg.ack = tcb->rcv.nxt;
2720 seg.wnd = tcb->rcv.wnd;
2721
2722 /* Pull out data to send */
2723 bp = nil;
2724 if(dsize != 0) {
2725 bp = qcopy(s->wq, dsize, sent);
2726 if(BLEN(bp) != dsize) {
2727 seg.flags |= FIN;
2728 dsize--;
2729 }
2730 }
2731
2732 if(sent+dsize == sndcnt && dsize)
2733 seg.flags |= PSH;
2734
2735 tcb->snd.ptr += ssize;
2736
2737 /* Pull up the send pointer so we can accept acks
2738 * for this window
2739 */
2740 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2741 tcb->snd.nxt = tcb->snd.ptr;
2742
2743 /* Build header, link data and compute cksum */
2744 switch(version){
2745 case V4:
2746 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2747 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2748 if(hbp == nil) {
2749 freeblist(bp);
2750 return;
2751 }
2752 break;
2753 case V6:
2754 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2755 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2756 if(hbp == nil) {
2757 freeblist(bp);
2758 return;
2759 }
2760 break;
2761 default:
2762 hbp = nil; /* to suppress a warning */
2763 panic("tcpoutput: version %d", version);
2764 }
2765
2766 /* Start the transmission timers if there is new data and we
2767 * expect acknowledges
2768 */
2769 if(ssize != 0){
2770 if(tcb->timer.state != TcptimerON){
2771 tcb->time = NOW;
2772 tcb->timeuna = tcb->snd.una;
2773 tcpgo(tpriv, &tcb->timer);
2774 }
2775
2776 /* If round trip timer isn't running, start it.
2777 * measure the longest packet only in case the
2778 * transmission time dominates RTT
2779 */
2780 if(tcb->snd.retransmit == 0)
2781 if(tcb->rtt_timer.state != TcptimerON)
2782 if(ssize == tcb->mss) {
2783 tcpgo(tpriv, &tcb->rtt_timer);
2784 tcb->rttseq = tcb->snd.ptr;
2785 }
2786 }
2787
2788 tpriv->stats[OutSegs]++;
2789 if(tcb->snd.retransmit)
2790 tpriv->stats[RetransSegsSent]++;
2791 tcb->rcv.ackptr = seg.ack;
2792 tcb->rcv.wsnt = tcb->rcv.wptr;
2793
2794 /* put off the next keep alive */
2795 tcpgo(tpriv, &tcb->katimer);
2796
2797 switch(version){
2798 case V4:
2799 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2800 /* a negative return means no route */
2801 localclose(s, "no route");
2802 }
2803 break;
2804 case V6:
2805 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2806 /* a negative return means no route */
2807 localclose(s, "no route");
2808 }
2809 break;
2810 default:
2811 panic("tcpoutput2: version %d", version);
2812 }
2813 if((msgs%4) == 3){
2814 qunlock(s);
2815 qlock(s);
2816 }
2817 }
2818 }
2819
2820 /*
2821 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2822 */
2823 static void
tcpsendka(Conv * s)2824 tcpsendka(Conv *s)
2825 {
2826 Tcp seg;
2827 Tcpctl *tcb;
2828 Block *hbp,*dbp;
2829
2830 tcb = (Tcpctl*)s->ptcl;
2831
2832 dbp = nil;
2833 memset(&seg, 0, sizeof seg);
2834 seg.urg = 0;
2835 seg.source = s->lport;
2836 seg.dest = s->rport;
2837 seg.flags = ACK|PSH;
2838 seg.mss = 0;
2839 seg.ws = 0;
2840 if(tcpporthogdefense)
2841 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2842 else
2843 seg.seq = tcb->snd.una-1;
2844 seg.ack = tcb->rcv.nxt;
2845 tcb->rcv.ackptr = seg.ack;
2846 tcprcvwin(s);
2847 seg.wnd = tcb->rcv.wnd;
2848 if(tcb->state == Finwait2){
2849 seg.flags |= FIN;
2850 } else {
2851 dbp = allocb(1);
2852 dbp->wp++;
2853 }
2854
2855 if(isv4(s->raddr)) {
2856 /* Build header, link data and compute cksum */
2857 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2858 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2859 if(hbp == nil) {
2860 freeblist(dbp);
2861 return;
2862 }
2863 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2864 }
2865 else {
2866 /* Build header, link data and compute cksum */
2867 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2868 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2869 if(hbp == nil) {
2870 freeblist(dbp);
2871 return;
2872 }
2873 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2874 }
2875 }
2876
2877 /*
2878 * set connection to time out after 12 minutes
2879 */
2880 static void
tcpsetkacounter(Tcpctl * tcb)2881 tcpsetkacounter(Tcpctl *tcb)
2882 {
2883 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2884 if(tcb->kacounter < 3)
2885 tcb->kacounter = 3;
2886 }
2887
2888 /*
2889 * if we've timed out, close the connection
2890 * otherwise, send a keepalive and restart the timer
2891 */
2892 static void
tcpkeepalive(void * v)2893 tcpkeepalive(void *v)
2894 {
2895 Tcpctl *tcb;
2896 Conv *s;
2897
2898 s = v;
2899 tcb = (Tcpctl*)s->ptcl;
2900 if(waserror()){
2901 qunlock(s);
2902 nexterror();
2903 }
2904 qlock(s);
2905 if(tcb->state != Closed){
2906 if(--(tcb->kacounter) <= 0) {
2907 localclose(s, Etimedout);
2908 } else {
2909 tcpsendka(s);
2910 tcpgo(s->p->priv, &tcb->katimer);
2911 }
2912 }
2913 qunlock(s);
2914 poperror();
2915 }
2916
2917 /*
2918 * start keepalive timer
2919 */
2920 static char*
tcpstartka(Conv * s,char ** f,int n)2921 tcpstartka(Conv *s, char **f, int n)
2922 {
2923 Tcpctl *tcb;
2924 int x;
2925
2926 tcb = (Tcpctl*)s->ptcl;
2927 if(tcb->state != Established)
2928 return "connection must be in Establised state";
2929 if(n > 1){
2930 x = atoi(f[1]);
2931 if(x >= MSPTICK)
2932 tcb->katimer.start = x/MSPTICK;
2933 }
2934 tcpsetkacounter(tcb);
2935 tcpgo(s->p->priv, &tcb->katimer);
2936
2937 return nil;
2938 }
2939
2940 /*
2941 * turn checksums on/off
2942 */
2943 static char*
tcpsetchecksum(Conv * s,char ** f,int)2944 tcpsetchecksum(Conv *s, char **f, int)
2945 {
2946 Tcpctl *tcb;
2947
2948 tcb = (Tcpctl*)s->ptcl;
2949 tcb->nochecksum = !atoi(f[1]);
2950
2951 return nil;
2952 }
2953
2954 /*
2955 * retransmit (at most) one segment at snd.una.
2956 * preserve cwind & snd.ptr
2957 */
2958 static void
tcprxmit(Conv * s)2959 tcprxmit(Conv *s)
2960 {
2961 Tcpctl *tcb;
2962 Tcppriv *tpriv;
2963 ulong tcwind, tptr;
2964
2965 tcb = (Tcpctl*)s->ptcl;
2966 tcb->flags |= RETRAN|FORCE;
2967
2968 tptr = tcb->snd.ptr;
2969 tcwind = tcb->cwind;
2970 tcb->snd.ptr = tcb->snd.una;
2971 tcb->cwind = tcb->mss;
2972 tcb->snd.retransmit = 1;
2973 tcpoutput(s);
2974 tcb->snd.retransmit = 0;
2975 tcb->cwind = tcwind;
2976 tcb->snd.ptr = tptr;
2977
2978 tpriv = s->p->priv;
2979 tpriv->stats[RetransSegs]++;
2980 }
2981
2982 /*
2983 * TODO: RFC 4138 F-RTO
2984 */
2985 static void
tcptimeout(void * arg)2986 tcptimeout(void *arg)
2987 {
2988 Conv *s;
2989 Tcpctl *tcb;
2990 int maxback;
2991 Tcppriv *tpriv;
2992
2993 s = (Conv*)arg;
2994 tpriv = s->p->priv;
2995 tcb = (Tcpctl*)s->ptcl;
2996
2997 if(waserror()){
2998 qunlock(s);
2999 nexterror();
3000 }
3001 qlock(s);
3002 switch(tcb->state){
3003 default:
3004 tcb->backoff++;
3005 if(tcb->state == Syn_sent)
3006 maxback = MAXBACKMS/2;
3007 else
3008 maxback = MAXBACKMS;
3009 tcb->backedoff += tcb->timer.start * MSPTICK;
3010 if(tcb->backedoff >= maxback) {
3011 localclose(s, Etimedout);
3012 break;
3013 }
3014 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
3015 tcb->srtt, tcb->mdev, NOW - tcb->time,
3016 tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
3017 tcpstates[s->state]);
3018 tcpsettimer(tcb);
3019 if(tcb->snd.rto == 0)
3020 tcpcongestion(tcb);
3021 tcprxmit(s);
3022 tcb->snd.ptr = tcb->snd.una;
3023 tcb->cwind = tcb->mss;
3024 tcb->snd.rto = 1;
3025 tpriv->stats[RetransTimeouts]++;
3026
3027 if(tcb->snd.recovery){
3028 tcb->snd.dupacks = 0; /* reno rto */
3029 tcb->snd.recovery = 0;
3030 tpriv->stats[RecoveryRTO]++;
3031 tcb->snd.rxt = tcb->snd.nxt;
3032 netlog(s->p->f, Logtcpwin,
3033 "rto recovery rxt @%lud\n", tcb->snd.nxt);
3034 }
3035
3036 tcb->abcbytes = 0;
3037 break;
3038 case Time_wait:
3039 localclose(s, nil);
3040 break;
3041 case Closed:
3042 break;
3043 }
3044 qunlock(s);
3045 poperror();
3046 }
3047
3048 static int
inwindow(Tcpctl * tcb,int seq)3049 inwindow(Tcpctl *tcb, int seq)
3050 {
3051 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3052 }
3053
3054 /*
3055 * set up state for a received SYN (or SYN ACK) packet
3056 */
3057 static void
procsyn(Conv * s,Tcp * seg)3058 procsyn(Conv *s, Tcp *seg)
3059 {
3060 Tcpctl *tcb;
3061 Tcppriv *tpriv;
3062
3063 tcb = (Tcpctl*)s->ptcl;
3064 tcb->flags |= FORCE;
3065
3066 tcb->rcv.nxt = seg->seq + 1;
3067 tcb->rcv.wptr = tcb->rcv.nxt;
3068 tcb->rcv.wsnt = 0;
3069 tcb->rcv.urg = tcb->rcv.nxt;
3070 tcb->irs = seg->seq;
3071
3072 /* our sending max segment size cannot be bigger than what he asked for */
3073 if(seg->mss != 0 && seg->mss < tcb->mss) {
3074 tcb->mss = seg->mss;
3075 tpriv = s->p->priv;
3076 tpriv->stats[Mss] = tcb->mss;
3077 }
3078
3079 tcb->snd.wnd = seg->wnd;
3080 initialwindow(tcb);
3081 }
3082
3083 static int
dumpreseq(Tcpctl * tcb)3084 dumpreseq(Tcpctl *tcb)
3085 {
3086 Reseq *r, *next;
3087
3088 for(r = tcb->reseq; r != nil; r = next){
3089 next = r->next;
3090 freeblist(r->bp);
3091 free(r);
3092 }
3093 tcb->reseq = nil;
3094 tcb->nreseq = 0;
3095 tcb->reseqlen = 0;
3096 return -1;
3097 }
3098
3099 static void
logreseq(Fs * f,Reseq * r,ulong n)3100 logreseq(Fs *f, Reseq *r, ulong n)
3101 {
3102 char *s;
3103
3104 for(; r != nil; r = r->next){
3105 s = nil;
3106 if(r->next == nil && r->seg.seq != n)
3107 s = "hole/end";
3108 else if(r->next == nil)
3109 s = "end";
3110 else if(r->seg.seq != n)
3111 s = "hole";
3112 if(s != nil)
3113 netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3114 n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3115 n = r->seg.seq + r->seg.len;
3116 }
3117 }
3118
3119 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3120 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3121 {
3122 Reseq *rp, **rr;
3123 int qmax;
3124
3125 rp = malloc(sizeof *rp);
3126 if(rp == nil){
3127 freeblist(bp); /* bp always consumed by addreseq */
3128 return 0;
3129 }
3130
3131 rp->seg = *seg;
3132 rp->bp = bp;
3133 rp->length = length;
3134
3135 tcb->reseqlen += length;
3136 tcb->nreseq++;
3137
3138 /* Place on reassembly list sorting by starting seq number */
3139 for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3140 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3141 rp->next = *rr;
3142 *rr = rp;
3143 tpriv->stats[Resequenced]++;
3144 if(rp->next != nil)
3145 tpriv->stats[OutOfOrder]++;
3146 break;
3147 }
3148
3149 qmax = tcb->window;
3150 if(tcb->reseqlen > qmax){
3151 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3152 tcb->reseqlen, qmax, tcb->nreseq);
3153 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3154 tpriv->stats[ReseqBytelim]++;
3155 return dumpreseq(tcb);
3156 }
3157 qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3158 if(tcb->nreseq > qmax){
3159 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3160 tcb->nreseq, qmax, tcb->reseqlen);
3161 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3162 tpriv->stats[ReseqPktlim]++;
3163 return dumpreseq(tcb);
3164 }
3165 return 0;
3166 }
3167
3168 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3169 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3170 {
3171 Reseq *rp;
3172
3173 rp = tcb->reseq;
3174 if(rp == nil)
3175 return;
3176
3177 tcb->reseq = rp->next;
3178
3179 *seg = rp->seg;
3180 *bp = rp->bp;
3181 *length = rp->length;
3182
3183 tcb->nreseq--;
3184 tcb->reseqlen -= rp->length;
3185
3186 free(rp);
3187 }
3188
3189 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3190 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3191 {
3192 ushort len;
3193 uchar accept;
3194 int dupcnt, excess;
3195
3196 accept = 0;
3197 len = *length;
3198 if(seg->flags & SYN)
3199 len++;
3200 if(seg->flags & FIN)
3201 len++;
3202
3203 if(tcb->rcv.wnd == 0) {
3204 if(len == 0 && seg->seq == tcb->rcv.nxt)
3205 return 0;
3206 }
3207 else {
3208 /* Some part of the segment should be in the window */
3209 if(inwindow(tcb,seg->seq))
3210 accept++;
3211 else
3212 if(len != 0) {
3213 if(inwindow(tcb, seg->seq+len-1) ||
3214 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3215 accept++;
3216 }
3217 }
3218 if(!accept) {
3219 freeblist(*bp);
3220 return -1;
3221 }
3222 dupcnt = tcb->rcv.nxt - seg->seq;
3223 if(dupcnt > 0){
3224 tcb->rerecv += dupcnt;
3225 if(seg->flags & SYN){
3226 seg->flags &= ~SYN;
3227 seg->seq++;
3228
3229 if(seg->urg > 1)
3230 seg->urg--;
3231 else
3232 seg->flags &= ~URG;
3233 dupcnt--;
3234 }
3235 if(dupcnt > 0){
3236 pullblock(bp, (ushort)dupcnt);
3237 seg->seq += dupcnt;
3238 *length -= dupcnt;
3239
3240 if(seg->urg > dupcnt)
3241 seg->urg -= dupcnt;
3242 else {
3243 seg->flags &= ~URG;
3244 seg->urg = 0;
3245 }
3246 }
3247 }
3248 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3249 if(excess > 0) {
3250 tcb->rerecv += excess;
3251 *length -= excess;
3252 *bp = trimblock(*bp, 0, *length);
3253 if(*bp == nil)
3254 panic("presotto is a boofhead");
3255 seg->flags &= ~FIN;
3256 }
3257 return 0;
3258 }
3259
3260 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3261 tcpadvise(Proto *tcp, Block *bp, char *msg)
3262 {
3263 Tcp4hdr *h4;
3264 Tcp6hdr *h6;
3265 Tcpctl *tcb;
3266 uchar source[IPaddrlen];
3267 uchar dest[IPaddrlen];
3268 ushort psource, pdest;
3269 Conv *s, **p;
3270
3271 h4 = (Tcp4hdr*)(bp->rp);
3272 h6 = (Tcp6hdr*)(bp->rp);
3273
3274 if((h4->vihl&0xF0)==IP_VER4) {
3275 v4tov6(dest, h4->tcpdst);
3276 v4tov6(source, h4->tcpsrc);
3277 psource = nhgets(h4->tcpsport);
3278 pdest = nhgets(h4->tcpdport);
3279 }
3280 else {
3281 ipmove(dest, h6->tcpdst);
3282 ipmove(source, h6->tcpsrc);
3283 psource = nhgets(h6->tcpsport);
3284 pdest = nhgets(h6->tcpdport);
3285 }
3286
3287 /* Look for a connection */
3288 qlock(tcp);
3289 for(p = tcp->conv; *p; p++) {
3290 s = *p;
3291 tcb = (Tcpctl*)s->ptcl;
3292 if(s->rport == pdest)
3293 if(s->lport == psource)
3294 if(tcb->state != Closed)
3295 if(ipcmp(s->raddr, dest) == 0)
3296 if(ipcmp(s->laddr, source) == 0){
3297 qlock(s);
3298 qunlock(tcp);
3299 switch(tcb->state){
3300 case Syn_sent:
3301 localclose(s, msg);
3302 break;
3303 }
3304 qunlock(s);
3305 freeblist(bp);
3306 return;
3307 }
3308 }
3309 qunlock(tcp);
3310 freeblist(bp);
3311 }
3312
3313 static char*
tcpporthogdefensectl(char * val)3314 tcpporthogdefensectl(char *val)
3315 {
3316 if(strcmp(val, "on") == 0)
3317 tcpporthogdefense = 1;
3318 else if(strcmp(val, "off") == 0)
3319 tcpporthogdefense = 0;
3320 else
3321 return "unknown value for tcpporthogdefense";
3322 return nil;
3323 }
3324
3325 /* called with c qlocked */
3326 static char*
tcpctl(Conv * c,char ** f,int n)3327 tcpctl(Conv* c, char** f, int n)
3328 {
3329 if(n == 1 && strcmp(f[0], "close") == 0)
3330 return tcpclose2(c);
3331 if(n == 1 && strcmp(f[0], "hangup") == 0)
3332 return tcphangup(c);
3333 if(n == 1 && strcmp(f[0], "hangupxmit") == 0)
3334 return tcpxmitclose(c);
3335 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3336 return tcpstartka(c, f, n);
3337 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3338 return tcpsetchecksum(c, f, n);
3339 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3340 return tcpporthogdefensectl(f[1]);
3341 return "unknown control request";
3342 }
3343
3344 static int
tcpstats(Proto * tcp,char * buf,int len)3345 tcpstats(Proto *tcp, char *buf, int len)
3346 {
3347 Tcppriv *priv;
3348 char *p, *e;
3349 int i;
3350
3351 priv = tcp->priv;
3352 p = buf;
3353 e = p+len;
3354 for(i = 0; i < Nstats; i++)
3355 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3356 return p - buf;
3357 }
3358
3359 /*
3360 * garbage collect any stale conversations:
3361 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3362 * - Finwait2 after 5 minutes
3363 *
3364 * this is called whenever we run out of channels. Both checks are
3365 * of questionable validity so we try to use them only when we're
3366 * up against the wall.
3367 */
3368 static int
tcpgc(Proto * tcp)3369 tcpgc(Proto *tcp)
3370 {
3371 Conv *c, **pp, **ep;
3372 int n;
3373 Tcpctl *tcb;
3374
3375
3376 n = 0;
3377 ep = &tcp->conv[tcp->nc];
3378 for(pp = tcp->conv; pp < ep; pp++) {
3379 c = *pp;
3380 if(c == nil)
3381 break;
3382 if(!canqlock(c))
3383 continue;
3384 tcb = (Tcpctl*)c->ptcl;
3385 switch(tcb->state){
3386 case Syn_received:
3387 if(NOW - tcb->time > 5000){
3388 localclose(c, Etimedout);
3389 n++;
3390 }
3391 break;
3392 case Finwait2:
3393 if(NOW - tcb->time > 5*60*1000){
3394 localclose(c, Etimedout);
3395 n++;
3396 }
3397 break;
3398 }
3399 qunlock(c);
3400 }
3401 return n;
3402 }
3403
3404 static void
tcpsettimer(Tcpctl * tcb)3405 tcpsettimer(Tcpctl *tcb)
3406 {
3407 int x;
3408
3409 /* round trip dependency */
3410 x = backoff(tcb->backoff) *
3411 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3412
3413 /* bounded twixt 0.3 and 64 seconds */
3414 if(x < 300/MSPTICK)
3415 x = 300/MSPTICK;
3416 else if(x > (64000/MSPTICK))
3417 x = 64000/MSPTICK;
3418 tcb->timer.start = x;
3419 }
3420
3421 void
tcpinit(Fs * fs)3422 tcpinit(Fs *fs)
3423 {
3424 Proto *tcp;
3425 Tcppriv *tpriv;
3426
3427 tcp = smalloc(sizeof(Proto));
3428 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3429 tcp->name = "tcp";
3430 tcp->connect = tcpconnect;
3431 tcp->announce = tcpannounce;
3432 tcp->ctl = tcpctl;
3433 tcp->state = tcpstate;
3434 tcp->create = tcpcreate;
3435 tcp->close = tcpclose;
3436 tcp->rcv = tcpiput;
3437 tcp->advise = tcpadvise;
3438 tcp->stats = tcpstats;
3439 tcp->inuse = tcpinuse;
3440 tcp->gc = tcpgc;
3441 tcp->ipproto = IP_TCPPROTO;
3442 tcp->nc = scalednconv();
3443 tcp->ptclsize = sizeof(Tcpctl);
3444 tpriv->stats[MaxConn] = tcp->nc;
3445
3446 Fsproto(fs, tcp);
3447 }
3448
3449 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3450 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3451 {
3452 /*
3453 * guess at reasonable queue sizes. there's no current way
3454 * to know how many nic receive buffers we can safely tie up in the
3455 * tcp stack, and we don't adjust our queues to maximize throughput
3456 * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
3457 * respected, but we still control our own buffer commitment by
3458 * keeping a seperate qscale.
3459 */
3460 tcb->rcv.scale = rcvscale & 0xff;
3461 tcb->snd.scale = sndscale & 0xff;
3462 tcb->qscale = rcvscale & 0xff;
3463 if(rcvscale > Maxqscale)
3464 tcb->qscale = Maxqscale;
3465
3466 if(rcvscale != tcb->rcv.scale)
3467 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3468 "qlen %d >> window %ud lport %d\n",
3469 tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3470 tcb->window = QMAX << tcb->qscale;
3471 tcb->ssthresh = tcb->window;
3472
3473 /*
3474 * it's important to set wq large enough to cover the full
3475 * bandwidth-delay product. it's possible to be in loss
3476 * recovery with a big window, and we need to keep sending
3477 * into the inflated window. the difference can be huge
3478 * for even modest (70ms) ping times.
3479 */
3480 qsetlimit(s->rq, tcb->window);
3481 qsetlimit(s->wq, tcb->window);
3482 tcprcvwin(s);
3483 }
3484