1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7
8 #include "ip.h"
9
10 enum
11 {
12 QMAX = 64*1024-1,
13 IP_TCPPROTO = 6,
14
15 TCP4_IPLEN = 8,
16 TCP4_PHDRSIZE = 12,
17 TCP4_HDRSIZE = 20,
18 TCP4_TCBPHDRSZ = 40,
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21 TCP6_IPLEN = 0,
22 TCP6_PHDRSIZE = 40,
23 TCP6_HDRSIZE = 20,
24 TCP6_TCBPHDRSZ = 60,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27 TcptimerOFF = 0,
28 TcptimerON = 1,
29 TcptimerDONE = 2,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
40
41 EOLOPT = 0,
42 NOOPOPT = 1,
43 MSSOPT = 2,
44 MSS_LENGTH = 4, /* Maximum segment size */
45 WSOPT = 3,
46 WS_LENGTH = 3, /* Bits to scale window size by */
47 MSL2 = 10,
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default maximum segment */
50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58
59 FORCE = 1,
60 CLONE = 2,
61 RETRAN = 4,
62 ACTIVE = 8,
63 SYNACK = 16,
64
65 LOGAGAIN = 3,
66 LOGDGAIN = 2,
67
68 Closed = 0, /* Connection states */
69 Listen,
70 Syn_sent,
71 Syn_received,
72 Established,
73 Finwait1,
74 Finwait2,
75 Close_wait,
76 Closing,
77 Last_ack,
78 Time_wait,
79
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
82 LHTMASK = NLHT-1,
83
84 /*
85 * window is 64kb * 2ⁿ
86 * these factors determine the ultimate bandwidth-delay product.
87 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88 */
89 Maxqscale = 4, /* maximum queuing scale */
90 Defadvscale = 4, /* default advertisement */
91 };
92
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96 "Closed", "Listen", "Syn_sent", "Syn_received",
97 "Established", "Finwait1", "Finwait2", "Close_wait",
98 "Closing", "Last_ack", "Time_wait"
99 };
100
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104 Tcptimer *next;
105 Tcptimer *prev;
106 Tcptimer *readynext;
107 int state;
108 int start;
109 int count;
110 void (*func)(void*);
111 void *arg;
112 };
113
114 /*
115 * v4 and v6 pseudo headers used for
116 * checksuming tcp
117 */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121 uchar vihl; /* Version and header length */
122 uchar tos; /* Type of service */
123 uchar length[2]; /* packet length */
124 uchar id[2]; /* Identification */
125 uchar frag[2]; /* Fragment information */
126 uchar Unused;
127 uchar proto;
128 uchar tcplen[2];
129 uchar tcpsrc[4];
130 uchar tcpdst[4];
131 /* same as v6 from here on */
132 uchar tcpsport[2];
133 uchar tcpdport[2];
134 uchar tcpseq[4];
135 uchar tcpack[4];
136 uchar tcpflag[2];
137 uchar tcpwin[2];
138 uchar tcpcksum[2];
139 uchar tcpurg[2];
140 /* Options segment */
141 uchar tcpopt[1];
142 };
143
144 typedef struct Tcp6hdr Tcp6hdr;
145 struct Tcp6hdr
146 {
147 uchar vcf[4];
148 uchar ploadlen[2];
149 uchar proto;
150 uchar ttl;
151 uchar tcpsrc[IPaddrlen];
152 uchar tcpdst[IPaddrlen];
153 /* same as v4 from here on */
154 uchar tcpsport[2];
155 uchar tcpdport[2];
156 uchar tcpseq[4];
157 uchar tcpack[4];
158 uchar tcpflag[2];
159 uchar tcpwin[2];
160 uchar tcpcksum[2];
161 uchar tcpurg[2];
162 /* Options segment */
163 uchar tcpopt[1];
164 };
165
166 /*
167 * this represents the control info
168 * for a single packet. It is derived from
169 * a packet in ntohtcp{4,6}() and stuck into
170 * a packet in htontcp{4,6}().
171 */
172 typedef struct Tcp Tcp;
173 struct Tcp
174 {
175 ushort source;
176 ushort dest;
177 ulong seq;
178 ulong ack;
179 uchar flags;
180 uchar update;
181 ushort ws; /* window scale option */
182 ulong wnd; /* prescaled window*/
183 ushort urg;
184 ushort mss; /* max segment size option (if not zero) */
185 ushort len; /* size of data */
186 };
187
188 /*
189 * this header is malloc'd to thread together fragments
190 * waiting to be coalesced
191 */
192 typedef struct Reseq Reseq;
193 struct Reseq
194 {
195 Reseq *next;
196 Tcp seg;
197 Block *bp;
198 ushort length;
199 };
200
201 /*
202 * the qlock in the Conv locks this structure
203 */
204 typedef struct Tcpctl Tcpctl;
205 struct Tcpctl
206 {
207 uchar state; /* Connection state */
208 uchar type; /* Listening or active connection */
209 uchar code; /* Icmp code */
210 struct {
211 ulong una; /* Unacked data pointer */
212 ulong nxt; /* Next sequence expected */
213 ulong ptr; /* Data pointer */
214 ulong wnd; /* Tcp send window */
215 ulong urg; /* Urgent data pointer */
216 ulong wl2;
217 uint scale; /* how much to right shift window */
218 /* in xmitted packets */
219 /* to implement tahoe and reno TCP */
220 ulong dupacks; /* number of duplicate acks rcvd */
221 ulong partialack;
222 int recovery; /* loss recovery flag */
223 int retransmit; /* retransmit 1 packet @ una flag */
224 int rto;
225 ulong rxt; /* right window marker for recovery */
226 /* "recover" rfc3782 */
227 } snd;
228 struct {
229 ulong nxt; /* Receive pointer to next uchar slot */
230 ulong wnd; /* Receive window incoming */
231 ulong wsnt; /* Last wptr sent. important to */
232 /* track for large bdp */
233 ulong wptr;
234 ulong urg; /* Urgent pointer */
235 ulong ackptr; /* last acked sequence */
236 int blocked;
237 uint scale; /* how much to left shift window in */
238 /* rcv'd packets */
239 } rcv;
240 ulong iss; /* Initial sequence number */
241 ulong cwind; /* Congestion window */
242 ulong abcbytes; /* appropriate byte counting rfc 3465 */
243 uint scale; /* desired snd.scale */
244 ulong ssthresh; /* Slow start threshold */
245 int resent; /* Bytes just resent */
246 int irs; /* Initial received squence */
247 ushort mss; /* Maximum segment size */
248 int rerecv; /* Overlap of data rerecevived */
249 ulong window; /* Our receive window (queue) */
250 uint qscale; /* Log2 of our receive window (queue) */
251 uchar backoff; /* Exponential backoff counter */
252 int backedoff; /* ms we've backed off for rexmits */
253 uchar flags; /* State flags */
254 Reseq *reseq; /* Resequencing queue */
255 int nreseq;
256 int reseqlen;
257 Tcptimer timer; /* Activity timer */
258 Tcptimer acktimer; /* Acknowledge timer */
259 Tcptimer rtt_timer; /* Round trip timer */
260 Tcptimer katimer; /* keep alive timer */
261 ulong rttseq; /* Round trip sequence */
262 int srtt; /* Smoothed round trip */
263 int mdev; /* Mean deviation of round trip */
264 int kacounter; /* count down for keep alive */
265 uint sndsyntime; /* time syn sent */
266 ulong time; /* time Finwait2 or Syn_received was sent */
267 ulong timeuna; /* snd.una when time was set */
268 int nochecksum; /* non-zero means don't send checksums */
269 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
270
271 union {
272 Tcp4hdr tcp4hdr;
273 Tcp6hdr tcp6hdr;
274 } protohdr; /* prototype header */
275 };
276
277 /*
278 * New calls are put in limbo rather than having a conversation structure
279 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
280 * any real Conv structures mucking things up. Calls in limbo rexmit their
281 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
282 *
283 * In particular they aren't on a listener's queue so that they don't figure
284 * in the input queue limit.
285 *
286 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
287 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
288 * there is no hashing of this list.
289 */
290 typedef struct Limbo Limbo;
291 struct Limbo
292 {
293 Limbo *next;
294
295 uchar laddr[IPaddrlen];
296 uchar raddr[IPaddrlen];
297 ushort lport;
298 ushort rport;
299 ulong irs; /* initial received sequence */
300 ulong iss; /* initial sent sequence */
301 ushort mss; /* mss from the other end */
302 ushort rcvscale; /* how much to scale rcvd windows */
303 ushort sndscale; /* how much to scale sent windows */
304 ulong lastsend; /* last time we sent a synack */
305 uchar version; /* v4 or v6 */
306 uchar rexmits; /* number of retransmissions */
307 };
308
309 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
310
311 enum {
312 /* MIB stats */
313 MaxConn,
314 Mss,
315 ActiveOpens,
316 PassiveOpens,
317 EstabResets,
318 CurrEstab,
319 InSegs,
320 OutSegs,
321 RetransSegs,
322 RetransSegsSent,
323 RetransTimeouts,
324 InErrs,
325 OutRsts,
326
327 /* non-MIB stats */
328 CsumErrs,
329 HlenErrs,
330 LenErrs,
331 Resequenced,
332 OutOfOrder,
333 ReseqBytelim,
334 ReseqPktlim,
335 Delayack,
336 Wopenack,
337
338 Recovery,
339 RecoveryDone,
340 RecoveryRTO,
341 RecoveryNoSeq,
342 RecoveryCwind,
343 RecoveryPA,
344
345 Nstats
346 };
347
348 static char *statnames[Nstats] =
349 {
350 [MaxConn] "MaxConn",
351 [Mss] "MaxSegment",
352 [ActiveOpens] "ActiveOpens",
353 [PassiveOpens] "PassiveOpens",
354 [EstabResets] "EstabResets",
355 [CurrEstab] "CurrEstab",
356 [InSegs] "InSegs",
357 [OutSegs] "OutSegs",
358 [RetransSegs] "RetransSegs",
359 [RetransSegsSent] "RetransSegsSent",
360 [RetransTimeouts] "RetransTimeouts",
361 [InErrs] "InErrs",
362 [OutRsts] "OutRsts",
363 [CsumErrs] "CsumErrs",
364 [HlenErrs] "HlenErrs",
365 [LenErrs] "LenErrs",
366 [OutOfOrder] "OutOfOrder",
367 [Resequenced] "Resequenced",
368 [ReseqBytelim] "ReseqBytelim",
369 [ReseqPktlim] "ReseqPktlim",
370 [Delayack] "Delayack",
371 [Wopenack] "Wopenack",
372
373 [Recovery] "Recovery",
374 [RecoveryDone] "RecoveryDone",
375 [RecoveryRTO] "RecoveryRTO",
376
377 [RecoveryNoSeq] "RecoveryNoSeq",
378 [RecoveryCwind] "RecoveryCwind",
379 [RecoveryPA] "RecoveryPA",
380 };
381
382 typedef struct Tcppriv Tcppriv;
383 struct Tcppriv
384 {
385 /* List of active timers */
386 QLock tl;
387 Tcptimer *timers;
388
389 /* hash table for matching conversations */
390 Ipht ht;
391
392 /* calls in limbo waiting for an ACK to our SYN ACK */
393 int nlimbo;
394 Limbo *lht[NLHT];
395
396 /* for keeping track of tcpackproc */
397 QLock apl;
398 int ackprocstarted;
399
400 uvlong stats[Nstats];
401 };
402
403 /*
404 * Setting tcpporthogdefense to non-zero enables Dong Lin's
405 * solution to hijacked systems staking out port's as a form
406 * of DoS attack.
407 *
408 * To avoid stateless Conv hogs, we pick a sequence number at random. If
409 * that number gets acked by the other end, we shut down the connection.
410 * Look for tcpporthogdefense in the code.
411 */
412 int tcpporthogdefense = 0;
413
414 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
415 static int dumpreseq(Tcpctl*);
416 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
417 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
418 static void limborexmit(Proto*);
419 static void localclose(Conv*, char*);
420 static void procsyn(Conv*, Tcp*);
421 static void tcpacktimer(void*);
422 static void tcpiput(Proto*, Ipifc*, Block*);
423 static void tcpkeepalive(void*);
424 static void tcpoutput(Conv*);
425 static void tcprcvwin(Conv*);
426 static void tcprxmit(Conv*);
427 static void tcpsetkacounter(Tcpctl*);
428 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
429 static void tcpsettimer(Tcpctl*);
430 static void tcpsndsyn(Conv*, Tcpctl*);
431 static void tcpstart(Conv*, int);
432 static void tcpsynackrtt(Conv*);
433 static void tcptimeout(void*);
434 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
435
436 static void
tcpsetstate(Conv * s,uchar newstate)437 tcpsetstate(Conv *s, uchar newstate)
438 {
439 Tcpctl *tcb;
440 uchar oldstate;
441 Tcppriv *tpriv;
442
443 tpriv = s->p->priv;
444
445 tcb = (Tcpctl*)s->ptcl;
446
447 oldstate = tcb->state;
448 if(oldstate == newstate)
449 return;
450
451 if(oldstate == Established)
452 tpriv->stats[CurrEstab]--;
453 if(newstate == Established)
454 tpriv->stats[CurrEstab]++;
455
456 switch(newstate) {
457 case Closed:
458 qclose(s->rq);
459 qclose(s->wq);
460 qclose(s->eq);
461 break;
462
463 case Close_wait: /* Remote closes */
464 qhangup(s->rq, nil);
465 break;
466 }
467
468 tcb->state = newstate;
469
470 if(oldstate == Syn_sent && newstate != Closed)
471 Fsconnected(s, nil);
472 }
473
474 static char*
tcpconnect(Conv * c,char ** argv,int argc)475 tcpconnect(Conv *c, char **argv, int argc)
476 {
477 char *e;
478 Tcpctl *tcb;
479
480 tcb = (Tcpctl*)(c->ptcl);
481 if(tcb->state != Closed)
482 return Econinuse;
483
484 e = Fsstdconnect(c, argv, argc);
485 if(e != nil)
486 return e;
487 tcpstart(c, TCP_CONNECT);
488
489 return nil;
490 }
491
492 static int
tcpstate(Conv * c,char * state,int n)493 tcpstate(Conv *c, char *state, int n)
494 {
495 Tcpctl *s;
496
497 s = (Tcpctl*)(c->ptcl);
498
499 return snprint(state, n,
500 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
501 "swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
502 "timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
503 tcpstates[s->state],
504 c->rq ? qlen(c->rq) : 0,
505 c->wq ? qlen(c->wq) : 0,
506 s->nreseq, s->reseqlen,
507 s->srtt, s->mdev, s->ssthresh,
508 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
509 s->qscale,
510 s->timer.start, s->timer.count, s->rerecv,
511 s->katimer.start, s->katimer.count);
512 }
513
514 static int
tcpinuse(Conv * c)515 tcpinuse(Conv *c)
516 {
517 Tcpctl *s;
518
519 s = (Tcpctl*)(c->ptcl);
520 return s->state != Closed;
521 }
522
523 static char*
tcpannounce(Conv * c,char ** argv,int argc)524 tcpannounce(Conv *c, char **argv, int argc)
525 {
526 char *e;
527 Tcpctl *tcb;
528
529 tcb = (Tcpctl*)(c->ptcl);
530 if(tcb->state != Closed)
531 return Econinuse;
532
533 e = Fsstdannounce(c, argv, argc);
534 if(e != nil)
535 return e;
536 tcpstart(c, TCP_LISTEN);
537 Fsconnected(c, nil);
538
539 return nil;
540 }
541
542 static void
tcpclosestate(Conv * c,Tcpctl * tcb,int state)543 tcpclosestate(Conv *c, Tcpctl *tcb, int state)
544 {
545 tcb->flgcnt++;
546 tcb->snd.nxt++;
547 tcpsetstate(c, state);
548 tcpoutput(c);
549 }
550
551 /* close the output half of a tcp connection */
552 static char *
tcpxmitclose(Conv * c)553 tcpxmitclose(Conv *c)
554 {
555 Tcpctl *tcb;
556
557 qhangup(c->wq, nil);
558
559 tcb = (Tcpctl*)c->ptcl;
560 switch(tcb->state) {
561 case Listen:
562 /*
563 * reset any incoming calls to this listener
564 */
565 Fsconnected(c, "Hangup");
566 /* fall through */
567 case Closed:
568 case Syn_sent:
569 localclose(c, nil);
570 break;
571 case Syn_received:
572 case Established:
573 case Close_wait:
574 tcpclosestate(c, tcb, tcb->state);
575 break;
576 }
577 return nil;
578 }
579
580 /*
581 * tcpclose is always called with the q locked
582 */
583 static void
tcpclose(Conv * c)584 tcpclose(Conv *c)
585 {
586 Tcpctl *tcb;
587
588 tcb = (Tcpctl*)c->ptcl;
589
590 qhangup(c->rq, nil);
591 qhangup(c->wq, nil);
592 qhangup(c->eq, nil);
593 qflush(c->rq);
594
595 switch(tcb->state) {
596 case Listen:
597 /*
598 * reset any incoming calls to this listener
599 */
600 Fsconnected(c, "Hangup");
601 /* fall through */
602 case Closed:
603 case Syn_sent:
604 localclose(c, nil);
605 break;
606 case Syn_received:
607 case Established:
608 tcpclosestate(c, tcb, Finwait1);
609 break;
610 case Close_wait:
611 tcpclosestate(c, tcb, Last_ack);
612 break;
613 }
614 }
615
616 static void
tcpkick(void * x)617 tcpkick(void *x)
618 {
619 Conv *s = x;
620 Tcpctl *tcb;
621
622 tcb = (Tcpctl*)s->ptcl;
623
624 if(waserror()){
625 qunlock(s);
626 nexterror();
627 }
628 qlock(s);
629
630 switch(tcb->state) {
631 case Syn_sent:
632 case Syn_received:
633 case Established:
634 case Close_wait:
635 /*
636 * Push data
637 */
638 tcpoutput(s);
639 break;
640 default:
641 localclose(s, "Hangup");
642 break;
643 }
644
645 qunlock(s);
646 poperror();
647 }
648
649 static int seq_lt(ulong, ulong);
650
651 static void
tcprcvwin(Conv * s)652 tcprcvwin(Conv *s) /* Call with tcb locked */
653 {
654 int w;
655 Tcpctl *tcb;
656
657 tcb = (Tcpctl*)s->ptcl;
658 w = tcb->window - qlen(s->rq);
659 if(w < 0)
660 w = 0;
661 /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
662 if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
663 w = tcb->rcv.wptr - tcb->rcv.nxt;
664 if(w != tcb->rcv.wnd)
665 if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
666 tcb->rcv.blocked = 1;
667 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
668 tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
669 }
670 tcb->rcv.wnd = w;
671 tcb->rcv.wptr = tcb->rcv.nxt + w;
672 }
673
674 static void
tcpacktimer(void * v)675 tcpacktimer(void *v)
676 {
677 Tcpctl *tcb;
678 Conv *s;
679
680 s = v;
681 tcb = (Tcpctl*)s->ptcl;
682
683 if(waserror()){
684 qunlock(s);
685 nexterror();
686 }
687 qlock(s);
688 if(tcb->state != Closed){
689 tcb->flags |= FORCE;
690 tcpoutput(s);
691 }
692 qunlock(s);
693 poperror();
694 }
695
696 static void
tcpcongestion(Tcpctl * tcb)697 tcpcongestion(Tcpctl *tcb)
698 {
699 ulong inflight;
700
701 inflight = tcb->snd.nxt - tcb->snd.una;
702 if(inflight > tcb->cwind)
703 inflight = tcb->cwind;
704 tcb->ssthresh = inflight / 2;
705 if(tcb->ssthresh < 2*tcb->mss)
706 tcb->ssthresh = 2*tcb->mss;
707 }
708
709 enum {
710 L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
711 };
712
713 static void
tcpabcincr(Tcpctl * tcb,uint acked)714 tcpabcincr(Tcpctl *tcb, uint acked)
715 {
716 uint limit;
717
718 tcb->abcbytes += acked;
719 if(tcb->cwind < tcb->ssthresh){
720 /* slow start */
721 if(tcb->snd.rto)
722 limit = tcb->mss;
723 else
724 limit = L*tcb->mss;
725 tcb->cwind += MIN(tcb->abcbytes, limit);
726 tcb->abcbytes = 0;
727 } else {
728 tcb->snd.rto = 0;
729 /* avoidance */
730 if(tcb->abcbytes >= tcb->cwind){
731 tcb->abcbytes -= tcb->cwind;
732 tcb->cwind += tcb->mss;
733 }
734 }
735 }
736
737 static void
tcpcreate(Conv * c)738 tcpcreate(Conv *c)
739 {
740 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
741 c->wq = qopen(QMAX, Qkick, tcpkick, c);
742 }
743
744 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)745 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
746 {
747 if(newstate != TcptimerON){
748 if(t->state == TcptimerON){
749 /* unchain */
750 if(priv->timers == t){
751 priv->timers = t->next;
752 if(t->prev != nil)
753 panic("timerstate1");
754 }
755 if(t->next)
756 t->next->prev = t->prev;
757 if(t->prev)
758 t->prev->next = t->next;
759 t->next = t->prev = nil;
760 }
761 } else {
762 if(t->state != TcptimerON){
763 /* chain */
764 if(t->prev != nil || t->next != nil)
765 panic("timerstate2");
766 t->prev = nil;
767 t->next = priv->timers;
768 if(t->next)
769 t->next->prev = t;
770 priv->timers = t;
771 }
772 }
773 t->state = newstate;
774 }
775
776 static void
tcpackproc(void * a)777 tcpackproc(void *a)
778 {
779 Tcptimer *t, *tp, *timeo;
780 Proto *tcp;
781 Tcppriv *priv;
782 int loop;
783
784 tcp = a;
785 priv = tcp->priv;
786
787 for(;;) {
788 tsleep(&up->sleep, return0, 0, MSPTICK);
789
790 qlock(&priv->tl);
791 timeo = nil;
792 loop = 0;
793 for(t = priv->timers; t != nil; t = tp) {
794 if(loop++ > 10000)
795 panic("tcpackproc1");
796 tp = t->next;
797 if(t->state == TcptimerON) {
798 t->count--;
799 if(t->count == 0) {
800 timerstate(priv, t, TcptimerDONE);
801 t->readynext = timeo;
802 timeo = t;
803 }
804 }
805 }
806 qunlock(&priv->tl);
807
808 loop = 0;
809 for(t = timeo; t != nil; t = t->readynext) {
810 if(loop++ > 10000)
811 panic("tcpackproc2");
812 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
813 (*t->func)(t->arg);
814 poperror();
815 }
816 }
817
818 limborexmit(tcp);
819 }
820 }
821
822 static void
tcpgo(Tcppriv * priv,Tcptimer * t)823 tcpgo(Tcppriv *priv, Tcptimer *t)
824 {
825 if(t == nil || t->start == 0)
826 return;
827
828 qlock(&priv->tl);
829 t->count = t->start;
830 timerstate(priv, t, TcptimerON);
831 qunlock(&priv->tl);
832 }
833
834 static void
tcphalt(Tcppriv * priv,Tcptimer * t)835 tcphalt(Tcppriv *priv, Tcptimer *t)
836 {
837 if(t == nil)
838 return;
839
840 qlock(&priv->tl);
841 timerstate(priv, t, TcptimerOFF);
842 qunlock(&priv->tl);
843 }
844
845 static int
backoff(int n)846 backoff(int n)
847 {
848 return 1 << n;
849 }
850
851 static void
localclose(Conv * s,char * reason)852 localclose(Conv *s, char *reason) /* called with tcb locked */
853 {
854 Tcpctl *tcb;
855 Tcppriv *tpriv;
856
857 tpriv = s->p->priv;
858 tcb = (Tcpctl*)s->ptcl;
859
860 iphtrem(&tpriv->ht, s);
861
862 tcphalt(tpriv, &tcb->timer);
863 tcphalt(tpriv, &tcb->rtt_timer);
864 tcphalt(tpriv, &tcb->acktimer);
865 tcphalt(tpriv, &tcb->katimer);
866
867 /* Flush reassembly queue; nothing more can arrive */
868 dumpreseq(tcb);
869
870 if(tcb->state == Syn_sent)
871 Fsconnected(s, reason);
872 if(s->state == Announced)
873 wakeup(&s->listenr);
874
875 qhangup(s->rq, reason);
876 qhangup(s->wq, reason);
877
878 tcpsetstate(s, Closed);
879 }
880
881 /* mtu (- TCP + IP hdr len) of 1st hop */
882 static int
tcpmtu(Proto * tcp,uchar * addr,int version,uint * scale)883 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
884 {
885 Ipifc *ifc;
886 int mtu;
887
888 ifc = findipifc(tcp->f, addr, 0);
889 switch(version){
890 default:
891 case V4:
892 mtu = DEF_MSS;
893 if(ifc != nil)
894 mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE);
895 break;
896 case V6:
897 mtu = DEF_MSS6;
898 if(ifc != nil)
899 mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE);
900 break;
901 }
902 /*
903 * set the ws. it doesn't commit us to anything.
904 * ws is the ultimate limit to the bandwidth-delay product.
905 */
906 *scale = Defadvscale;
907
908 return mtu;
909 }
910
911 static void
inittcpctl(Conv * s,int mode)912 inittcpctl(Conv *s, int mode)
913 {
914 Tcpctl *tcb;
915 Tcp4hdr* h4;
916 Tcp6hdr* h6;
917 Tcppriv *tpriv;
918 int mss;
919
920 tcb = (Tcpctl*)s->ptcl;
921
922 memset(tcb, 0, sizeof(Tcpctl));
923
924 tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
925 tcb->srtt = tcp_irtt<<LOGAGAIN;
926 tcb->mdev = 0;
927
928 /* setup timers */
929 tcb->timer.start = tcp_irtt / MSPTICK;
930 tcb->timer.func = tcptimeout;
931 tcb->timer.arg = s;
932 tcb->rtt_timer.start = MAX_TIME;
933 tcb->acktimer.start = TCP_ACK / MSPTICK;
934 tcb->acktimer.func = tcpacktimer;
935 tcb->acktimer.arg = s;
936 tcb->katimer.start = DEF_KAT / MSPTICK;
937 tcb->katimer.func = tcpkeepalive;
938 tcb->katimer.arg = s;
939
940 mss = DEF_MSS;
941
942 /* create a prototype(pseudo) header */
943 if(mode != TCP_LISTEN){
944 if(ipcmp(s->laddr, IPnoaddr) == 0)
945 findlocalip(s->p->f, s->laddr, s->raddr);
946
947 switch(s->ipversion){
948 case V4:
949 h4 = &tcb->protohdr.tcp4hdr;
950 memset(h4, 0, sizeof(*h4));
951 h4->proto = IP_TCPPROTO;
952 hnputs(h4->tcpsport, s->lport);
953 hnputs(h4->tcpdport, s->rport);
954 v6tov4(h4->tcpsrc, s->laddr);
955 v6tov4(h4->tcpdst, s->raddr);
956 break;
957 case V6:
958 h6 = &tcb->protohdr.tcp6hdr;
959 memset(h6, 0, sizeof(*h6));
960 h6->proto = IP_TCPPROTO;
961 hnputs(h6->tcpsport, s->lport);
962 hnputs(h6->tcpdport, s->rport);
963 ipmove(h6->tcpsrc, s->laddr);
964 ipmove(h6->tcpdst, s->raddr);
965 mss = DEF_MSS6;
966 break;
967 default:
968 panic("inittcpctl: version %d", s->ipversion);
969 }
970 }
971
972 tcb->mss = tcb->cwind = mss;
973 tcb->abcbytes = 0;
974 tpriv = s->p->priv;
975 tpriv->stats[Mss] = tcb->mss;
976
977 /* default is no window scaling */
978 tcpsetscale(s, tcb, 0, 0);
979 }
980
981 /*
982 * called with s qlocked
983 */
984 static void
tcpstart(Conv * s,int mode)985 tcpstart(Conv *s, int mode)
986 {
987 Tcpctl *tcb;
988 Tcppriv *tpriv;
989 char kpname[KNAMELEN];
990
991 tpriv = s->p->priv;
992
993 if(tpriv->ackprocstarted == 0){
994 qlock(&tpriv->apl);
995 if(tpriv->ackprocstarted == 0){
996 snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
997 kproc(kpname, tcpackproc, s->p);
998 tpriv->ackprocstarted = 1;
999 }
1000 qunlock(&tpriv->apl);
1001 }
1002
1003 tcb = (Tcpctl*)s->ptcl;
1004
1005 inittcpctl(s, mode);
1006
1007 iphtadd(&tpriv->ht, s);
1008 switch(mode) {
1009 case TCP_LISTEN:
1010 tpriv->stats[PassiveOpens]++;
1011 tcb->flags |= CLONE;
1012 tcpsetstate(s, Listen);
1013 break;
1014
1015 case TCP_CONNECT:
1016 tpriv->stats[ActiveOpens]++;
1017 tcb->flags |= ACTIVE;
1018 tcpsndsyn(s, tcb);
1019 tcpsetstate(s, Syn_sent);
1020 tcpoutput(s);
1021 break;
1022 }
1023 }
1024
1025 static char*
tcpflag(char * buf,char * e,ushort flag)1026 tcpflag(char *buf, char *e, ushort flag)
1027 {
1028 char *p;
1029
1030 p = seprint(buf, e, "%d", flag>>10); /* Head len */
1031 if(flag & URG)
1032 p = seprint(p, e, " URG");
1033 if(flag & ACK)
1034 p = seprint(p, e, " ACK");
1035 if(flag & PSH)
1036 p = seprint(p, e, " PSH");
1037 if(flag & RST)
1038 p = seprint(p, e, " RST");
1039 if(flag & SYN)
1040 p = seprint(p, e, " SYN");
1041 if(flag & FIN)
1042 p = seprint(p, e, " FIN");
1043 USED(p);
1044 return buf;
1045 }
1046
1047 static Block*
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)1048 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1049 {
1050 int dlen;
1051 Tcp6hdr *h;
1052 ushort csum;
1053 ushort hdrlen, optpad = 0;
1054 uchar *opt;
1055
1056 hdrlen = TCP6_HDRSIZE;
1057 if(tcph->flags & SYN){
1058 if(tcph->mss)
1059 hdrlen += MSS_LENGTH;
1060 if(tcph->ws)
1061 hdrlen += WS_LENGTH;
1062 optpad = hdrlen & 3;
1063 if(optpad)
1064 optpad = 4 - optpad;
1065 hdrlen += optpad;
1066 }
1067
1068 if(data) {
1069 dlen = blocklen(data);
1070 data = padblock(data, hdrlen + TCP6_PKT);
1071 if(data == nil)
1072 return nil;
1073 }
1074 else {
1075 dlen = 0;
1076 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
1077 if(data == nil)
1078 return nil;
1079 data->wp += hdrlen + TCP6_PKT;
1080 }
1081
1082 /* copy in pseudo ip header plus port numbers */
1083 h = (Tcp6hdr *)(data->rp);
1084 memmove(h, ph, TCP6_TCBPHDRSZ);
1085
1086 /* compose pseudo tcp header, do cksum calculation */
1087 hnputl(h->vcf, hdrlen + dlen);
1088 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1089 h->ttl = ph->proto;
1090
1091 /* copy in variable bits */
1092 hnputl(h->tcpseq, tcph->seq);
1093 hnputl(h->tcpack, tcph->ack);
1094 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1095 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1096 hnputs(h->tcpurg, tcph->urg);
1097
1098 if(tcph->flags & SYN){
1099 opt = h->tcpopt;
1100 if(tcph->mss != 0){
1101 *opt++ = MSSOPT;
1102 *opt++ = MSS_LENGTH;
1103 hnputs(opt, tcph->mss);
1104 opt += 2;
1105 }
1106 if(tcph->ws != 0){
1107 *opt++ = WSOPT;
1108 *opt++ = WS_LENGTH;
1109 *opt++ = tcph->ws;
1110 }
1111 while(optpad-- > 0)
1112 *opt++ = NOOPOPT;
1113 }
1114
1115 if(tcb != nil && tcb->nochecksum){
1116 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1117 } else {
1118 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1119 hnputs(h->tcpcksum, csum);
1120 }
1121
1122 /* move from pseudo header back to normal ip header */
1123 memset(h->vcf, 0, 4);
1124 h->vcf[0] = IP_VER6;
1125 hnputs(h->ploadlen, hdrlen+dlen);
1126 h->proto = ph->proto;
1127
1128 return data;
1129 }
1130
1131 static Block*
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1132 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1133 {
1134 int dlen;
1135 Tcp4hdr *h;
1136 ushort csum;
1137 ushort hdrlen, optpad = 0;
1138 uchar *opt;
1139
1140 hdrlen = TCP4_HDRSIZE;
1141 if(tcph->flags & SYN){
1142 if(tcph->mss)
1143 hdrlen += MSS_LENGTH;
1144 if(1)
1145 hdrlen += WS_LENGTH;
1146 optpad = hdrlen & 3;
1147 if(optpad)
1148 optpad = 4 - optpad;
1149 hdrlen += optpad;
1150 }
1151
1152 if(data) {
1153 dlen = blocklen(data);
1154 data = padblock(data, hdrlen + TCP4_PKT);
1155 if(data == nil)
1156 return nil;
1157 }
1158 else {
1159 dlen = 0;
1160 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1161 if(data == nil)
1162 return nil;
1163 data->wp += hdrlen + TCP4_PKT;
1164 }
1165
1166 /* copy in pseudo ip header plus port numbers */
1167 h = (Tcp4hdr *)(data->rp);
1168 memmove(h, ph, TCP4_TCBPHDRSZ);
1169
1170 /* copy in variable bits */
1171 hnputs(h->tcplen, hdrlen + dlen);
1172 hnputl(h->tcpseq, tcph->seq);
1173 hnputl(h->tcpack, tcph->ack);
1174 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1175 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1176 hnputs(h->tcpurg, tcph->urg);
1177
1178 if(tcph->flags & SYN){
1179 opt = h->tcpopt;
1180 if(tcph->mss != 0){
1181 *opt++ = MSSOPT;
1182 *opt++ = MSS_LENGTH;
1183 hnputs(opt, tcph->mss);
1184 opt += 2;
1185 }
1186 /* always offer. rfc1323 §2.2 */
1187 if(1){
1188 *opt++ = WSOPT;
1189 *opt++ = WS_LENGTH;
1190 *opt++ = tcph->ws;
1191 }
1192 while(optpad-- > 0)
1193 *opt++ = NOOPOPT;
1194 }
1195
1196 if(tcb != nil && tcb->nochecksum){
1197 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1198 } else {
1199 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1200 hnputs(h->tcpcksum, csum);
1201 }
1202
1203 return data;
1204 }
1205
1206 static int
ntohtcp6(Tcp * tcph,Block ** bpp)1207 ntohtcp6(Tcp *tcph, Block **bpp)
1208 {
1209 Tcp6hdr *h;
1210 uchar *optr;
1211 ushort hdrlen;
1212 ushort optlen;
1213 int n;
1214
1215 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1216 if(*bpp == nil)
1217 return -1;
1218
1219 h = (Tcp6hdr *)((*bpp)->rp);
1220 tcph->source = nhgets(h->tcpsport);
1221 tcph->dest = nhgets(h->tcpdport);
1222 tcph->seq = nhgetl(h->tcpseq);
1223 tcph->ack = nhgetl(h->tcpack);
1224 hdrlen = (h->tcpflag[0]>>2) & ~3;
1225 if(hdrlen < TCP6_HDRSIZE) {
1226 freeblist(*bpp);
1227 return -1;
1228 }
1229
1230 tcph->flags = h->tcpflag[1];
1231 tcph->wnd = nhgets(h->tcpwin);
1232 tcph->urg = nhgets(h->tcpurg);
1233 tcph->mss = 0;
1234 tcph->ws = 0;
1235 tcph->update = 0;
1236 tcph->len = nhgets(h->ploadlen) - hdrlen;
1237
1238 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1239 if(*bpp == nil)
1240 return -1;
1241
1242 optr = h->tcpopt;
1243 n = hdrlen - TCP6_HDRSIZE;
1244 while(n > 0 && *optr != EOLOPT) {
1245 if(*optr == NOOPOPT) {
1246 n--;
1247 optr++;
1248 continue;
1249 }
1250 optlen = optr[1];
1251 if(optlen < 2 || optlen > n)
1252 break;
1253 switch(*optr) {
1254 case MSSOPT:
1255 if(optlen == MSS_LENGTH)
1256 tcph->mss = nhgets(optr+2);
1257 break;
1258 case WSOPT:
1259 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1260 tcph->ws = *(optr+2);
1261 break;
1262 }
1263 n -= optlen;
1264 optr += optlen;
1265 }
1266 return hdrlen;
1267 }
1268
1269 static int
ntohtcp4(Tcp * tcph,Block ** bpp)1270 ntohtcp4(Tcp *tcph, Block **bpp)
1271 {
1272 Tcp4hdr *h;
1273 uchar *optr;
1274 ushort hdrlen;
1275 ushort optlen;
1276 int n;
1277
1278 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1279 if(*bpp == nil)
1280 return -1;
1281
1282 h = (Tcp4hdr *)((*bpp)->rp);
1283 tcph->source = nhgets(h->tcpsport);
1284 tcph->dest = nhgets(h->tcpdport);
1285 tcph->seq = nhgetl(h->tcpseq);
1286 tcph->ack = nhgetl(h->tcpack);
1287
1288 hdrlen = (h->tcpflag[0]>>2) & ~3;
1289 if(hdrlen < TCP4_HDRSIZE) {
1290 freeblist(*bpp);
1291 return -1;
1292 }
1293
1294 tcph->flags = h->tcpflag[1];
1295 tcph->wnd = nhgets(h->tcpwin);
1296 tcph->urg = nhgets(h->tcpurg);
1297 tcph->mss = 0;
1298 tcph->ws = 0;
1299 tcph->update = 0;
1300 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1301
1302 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1303 if(*bpp == nil)
1304 return -1;
1305
1306 optr = h->tcpopt;
1307 n = hdrlen - TCP4_HDRSIZE;
1308 while(n > 0 && *optr != EOLOPT) {
1309 if(*optr == NOOPOPT) {
1310 n--;
1311 optr++;
1312 continue;
1313 }
1314 optlen = optr[1];
1315 if(optlen < 2 || optlen > n)
1316 break;
1317 switch(*optr) {
1318 case MSSOPT:
1319 if(optlen == MSS_LENGTH)
1320 tcph->mss = nhgets(optr+2);
1321 break;
1322 case WSOPT:
1323 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1324 tcph->ws = *(optr+2);
1325 break;
1326 }
1327 n -= optlen;
1328 optr += optlen;
1329 }
1330 return hdrlen;
1331 }
1332
1333 /*
1334 * For outgoing calls, generate an initial sequence
1335 * number and put a SYN on the send queue
1336 */
1337 static void
tcpsndsyn(Conv * s,Tcpctl * tcb)1338 tcpsndsyn(Conv *s, Tcpctl *tcb)
1339 {
1340 Tcppriv *tpriv;
1341
1342 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1343 tcb->rttseq = tcb->iss;
1344 tcb->snd.wl2 = tcb->iss;
1345 tcb->snd.una = tcb->iss;
1346 tcb->snd.rxt = tcb->iss;
1347 tcb->snd.ptr = tcb->rttseq;
1348 tcb->snd.nxt = tcb->rttseq;
1349 tcb->flgcnt++;
1350 tcb->flags |= FORCE;
1351 tcb->sndsyntime = NOW;
1352
1353 /* set desired mss and scale */
1354 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1355 tpriv = s->p->priv;
1356 tpriv->stats[Mss] = tcb->mss;
1357 }
1358
1359 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1360 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1361 {
1362 Block *hbp;
1363 uchar rflags;
1364 Tcppriv *tpriv;
1365 Tcp4hdr ph4;
1366 Tcp6hdr ph6;
1367
1368 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1369
1370 tpriv = tcp->priv;
1371
1372 if(seg->flags & RST)
1373 return;
1374
1375 /* make pseudo header */
1376 switch(version) {
1377 case V4:
1378 memset(&ph4, 0, sizeof(ph4));
1379 ph4.vihl = IP_VER4;
1380 v6tov4(ph4.tcpsrc, dest);
1381 v6tov4(ph4.tcpdst, source);
1382 ph4.proto = IP_TCPPROTO;
1383 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1384 hnputs(ph4.tcpsport, seg->dest);
1385 hnputs(ph4.tcpdport, seg->source);
1386 break;
1387 case V6:
1388 memset(&ph6, 0, sizeof(ph6));
1389 ph6.vcf[0] = IP_VER6;
1390 ipmove(ph6.tcpsrc, dest);
1391 ipmove(ph6.tcpdst, source);
1392 ph6.proto = IP_TCPPROTO;
1393 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1394 hnputs(ph6.tcpsport, seg->dest);
1395 hnputs(ph6.tcpdport, seg->source);
1396 break;
1397 default:
1398 panic("sndrst: version %d", version);
1399 }
1400
1401 tpriv->stats[OutRsts]++;
1402 rflags = RST;
1403
1404 /* convince the other end that this reset is in band */
1405 if(seg->flags & ACK) {
1406 seg->seq = seg->ack;
1407 seg->ack = 0;
1408 }
1409 else {
1410 rflags |= ACK;
1411 seg->ack = seg->seq;
1412 seg->seq = 0;
1413 if(seg->flags & SYN)
1414 seg->ack++;
1415 seg->ack += length;
1416 if(seg->flags & FIN)
1417 seg->ack++;
1418 }
1419 seg->flags = rflags;
1420 seg->wnd = 0;
1421 seg->urg = 0;
1422 seg->mss = 0;
1423 seg->ws = 0;
1424 switch(version) {
1425 case V4:
1426 hbp = htontcp4(seg, nil, &ph4, nil);
1427 if(hbp == nil)
1428 return;
1429 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1430 break;
1431 case V6:
1432 hbp = htontcp6(seg, nil, &ph6, nil);
1433 if(hbp == nil)
1434 return;
1435 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1436 break;
1437 default:
1438 panic("sndrst2: version %d", version);
1439 }
1440 }
1441
1442 /*
1443 * close the conversation
1444 */
1445 static char*
tcpclose2(Conv * s)1446 tcpclose2(Conv *s)
1447 {
1448 tcpclose(s);
1449 return nil;
1450 }
1451
1452 /*
1453 * send a reset to the remote side and close the conversation
1454 * called with s qlocked
1455 */
1456 static char*
tcphangup(Conv * s)1457 tcphangup(Conv *s)
1458 {
1459 Tcp seg;
1460 Tcpctl *tcb;
1461 Block *hbp;
1462
1463 tcb = (Tcpctl*)s->ptcl;
1464 if(waserror())
1465 return commonerror();
1466 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1467 if(!waserror()){
1468 memset(&seg, 0, sizeof seg);
1469 seg.flags = RST | ACK;
1470 seg.ack = tcb->rcv.nxt;
1471 tcb->rcv.ackptr = seg.ack;
1472 seg.seq = tcb->snd.ptr;
1473 seg.wnd = 0;
1474 seg.urg = 0;
1475 seg.mss = 0;
1476 seg.ws = 0;
1477 switch(s->ipversion) {
1478 case V4:
1479 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1480 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1481 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1482 break;
1483 case V6:
1484 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1485 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1486 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1487 break;
1488 default:
1489 panic("tcphangup: version %d", s->ipversion);
1490 }
1491 poperror();
1492 }
1493 }
1494 localclose(s, nil);
1495 poperror();
1496 return nil;
1497 }
1498
1499 /*
1500 * (re)send a SYN ACK
1501 */
1502 static int
sndsynack(Proto * tcp,Limbo * lp)1503 sndsynack(Proto *tcp, Limbo *lp)
1504 {
1505 Block *hbp;
1506 Tcp4hdr ph4;
1507 Tcp6hdr ph6;
1508 Tcp seg;
1509 uint scale;
1510
1511 /* make pseudo header */
1512 switch(lp->version) {
1513 case V4:
1514 memset(&ph4, 0, sizeof(ph4));
1515 ph4.vihl = IP_VER4;
1516 v6tov4(ph4.tcpsrc, lp->laddr);
1517 v6tov4(ph4.tcpdst, lp->raddr);
1518 ph4.proto = IP_TCPPROTO;
1519 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1520 hnputs(ph4.tcpsport, lp->lport);
1521 hnputs(ph4.tcpdport, lp->rport);
1522 break;
1523 case V6:
1524 memset(&ph6, 0, sizeof(ph6));
1525 ph6.vcf[0] = IP_VER6;
1526 ipmove(ph6.tcpsrc, lp->laddr);
1527 ipmove(ph6.tcpdst, lp->raddr);
1528 ph6.proto = IP_TCPPROTO;
1529 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1530 hnputs(ph6.tcpsport, lp->lport);
1531 hnputs(ph6.tcpdport, lp->rport);
1532 break;
1533 default:
1534 panic("sndrst: version %d", lp->version);
1535 }
1536
1537 memset(&seg, 0, sizeof seg);
1538 seg.seq = lp->iss;
1539 seg.ack = lp->irs+1;
1540 seg.flags = SYN|ACK;
1541 seg.urg = 0;
1542 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1543 seg.wnd = QMAX;
1544
1545 /* if the other side set scale, we should too */
1546 if(lp->rcvscale){
1547 seg.ws = scale;
1548 lp->sndscale = scale;
1549 } else {
1550 seg.ws = 0;
1551 lp->sndscale = 0;
1552 }
1553
1554 switch(lp->version) {
1555 case V4:
1556 hbp = htontcp4(&seg, nil, &ph4, nil);
1557 if(hbp == nil)
1558 return -1;
1559 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1560 break;
1561 case V6:
1562 hbp = htontcp6(&seg, nil, &ph6, nil);
1563 if(hbp == nil)
1564 return -1;
1565 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1566 break;
1567 default:
1568 panic("sndsnack: version %d", lp->version);
1569 }
1570 lp->lastsend = NOW;
1571 return 0;
1572 }
1573
1574 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1575
1576 /*
1577 * put a call into limbo and respond with a SYN ACK
1578 *
1579 * called with proto locked
1580 */
1581 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1582 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1583 {
1584 Limbo *lp, **l;
1585 Tcppriv *tpriv;
1586 int h;
1587
1588 tpriv = s->p->priv;
1589 h = hashipa(source, seg->source);
1590
1591 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1592 lp = *l;
1593 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1594 continue;
1595 if(ipcmp(lp->raddr, source) != 0)
1596 continue;
1597 if(ipcmp(lp->laddr, dest) != 0)
1598 continue;
1599
1600 /* each new SYN restarts the retransmits */
1601 lp->irs = seg->seq;
1602 break;
1603 }
1604 lp = *l;
1605 if(lp == nil){
1606 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1607 lp = tpriv->lht[h];
1608 tpriv->lht[h] = lp->next;
1609 lp->next = nil;
1610 } else {
1611 lp = malloc(sizeof(*lp));
1612 if(lp == nil)
1613 return;
1614 tpriv->nlimbo++;
1615 }
1616 *l = lp;
1617 lp->version = version;
1618 ipmove(lp->laddr, dest);
1619 ipmove(lp->raddr, source);
1620 lp->lport = seg->dest;
1621 lp->rport = seg->source;
1622 lp->mss = seg->mss;
1623 lp->rcvscale = seg->ws;
1624 lp->irs = seg->seq;
1625 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1626 }
1627
1628 if(sndsynack(s->p, lp) < 0){
1629 *l = lp->next;
1630 tpriv->nlimbo--;
1631 free(lp);
1632 }
1633 }
1634
1635 /*
1636 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1637 */
1638 static void
limborexmit(Proto * tcp)1639 limborexmit(Proto *tcp)
1640 {
1641 Tcppriv *tpriv;
1642 Limbo **l, *lp;
1643 int h;
1644 int seen;
1645 ulong now;
1646
1647 tpriv = tcp->priv;
1648
1649 if(!canqlock(tcp))
1650 return;
1651 seen = 0;
1652 now = NOW;
1653 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1654 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1655 lp = *l;
1656 seen++;
1657 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1658 continue;
1659
1660 /* time it out after 1 second */
1661 if(++(lp->rexmits) > 5){
1662 tpriv->nlimbo--;
1663 *l = lp->next;
1664 free(lp);
1665 continue;
1666 }
1667
1668 /* if we're being attacked, don't bother resending SYN ACK's */
1669 if(tpriv->nlimbo > 100)
1670 continue;
1671
1672 if(sndsynack(tcp, lp) < 0){
1673 tpriv->nlimbo--;
1674 *l = lp->next;
1675 free(lp);
1676 continue;
1677 }
1678
1679 l = &lp->next;
1680 }
1681 }
1682 qunlock(tcp);
1683 }
1684
1685 /*
1686 * lookup call in limbo. if found, throw it out.
1687 *
1688 * called with proto locked
1689 */
1690 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1691 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1692 {
1693 Limbo *lp, **l;
1694 int h;
1695 Tcppriv *tpriv;
1696
1697 tpriv = s->p->priv;
1698
1699 /* find a call in limbo */
1700 h = hashipa(src, segp->source);
1701 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1702 lp = *l;
1703 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1704 continue;
1705 if(ipcmp(lp->laddr, dst) != 0)
1706 continue;
1707 if(ipcmp(lp->raddr, src) != 0)
1708 continue;
1709
1710 /* RST can only follow the SYN */
1711 if(segp->seq == lp->irs+1){
1712 tpriv->nlimbo--;
1713 *l = lp->next;
1714 free(lp);
1715 }
1716 break;
1717 }
1718 }
1719
1720 static void
initialwindow(Tcpctl * tcb)1721 initialwindow(Tcpctl *tcb)
1722 {
1723 /* RFC 3390 initial window */
1724 if(tcb->mss < 1095)
1725 tcb->cwind = 4*tcb->mss;
1726 else if(tcb->mss < 2190)
1727 tcb->cwind = 2*2190;
1728 else
1729 tcb->cwind = 2*tcb->mss;
1730 }
1731
1732 /*
1733 * come here when we finally get an ACK to our SYN-ACK.
1734 * lookup call in limbo. if found, create a new conversation
1735 *
1736 * called with proto locked
1737 */
1738 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1739 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1740 {
1741 Conv *new;
1742 Tcpctl *tcb;
1743 Tcppriv *tpriv;
1744 Tcp4hdr *h4;
1745 Tcp6hdr *h6;
1746 Limbo *lp, **l;
1747 int h;
1748
1749 /* unless it's just an ack, it can't be someone coming out of limbo */
1750 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1751 return nil;
1752
1753 tpriv = s->p->priv;
1754
1755 /* find a call in limbo */
1756 h = hashipa(src, segp->source);
1757 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1758 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1759 src, segp->source, lp->raddr, lp->rport,
1760 dst, segp->dest, lp->laddr, lp->lport,
1761 version, lp->version
1762 );
1763
1764 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1765 continue;
1766 if(ipcmp(lp->laddr, dst) != 0)
1767 continue;
1768 if(ipcmp(lp->raddr, src) != 0)
1769 continue;
1770
1771 /* we're assuming no data with the initial SYN */
1772 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1773 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1774 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1775 lp = nil;
1776 } else {
1777 tpriv->nlimbo--;
1778 *l = lp->next;
1779 }
1780 break;
1781 }
1782 if(lp == nil)
1783 return nil;
1784
1785 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1786 if(new == nil)
1787 return nil;
1788
1789 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1790 tcb = (Tcpctl*)new->ptcl;
1791 tcb->flags &= ~CLONE;
1792 tcb->timer.arg = new;
1793 tcb->timer.state = TcptimerOFF;
1794 tcb->acktimer.arg = new;
1795 tcb->acktimer.state = TcptimerOFF;
1796 tcb->katimer.arg = new;
1797 tcb->katimer.state = TcptimerOFF;
1798 tcb->rtt_timer.arg = new;
1799 tcb->rtt_timer.state = TcptimerOFF;
1800
1801 tcb->irs = lp->irs;
1802 tcb->rcv.nxt = tcb->irs+1;
1803 tcb->rcv.wptr = tcb->rcv.nxt;
1804 tcb->rcv.wsnt = 0;
1805 tcb->rcv.urg = tcb->rcv.nxt;
1806
1807 tcb->iss = lp->iss;
1808 tcb->rttseq = tcb->iss;
1809 tcb->snd.wl2 = tcb->iss;
1810 tcb->snd.una = tcb->iss+1;
1811 tcb->snd.ptr = tcb->iss+1;
1812 tcb->snd.nxt = tcb->iss+1;
1813 tcb->snd.rxt = tcb->iss+1;
1814 tcb->flgcnt = 0;
1815 tcb->flags |= SYNACK;
1816
1817 /* set desired mss and scale */
1818 tcb->mss = tcpmtu(s->p, dst, s->ipversion, &tcb->scale);
1819
1820 /* our sending max segment size cannot be bigger than what he asked for */
1821 if(lp->mss != 0 && lp->mss < tcb->mss)
1822 tcb->mss = lp->mss;
1823 tpriv->stats[Mss] = tcb->mss;
1824
1825 /* window scaling */
1826 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1827
1828 /* congestion window */
1829 tcb->snd.wnd = segp->wnd;
1830 initialwindow(tcb);
1831
1832 /* set initial round trip time */
1833 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1834 tcpsynackrtt(new);
1835
1836 free(lp);
1837
1838 /* set up proto header */
1839 switch(version){
1840 case V4:
1841 h4 = &tcb->protohdr.tcp4hdr;
1842 memset(h4, 0, sizeof(*h4));
1843 h4->proto = IP_TCPPROTO;
1844 hnputs(h4->tcpsport, new->lport);
1845 hnputs(h4->tcpdport, new->rport);
1846 v6tov4(h4->tcpsrc, dst);
1847 v6tov4(h4->tcpdst, src);
1848 break;
1849 case V6:
1850 h6 = &tcb->protohdr.tcp6hdr;
1851 memset(h6, 0, sizeof(*h6));
1852 h6->proto = IP_TCPPROTO;
1853 hnputs(h6->tcpsport, new->lport);
1854 hnputs(h6->tcpdport, new->rport);
1855 ipmove(h6->tcpsrc, dst);
1856 ipmove(h6->tcpdst, src);
1857 break;
1858 default:
1859 panic("tcpincoming: version %d", new->ipversion);
1860 }
1861
1862 tcpsetstate(new, Established);
1863
1864 iphtadd(&tpriv->ht, new);
1865
1866 return new;
1867 }
1868
1869 static int
seq_within(ulong x,ulong low,ulong high)1870 seq_within(ulong x, ulong low, ulong high)
1871 {
1872 if(low <= high){
1873 if(low <= x && x <= high)
1874 return 1;
1875 }
1876 else {
1877 if(x >= low || x <= high)
1878 return 1;
1879 }
1880 return 0;
1881 }
1882
1883 static int
seq_lt(ulong x,ulong y)1884 seq_lt(ulong x, ulong y)
1885 {
1886 return (int)(x-y) < 0;
1887 }
1888
1889 static int
seq_le(ulong x,ulong y)1890 seq_le(ulong x, ulong y)
1891 {
1892 return (int)(x-y) <= 0;
1893 }
1894
1895 static int
seq_gt(ulong x,ulong y)1896 seq_gt(ulong x, ulong y)
1897 {
1898 return (int)(x-y) > 0;
1899 }
1900
1901 static int
seq_ge(ulong x,ulong y)1902 seq_ge(ulong x, ulong y)
1903 {
1904 return (int)(x-y) >= 0;
1905 }
1906
1907 /*
1908 * use the time between the first SYN and it's ack as the
1909 * initial round trip time
1910 */
1911 static void
tcpsynackrtt(Conv * s)1912 tcpsynackrtt(Conv *s)
1913 {
1914 Tcpctl *tcb;
1915 int delta;
1916 Tcppriv *tpriv;
1917
1918 tcb = (Tcpctl*)s->ptcl;
1919 tpriv = s->p->priv;
1920
1921 delta = NOW - tcb->sndsyntime;
1922 tcb->srtt = delta<<LOGAGAIN;
1923 tcb->mdev = delta<<LOGDGAIN;
1924
1925 /* halt round trip timer */
1926 tcphalt(tpriv, &tcb->rtt_timer);
1927 }
1928
1929 static void
update(Conv * s,Tcp * seg)1930 update(Conv *s, Tcp *seg)
1931 {
1932 int rtt, delta;
1933 Tcpctl *tcb;
1934 ulong acked;
1935 Tcppriv *tpriv;
1936
1937 if(seg->update)
1938 return;
1939 seg->update = 1;
1940
1941 tpriv = s->p->priv;
1942 tcb = (Tcpctl*)s->ptcl;
1943
1944 /* catch zero-window updates, update window & recover */
1945 if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1946 seq_lt(seg->ack, tcb->snd.ptr)){
1947 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1948 seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
1949 tcb->snd.wnd = seg->wnd;
1950 goto recovery;
1951 }
1952
1953 /* newreno fast retransmit */
1954 if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1955 ++tcb->snd.dupacks == 3){ /* was TCPREXMTTHRESH */
1956 recovery:
1957 if(tcb->snd.recovery){
1958 tpriv->stats[RecoveryCwind]++;
1959 tcb->cwind += tcb->mss;
1960 }else if(seq_le(tcb->snd.rxt, seg->ack)){
1961 tpriv->stats[Recovery]++;
1962 tcb->abcbytes = 0;
1963 tcb->snd.recovery = 1;
1964 tcb->snd.partialack = 0;
1965 tcb->snd.rxt = tcb->snd.nxt;
1966 tcpcongestion(tcb);
1967 tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1968 netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1969 tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1970 tcprxmit(s);
1971 }else{
1972 tpriv->stats[RecoveryNoSeq]++;
1973 netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1974 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1975 /* don't enter fast retransmit, don't change ssthresh */
1976 }
1977 }else if(tcb->snd.recovery){
1978 tpriv->stats[RecoveryCwind]++;
1979 tcb->cwind += tcb->mss;
1980 }
1981
1982 /*
1983 * update window
1984 */
1985 if(seq_gt(seg->ack, tcb->snd.wl2)
1986 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1987 /* clear dupack if we advance wl2 */
1988 if(tcb->snd.wl2 != seg->ack)
1989 tcb->snd.dupacks = 0;
1990 tcb->snd.wnd = seg->wnd;
1991 tcb->snd.wl2 = seg->ack;
1992 }
1993
1994 if(!seq_gt(seg->ack, tcb->snd.una)){
1995 /*
1996 * don't let us hangup if sending into a closed window and
1997 * we're still getting acks
1998 */
1999 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
2000 tcb->backedoff = MAXBACKMS/4;
2001 return;
2002 }
2003
2004 /* Compute the new send window size */
2005 acked = seg->ack - tcb->snd.una;
2006
2007 /* avoid slow start and timers for SYN acks */
2008 if((tcb->flags & SYNACK) == 0) {
2009 tcb->flags |= SYNACK;
2010 acked--;
2011 tcb->flgcnt--;
2012 goto done;
2013 }
2014
2015 /*
2016 * congestion control
2017 */
2018 if(tcb->snd.recovery){
2019 if(seq_ge(seg->ack, tcb->snd.rxt)){
2020 /* recovery finished; deflate window */
2021 tpriv->stats[RecoveryDone]++;
2022 tcb->snd.dupacks = 0;
2023 tcb->snd.recovery = 0;
2024 tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
2025 if(tcb->ssthresh < tcb->cwind)
2026 tcb->cwind = tcb->ssthresh;
2027 netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
2028 tcb->cwind, tcb->ssthresh);
2029 } else {
2030 /* partial ack; we lost more than one segment */
2031 tpriv->stats[RecoveryPA]++;
2032 if(tcb->cwind > acked)
2033 tcb->cwind -= acked;
2034 else{
2035 netlog(s->p->f, Logtcpwin, "partial ack neg\n");
2036 tcb->cwind = tcb->mss;
2037 }
2038 netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
2039 acked, tcb->snd.rxt - seg->ack, tcb->cwind);
2040
2041 if(acked >= tcb->mss)
2042 tcb->cwind += tcb->mss;
2043 tcb->snd.partialack++;
2044 }
2045 } else
2046 tcpabcincr(tcb, acked);
2047
2048 /* Adjust the timers according to the round trip time */
2049 /* TODO: fix sloppy treatment of overflow cases here. */
2050 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2051 tcphalt(tpriv, &tcb->rtt_timer);
2052 if((tcb->flags&RETRAN) == 0) {
2053 tcb->backoff = 0;
2054 tcb->backedoff = 0;
2055 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2056 if(rtt == 0)
2057 rtt = 1; /* else all close sys's will rexmit in 0 time */
2058 rtt *= MSPTICK;
2059 if(tcb->srtt == 0) {
2060 tcb->srtt = rtt << LOGAGAIN;
2061 tcb->mdev = rtt << LOGDGAIN;
2062 } else {
2063 delta = rtt - (tcb->srtt>>LOGAGAIN);
2064 tcb->srtt += delta;
2065 if(tcb->srtt <= 0)
2066 tcb->srtt = 1;
2067
2068 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2069 tcb->mdev += delta;
2070 if(tcb->mdev <= 0)
2071 tcb->mdev = 1;
2072 }
2073 tcpsettimer(tcb);
2074 }
2075 }
2076
2077 done:
2078 if(qdiscard(s->wq, acked) < acked)
2079 tcb->flgcnt--;
2080 tcb->snd.una = seg->ack;
2081
2082 /* newreno fast recovery */
2083 if(tcb->snd.recovery)
2084 tcprxmit(s);
2085
2086 if(seq_gt(seg->ack, tcb->snd.urg))
2087 tcb->snd.urg = seg->ack;
2088
2089 if(tcb->snd.una != tcb->snd.nxt){
2090 /* `impatient' variant */
2091 if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2092 tcb->time = NOW;
2093 tcb->timeuna = tcb->snd.una;
2094 tcpgo(tpriv, &tcb->timer);
2095 }
2096 } else
2097 tcphalt(tpriv, &tcb->timer);
2098
2099 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2100 tcb->snd.ptr = tcb->snd.una;
2101
2102 if(!tcb->snd.recovery)
2103 tcb->flags &= ~RETRAN;
2104 tcb->backoff = 0;
2105 tcb->backedoff = 0;
2106 }
2107
2108 static void
tcpiput(Proto * tcp,Ipifc *,Block * bp)2109 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2110 {
2111 Tcp seg;
2112 Tcp4hdr *h4;
2113 Tcp6hdr *h6;
2114 int hdrlen;
2115 Tcpctl *tcb;
2116 ushort length, csum;
2117 uchar source[IPaddrlen], dest[IPaddrlen];
2118 Conv *s;
2119 Fs *f;
2120 Tcppriv *tpriv;
2121 uchar version;
2122
2123 f = tcp->f;
2124 tpriv = tcp->priv;
2125
2126 tpriv->stats[InSegs]++;
2127
2128 h4 = (Tcp4hdr*)(bp->rp);
2129 h6 = (Tcp6hdr*)(bp->rp);
2130
2131 if((h4->vihl&0xF0)==IP_VER4) {
2132 version = V4;
2133 length = nhgets(h4->length);
2134 v4tov6(dest, h4->tcpdst);
2135 v4tov6(source, h4->tcpsrc);
2136
2137 h4->Unused = 0;
2138 hnputs(h4->tcplen, length-TCP4_PKT);
2139 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2140 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2141 tpriv->stats[CsumErrs]++;
2142 tpriv->stats[InErrs]++;
2143 netlog(f, Logtcp, "bad tcp proto cksum\n");
2144 freeblist(bp);
2145 return;
2146 }
2147
2148 hdrlen = ntohtcp4(&seg, &bp);
2149 if(hdrlen < 0){
2150 tpriv->stats[HlenErrs]++;
2151 tpriv->stats[InErrs]++;
2152 netlog(f, Logtcp, "bad tcp hdr len\n");
2153 return;
2154 }
2155
2156 /* trim the packet to the size claimed by the datagram */
2157 length -= hdrlen+TCP4_PKT;
2158 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2159 if(bp == nil){
2160 tpriv->stats[LenErrs]++;
2161 tpriv->stats[InErrs]++;
2162 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2163 return;
2164 }
2165 }
2166 else {
2167 int ttl = h6->ttl;
2168 int proto = h6->proto;
2169
2170 version = V6;
2171 length = nhgets(h6->ploadlen);
2172 ipmove(dest, h6->tcpdst);
2173 ipmove(source, h6->tcpsrc);
2174
2175 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2176 h6->ttl = proto;
2177 hnputl(h6->vcf, length);
2178 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2179 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2180 tpriv->stats[CsumErrs]++;
2181 tpriv->stats[InErrs]++;
2182 netlog(f, Logtcp,
2183 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2184 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2185 freeblist(bp);
2186 return;
2187 }
2188 h6->ttl = ttl;
2189 h6->proto = proto;
2190 hnputs(h6->ploadlen, length);
2191
2192 hdrlen = ntohtcp6(&seg, &bp);
2193 if(hdrlen < 0){
2194 tpriv->stats[HlenErrs]++;
2195 tpriv->stats[InErrs]++;
2196 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2197 return;
2198 }
2199
2200 /* trim the packet to the size claimed by the datagram */
2201 length -= hdrlen;
2202 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2203 if(bp == nil){
2204 tpriv->stats[LenErrs]++;
2205 tpriv->stats[InErrs]++;
2206 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2207 return;
2208 }
2209 }
2210
2211 /* lock protocol while searching for a conversation */
2212 qlock(tcp);
2213
2214 /* Look for a matching conversation */
2215 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2216 if(s == nil){
2217 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2218 source, seg.source, dest, seg.dest);
2219 reset:
2220 qunlock(tcp);
2221 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2222 freeblist(bp);
2223 return;
2224 }
2225
2226 /* if it's a listener, look for the right flags and get a new conv */
2227 tcb = (Tcpctl*)s->ptcl;
2228 if(tcb->state == Listen){
2229 if(seg.flags & RST){
2230 limborst(s, &seg, source, dest, version);
2231 qunlock(tcp);
2232 freeblist(bp);
2233 return;
2234 }
2235
2236 /* if this is a new SYN, put the call into limbo */
2237 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2238 limbo(s, source, dest, &seg, version);
2239 qunlock(tcp);
2240 freeblist(bp);
2241 return;
2242 }
2243
2244 /*
2245 * if there's a matching call in limbo, tcpincoming will
2246 * return it in state Syn_received
2247 */
2248 s = tcpincoming(s, &seg, source, dest, version);
2249 if(s == nil)
2250 goto reset;
2251 }
2252
2253 /* The rest of the input state machine is run with the control block
2254 * locked and implements the state machine directly out of the RFC.
2255 * Out-of-band data is ignored - it was always a bad idea.
2256 */
2257 tcb = (Tcpctl*)s->ptcl;
2258 if(waserror()){
2259 qunlock(s);
2260 nexterror();
2261 }
2262 qlock(s);
2263 qunlock(tcp);
2264
2265 /* fix up window */
2266 seg.wnd <<= tcb->rcv.scale;
2267
2268 /* every input packet in puts off the keep alive time out */
2269 tcpsetkacounter(tcb);
2270
2271 switch(tcb->state) {
2272 case Closed:
2273 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2274 goto raise;
2275 case Syn_sent:
2276 if(seg.flags & ACK) {
2277 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2278 sndrst(tcp, source, dest, length, &seg, version,
2279 "bad seq in Syn_sent");
2280 goto raise;
2281 }
2282 }
2283 if(seg.flags & RST) {
2284 if(seg.flags & ACK)
2285 localclose(s, Econrefused);
2286 goto raise;
2287 }
2288
2289 if(seg.flags & SYN) {
2290 procsyn(s, &seg);
2291 if(seg.flags & ACK){
2292 update(s, &seg);
2293 tcpsynackrtt(s);
2294 tcpsetstate(s, Established);
2295 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2296 }
2297 else {
2298 tcb->time = NOW;
2299 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2300 }
2301
2302 if(length != 0 || (seg.flags & FIN))
2303 break;
2304
2305 freeblist(bp);
2306 goto output;
2307 }
2308 else
2309 freeblist(bp);
2310
2311 qunlock(s);
2312 poperror();
2313 return;
2314 case Syn_received:
2315 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2316 if(seg.flags & ACK)
2317 tcpsynackrtt(s);
2318 break;
2319 }
2320
2321 /*
2322 * One DOS attack is to open connections to us and then forget about them,
2323 * thereby tying up a conv at no long term cost to the attacker.
2324 * This is an attempt to defeat these stateless DOS attacks. See
2325 * corresponding code in tcpsendka().
2326 */
2327 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2328 if(tcpporthogdefense
2329 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2330 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2331 source, seg.source, dest, seg.dest, seg.flags,
2332 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2333 localclose(s, "stateless hog");
2334 }
2335 }
2336
2337 /* Cut the data to fit the receive window */
2338 tcprcvwin(s);
2339 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2340 if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2341 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2342 "%lud-%lud l %d from %I\n", seg.seq,
2343 seg.seq + length - 1, tcb->rcv.nxt,
2344 tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2345 update(s, &seg);
2346 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2347 tcphalt(tpriv, &tcb->rtt_timer);
2348 tcphalt(tpriv, &tcb->acktimer);
2349 tcphalt(tpriv, &tcb->katimer);
2350 tcpsetstate(s, Time_wait);
2351 tcb->timer.start = MSL2*(1000 / MSPTICK);
2352 tcpgo(tpriv, &tcb->timer);
2353 }
2354 if(!(seg.flags & RST)) {
2355 tcb->flags |= FORCE;
2356 goto output;
2357 }
2358 qunlock(s);
2359 poperror();
2360 return;
2361 }
2362
2363 /* Cannot accept so answer with a rst */
2364 if(length && tcb->state == Closed) {
2365 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2366 goto raise;
2367 }
2368
2369 /* The segment is beyond the current receive pointer so
2370 * queue the data in the resequence queue
2371 */
2372 if(seg.seq != tcb->rcv.nxt)
2373 if(length != 0 || (seg.flags & (SYN|FIN))) {
2374 update(s, &seg);
2375 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2376 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2377 s->laddr, s->lport);
2378 tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
2379 goto output;
2380 }
2381
2382 if(tcb->nreseq > 0)
2383 tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2384
2385 /*
2386 * keep looping till we've processed this packet plus any
2387 * adjacent packets in the resequence queue
2388 */
2389 for(;;) {
2390 if(seg.flags & RST) {
2391 if(tcb->state == Established) {
2392 tpriv->stats[EstabResets]++;
2393 if(tcb->rcv.nxt != seg.seq)
2394 netlog(f, Logtcp, "out of order RST "
2395 "rcvd: %I.%d -> %I.%d, rcv.nxt "
2396 "%lux seq %lux\n",
2397 s->raddr, s->rport, s->laddr,
2398 s->lport, tcb->rcv.nxt, seg.seq);
2399 }
2400 localclose(s, Econrefused);
2401 goto raise;
2402 }
2403
2404 if((seg.flags&ACK) == 0)
2405 goto raise;
2406
2407 switch(tcb->state) {
2408 case Syn_received:
2409 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2410 sndrst(tcp, source, dest, length, &seg, version,
2411 "bad seq in Syn_received");
2412 goto raise;
2413 }
2414 update(s, &seg);
2415 tcpsetstate(s, Established);
2416 case Established:
2417 case Close_wait:
2418 update(s, &seg);
2419 break;
2420 case Finwait1:
2421 update(s, &seg);
2422 if(qlen(s->wq)+tcb->flgcnt == 0){
2423 tcphalt(tpriv, &tcb->rtt_timer);
2424 tcphalt(tpriv, &tcb->acktimer);
2425 tcpsetkacounter(tcb);
2426 tcb->time = NOW;
2427 tcpsetstate(s, Finwait2);
2428 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2429 tcpgo(tpriv, &tcb->katimer);
2430 }
2431 break;
2432 case Finwait2:
2433 update(s, &seg);
2434 break;
2435 case Closing:
2436 update(s, &seg);
2437 if(qlen(s->wq)+tcb->flgcnt == 0) {
2438 tcphalt(tpriv, &tcb->rtt_timer);
2439 tcphalt(tpriv, &tcb->acktimer);
2440 tcphalt(tpriv, &tcb->katimer);
2441 tcpsetstate(s, Time_wait);
2442 tcb->timer.start = MSL2*(1000 / MSPTICK);
2443 tcpgo(tpriv, &tcb->timer);
2444 }
2445 break;
2446 case Last_ack:
2447 update(s, &seg);
2448 if(qlen(s->wq)+tcb->flgcnt == 0) {
2449 localclose(s, nil);
2450 goto raise;
2451 }
2452 case Time_wait:
2453 tcb->flags |= FORCE;
2454 if(tcb->timer.state != TcptimerON)
2455 tcpgo(tpriv, &tcb->timer);
2456 }
2457
2458 if((seg.flags&URG) && seg.urg) {
2459 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2460 tcb->rcv.urg = seg.urg + seg.seq;
2461 pullblock(&bp, seg.urg);
2462 }
2463 }
2464 else
2465 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2466 tcb->rcv.urg = tcb->rcv.nxt;
2467
2468 if(length == 0) {
2469 if(bp != nil)
2470 freeblist(bp);
2471 }
2472 else {
2473 switch(tcb->state){
2474 default:
2475 /* Ignore segment text */
2476 if(bp != nil)
2477 freeblist(bp);
2478 break;
2479
2480 case Syn_received:
2481 case Established:
2482 case Finwait1:
2483 /* If we still have some data place on
2484 * receive queue
2485 */
2486 if(bp) {
2487 bp = packblock(bp);
2488 if(bp == nil)
2489 panic("tcp packblock");
2490 qpassnolim(s->rq, bp);
2491 bp = nil;
2492 }
2493 tcb->rcv.nxt += length;
2494
2495 /*
2496 * turn on the acktimer if there's something
2497 * to ack
2498 */
2499 if(tcb->acktimer.state != TcptimerON)
2500 tcpgo(tpriv, &tcb->acktimer);
2501
2502 break;
2503 case Finwait2:
2504 /* no process to read the data, send a reset */
2505 if(bp != nil)
2506 freeblist(bp);
2507 sndrst(tcp, source, dest, length, &seg, version,
2508 "send to Finwait2");
2509 qunlock(s);
2510 poperror();
2511 return;
2512 }
2513 }
2514
2515 if(seg.flags & FIN) {
2516 tcb->flags |= FORCE;
2517
2518 switch(tcb->state) {
2519 case Syn_received:
2520 case Established:
2521 tcb->rcv.nxt++;
2522 tcpsetstate(s, Close_wait);
2523 break;
2524 case Finwait1:
2525 tcb->rcv.nxt++;
2526 if(qlen(s->wq)+tcb->flgcnt == 0) {
2527 tcphalt(tpriv, &tcb->rtt_timer);
2528 tcphalt(tpriv, &tcb->acktimer);
2529 tcphalt(tpriv, &tcb->katimer);
2530 tcpsetstate(s, Time_wait);
2531 tcb->timer.start = MSL2*(1000/MSPTICK);
2532 tcpgo(tpriv, &tcb->timer);
2533 }
2534 else
2535 tcpsetstate(s, Closing);
2536 break;
2537 case Finwait2:
2538 tcb->rcv.nxt++;
2539 tcphalt(tpriv, &tcb->rtt_timer);
2540 tcphalt(tpriv, &tcb->acktimer);
2541 tcphalt(tpriv, &tcb->katimer);
2542 tcpsetstate(s, Time_wait);
2543 tcb->timer.start = MSL2 * (1000/MSPTICK);
2544 tcpgo(tpriv, &tcb->timer);
2545 break;
2546 case Close_wait:
2547 case Closing:
2548 case Last_ack:
2549 break;
2550 case Time_wait:
2551 tcpgo(tpriv, &tcb->timer);
2552 break;
2553 }
2554 }
2555
2556 /*
2557 * get next adjacent segment from the resequence queue.
2558 * dump/trim any overlapping segments
2559 */
2560 for(;;) {
2561 if(tcb->reseq == nil)
2562 goto output;
2563
2564 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2565 goto output;
2566
2567 getreseq(tcb, &seg, &bp, &length);
2568
2569 tcprcvwin(s);
2570 if(tcptrim(tcb, &seg, &bp, &length) == 0){
2571 tcb->flags |= FORCE;
2572 break;
2573 }
2574 }
2575 }
2576 output:
2577 tcpoutput(s);
2578 qunlock(s);
2579 poperror();
2580 return;
2581 raise:
2582 qunlock(s);
2583 poperror();
2584 freeblist(bp);
2585 tcpkick(s);
2586 }
2587
2588 /*
2589 * always enters and exits with the s locked. We drop
2590 * the lock to ipoput the packet so some care has to be
2591 * taken by callers.
2592 */
2593 static void
tcpoutput(Conv * s)2594 tcpoutput(Conv *s)
2595 {
2596 Tcp seg;
2597 uint msgs;
2598 Tcpctl *tcb;
2599 Block *hbp, *bp;
2600 int sndcnt;
2601 ulong ssize, dsize, sent;
2602 Fs *f;
2603 Tcppriv *tpriv;
2604 uchar version;
2605
2606 f = s->p->f;
2607 tpriv = s->p->priv;
2608 version = s->ipversion;
2609
2610 tcb = (Tcpctl*)s->ptcl;
2611
2612 /* force ack every 2*mss */
2613 if((tcb->flags & FORCE) == 0 &&
2614 tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2615 tpriv->stats[Delayack]++;
2616 tcb->flags |= FORCE;
2617 }
2618
2619 /* force ack if window opening */
2620 if((tcb->flags & FORCE) == 0){
2621 tcprcvwin(s);
2622 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2623 tpriv->stats[Wopenack]++;
2624 tcb->flags |= FORCE;
2625 }
2626 }
2627
2628 for(msgs = 0; msgs < 100; msgs++) {
2629 switch(tcb->state) {
2630 case Listen:
2631 case Closed:
2632 case Finwait2:
2633 return;
2634 }
2635
2636 /* Don't send anything else until our SYN has been acked */
2637 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2638 break;
2639
2640 /* force an ack when a window has opened up */
2641 tcprcvwin(s);
2642 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2643 tcb->rcv.blocked = 0;
2644 tcb->flags |= FORCE;
2645 }
2646
2647 sndcnt = qlen(s->wq)+tcb->flgcnt;
2648 sent = tcb->snd.ptr - tcb->snd.una;
2649 ssize = sndcnt;
2650 if(tcb->snd.wnd == 0){
2651 /* zero window probe */
2652 if(sent > 0 && !(tcb->flags & FORCE))
2653 break; /* already probing, rto re-probes */
2654 if(ssize < sent)
2655 ssize = 0;
2656 else{
2657 ssize -= sent;
2658 if(ssize > 0)
2659 ssize = 1;
2660 }
2661 } else {
2662 /* calculate usable segment size */
2663 if(ssize > tcb->cwind)
2664 ssize = tcb->cwind;
2665 if(ssize > tcb->snd.wnd)
2666 ssize = tcb->snd.wnd;
2667
2668 if(ssize < sent)
2669 ssize = 0;
2670 else {
2671 ssize -= sent;
2672 if(ssize > tcb->mss)
2673 ssize = tcb->mss;
2674 }
2675 }
2676
2677 dsize = ssize;
2678 seg.urg = 0;
2679
2680 if(!(tcb->flags & FORCE))
2681 if(ssize == 0 ||
2682 ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2683 sent > TCPREXMTTHRESH * tcb->mss)
2684 break;
2685
2686 tcb->flags &= ~FORCE;
2687
2688 /* By default we will generate an ack */
2689 tcphalt(tpriv, &tcb->acktimer);
2690 seg.source = s->lport;
2691 seg.dest = s->rport;
2692 seg.flags = ACK;
2693 seg.mss = 0;
2694 seg.ws = 0;
2695 seg.update = 0;
2696 switch(tcb->state){
2697 case Syn_sent:
2698 seg.flags = 0;
2699 if(tcb->snd.ptr == tcb->iss){
2700 seg.flags |= SYN;
2701 dsize--;
2702 seg.mss = tcb->mss;
2703 seg.ws = tcb->scale;
2704 }
2705 break;
2706 case Syn_received:
2707 /*
2708 * don't send any data with a SYN/ACK packet
2709 * because Linux rejects the packet in its
2710 * attempt to solve the SYN attack problem
2711 */
2712 if(tcb->snd.ptr == tcb->iss){
2713 seg.flags |= SYN;
2714 dsize = 0;
2715 ssize = 1;
2716 seg.mss = tcb->mss;
2717 seg.ws = tcb->scale;
2718 }
2719 break;
2720 }
2721 seg.seq = tcb->snd.ptr;
2722 seg.ack = tcb->rcv.nxt;
2723 seg.wnd = tcb->rcv.wnd;
2724
2725 /* Pull out data to send */
2726 bp = nil;
2727 if(dsize != 0) {
2728 bp = qcopy(s->wq, dsize, sent);
2729 if(BLEN(bp) != dsize) {
2730 seg.flags |= FIN;
2731 dsize--;
2732 }
2733 }
2734
2735 if(sent+dsize == sndcnt && dsize)
2736 seg.flags |= PSH;
2737
2738 tcb->snd.ptr += ssize;
2739
2740 /* Pull up the send pointer so we can accept acks
2741 * for this window
2742 */
2743 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2744 tcb->snd.nxt = tcb->snd.ptr;
2745
2746 /* Build header, link data and compute cksum */
2747 switch(version){
2748 case V4:
2749 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2750 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2751 if(hbp == nil) {
2752 freeblist(bp);
2753 return;
2754 }
2755 break;
2756 case V6:
2757 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2758 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2759 if(hbp == nil) {
2760 freeblist(bp);
2761 return;
2762 }
2763 break;
2764 default:
2765 hbp = nil; /* to suppress a warning */
2766 panic("tcpoutput: version %d", version);
2767 }
2768
2769 /* Start the transmission timers if there is new data and we
2770 * expect acknowledges
2771 */
2772 if(ssize != 0){
2773 if(tcb->timer.state != TcptimerON){
2774 tcb->time = NOW;
2775 tcb->timeuna = tcb->snd.una;
2776 tcpgo(tpriv, &tcb->timer);
2777 }
2778
2779 /* If round trip timer isn't running, start it.
2780 * measure the longest packet only in case the
2781 * transmission time dominates RTT
2782 */
2783 if(tcb->snd.retransmit == 0)
2784 if(tcb->rtt_timer.state != TcptimerON)
2785 if(ssize == tcb->mss) {
2786 tcpgo(tpriv, &tcb->rtt_timer);
2787 tcb->rttseq = tcb->snd.ptr;
2788 }
2789 }
2790
2791 tpriv->stats[OutSegs]++;
2792 if(tcb->snd.retransmit)
2793 tpriv->stats[RetransSegsSent]++;
2794 tcb->rcv.ackptr = seg.ack;
2795 tcb->rcv.wsnt = tcb->rcv.wptr;
2796
2797 /* put off the next keep alive */
2798 tcpgo(tpriv, &tcb->katimer);
2799
2800 switch(version){
2801 case V4:
2802 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2803 /* a negative return means no route */
2804 localclose(s, "no route");
2805 }
2806 break;
2807 case V6:
2808 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2809 /* a negative return means no route */
2810 localclose(s, "no route");
2811 }
2812 break;
2813 default:
2814 panic("tcpoutput2: version %d", version);
2815 }
2816 if((msgs%4) == 3){
2817 qunlock(s);
2818 qlock(s);
2819 }
2820 }
2821 }
2822
2823 /*
2824 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2825 */
2826 static void
tcpsendka(Conv * s)2827 tcpsendka(Conv *s)
2828 {
2829 Tcp seg;
2830 Tcpctl *tcb;
2831 Block *hbp,*dbp;
2832
2833 tcb = (Tcpctl*)s->ptcl;
2834
2835 dbp = nil;
2836 memset(&seg, 0, sizeof seg);
2837 seg.urg = 0;
2838 seg.source = s->lport;
2839 seg.dest = s->rport;
2840 seg.flags = ACK|PSH;
2841 seg.mss = 0;
2842 seg.ws = 0;
2843 if(tcpporthogdefense)
2844 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2845 else
2846 seg.seq = tcb->snd.una-1;
2847 seg.ack = tcb->rcv.nxt;
2848 tcb->rcv.ackptr = seg.ack;
2849 tcprcvwin(s);
2850 seg.wnd = tcb->rcv.wnd;
2851 if(tcb->state == Finwait2){
2852 seg.flags |= FIN;
2853 } else {
2854 dbp = allocb(1);
2855 dbp->wp++;
2856 }
2857
2858 if(isv4(s->raddr)) {
2859 /* Build header, link data and compute cksum */
2860 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2861 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2862 if(hbp == nil) {
2863 freeblist(dbp);
2864 return;
2865 }
2866 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2867 }
2868 else {
2869 /* Build header, link data and compute cksum */
2870 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2871 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2872 if(hbp == nil) {
2873 freeblist(dbp);
2874 return;
2875 }
2876 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2877 }
2878 }
2879
2880 /*
2881 * set connection to time out after 12 minutes
2882 */
2883 static void
tcpsetkacounter(Tcpctl * tcb)2884 tcpsetkacounter(Tcpctl *tcb)
2885 {
2886 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2887 if(tcb->kacounter < 3)
2888 tcb->kacounter = 3;
2889 }
2890
2891 /*
2892 * if we've timed out, close the connection
2893 * otherwise, send a keepalive and restart the timer
2894 */
2895 static void
tcpkeepalive(void * v)2896 tcpkeepalive(void *v)
2897 {
2898 Tcpctl *tcb;
2899 Conv *s;
2900
2901 s = v;
2902 tcb = (Tcpctl*)s->ptcl;
2903 if(waserror()){
2904 qunlock(s);
2905 nexterror();
2906 }
2907 qlock(s);
2908 if(tcb->state != Closed){
2909 if(--(tcb->kacounter) <= 0) {
2910 localclose(s, Etimedout);
2911 } else {
2912 tcpsendka(s);
2913 tcpgo(s->p->priv, &tcb->katimer);
2914 }
2915 }
2916 qunlock(s);
2917 poperror();
2918 }
2919
2920 /*
2921 * start keepalive timer
2922 */
2923 static char*
tcpstartka(Conv * s,char ** f,int n)2924 tcpstartka(Conv *s, char **f, int n)
2925 {
2926 Tcpctl *tcb;
2927 int x;
2928
2929 tcb = (Tcpctl*)s->ptcl;
2930 if(tcb->state != Established)
2931 return "connection must be in Establised state";
2932 if(n > 1){
2933 x = atoi(f[1]);
2934 if(x >= MSPTICK)
2935 tcb->katimer.start = x/MSPTICK;
2936 }
2937 tcpsetkacounter(tcb);
2938 tcpgo(s->p->priv, &tcb->katimer);
2939
2940 return nil;
2941 }
2942
2943 /*
2944 * turn checksums on/off
2945 */
2946 static char*
tcpsetchecksum(Conv * s,char ** f,int)2947 tcpsetchecksum(Conv *s, char **f, int)
2948 {
2949 Tcpctl *tcb;
2950
2951 tcb = (Tcpctl*)s->ptcl;
2952 tcb->nochecksum = !atoi(f[1]);
2953
2954 return nil;
2955 }
2956
2957 /*
2958 * retransmit (at most) one segment at snd.una.
2959 * preserve cwind & snd.ptr
2960 */
2961 static void
tcprxmit(Conv * s)2962 tcprxmit(Conv *s)
2963 {
2964 Tcpctl *tcb;
2965 Tcppriv *tpriv;
2966 ulong tcwind, tptr;
2967
2968 tcb = (Tcpctl*)s->ptcl;
2969 tcb->flags |= RETRAN|FORCE;
2970
2971 tptr = tcb->snd.ptr;
2972 tcwind = tcb->cwind;
2973 tcb->snd.ptr = tcb->snd.una;
2974 tcb->cwind = tcb->mss;
2975 tcb->snd.retransmit = 1;
2976 tcpoutput(s);
2977 tcb->snd.retransmit = 0;
2978 tcb->cwind = tcwind;
2979 tcb->snd.ptr = tptr;
2980
2981 tpriv = s->p->priv;
2982 tpriv->stats[RetransSegs]++;
2983 }
2984
2985 /*
2986 * TODO: RFC 4138 F-RTO
2987 */
2988 static void
tcptimeout(void * arg)2989 tcptimeout(void *arg)
2990 {
2991 Conv *s;
2992 Tcpctl *tcb;
2993 int maxback;
2994 Tcppriv *tpriv;
2995
2996 s = (Conv*)arg;
2997 tpriv = s->p->priv;
2998 tcb = (Tcpctl*)s->ptcl;
2999
3000 if(waserror()){
3001 qunlock(s);
3002 nexterror();
3003 }
3004 qlock(s);
3005 switch(tcb->state){
3006 default:
3007 tcb->backoff++;
3008 if(tcb->state == Syn_sent)
3009 maxback = MAXBACKMS/2;
3010 else
3011 maxback = MAXBACKMS;
3012 tcb->backedoff += tcb->timer.start * MSPTICK;
3013 if(tcb->backedoff >= maxback) {
3014 localclose(s, Etimedout);
3015 break;
3016 }
3017 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
3018 tcb->srtt, tcb->mdev, NOW - tcb->time,
3019 tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
3020 tcpstates[s->state]);
3021 tcpsettimer(tcb);
3022 if(tcb->snd.rto == 0)
3023 tcpcongestion(tcb);
3024 tcprxmit(s);
3025 tcb->snd.ptr = tcb->snd.una;
3026 tcb->cwind = tcb->mss;
3027 tcb->snd.rto = 1;
3028 tpriv->stats[RetransTimeouts]++;
3029
3030 if(tcb->snd.recovery){
3031 tcb->snd.dupacks = 0; /* reno rto */
3032 tcb->snd.recovery = 0;
3033 tpriv->stats[RecoveryRTO]++;
3034 tcb->snd.rxt = tcb->snd.nxt;
3035 netlog(s->p->f, Logtcpwin,
3036 "rto recovery rxt @%lud\n", tcb->snd.nxt);
3037 }
3038
3039 tcb->abcbytes = 0;
3040 break;
3041 case Time_wait:
3042 localclose(s, nil);
3043 break;
3044 case Closed:
3045 break;
3046 }
3047 qunlock(s);
3048 poperror();
3049 }
3050
3051 static int
inwindow(Tcpctl * tcb,int seq)3052 inwindow(Tcpctl *tcb, int seq)
3053 {
3054 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3055 }
3056
3057 /*
3058 * set up state for a received SYN (or SYN ACK) packet
3059 */
3060 static void
procsyn(Conv * s,Tcp * seg)3061 procsyn(Conv *s, Tcp *seg)
3062 {
3063 Tcpctl *tcb;
3064 Tcppriv *tpriv;
3065
3066 tcb = (Tcpctl*)s->ptcl;
3067 tcb->flags |= FORCE;
3068
3069 tcb->rcv.nxt = seg->seq + 1;
3070 tcb->rcv.wptr = tcb->rcv.nxt;
3071 tcb->rcv.wsnt = 0;
3072 tcb->rcv.urg = tcb->rcv.nxt;
3073 tcb->irs = seg->seq;
3074
3075 /* our sending max segment size cannot be bigger than what he asked for */
3076 if(seg->mss != 0 && seg->mss < tcb->mss) {
3077 tcb->mss = seg->mss;
3078 tpriv = s->p->priv;
3079 tpriv->stats[Mss] = tcb->mss;
3080 }
3081
3082 tcb->snd.wnd = seg->wnd;
3083 initialwindow(tcb);
3084 }
3085
3086 static int
dumpreseq(Tcpctl * tcb)3087 dumpreseq(Tcpctl *tcb)
3088 {
3089 Reseq *r, *next;
3090
3091 for(r = tcb->reseq; r != nil; r = next){
3092 next = r->next;
3093 freeblist(r->bp);
3094 free(r);
3095 }
3096 tcb->reseq = nil;
3097 tcb->nreseq = 0;
3098 tcb->reseqlen = 0;
3099 return -1;
3100 }
3101
3102 static void
logreseq(Fs * f,Reseq * r,ulong n)3103 logreseq(Fs *f, Reseq *r, ulong n)
3104 {
3105 char *s;
3106
3107 for(; r != nil; r = r->next){
3108 s = nil;
3109 if(r->next == nil && r->seg.seq != n)
3110 s = "hole/end";
3111 else if(r->next == nil)
3112 s = "end";
3113 else if(r->seg.seq != n)
3114 s = "hole";
3115 if(s != nil)
3116 netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3117 n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3118 n = r->seg.seq + r->seg.len;
3119 }
3120 }
3121
3122 static int
addreseq(Fs * f,Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)3123 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3124 {
3125 Reseq *rp, **rr;
3126 int qmax;
3127
3128 rp = malloc(sizeof *rp);
3129 if(rp == nil){
3130 freeblist(bp); /* bp always consumed by addreseq */
3131 return 0;
3132 }
3133
3134 rp->seg = *seg;
3135 rp->bp = bp;
3136 rp->length = length;
3137
3138 tcb->reseqlen += length;
3139 tcb->nreseq++;
3140
3141 /* Place on reassembly list sorting by starting seq number */
3142 for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3143 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3144 rp->next = *rr;
3145 *rr = rp;
3146 tpriv->stats[Resequenced]++;
3147 if(rp->next != nil)
3148 tpriv->stats[OutOfOrder]++;
3149 break;
3150 }
3151
3152 qmax = tcb->window;
3153 if(tcb->reseqlen > qmax){
3154 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3155 tcb->reseqlen, qmax, tcb->nreseq);
3156 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3157 tpriv->stats[ReseqBytelim]++;
3158 return dumpreseq(tcb);
3159 }
3160 qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3161 if(tcb->nreseq > qmax){
3162 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3163 tcb->nreseq, qmax, tcb->reseqlen);
3164 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3165 tpriv->stats[ReseqPktlim]++;
3166 return dumpreseq(tcb);
3167 }
3168 return 0;
3169 }
3170
3171 static void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3172 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3173 {
3174 Reseq *rp;
3175
3176 rp = tcb->reseq;
3177 if(rp == nil)
3178 return;
3179
3180 tcb->reseq = rp->next;
3181
3182 *seg = rp->seg;
3183 *bp = rp->bp;
3184 *length = rp->length;
3185
3186 tcb->nreseq--;
3187 tcb->reseqlen -= rp->length;
3188
3189 free(rp);
3190 }
3191
3192 static int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)3193 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3194 {
3195 ushort len;
3196 uchar accept;
3197 int dupcnt, excess;
3198
3199 accept = 0;
3200 len = *length;
3201 if(seg->flags & SYN)
3202 len++;
3203 if(seg->flags & FIN)
3204 len++;
3205
3206 if(tcb->rcv.wnd == 0) {
3207 if(len == 0 && seg->seq == tcb->rcv.nxt)
3208 return 0;
3209 }
3210 else {
3211 /* Some part of the segment should be in the window */
3212 if(inwindow(tcb,seg->seq))
3213 accept++;
3214 else
3215 if(len != 0) {
3216 if(inwindow(tcb, seg->seq+len-1) ||
3217 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3218 accept++;
3219 }
3220 }
3221 if(!accept) {
3222 freeblist(*bp);
3223 return -1;
3224 }
3225 dupcnt = tcb->rcv.nxt - seg->seq;
3226 if(dupcnt > 0){
3227 tcb->rerecv += dupcnt;
3228 if(seg->flags & SYN){
3229 seg->flags &= ~SYN;
3230 seg->seq++;
3231
3232 if(seg->urg > 1)
3233 seg->urg--;
3234 else
3235 seg->flags &= ~URG;
3236 dupcnt--;
3237 }
3238 if(dupcnt > 0){
3239 pullblock(bp, (ushort)dupcnt);
3240 seg->seq += dupcnt;
3241 *length -= dupcnt;
3242
3243 if(seg->urg > dupcnt)
3244 seg->urg -= dupcnt;
3245 else {
3246 seg->flags &= ~URG;
3247 seg->urg = 0;
3248 }
3249 }
3250 }
3251 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3252 if(excess > 0) {
3253 tcb->rerecv += excess;
3254 *length -= excess;
3255 *bp = trimblock(*bp, 0, *length);
3256 if(*bp == nil)
3257 panic("presotto is a boofhead");
3258 seg->flags &= ~FIN;
3259 }
3260 return 0;
3261 }
3262
3263 static void
tcpadvise(Proto * tcp,Block * bp,char * msg)3264 tcpadvise(Proto *tcp, Block *bp, char *msg)
3265 {
3266 Tcp4hdr *h4;
3267 Tcp6hdr *h6;
3268 Tcpctl *tcb;
3269 uchar source[IPaddrlen];
3270 uchar dest[IPaddrlen];
3271 ushort psource, pdest;
3272 Conv *s, **p;
3273
3274 h4 = (Tcp4hdr*)(bp->rp);
3275 h6 = (Tcp6hdr*)(bp->rp);
3276
3277 if((h4->vihl&0xF0)==IP_VER4) {
3278 v4tov6(dest, h4->tcpdst);
3279 v4tov6(source, h4->tcpsrc);
3280 psource = nhgets(h4->tcpsport);
3281 pdest = nhgets(h4->tcpdport);
3282 }
3283 else {
3284 ipmove(dest, h6->tcpdst);
3285 ipmove(source, h6->tcpsrc);
3286 psource = nhgets(h6->tcpsport);
3287 pdest = nhgets(h6->tcpdport);
3288 }
3289
3290 /* Look for a connection */
3291 qlock(tcp);
3292 for(p = tcp->conv; *p; p++) {
3293 s = *p;
3294 tcb = (Tcpctl*)s->ptcl;
3295 if(s->rport == pdest)
3296 if(s->lport == psource)
3297 if(tcb->state != Closed)
3298 if(ipcmp(s->raddr, dest) == 0)
3299 if(ipcmp(s->laddr, source) == 0){
3300 qlock(s);
3301 qunlock(tcp);
3302 switch(tcb->state){
3303 case Syn_sent:
3304 localclose(s, msg);
3305 break;
3306 }
3307 qunlock(s);
3308 freeblist(bp);
3309 return;
3310 }
3311 }
3312 qunlock(tcp);
3313 freeblist(bp);
3314 }
3315
3316 static char*
tcpporthogdefensectl(char * val)3317 tcpporthogdefensectl(char *val)
3318 {
3319 if(strcmp(val, "on") == 0)
3320 tcpporthogdefense = 1;
3321 else if(strcmp(val, "off") == 0)
3322 tcpporthogdefense = 0;
3323 else
3324 return "unknown value for tcpporthogdefense";
3325 return nil;
3326 }
3327
3328 /* called with c qlocked */
3329 static char*
tcpctl(Conv * c,char ** f,int n)3330 tcpctl(Conv* c, char** f, int n)
3331 {
3332 if(n == 1 && strcmp(f[0], "close") == 0)
3333 return tcpclose2(c);
3334 if(n == 1 && strcmp(f[0], "hangup") == 0)
3335 return tcphangup(c);
3336 if(n == 1 && strcmp(f[0], "hangupxmit") == 0)
3337 return tcpxmitclose(c);
3338 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3339 return tcpstartka(c, f, n);
3340 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3341 return tcpsetchecksum(c, f, n);
3342 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3343 return tcpporthogdefensectl(f[1]);
3344 return "unknown control request";
3345 }
3346
3347 static int
tcpstats(Proto * tcp,char * buf,int len)3348 tcpstats(Proto *tcp, char *buf, int len)
3349 {
3350 Tcppriv *priv;
3351 char *p, *e;
3352 int i;
3353
3354 priv = tcp->priv;
3355 p = buf;
3356 e = p+len;
3357 for(i = 0; i < Nstats; i++)
3358 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3359 return p - buf;
3360 }
3361
3362 /*
3363 * garbage collect any stale conversations:
3364 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3365 * - Finwait2 after 5 minutes
3366 *
3367 * this is called whenever we run out of channels. Both checks are
3368 * of questionable validity so we try to use them only when we're
3369 * up against the wall.
3370 */
3371 static int
tcpgc(Proto * tcp)3372 tcpgc(Proto *tcp)
3373 {
3374 Conv *c, **pp, **ep;
3375 int n;
3376 Tcpctl *tcb;
3377
3378
3379 n = 0;
3380 ep = &tcp->conv[tcp->nc];
3381 for(pp = tcp->conv; pp < ep; pp++) {
3382 c = *pp;
3383 if(c == nil)
3384 break;
3385 if(!canqlock(c))
3386 continue;
3387 tcb = (Tcpctl*)c->ptcl;
3388 switch(tcb->state){
3389 case Syn_received:
3390 if(NOW - tcb->time > 5000){
3391 localclose(c, Etimedout);
3392 n++;
3393 }
3394 break;
3395 case Finwait2:
3396 if(NOW - tcb->time > 5*60*1000){
3397 localclose(c, Etimedout);
3398 n++;
3399 }
3400 break;
3401 }
3402 qunlock(c);
3403 }
3404 return n;
3405 }
3406
3407 static void
tcpsettimer(Tcpctl * tcb)3408 tcpsettimer(Tcpctl *tcb)
3409 {
3410 int x;
3411
3412 /* round trip dependency */
3413 x = backoff(tcb->backoff) *
3414 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3415
3416 /* bounded twixt 0.3 and 64 seconds */
3417 if(x < 300/MSPTICK)
3418 x = 300/MSPTICK;
3419 else if(x > (64000/MSPTICK))
3420 x = 64000/MSPTICK;
3421 tcb->timer.start = x;
3422 }
3423
3424 void
tcpinit(Fs * fs)3425 tcpinit(Fs *fs)
3426 {
3427 Proto *tcp;
3428 Tcppriv *tpriv;
3429
3430 tcp = smalloc(sizeof(Proto));
3431 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3432 tcp->name = "tcp";
3433 tcp->connect = tcpconnect;
3434 tcp->announce = tcpannounce;
3435 tcp->ctl = tcpctl;
3436 tcp->state = tcpstate;
3437 tcp->create = tcpcreate;
3438 tcp->close = tcpclose;
3439 tcp->rcv = tcpiput;
3440 tcp->advise = tcpadvise;
3441 tcp->stats = tcpstats;
3442 tcp->inuse = tcpinuse;
3443 tcp->gc = tcpgc;
3444 tcp->ipproto = IP_TCPPROTO;
3445 tcp->nc = scalednconv();
3446 tcp->ptclsize = sizeof(Tcpctl);
3447 tpriv->stats[MaxConn] = tcp->nc;
3448
3449 Fsproto(fs, tcp);
3450 }
3451
3452 static void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3453 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3454 {
3455 /*
3456 * guess at reasonable queue sizes. there's no current way
3457 * to know how many nic receive buffers we can safely tie up in the
3458 * tcp stack, and we don't adjust our queues to maximize throughput
3459 * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
3460 * respected, but we still control our own buffer commitment by
3461 * keeping a seperate qscale.
3462 */
3463 tcb->rcv.scale = rcvscale & 0xff;
3464 tcb->snd.scale = sndscale & 0xff;
3465 tcb->qscale = rcvscale & 0xff;
3466 if(rcvscale > Maxqscale)
3467 tcb->qscale = Maxqscale;
3468
3469 if(rcvscale != tcb->rcv.scale)
3470 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3471 "qlen %d >> window %ud lport %d\n",
3472 tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3473 tcb->window = QMAX << tcb->qscale;
3474 tcb->ssthresh = tcb->window;
3475
3476 /*
3477 * it's important to set wq large enough to cover the full
3478 * bandwidth-delay product. it's possible to be in loss
3479 * recovery with a big window, and we need to keep sending
3480 * into the inflated window. the difference can be huge
3481 * for even modest (70ms) ping times.
3482 */
3483 qsetlimit(s->rq, tcb->window);
3484 qsetlimit(s->wq, tcb->window);
3485 tcprcvwin(s);
3486 }
3487