1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7
8 #include "ip.h"
9
10 enum
11 {
12 QMAX = 64*1024-1,
13 IP_TCPPROTO = 6,
14
15 TCP4_IPLEN = 8,
16 TCP4_PHDRSIZE = 12,
17 TCP4_HDRSIZE = 20,
18 TCP4_TCBPHDRSZ = 40,
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21 TCP6_IPLEN = 0,
22 TCP6_PHDRSIZE = 40,
23 TCP6_HDRSIZE = 20,
24 TCP6_TCBPHDRSZ = 60,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27 TcptimerOFF = 0,
28 TcptimerON = 1,
29 TcptimerDONE = 2,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
40
41 EOLOPT = 0,
42 NOOPOPT = 1,
43 MSSOPT = 2,
44 MSS_LENGTH = 4, /* Mean segment size */
45 WSOPT = 3,
46 WS_LENGTH = 3, /* Bits to scale window size by */
47 MSL2 = 10,
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default mean segment */
50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58
59 FORCE = 1,
60 CLONE = 2,
61 RETRAN = 4,
62 ACTIVE = 8,
63 SYNACK = 16,
64
65 LOGAGAIN = 3,
66 LOGDGAIN = 2,
67
68 Closed = 0, /* Connection states */
69 Listen,
70 Syn_sent,
71 Syn_received,
72 Established,
73 Finwait1,
74 Finwait2,
75 Close_wait,
76 Closing,
77 Last_ack,
78 Time_wait,
79
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
82 LHTMASK = NLHT-1,
83
84 HaveWS = 1<<8,
85 };
86
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 "Closed", "Listen", "Syn_sent", "Syn_received",
91 "Established", "Finwait1", "Finwait2", "Close_wait",
92 "Closing", "Last_ack", "Time_wait"
93 };
94
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 Tcptimer *next;
99 Tcptimer *prev;
100 Tcptimer *readynext;
101 int state;
102 int start;
103 int count;
104 void (*func)(void*);
105 void *arg;
106 };
107
108 /*
109 * v4 and v6 pseudo headers used for
110 * checksuming tcp
111 */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 uchar vihl; /* Version and header length */
116 uchar tos; /* Type of service */
117 uchar length[2]; /* packet length */
118 uchar id[2]; /* Identification */
119 uchar frag[2]; /* Fragment information */
120 uchar Unused;
121 uchar proto;
122 uchar tcplen[2];
123 uchar tcpsrc[4];
124 uchar tcpdst[4];
125 uchar tcpsport[2];
126 uchar tcpdport[2];
127 uchar tcpseq[4];
128 uchar tcpack[4];
129 uchar tcpflag[2];
130 uchar tcpwin[2];
131 uchar tcpcksum[2];
132 uchar tcpurg[2];
133 /* Options segment */
134 uchar tcpopt[1];
135 };
136
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 uchar vcf[4];
141 uchar ploadlen[2];
142 uchar proto;
143 uchar ttl;
144 uchar tcpsrc[IPaddrlen];
145 uchar tcpdst[IPaddrlen];
146 uchar tcpsport[2];
147 uchar tcpdport[2];
148 uchar tcpseq[4];
149 uchar tcpack[4];
150 uchar tcpflag[2];
151 uchar tcpwin[2];
152 uchar tcpcksum[2];
153 uchar tcpurg[2];
154 /* Options segment */
155 uchar tcpopt[1];
156 };
157
158 /*
159 * this represents the control info
160 * for a single packet. It is derived from
161 * a packet in ntohtcp{4,6}() and stuck into
162 * a packet in htontcp{4,6}().
163 */
164 typedef struct Tcp Tcp;
165 struct Tcp
166 {
167 ushort source;
168 ushort dest;
169 ulong seq;
170 ulong ack;
171 uchar flags;
172 ushort ws; /* window scale option (if not zero) */
173 ulong wnd;
174 ushort urg;
175 ushort mss; /* max segment size option (if not zero) */
176 ushort len; /* size of data */
177 };
178
179 /*
180 * this header is malloc'd to thread together fragments
181 * waiting to be coalesced
182 */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 Reseq *next;
187 Tcp seg;
188 Block *bp;
189 ushort length;
190 };
191
192 /*
193 * the qlock in the Conv locks this structure
194 */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 uchar state; /* Connection state */
199 uchar type; /* Listening or active connection */
200 uchar code; /* Icmp code */
201 struct {
202 ulong una; /* Unacked data pointer */
203 ulong nxt; /* Next sequence expected */
204 ulong ptr; /* Data pointer */
205 ulong wnd; /* Tcp send window */
206 ulong urg; /* Urgent data pointer */
207 ulong wl2;
208 int scale; /* how much to right shift window in xmitted packets */
209 /* to implement tahoe and reno TCP */
210 ulong dupacks; /* number of duplicate acks rcvd */
211 int recovery; /* loss recovery flag */
212 ulong rxt; /* right window marker for recovery */
213 } snd;
214 struct {
215 ulong nxt; /* Receive pointer to next uchar slot */
216 ulong wnd; /* Receive window incoming */
217 ulong urg; /* Urgent pointer */
218 int blocked;
219 int una; /* unacked data segs */
220 int scale; /* how much to left shift window in rcved packets */
221 } rcv;
222 ulong iss; /* Initial sequence number */
223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
224 ulong cwind; /* Congestion window */
225 int scale; /* desired snd.scale */
226 ushort ssthresh; /* Slow start threshold */
227 int resent; /* Bytes just resent */
228 int irs; /* Initial received squence */
229 ushort mss; /* Mean segment size */
230 int rerecv; /* Overlap of data rerecevived */
231 ulong window; /* Recevive window */
232 uchar backoff; /* Exponential backoff counter */
233 int backedoff; /* ms we've backed off for rexmits */
234 uchar flags; /* State flags */
235 Reseq *reseq; /* Resequencing queue */
236 Tcptimer timer; /* Activity timer */
237 Tcptimer acktimer; /* Acknowledge timer */
238 Tcptimer rtt_timer; /* Round trip timer */
239 Tcptimer katimer; /* keep alive timer */
240 ulong rttseq; /* Round trip sequence */
241 int srtt; /* Shortened round trip */
242 int mdev; /* Mean deviation of round trip */
243 int kacounter; /* count down for keep alive */
244 uint sndsyntime; /* time syn sent */
245 ulong time; /* time Finwait2 or Syn_received was sent */
246 int nochecksum; /* non-zero means don't send checksums */
247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
248
249 union {
250 Tcp4hdr tcp4hdr;
251 Tcp6hdr tcp6hdr;
252 } protohdr; /* prototype header */
253 };
254
255 /*
256 * New calls are put in limbo rather than having a conversation structure
257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
258 * any real Conv structures mucking things up. Calls in limbo rexmit their
259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260 *
261 * In particular they aren't on a listener's queue so that they don't figure
262 * in the input queue limit.
263 *
264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
266 * there is no hashing of this list.
267 */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 Limbo *next;
272
273 uchar laddr[IPaddrlen];
274 uchar raddr[IPaddrlen];
275 ushort lport;
276 ushort rport;
277 ulong irs; /* initial received sequence */
278 ulong iss; /* initial sent sequence */
279 ushort mss; /* mss from the other end */
280 ushort rcvscale; /* how much to scale rcvd windows */
281 ushort sndscale; /* how much to scale sent windows */
282 ulong lastsend; /* last time we sent a synack */
283 uchar version; /* v4 or v6 */
284 uchar rexmits; /* number of retransmissions */
285 };
286
287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */
289
290 enum {
291 /* MIB stats */
292 MaxConn,
293 ActiveOpens,
294 PassiveOpens,
295 EstabResets,
296 CurrEstab,
297 InSegs,
298 OutSegs,
299 RetransSegs,
300 RetransTimeouts,
301 InErrs,
302 OutRsts,
303
304 /* non-MIB stats */
305 CsumErrs,
306 HlenErrs,
307 LenErrs,
308 OutOfOrder,
309
310 Nstats
311 };
312
313 static char *statnames[] =
314 {
315 [MaxConn] "MaxConn",
316 [ActiveOpens] "ActiveOpens",
317 [PassiveOpens] "PassiveOpens",
318 [EstabResets] "EstabResets",
319 [CurrEstab] "CurrEstab",
320 [InSegs] "InSegs",
321 [OutSegs] "OutSegs",
322 [RetransSegs] "RetransSegs",
323 [RetransTimeouts] "RetransTimeouts",
324 [InErrs] "InErrs",
325 [OutRsts] "OutRsts",
326 [CsumErrs] "CsumErrs",
327 [HlenErrs] "HlenErrs",
328 [LenErrs] "LenErrs",
329 [OutOfOrder] "OutOfOrder",
330 };
331
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335 /* List of active timers */
336 QLock tl;
337 Tcptimer *timers;
338
339 /* hash table for matching conversations */
340 Ipht ht;
341
342 /* calls in limbo waiting for an ACK to our SYN ACK */
343 int nlimbo;
344 Limbo *lht[NLHT];
345
346 /* for keeping track of tcpackproc */
347 QLock apl;
348 int ackprocstarted;
349
350 ulong stats[Nstats];
351 };
352
353 /*
354 * Setting tcpporthogdefense to non-zero enables Dong Lin's
355 * solution to hijacked systems staking out port's as a form
356 * of DoS attack.
357 *
358 * To avoid stateless Conv hogs, we pick a sequence number at random. If
359 * it that number gets acked by the other end, we shut down the connection.
360 * Look for tcpporthogedefense in the code.
361 */
362 int tcpporthogdefense = 0;
363
364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void localclose(Conv*, char*);
367 void procsyn(Conv*, Tcp*);
368 void tcpiput(Proto*, Ipifc*, Block*);
369 void tcpoutput(Conv*);
370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void tcpstart(Conv*, int);
372 void tcptimeout(void*);
373 void tcpsndsyn(Conv*, Tcpctl*);
374 void tcprcvwin(Conv*);
375 void tcpacktimer(void*);
376 void tcpkeepalive(void*);
377 void tcpsetkacounter(Tcpctl*);
378 void tcprxmit(Conv*);
379 void tcpsettimer(Tcpctl*);
380 void tcpsynackrtt(Conv*);
381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385
386 void
tcpsetstate(Conv * s,uchar newstate)387 tcpsetstate(Conv *s, uchar newstate)
388 {
389 Tcpctl *tcb;
390 uchar oldstate;
391 Tcppriv *tpriv;
392
393 tpriv = s->p->priv;
394
395 tcb = (Tcpctl*)s->ptcl;
396
397 oldstate = tcb->state;
398 if(oldstate == newstate)
399 return;
400
401 if(oldstate == Established)
402 tpriv->stats[CurrEstab]--;
403 if(newstate == Established)
404 tpriv->stats[CurrEstab]++;
405
406 /**
407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409 **/
410
411 switch(newstate) {
412 case Closed:
413 qclose(s->rq);
414 qclose(s->wq);
415 qclose(s->eq);
416 break;
417
418 case Close_wait: /* Remote closes */
419 qhangup(s->rq, nil);
420 break;
421 }
422
423 tcb->state = newstate;
424
425 if(oldstate == Syn_sent && newstate != Closed)
426 Fsconnected(s, nil);
427 }
428
429 static char*
tcpconnect(Conv * c,char ** argv,int argc)430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432 char *e;
433
434 e = Fsstdconnect(c, argv, argc);
435 if(e != nil)
436 return e;
437 tcpstart(c, TCP_CONNECT);
438
439 return nil;
440 }
441
442 static int
tcpstate(Conv * c,char * state,int n)443 tcpstate(Conv *c, char *state, int n)
444 {
445 Tcpctl *s;
446
447 s = (Tcpctl*)(c->ptcl);
448
449 return snprint(state, n,
450 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
451 tcpstates[s->state],
452 c->rq ? qlen(c->rq) : 0,
453 c->wq ? qlen(c->wq) : 0,
454 s->srtt, s->mdev,
455 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
456 s->timer.start, s->timer.count, s->rerecv,
457 s->katimer.start, s->katimer.count);
458 }
459
460 static int
tcpinuse(Conv * c)461 tcpinuse(Conv *c)
462 {
463 Tcpctl *s;
464
465 s = (Tcpctl*)(c->ptcl);
466 return s->state != Closed;
467 }
468
469 static char*
tcpannounce(Conv * c,char ** argv,int argc)470 tcpannounce(Conv *c, char **argv, int argc)
471 {
472 char *e;
473
474 e = Fsstdannounce(c, argv, argc);
475 if(e != nil)
476 return e;
477 tcpstart(c, TCP_LISTEN);
478 Fsconnected(c, nil);
479
480 return nil;
481 }
482
483 /*
484 * tcpclose is always called with the q locked
485 */
486 static void
tcpclose(Conv * c)487 tcpclose(Conv *c)
488 {
489 Tcpctl *tcb;
490
491 tcb = (Tcpctl*)c->ptcl;
492
493 qhangup(c->rq, nil);
494 qhangup(c->wq, nil);
495 qhangup(c->eq, nil);
496 qflush(c->rq);
497
498 switch(tcb->state) {
499 case Listen:
500 /*
501 * reset any incoming calls to this listener
502 */
503 Fsconnected(c, "Hangup");
504
505 localclose(c, nil);
506 break;
507 case Closed:
508 case Syn_sent:
509 localclose(c, nil);
510 break;
511 case Syn_received:
512 case Established:
513 tcb->flgcnt++;
514 tcb->snd.nxt++;
515 tcpsetstate(c, Finwait1);
516 tcpoutput(c);
517 break;
518 case Close_wait:
519 tcb->flgcnt++;
520 tcb->snd.nxt++;
521 tcpsetstate(c, Last_ack);
522 tcpoutput(c);
523 break;
524 }
525 }
526
527 void
tcpkick(void * x)528 tcpkick(void *x)
529 {
530 Conv *s = x;
531 Tcpctl *tcb;
532
533 tcb = (Tcpctl*)s->ptcl;
534
535 if(waserror()){
536 qunlock(s);
537 nexterror();
538 }
539 qlock(s);
540
541 switch(tcb->state) {
542 case Syn_sent:
543 case Syn_received:
544 case Established:
545 case Close_wait:
546 /*
547 * Push data
548 */
549 tcprcvwin(s);
550 tcpoutput(s);
551 break;
552 default:
553 localclose(s, "Hangup");
554 break;
555 }
556
557 qunlock(s);
558 poperror();
559 }
560
561 void
tcprcvwin(Conv * s)562 tcprcvwin(Conv *s) /* Call with tcb locked */
563 {
564 int w;
565 Tcpctl *tcb;
566
567 tcb = (Tcpctl*)s->ptcl;
568 w = tcb->window - qlen(s->rq);
569 if(w < 0)
570 w = 0;
571 tcb->rcv.wnd = w;
572 if(w == 0)
573 tcb->rcv.blocked = 1;
574 }
575
576 void
tcpacktimer(void * v)577 tcpacktimer(void *v)
578 {
579 Tcpctl *tcb;
580 Conv *s;
581
582 s = v;
583 tcb = (Tcpctl*)s->ptcl;
584
585 if(waserror()){
586 qunlock(s);
587 nexterror();
588 }
589 qlock(s);
590 if(tcb->state != Closed){
591 tcb->flags |= FORCE;
592 tcprcvwin(s);
593 tcpoutput(s);
594 }
595 qunlock(s);
596 poperror();
597 }
598
599 static void
tcpcreate(Conv * c)600 tcpcreate(Conv *c)
601 {
602 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
603 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
604 }
605
606 static void
timerstate(Tcppriv * priv,Tcptimer * t,int newstate)607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
608 {
609 if(newstate != TcptimerON){
610 if(t->state == TcptimerON){
611 // unchain
612 if(priv->timers == t){
613 priv->timers = t->next;
614 if(t->prev != nil)
615 panic("timerstate1");
616 }
617 if(t->next)
618 t->next->prev = t->prev;
619 if(t->prev)
620 t->prev->next = t->next;
621 t->next = t->prev = nil;
622 }
623 } else {
624 if(t->state != TcptimerON){
625 // chain
626 if(t->prev != nil || t->next != nil)
627 panic("timerstate2");
628 t->prev = nil;
629 t->next = priv->timers;
630 if(t->next)
631 t->next->prev = t;
632 priv->timers = t;
633 }
634 }
635 t->state = newstate;
636 }
637
638 void
tcpackproc(void * a)639 tcpackproc(void *a)
640 {
641 Tcptimer *t, *tp, *timeo;
642 Proto *tcp;
643 Tcppriv *priv;
644 int loop;
645
646 tcp = a;
647 priv = tcp->priv;
648
649 for(;;) {
650 tsleep(&up->sleep, return0, 0, MSPTICK);
651
652 qlock(&priv->tl);
653 timeo = nil;
654 loop = 0;
655 for(t = priv->timers; t != nil; t = tp) {
656 if(loop++ > 10000)
657 panic("tcpackproc1");
658 tp = t->next;
659 if(t->state == TcptimerON) {
660 t->count--;
661 if(t->count == 0) {
662 timerstate(priv, t, TcptimerDONE);
663 t->readynext = timeo;
664 timeo = t;
665 }
666 }
667 }
668 qunlock(&priv->tl);
669
670 loop = 0;
671 for(t = timeo; t != nil; t = t->readynext) {
672 if(loop++ > 10000)
673 panic("tcpackproc2");
674 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
675 (*t->func)(t->arg);
676 poperror();
677 }
678 }
679
680 limborexmit(tcp);
681 }
682 }
683
684 void
tcpgo(Tcppriv * priv,Tcptimer * t)685 tcpgo(Tcppriv *priv, Tcptimer *t)
686 {
687 if(t == nil || t->start == 0)
688 return;
689
690 qlock(&priv->tl);
691 t->count = t->start;
692 timerstate(priv, t, TcptimerON);
693 qunlock(&priv->tl);
694 }
695
696 void
tcphalt(Tcppriv * priv,Tcptimer * t)697 tcphalt(Tcppriv *priv, Tcptimer *t)
698 {
699 if(t == nil)
700 return;
701
702 qlock(&priv->tl);
703 timerstate(priv, t, TcptimerOFF);
704 qunlock(&priv->tl);
705 }
706
707 int
backoff(int n)708 backoff(int n)
709 {
710 return 1 << n;
711 }
712
713 void
localclose(Conv * s,char * reason)714 localclose(Conv *s, char *reason) /* called with tcb locked */
715 {
716 Tcpctl *tcb;
717 Reseq *rp,*rp1;
718 Tcppriv *tpriv;
719
720 tpriv = s->p->priv;
721 tcb = (Tcpctl*)s->ptcl;
722
723 iphtrem(&tpriv->ht, s);
724
725 tcphalt(tpriv, &tcb->timer);
726 tcphalt(tpriv, &tcb->rtt_timer);
727 tcphalt(tpriv, &tcb->acktimer);
728 tcphalt(tpriv, &tcb->katimer);
729
730 /* Flush reassembly queue; nothing more can arrive */
731 for(rp = tcb->reseq; rp != nil; rp = rp1) {
732 rp1 = rp->next;
733 freeblist(rp->bp);
734 free(rp);
735 }
736 tcb->reseq = nil;
737
738 if(tcb->state == Syn_sent)
739 Fsconnected(s, reason);
740 if(s->state == Announced)
741 wakeup(&s->listenr);
742
743 qhangup(s->rq, reason);
744 qhangup(s->wq, reason);
745
746 tcpsetstate(s, Closed);
747 }
748
749 /* mtu (- TCP + IP hdr len) of 1st hop */
750 int
tcpmtu(Proto * tcp,uchar * addr,int version,int * scale)751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
752 {
753 Ipifc *ifc;
754 int mtu;
755
756 ifc = findipifc(tcp->f, addr, 0);
757 switch(version){
758 default:
759 case V4:
760 mtu = DEF_MSS;
761 if(ifc != nil)
762 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763 break;
764 case V6:
765 mtu = DEF_MSS6;
766 if(ifc != nil)
767 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768 break;
769 }
770 if(ifc != nil){
771 if(ifc->mbps > 100)
772 *scale = HaveWS | 3;
773 else if(ifc->mbps > 10)
774 *scale = HaveWS | 1;
775 else
776 *scale = HaveWS | 0;
777 } else
778 *scale = HaveWS | 0;
779
780 return mtu;
781 }
782
783 void
inittcpctl(Conv * s,int mode)784 inittcpctl(Conv *s, int mode)
785 {
786 Tcpctl *tcb;
787 Tcp4hdr* h4;
788 Tcp6hdr* h6;
789 int mss;
790
791 tcb = (Tcpctl*)s->ptcl;
792
793 memset(tcb, 0, sizeof(Tcpctl));
794
795 tcb->ssthresh = 65535;
796 tcb->srtt = tcp_irtt<<LOGAGAIN;
797 tcb->mdev = 0;
798
799 /* setup timers */
800 tcb->timer.start = tcp_irtt / MSPTICK;
801 tcb->timer.func = tcptimeout;
802 tcb->timer.arg = s;
803 tcb->rtt_timer.start = MAX_TIME;
804 tcb->acktimer.start = TCP_ACK / MSPTICK;
805 tcb->acktimer.func = tcpacktimer;
806 tcb->acktimer.arg = s;
807 tcb->katimer.start = DEF_KAT / MSPTICK;
808 tcb->katimer.func = tcpkeepalive;
809 tcb->katimer.arg = s;
810
811 mss = DEF_MSS;
812
813 /* create a prototype(pseudo) header */
814 if(mode != TCP_LISTEN){
815 if(ipcmp(s->laddr, IPnoaddr) == 0)
816 findlocalip(s->p->f, s->laddr, s->raddr);
817
818 switch(s->ipversion){
819 case V4:
820 h4 = &tcb->protohdr.tcp4hdr;
821 memset(h4, 0, sizeof(*h4));
822 h4->proto = IP_TCPPROTO;
823 hnputs(h4->tcpsport, s->lport);
824 hnputs(h4->tcpdport, s->rport);
825 v6tov4(h4->tcpsrc, s->laddr);
826 v6tov4(h4->tcpdst, s->raddr);
827 break;
828 case V6:
829 h6 = &tcb->protohdr.tcp6hdr;
830 memset(h6, 0, sizeof(*h6));
831 h6->proto = IP_TCPPROTO;
832 hnputs(h6->tcpsport, s->lport);
833 hnputs(h6->tcpdport, s->rport);
834 ipmove(h6->tcpsrc, s->laddr);
835 ipmove(h6->tcpdst, s->raddr);
836 mss = DEF_MSS6;
837 break;
838 default:
839 panic("inittcpctl: version %d", s->ipversion);
840 }
841 }
842
843 tcb->mss = tcb->cwind = mss;
844
845 /* default is no window scaling */
846 tcb->window = QMAX;
847 tcb->rcv.wnd = QMAX;
848 tcb->rcv.scale = 0;
849 tcb->snd.scale = 0;
850 qsetlimit(s->rq, QMAX);
851 }
852
853 /*
854 * called with s qlocked
855 */
856 void
tcpstart(Conv * s,int mode)857 tcpstart(Conv *s, int mode)
858 {
859 Tcpctl *tcb;
860 Tcppriv *tpriv;
861 char kpname[KNAMELEN];
862
863 tpriv = s->p->priv;
864
865 if(tpriv->ackprocstarted == 0){
866 qlock(&tpriv->apl);
867 if(tpriv->ackprocstarted == 0){
868 sprint(kpname, "#I%dtcpack", s->p->f->dev);
869 kproc(kpname, tcpackproc, s->p, 0);
870 tpriv->ackprocstarted = 1;
871 }
872 qunlock(&tpriv->apl);
873 }
874
875 tcb = (Tcpctl*)s->ptcl;
876
877 inittcpctl(s, mode);
878
879 iphtadd(&tpriv->ht, s);
880 switch(mode) {
881 case TCP_LISTEN:
882 tpriv->stats[PassiveOpens]++;
883 tcb->flags |= CLONE;
884 tcpsetstate(s, Listen);
885 break;
886
887 case TCP_CONNECT:
888 tpriv->stats[ActiveOpens]++;
889 tcb->flags |= ACTIVE;
890 tcpsndsyn(s, tcb);
891 tcpsetstate(s, Syn_sent);
892 tcpoutput(s);
893 break;
894 }
895 }
896
897 static char*
tcpflag(ushort flag)898 tcpflag(ushort flag)
899 {
900 static char buf[128];
901
902 sprint(buf, "%d", flag>>10); /* Head len */
903 if(flag & URG)
904 strcat(buf, " URG");
905 if(flag & ACK)
906 strcat(buf, " ACK");
907 if(flag & PSH)
908 strcat(buf, " PSH");
909 if(flag & RST)
910 strcat(buf, " RST");
911 if(flag & SYN)
912 strcat(buf, " SYN");
913 if(flag & FIN)
914 strcat(buf, " FIN");
915
916 return buf;
917 }
918
919 Block *
htontcp6(Tcp * tcph,Block * data,Tcp6hdr * ph,Tcpctl * tcb)920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
921 {
922 int dlen;
923 Tcp6hdr *h;
924 ushort csum;
925 ushort hdrlen, optpad = 0;
926 uchar *opt;
927
928 hdrlen = TCP6_HDRSIZE;
929 if(tcph->flags & SYN){
930 if(tcph->mss)
931 hdrlen += MSS_LENGTH;
932 if(tcph->ws)
933 hdrlen += WS_LENGTH;
934 optpad = hdrlen & 3;
935 if(optpad)
936 optpad = 4 - optpad;
937 hdrlen += optpad;
938 }
939
940 if(data) {
941 dlen = blocklen(data);
942 data = padblock(data, hdrlen + TCP6_PKT);
943 if(data == nil)
944 return nil;
945 }
946 else {
947 dlen = 0;
948 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
949 if(data == nil)
950 return nil;
951 data->wp += hdrlen + TCP6_PKT;
952 }
953
954 /* copy in pseudo ip header plus port numbers */
955 h = (Tcp6hdr *)(data->rp);
956 memmove(h, ph, TCP6_TCBPHDRSZ);
957
958 /* compose pseudo tcp header, do cksum calculation */
959 hnputl(h->vcf, hdrlen + dlen);
960 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
961 h->ttl = ph->proto;
962
963 /* copy in variable bits */
964 hnputl(h->tcpseq, tcph->seq);
965 hnputl(h->tcpack, tcph->ack);
966 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
967 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
968 hnputs(h->tcpurg, tcph->urg);
969
970 if(tcph->flags & SYN){
971 opt = h->tcpopt;
972 if(tcph->mss != 0){
973 *opt++ = MSSOPT;
974 *opt++ = MSS_LENGTH;
975 hnputs(opt, tcph->mss);
976 opt += 2;
977 }
978 if(tcph->ws != 0){
979 *opt++ = WSOPT;
980 *opt++ = WS_LENGTH;
981 *opt++ = tcph->ws;
982 }
983 while(optpad-- > 0)
984 *opt++ = NOOPOPT;
985 }
986
987 if(tcb != nil && tcb->nochecksum){
988 h->tcpcksum[0] = h->tcpcksum[1] = 0;
989 } else {
990 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
991 hnputs(h->tcpcksum, csum);
992 }
993
994 /* move from pseudo header back to normal ip header */
995 memset(h->vcf, 0, 4);
996 h->vcf[0] = IP_VER6;
997 hnputs(h->ploadlen, hdrlen+dlen);
998 h->proto = ph->proto;
999
1000 return data;
1001 }
1002
1003 Block *
htontcp4(Tcp * tcph,Block * data,Tcp4hdr * ph,Tcpctl * tcb)1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1005 {
1006 int dlen;
1007 Tcp4hdr *h;
1008 ushort csum;
1009 ushort hdrlen, optpad = 0;
1010 uchar *opt;
1011
1012 hdrlen = TCP4_HDRSIZE;
1013 if(tcph->flags & SYN){
1014 if(tcph->mss)
1015 hdrlen += MSS_LENGTH;
1016 if(tcph->ws)
1017 hdrlen += WS_LENGTH;
1018 optpad = hdrlen & 3;
1019 if(optpad)
1020 optpad = 4 - optpad;
1021 hdrlen += optpad;
1022 }
1023
1024 if(data) {
1025 dlen = blocklen(data);
1026 data = padblock(data, hdrlen + TCP4_PKT);
1027 if(data == nil)
1028 return nil;
1029 }
1030 else {
1031 dlen = 0;
1032 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1033 if(data == nil)
1034 return nil;
1035 data->wp += hdrlen + TCP4_PKT;
1036 }
1037
1038 /* copy in pseudo ip header plus port numbers */
1039 h = (Tcp4hdr *)(data->rp);
1040 memmove(h, ph, TCP4_TCBPHDRSZ);
1041
1042 /* copy in variable bits */
1043 hnputs(h->tcplen, hdrlen + dlen);
1044 hnputl(h->tcpseq, tcph->seq);
1045 hnputl(h->tcpack, tcph->ack);
1046 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1047 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1048 hnputs(h->tcpurg, tcph->urg);
1049
1050 if(tcph->flags & SYN){
1051 opt = h->tcpopt;
1052 if(tcph->mss != 0){
1053 *opt++ = MSSOPT;
1054 *opt++ = MSS_LENGTH;
1055 hnputs(opt, tcph->mss);
1056 opt += 2;
1057 }
1058 if(tcph->ws != 0){
1059 *opt++ = WSOPT;
1060 *opt++ = WS_LENGTH;
1061 *opt++ = tcph->ws;
1062 }
1063 while(optpad-- > 0)
1064 *opt++ = NOOPOPT;
1065 }
1066
1067 if(tcb != nil && tcb->nochecksum){
1068 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1069 } else {
1070 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1071 hnputs(h->tcpcksum, csum);
1072 }
1073
1074 return data;
1075 }
1076
1077 int
ntohtcp6(Tcp * tcph,Block ** bpp)1078 ntohtcp6(Tcp *tcph, Block **bpp)
1079 {
1080 Tcp6hdr *h;
1081 uchar *optr;
1082 ushort hdrlen;
1083 ushort optlen;
1084 int n;
1085
1086 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1087 if(*bpp == nil)
1088 return -1;
1089
1090 h = (Tcp6hdr *)((*bpp)->rp);
1091 tcph->source = nhgets(h->tcpsport);
1092 tcph->dest = nhgets(h->tcpdport);
1093 tcph->seq = nhgetl(h->tcpseq);
1094 tcph->ack = nhgetl(h->tcpack);
1095 hdrlen = (h->tcpflag[0]>>2) & ~3;
1096 if(hdrlen < TCP6_HDRSIZE) {
1097 freeblist(*bpp);
1098 return -1;
1099 }
1100
1101 tcph->flags = h->tcpflag[1];
1102 tcph->wnd = nhgets(h->tcpwin);
1103 tcph->urg = nhgets(h->tcpurg);
1104 tcph->mss = 0;
1105 tcph->ws = 0;
1106 tcph->len = nhgets(h->ploadlen) - hdrlen;
1107
1108 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1109 if(*bpp == nil)
1110 return -1;
1111
1112 optr = h->tcpopt;
1113 n = hdrlen - TCP6_HDRSIZE;
1114 while(n > 0 && *optr != EOLOPT) {
1115 if(*optr == NOOPOPT) {
1116 n--;
1117 optr++;
1118 continue;
1119 }
1120 optlen = optr[1];
1121 if(optlen < 2 || optlen > n)
1122 break;
1123 switch(*optr) {
1124 case MSSOPT:
1125 if(optlen == MSS_LENGTH)
1126 tcph->mss = nhgets(optr+2);
1127 break;
1128 case WSOPT:
1129 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1130 tcph->ws = HaveWS | *(optr+2);
1131 break;
1132 }
1133 n -= optlen;
1134 optr += optlen;
1135 }
1136 return hdrlen;
1137 }
1138
1139 int
ntohtcp4(Tcp * tcph,Block ** bpp)1140 ntohtcp4(Tcp *tcph, Block **bpp)
1141 {
1142 Tcp4hdr *h;
1143 uchar *optr;
1144 ushort hdrlen;
1145 ushort optlen;
1146 int n;
1147
1148 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1149 if(*bpp == nil)
1150 return -1;
1151
1152 h = (Tcp4hdr *)((*bpp)->rp);
1153 tcph->source = nhgets(h->tcpsport);
1154 tcph->dest = nhgets(h->tcpdport);
1155 tcph->seq = nhgetl(h->tcpseq);
1156 tcph->ack = nhgetl(h->tcpack);
1157
1158 hdrlen = (h->tcpflag[0]>>2) & ~3;
1159 if(hdrlen < TCP4_HDRSIZE) {
1160 freeblist(*bpp);
1161 return -1;
1162 }
1163
1164 tcph->flags = h->tcpflag[1];
1165 tcph->wnd = nhgets(h->tcpwin);
1166 tcph->urg = nhgets(h->tcpurg);
1167 tcph->mss = 0;
1168 tcph->ws = 0;
1169 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1170
1171 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1172 if(*bpp == nil)
1173 return -1;
1174
1175 optr = h->tcpopt;
1176 n = hdrlen - TCP4_HDRSIZE;
1177 while(n > 0 && *optr != EOLOPT) {
1178 if(*optr == NOOPOPT) {
1179 n--;
1180 optr++;
1181 continue;
1182 }
1183 optlen = optr[1];
1184 if(optlen < 2 || optlen > n)
1185 break;
1186 switch(*optr) {
1187 case MSSOPT:
1188 if(optlen == MSS_LENGTH)
1189 tcph->mss = nhgets(optr+2);
1190 break;
1191 case WSOPT:
1192 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1193 tcph->ws = HaveWS | *(optr+2);
1194 break;
1195 }
1196 n -= optlen;
1197 optr += optlen;
1198 }
1199 return hdrlen;
1200 }
1201
1202 /*
1203 * For outgiing calls, generate an initial sequence
1204 * number and put a SYN on the send queue
1205 */
1206 void
tcpsndsyn(Conv * s,Tcpctl * tcb)1207 tcpsndsyn(Conv *s, Tcpctl *tcb)
1208 {
1209 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1210 tcb->rttseq = tcb->iss;
1211 tcb->snd.wl2 = tcb->iss;
1212 tcb->snd.una = tcb->iss;
1213 tcb->snd.ptr = tcb->rttseq;
1214 tcb->snd.nxt = tcb->rttseq;
1215 tcb->flgcnt++;
1216 tcb->flags |= FORCE;
1217 tcb->sndsyntime = NOW;
1218
1219 /* set desired mss and scale */
1220 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1221 }
1222
1223 void
sndrst(Proto * tcp,uchar * source,uchar * dest,ushort length,Tcp * seg,uchar version,char * reason)1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1225 {
1226 Block *hbp;
1227 uchar rflags;
1228 Tcppriv *tpriv;
1229 Tcp4hdr ph4;
1230 Tcp6hdr ph6;
1231
1232 netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1233
1234 tpriv = tcp->priv;
1235
1236 if(seg->flags & RST)
1237 return;
1238
1239 /* make pseudo header */
1240 switch(version) {
1241 case V4:
1242 memset(&ph4, 0, sizeof(ph4));
1243 ph4.vihl = IP_VER4;
1244 v6tov4(ph4.tcpsrc, dest);
1245 v6tov4(ph4.tcpdst, source);
1246 ph4.proto = IP_TCPPROTO;
1247 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1248 hnputs(ph4.tcpsport, seg->dest);
1249 hnputs(ph4.tcpdport, seg->source);
1250 break;
1251 case V6:
1252 memset(&ph6, 0, sizeof(ph6));
1253 ph6.vcf[0] = IP_VER6;
1254 ipmove(ph6.tcpsrc, dest);
1255 ipmove(ph6.tcpdst, source);
1256 ph6.proto = IP_TCPPROTO;
1257 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1258 hnputs(ph6.tcpsport, seg->dest);
1259 hnputs(ph6.tcpdport, seg->source);
1260 break;
1261 default:
1262 panic("sndrst: version %d", version);
1263 }
1264
1265 tpriv->stats[OutRsts]++;
1266 rflags = RST;
1267
1268 /* convince the other end that this reset is in band */
1269 if(seg->flags & ACK) {
1270 seg->seq = seg->ack;
1271 seg->ack = 0;
1272 }
1273 else {
1274 rflags |= ACK;
1275 seg->ack = seg->seq;
1276 seg->seq = 0;
1277 if(seg->flags & SYN)
1278 seg->ack++;
1279 seg->ack += length;
1280 if(seg->flags & FIN)
1281 seg->ack++;
1282 }
1283 seg->flags = rflags;
1284 seg->wnd = 0;
1285 seg->urg = 0;
1286 seg->mss = 0;
1287 seg->ws = 0;
1288 switch(version) {
1289 case V4:
1290 hbp = htontcp4(seg, nil, &ph4, nil);
1291 if(hbp == nil)
1292 return;
1293 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1294 break;
1295 case V6:
1296 hbp = htontcp6(seg, nil, &ph6, nil);
1297 if(hbp == nil)
1298 return;
1299 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1300 break;
1301 default:
1302 panic("sndrst2: version %d", version);
1303 }
1304 }
1305
1306 /*
1307 * send a reset to the remote side and close the conversation
1308 * called with s qlocked
1309 */
1310 char*
tcphangup(Conv * s)1311 tcphangup(Conv *s)
1312 {
1313 Tcp seg;
1314 Tcpctl *tcb;
1315 Block *hbp;
1316
1317 tcb = (Tcpctl*)s->ptcl;
1318 if(waserror())
1319 return commonerror();
1320 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1321 if(!waserror()){
1322 memset(&seg, 0, sizeof seg);
1323 seg.flags = RST | ACK;
1324 seg.ack = tcb->rcv.nxt;
1325 tcb->rcv.una = 0;
1326 seg.seq = tcb->snd.ptr;
1327 seg.wnd = 0;
1328 seg.urg = 0;
1329 seg.mss = 0;
1330 seg.ws = 0;
1331 switch(s->ipversion) {
1332 case V4:
1333 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1334 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1335 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1336 break;
1337 case V6:
1338 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1339 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1340 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1341 break;
1342 default:
1343 panic("tcphangup: version %d", s->ipversion);
1344 }
1345 poperror();
1346 }
1347 }
1348 localclose(s, nil);
1349 poperror();
1350 return nil;
1351 }
1352
1353 /*
1354 * (re)send a SYN ACK
1355 */
1356 int
sndsynack(Proto * tcp,Limbo * lp)1357 sndsynack(Proto *tcp, Limbo *lp)
1358 {
1359 Block *hbp;
1360 Tcp4hdr ph4;
1361 Tcp6hdr ph6;
1362 Tcp seg;
1363 int scale;
1364
1365 /* make pseudo header */
1366 switch(lp->version) {
1367 case V4:
1368 memset(&ph4, 0, sizeof(ph4));
1369 ph4.vihl = IP_VER4;
1370 v6tov4(ph4.tcpsrc, lp->laddr);
1371 v6tov4(ph4.tcpdst, lp->raddr);
1372 ph4.proto = IP_TCPPROTO;
1373 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1374 hnputs(ph4.tcpsport, lp->lport);
1375 hnputs(ph4.tcpdport, lp->rport);
1376 break;
1377 case V6:
1378 memset(&ph6, 0, sizeof(ph6));
1379 ph6.vcf[0] = IP_VER6;
1380 ipmove(ph6.tcpsrc, lp->laddr);
1381 ipmove(ph6.tcpdst, lp->raddr);
1382 ph6.proto = IP_TCPPROTO;
1383 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1384 hnputs(ph6.tcpsport, lp->lport);
1385 hnputs(ph6.tcpdport, lp->rport);
1386 break;
1387 default:
1388 panic("sndrst: version %d", lp->version);
1389 }
1390
1391 seg.seq = lp->iss;
1392 seg.ack = lp->irs+1;
1393 seg.flags = SYN|ACK;
1394 seg.urg = 0;
1395 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1396 seg.wnd = QMAX;
1397
1398 /* if the other side set scale, we should too */
1399 if(lp->rcvscale){
1400 seg.ws = scale;
1401 lp->sndscale = scale;
1402 } else {
1403 seg.ws = 0;
1404 lp->sndscale = 0;
1405 }
1406
1407 switch(lp->version) {
1408 case V4:
1409 hbp = htontcp4(&seg, nil, &ph4, nil);
1410 if(hbp == nil)
1411 return -1;
1412 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1413 break;
1414 case V6:
1415 hbp = htontcp6(&seg, nil, &ph6, nil);
1416 if(hbp == nil)
1417 return -1;
1418 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1419 break;
1420 default:
1421 panic("sndsnack: version %d", lp->version);
1422 }
1423 lp->lastsend = NOW;
1424 return 0;
1425 }
1426
1427 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1428
1429 /*
1430 * put a call into limbo and respond with a SYN ACK
1431 *
1432 * called with proto locked
1433 */
1434 static void
limbo(Conv * s,uchar * source,uchar * dest,Tcp * seg,int version)1435 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1436 {
1437 Limbo *lp, **l;
1438 Tcppriv *tpriv;
1439 int h;
1440
1441 tpriv = s->p->priv;
1442 h = hashipa(source, seg->source);
1443
1444 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1445 lp = *l;
1446 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1447 continue;
1448 if(ipcmp(lp->raddr, source) != 0)
1449 continue;
1450 if(ipcmp(lp->laddr, dest) != 0)
1451 continue;
1452
1453 /* each new SYN restarts the retransmits */
1454 lp->irs = seg->seq;
1455 break;
1456 }
1457 lp = *l;
1458 if(lp == nil){
1459 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1460 lp = tpriv->lht[h];
1461 tpriv->lht[h] = lp->next;
1462 lp->next = nil;
1463 } else {
1464 lp = malloc(sizeof(*lp));
1465 if(lp == nil)
1466 return;
1467 tpriv->nlimbo++;
1468 }
1469 *l = lp;
1470 lp->version = version;
1471 ipmove(lp->laddr, dest);
1472 ipmove(lp->raddr, source);
1473 lp->lport = seg->dest;
1474 lp->rport = seg->source;
1475 lp->mss = seg->mss;
1476 lp->rcvscale = seg->ws;
1477 lp->irs = seg->seq;
1478 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1479 }
1480
1481 if(sndsynack(s->p, lp) < 0){
1482 *l = lp->next;
1483 tpriv->nlimbo--;
1484 free(lp);
1485 }
1486 }
1487
1488 /*
1489 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1490 */
1491 static void
limborexmit(Proto * tcp)1492 limborexmit(Proto *tcp)
1493 {
1494 Tcppriv *tpriv;
1495 Limbo **l, *lp;
1496 int h;
1497 int seen;
1498 ulong now;
1499
1500 tpriv = tcp->priv;
1501
1502 if(!canqlock(tcp))
1503 return;
1504 seen = 0;
1505 now = NOW;
1506 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1507 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1508 lp = *l;
1509 seen++;
1510 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1511 continue;
1512
1513 /* time it out after 1 second */
1514 if(++(lp->rexmits) > 5){
1515 tpriv->nlimbo--;
1516 *l = lp->next;
1517 free(lp);
1518 continue;
1519 }
1520
1521 /* if we're being attacked, don't bother resending SYN ACK's */
1522 if(tpriv->nlimbo > 100)
1523 continue;
1524
1525 if(sndsynack(tcp, lp) < 0){
1526 tpriv->nlimbo--;
1527 *l = lp->next;
1528 free(lp);
1529 continue;
1530 }
1531
1532 l = &lp->next;
1533 }
1534 }
1535 qunlock(tcp);
1536 }
1537
1538 /*
1539 * lookup call in limbo. if found, throw it out.
1540 *
1541 * called with proto locked
1542 */
1543 static void
limborst(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1544 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1545 {
1546 Limbo *lp, **l;
1547 int h;
1548 Tcppriv *tpriv;
1549
1550 tpriv = s->p->priv;
1551
1552 /* find a call in limbo */
1553 h = hashipa(src, segp->source);
1554 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1555 lp = *l;
1556 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1557 continue;
1558 if(ipcmp(lp->laddr, dst) != 0)
1559 continue;
1560 if(ipcmp(lp->raddr, src) != 0)
1561 continue;
1562
1563 /* RST can only follow the SYN */
1564 if(segp->seq == lp->irs+1){
1565 tpriv->nlimbo--;
1566 *l = lp->next;
1567 free(lp);
1568 }
1569 break;
1570 }
1571 }
1572
1573 /*
1574 * come here when we finally get an ACK to our SYN-ACK.
1575 * lookup call in limbo. if found, create a new conversation
1576 *
1577 * called with proto locked
1578 */
1579 static Conv*
tcpincoming(Conv * s,Tcp * segp,uchar * src,uchar * dst,uchar version)1580 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1581 {
1582 Conv *new;
1583 Tcpctl *tcb;
1584 Tcppriv *tpriv;
1585 Tcp4hdr *h4;
1586 Tcp6hdr *h6;
1587 Limbo *lp, **l;
1588 int h;
1589
1590 /* unless it's just an ack, it can't be someone coming out of limbo */
1591 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1592 return nil;
1593
1594 tpriv = s->p->priv;
1595
1596 /* find a call in limbo */
1597 h = hashipa(src, segp->source);
1598 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1599 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d",
1600 src, segp->source, lp->raddr, lp->rport,
1601 dst, segp->dest, lp->laddr, lp->lport,
1602 version, lp->version
1603 );
1604
1605 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1606 continue;
1607 if(ipcmp(lp->laddr, dst) != 0)
1608 continue;
1609 if(ipcmp(lp->raddr, src) != 0)
1610 continue;
1611
1612 /* we're assuming no data with the initial SYN */
1613 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1614 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux",
1615 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1616 lp = nil;
1617 } else {
1618 tpriv->nlimbo--;
1619 *l = lp->next;
1620 }
1621 break;
1622 }
1623 if(lp == nil)
1624 return nil;
1625
1626 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1627 if(new == nil)
1628 return nil;
1629
1630 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1631 tcb = (Tcpctl*)new->ptcl;
1632 tcb->flags &= ~CLONE;
1633 tcb->timer.arg = new;
1634 tcb->timer.state = TcptimerOFF;
1635 tcb->acktimer.arg = new;
1636 tcb->acktimer.state = TcptimerOFF;
1637 tcb->katimer.arg = new;
1638 tcb->katimer.state = TcptimerOFF;
1639 tcb->rtt_timer.arg = new;
1640 tcb->rtt_timer.state = TcptimerOFF;
1641
1642 tcb->irs = lp->irs;
1643 tcb->rcv.nxt = tcb->irs+1;
1644 tcb->rcv.urg = tcb->rcv.nxt;
1645
1646 tcb->iss = lp->iss;
1647 tcb->rttseq = tcb->iss;
1648 tcb->snd.wl2 = tcb->iss;
1649 tcb->snd.una = tcb->iss+1;
1650 tcb->snd.ptr = tcb->iss+1;
1651 tcb->snd.nxt = tcb->iss+1;
1652 tcb->flgcnt = 0;
1653 tcb->flags |= SYNACK;
1654
1655 /* our sending max segment size cannot be bigger than what he asked for */
1656 if(lp->mss != 0 && lp->mss < tcb->mss)
1657 tcb->mss = lp->mss;
1658
1659 /* window scaling */
1660 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1661
1662 /* the congestion window always starts out as a single segment */
1663 tcb->snd.wnd = segp->wnd;
1664 tcb->cwind = tcb->mss;
1665
1666 /* set initial round trip time */
1667 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1668 tcpsynackrtt(new);
1669
1670 free(lp);
1671
1672 /* set up proto header */
1673 switch(version){
1674 case V4:
1675 h4 = &tcb->protohdr.tcp4hdr;
1676 memset(h4, 0, sizeof(*h4));
1677 h4->proto = IP_TCPPROTO;
1678 hnputs(h4->tcpsport, new->lport);
1679 hnputs(h4->tcpdport, new->rport);
1680 v6tov4(h4->tcpsrc, dst);
1681 v6tov4(h4->tcpdst, src);
1682 break;
1683 case V6:
1684 h6 = &tcb->protohdr.tcp6hdr;
1685 memset(h6, 0, sizeof(*h6));
1686 h6->proto = IP_TCPPROTO;
1687 hnputs(h6->tcpsport, new->lport);
1688 hnputs(h6->tcpdport, new->rport);
1689 ipmove(h6->tcpsrc, dst);
1690 ipmove(h6->tcpdst, src);
1691 break;
1692 default:
1693 panic("tcpincoming: version %d", new->ipversion);
1694 }
1695
1696 tcpsetstate(new, Established);
1697
1698 iphtadd(&tpriv->ht, new);
1699
1700 return new;
1701 }
1702
1703 int
seq_within(ulong x,ulong low,ulong high)1704 seq_within(ulong x, ulong low, ulong high)
1705 {
1706 if(low <= high){
1707 if(low <= x && x <= high)
1708 return 1;
1709 }
1710 else {
1711 if(x >= low || x <= high)
1712 return 1;
1713 }
1714 return 0;
1715 }
1716
1717 int
seq_lt(ulong x,ulong y)1718 seq_lt(ulong x, ulong y)
1719 {
1720 return (int)(x-y) < 0;
1721 }
1722
1723 int
seq_le(ulong x,ulong y)1724 seq_le(ulong x, ulong y)
1725 {
1726 return (int)(x-y) <= 0;
1727 }
1728
1729 int
seq_gt(ulong x,ulong y)1730 seq_gt(ulong x, ulong y)
1731 {
1732 return (int)(x-y) > 0;
1733 }
1734
1735 int
seq_ge(ulong x,ulong y)1736 seq_ge(ulong x, ulong y)
1737 {
1738 return (int)(x-y) >= 0;
1739 }
1740
1741 /*
1742 * use the time between the first SYN and it's ack as the
1743 * initial round trip time
1744 */
1745 void
tcpsynackrtt(Conv * s)1746 tcpsynackrtt(Conv *s)
1747 {
1748 Tcpctl *tcb;
1749 int delta;
1750 Tcppriv *tpriv;
1751
1752 tcb = (Tcpctl*)s->ptcl;
1753 tpriv = s->p->priv;
1754
1755 delta = NOW - tcb->sndsyntime;
1756 tcb->srtt = delta<<LOGAGAIN;
1757 tcb->mdev = delta<<LOGDGAIN;
1758
1759 /* halt round trip timer */
1760 tcphalt(tpriv, &tcb->rtt_timer);
1761 }
1762
1763 void
update(Conv * s,Tcp * seg)1764 update(Conv *s, Tcp *seg)
1765 {
1766 int rtt, delta;
1767 Tcpctl *tcb;
1768 ulong acked;
1769 ulong expand;
1770 Tcppriv *tpriv;
1771
1772 tpriv = s->p->priv;
1773 tcb = (Tcpctl*)s->ptcl;
1774
1775 /* if everything has been acked, force output(?) */
1776 if(seq_gt(seg->ack, tcb->snd.nxt)) {
1777 tcb->flags |= FORCE;
1778 return;
1779 }
1780
1781 /* added by Dong Lin for fast retransmission */
1782 if(seg->ack == tcb->snd.una
1783 && tcb->snd.una != tcb->snd.nxt
1784 && seg->len == 0
1785 && seg->wnd == tcb->snd.wnd) {
1786
1787 /* this is a pure ack w/o window update */
1788 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1789 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1790
1791 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1792 /*
1793 * tahoe tcp rxt the packet, half sshthresh,
1794 * and set cwnd to one packet
1795 */
1796 tcb->snd.recovery = 1;
1797 tcb->snd.rxt = tcb->snd.nxt;
1798 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1799 tcprxmit(s);
1800 } else {
1801 /* do reno tcp here. */
1802 }
1803 }
1804
1805 /*
1806 * update window
1807 */
1808 if(seq_gt(seg->ack, tcb->snd.wl2)
1809 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1810 tcb->snd.wnd = seg->wnd;
1811 tcb->snd.wl2 = seg->ack;
1812 }
1813
1814 if(!seq_gt(seg->ack, tcb->snd.una)){
1815 /*
1816 * don't let us hangup if sending into a closed window and
1817 * we're still getting acks
1818 */
1819 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1820 tcb->backedoff = MAXBACKMS/4;
1821 }
1822 return;
1823 }
1824
1825 /*
1826 * any positive ack turns off fast rxt,
1827 * (should we do new-reno on partial acks?)
1828 */
1829 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1830 tcb->snd.dupacks = 0;
1831 tcb->snd.recovery = 0;
1832 } else
1833 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1834
1835 /* Compute the new send window size */
1836 acked = seg->ack - tcb->snd.una;
1837
1838 /* avoid slow start and timers for SYN acks */
1839 if((tcb->flags & SYNACK) == 0) {
1840 tcb->flags |= SYNACK;
1841 acked--;
1842 tcb->flgcnt--;
1843 goto done;
1844 }
1845
1846 /* slow start as long as we're not recovering from lost packets */
1847 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1848 if(tcb->cwind < tcb->ssthresh) {
1849 expand = tcb->mss;
1850 if(acked < expand)
1851 expand = acked;
1852 }
1853 else
1854 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1855
1856 if(tcb->cwind + expand < tcb->cwind)
1857 expand = tcb->snd.wnd - tcb->cwind;
1858 if(tcb->cwind + expand > tcb->snd.wnd)
1859 expand = tcb->snd.wnd - tcb->cwind;
1860 tcb->cwind += expand;
1861 }
1862
1863 /* Adjust the timers according to the round trip time */
1864 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1865 tcphalt(tpriv, &tcb->rtt_timer);
1866 if((tcb->flags&RETRAN) == 0) {
1867 tcb->backoff = 0;
1868 tcb->backedoff = 0;
1869 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1870 if(rtt == 0)
1871 rtt = 1; /* otherwise all close systems will rexmit in 0 time */
1872 rtt *= MSPTICK;
1873 if(tcb->srtt == 0) {
1874 tcb->srtt = rtt << LOGAGAIN;
1875 tcb->mdev = rtt << LOGDGAIN;
1876 } else {
1877 delta = rtt - (tcb->srtt>>LOGAGAIN);
1878 tcb->srtt += delta;
1879 if(tcb->srtt <= 0)
1880 tcb->srtt = 1;
1881
1882 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1883 tcb->mdev += delta;
1884 if(tcb->mdev <= 0)
1885 tcb->mdev = 1;
1886 }
1887 tcpsettimer(tcb);
1888 }
1889 }
1890
1891 done:
1892 if(qdiscard(s->wq, acked) < acked)
1893 tcb->flgcnt--;
1894
1895 tcb->snd.una = seg->ack;
1896 if(seq_gt(seg->ack, tcb->snd.urg))
1897 tcb->snd.urg = seg->ack;
1898
1899 if(tcb->snd.una != tcb->snd.nxt)
1900 tcpgo(tpriv, &tcb->timer);
1901 else
1902 tcphalt(tpriv, &tcb->timer);
1903
1904 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1905 tcb->snd.ptr = tcb->snd.una;
1906
1907 tcb->flags &= ~RETRAN;
1908 tcb->backoff = 0;
1909 tcb->backedoff = 0;
1910 }
1911
1912 void
tcpiput(Proto * tcp,Ipifc *,Block * bp)1913 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1914 {
1915 Tcp seg;
1916 Tcp4hdr *h4;
1917 Tcp6hdr *h6;
1918 int hdrlen;
1919 Tcpctl *tcb;
1920 ushort length;
1921 uchar source[IPaddrlen], dest[IPaddrlen];
1922 Conv *s;
1923 Fs *f;
1924 Tcppriv *tpriv;
1925 uchar version;
1926
1927 f = tcp->f;
1928 tpriv = tcp->priv;
1929
1930 tpriv->stats[InSegs]++;
1931
1932 h4 = (Tcp4hdr*)(bp->rp);
1933 h6 = (Tcp6hdr*)(bp->rp);
1934
1935 if((h4->vihl&0xF0)==IP_VER4) {
1936 version = V4;
1937 length = nhgets(h4->length);
1938 v4tov6(dest, h4->tcpdst);
1939 v4tov6(source, h4->tcpsrc);
1940
1941 h4->Unused = 0;
1942 hnputs(h4->tcplen, length-TCP4_PKT);
1943 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1944 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1945 tpriv->stats[CsumErrs]++;
1946 tpriv->stats[InErrs]++;
1947 netlog(f, Logtcp, "bad tcp proto cksum\n");
1948 freeblist(bp);
1949 return;
1950 }
1951
1952 hdrlen = ntohtcp4(&seg, &bp);
1953 if(hdrlen < 0){
1954 tpriv->stats[HlenErrs]++;
1955 tpriv->stats[InErrs]++;
1956 netlog(f, Logtcp, "bad tcp hdr len\n");
1957 return;
1958 }
1959
1960 /* trim the packet to the size claimed by the datagram */
1961 length -= hdrlen+TCP4_PKT;
1962 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1963 if(bp == nil){
1964 tpriv->stats[LenErrs]++;
1965 tpriv->stats[InErrs]++;
1966 netlog(f, Logtcp, "tcp len < 0 after trim\n");
1967 return;
1968 }
1969 }
1970 else {
1971 int ttl = h6->ttl;
1972 int proto = h6->proto;
1973
1974 version = V6;
1975 length = nhgets(h6->ploadlen);
1976 ipmove(dest, h6->tcpdst);
1977 ipmove(source, h6->tcpsrc);
1978
1979 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1980 h6->ttl = proto;
1981 hnputl(h6->vcf, length);
1982 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1983 ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
1984 tpriv->stats[CsumErrs]++;
1985 tpriv->stats[InErrs]++;
1986 netlog(f, Logtcp, "bad tcp proto cksum\n");
1987 freeblist(bp);
1988 return;
1989 }
1990 h6->ttl = ttl;
1991 h6->proto = proto;
1992 hnputs(h6->ploadlen, length);
1993
1994 hdrlen = ntohtcp6(&seg, &bp);
1995 if(hdrlen < 0){
1996 tpriv->stats[HlenErrs]++;
1997 tpriv->stats[InErrs]++;
1998 netlog(f, Logtcp, "bad tcp hdr len\n");
1999 return;
2000 }
2001
2002 /* trim the packet to the size claimed by the datagram */
2003 length -= hdrlen;
2004 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2005 if(bp == nil){
2006 tpriv->stats[LenErrs]++;
2007 tpriv->stats[InErrs]++;
2008 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2009 return;
2010 }
2011 }
2012
2013 /* lock protocol while searching for a conversation */
2014 qlock(tcp);
2015
2016 /* Look for a matching conversation */
2017 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2018 if(s == nil){
2019 netlog(f, Logtcp, "iphtlook failed");
2020 reset:
2021 qunlock(tcp);
2022 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2023 freeblist(bp);
2024 return;
2025 }
2026
2027 /* if it's a listener, look for the right flags and get a new conv */
2028 tcb = (Tcpctl*)s->ptcl;
2029 if(tcb->state == Listen){
2030 if(seg.flags & RST){
2031 limborst(s, &seg, source, dest, version);
2032 qunlock(tcp);
2033 freeblist(bp);
2034 return;
2035 }
2036
2037 /* if this is a new SYN, put the call into limbo */
2038 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2039 limbo(s, source, dest, &seg, version);
2040 qunlock(tcp);
2041 freeblist(bp);
2042 return;
2043 }
2044
2045 /*
2046 * if there's a matching call in limbo, tcpincoming will
2047 * return it in state Syn_received
2048 */
2049 s = tcpincoming(s, &seg, source, dest, version);
2050 if(s == nil)
2051 goto reset;
2052 }
2053
2054 /* The rest of the input state machine is run with the control block
2055 * locked and implements the state machine directly out of the RFC.
2056 * Out-of-band data is ignored - it was always a bad idea.
2057 */
2058 tcb = (Tcpctl*)s->ptcl;
2059 if(waserror()){
2060 qunlock(s);
2061 nexterror();
2062 }
2063 qlock(s);
2064 qunlock(tcp);
2065
2066 /* fix up window */
2067 seg.wnd <<= tcb->rcv.scale;
2068
2069 /* every input packet in puts off the keep alive time out */
2070 tcpsetkacounter(tcb);
2071
2072 switch(tcb->state) {
2073 case Closed:
2074 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2075 goto raise;
2076 case Syn_sent:
2077 if(seg.flags & ACK) {
2078 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2079 sndrst(tcp, source, dest, length, &seg, version,
2080 "bad seq in Syn_sent");
2081 goto raise;
2082 }
2083 }
2084 if(seg.flags & RST) {
2085 if(seg.flags & ACK)
2086 localclose(s, Econrefused);
2087 goto raise;
2088 }
2089
2090 if(seg.flags & SYN) {
2091 procsyn(s, &seg);
2092 if(seg.flags & ACK){
2093 update(s, &seg);
2094 tcpsynackrtt(s);
2095 tcpsetstate(s, Established);
2096 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2097 }
2098 else {
2099 tcb->time = NOW;
2100 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2101 }
2102
2103 if(length != 0 || (seg.flags & FIN))
2104 break;
2105
2106 freeblist(bp);
2107 goto output;
2108 }
2109 else
2110 freeblist(bp);
2111
2112 qunlock(s);
2113 poperror();
2114 return;
2115 case Syn_received:
2116 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2117 if(seg.flags & ACK)
2118 tcpsynackrtt(s);
2119 break;
2120 }
2121
2122 /*
2123 * One DOS attack is to open connections to us and then forget about them,
2124 * thereby tying up a conv at no long term cost to the attacker.
2125 * This is an attempt to defeat these stateless DOS attacks. See
2126 * corresponding code in tcpsendka().
2127 */
2128 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2129 if(tcpporthogdefense
2130 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2131 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2132 source, seg.source, dest, seg.dest, seg.flags,
2133 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2134 localclose(s, "stateless hog");
2135 }
2136 }
2137
2138 /* Cut the data to fit the receive window */
2139 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2140 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2141 update(s, &seg);
2142 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2143 tcphalt(tpriv, &tcb->rtt_timer);
2144 tcphalt(tpriv, &tcb->acktimer);
2145 tcphalt(tpriv, &tcb->katimer);
2146 tcpsetstate(s, Time_wait);
2147 tcb->timer.start = MSL2*(1000 / MSPTICK);
2148 tcpgo(tpriv, &tcb->timer);
2149 }
2150 if(!(seg.flags & RST)) {
2151 tcb->flags |= FORCE;
2152 goto output;
2153 }
2154 qunlock(s);
2155 poperror();
2156 return;
2157 }
2158
2159 /* Cannot accept so answer with a rst */
2160 if(length && tcb->state == Closed) {
2161 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2162 goto raise;
2163 }
2164
2165 /* The segment is beyond the current receive pointer so
2166 * queue the data in the resequence queue
2167 */
2168 if(seg.seq != tcb->rcv.nxt)
2169 if(length != 0 || (seg.flags & (SYN|FIN))) {
2170 update(s, &seg);
2171 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2172 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2173 tcb->flags |= FORCE;
2174 goto output;
2175 }
2176
2177 /*
2178 * keep looping till we've processed this packet plus any
2179 * adjacent packets in the resequence queue
2180 */
2181 for(;;) {
2182 if(seg.flags & RST) {
2183 if(tcb->state == Established) {
2184 tpriv->stats[EstabResets]++;
2185 if(tcb->rcv.nxt != seg.seq)
2186 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2187 }
2188 localclose(s, Econrefused);
2189 goto raise;
2190 }
2191
2192 if((seg.flags&ACK) == 0)
2193 goto raise;
2194
2195 switch(tcb->state) {
2196 case Syn_received:
2197 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2198 sndrst(tcp, source, dest, length, &seg, version,
2199 "bad seq in Syn_received");
2200 goto raise;
2201 }
2202 update(s, &seg);
2203 tcpsetstate(s, Established);
2204 case Established:
2205 case Close_wait:
2206 update(s, &seg);
2207 break;
2208 case Finwait1:
2209 update(s, &seg);
2210 if(qlen(s->wq)+tcb->flgcnt == 0){
2211 tcphalt(tpriv, &tcb->rtt_timer);
2212 tcphalt(tpriv, &tcb->acktimer);
2213 tcpsetkacounter(tcb);
2214 tcb->time = NOW;
2215 tcpsetstate(s, Finwait2);
2216 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2217 tcpgo(tpriv, &tcb->katimer);
2218 }
2219 break;
2220 case Finwait2:
2221 update(s, &seg);
2222 break;
2223 case Closing:
2224 update(s, &seg);
2225 if(qlen(s->wq)+tcb->flgcnt == 0) {
2226 tcphalt(tpriv, &tcb->rtt_timer);
2227 tcphalt(tpriv, &tcb->acktimer);
2228 tcphalt(tpriv, &tcb->katimer);
2229 tcpsetstate(s, Time_wait);
2230 tcb->timer.start = MSL2*(1000 / MSPTICK);
2231 tcpgo(tpriv, &tcb->timer);
2232 }
2233 break;
2234 case Last_ack:
2235 update(s, &seg);
2236 if(qlen(s->wq)+tcb->flgcnt == 0) {
2237 localclose(s, nil);
2238 goto raise;
2239 }
2240 case Time_wait:
2241 tcb->flags |= FORCE;
2242 if(tcb->timer.state != TcptimerON)
2243 tcpgo(tpriv, &tcb->timer);
2244 }
2245
2246 if((seg.flags&URG) && seg.urg) {
2247 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2248 tcb->rcv.urg = seg.urg + seg.seq;
2249 pullblock(&bp, seg.urg);
2250 }
2251 }
2252 else
2253 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2254 tcb->rcv.urg = tcb->rcv.nxt;
2255
2256 if(length == 0) {
2257 if(bp != nil)
2258 freeblist(bp);
2259 }
2260 else {
2261 switch(tcb->state){
2262 default:
2263 /* Ignore segment text */
2264 if(bp != nil)
2265 freeblist(bp);
2266 break;
2267
2268 case Syn_received:
2269 case Established:
2270 case Finwait1:
2271 /* If we still have some data place on
2272 * receive queue
2273 */
2274 if(bp) {
2275 bp = packblock(bp);
2276 if(bp == nil)
2277 panic("tcp packblock");
2278 qpassnolim(s->rq, bp);
2279 bp = nil;
2280
2281 /*
2282 * Force an ack every 2 data messages. This is
2283 * a hack for rob to make his home system run
2284 * faster.
2285 *
2286 * this also keeps the standard TCP congestion
2287 * control working since it needs an ack every
2288 * 2 max segs worth. This is not quite that,
2289 * but under a real stream is equivalent since
2290 * every packet has a max seg in it.
2291 */
2292 if(++(tcb->rcv.una) >= 2)
2293 tcb->flags |= FORCE;
2294 }
2295 tcb->rcv.nxt += length;
2296
2297 /*
2298 * update our rcv window
2299 */
2300 tcprcvwin(s);
2301
2302 /*
2303 * turn on the acktimer if there's something
2304 * to ack
2305 */
2306 if(tcb->acktimer.state != TcptimerON)
2307 tcpgo(tpriv, &tcb->acktimer);
2308
2309 break;
2310 case Finwait2:
2311 /* no process to read the data, send a reset */
2312 if(bp != nil)
2313 freeblist(bp);
2314 sndrst(tcp, source, dest, length, &seg, version,
2315 "send to Finwait2");
2316 qunlock(s);
2317 poperror();
2318 return;
2319 }
2320 }
2321
2322 if(seg.flags & FIN) {
2323 tcb->flags |= FORCE;
2324
2325 switch(tcb->state) {
2326 case Syn_received:
2327 case Established:
2328 tcb->rcv.nxt++;
2329 tcpsetstate(s, Close_wait);
2330 break;
2331 case Finwait1:
2332 tcb->rcv.nxt++;
2333 if(qlen(s->wq)+tcb->flgcnt == 0) {
2334 tcphalt(tpriv, &tcb->rtt_timer);
2335 tcphalt(tpriv, &tcb->acktimer);
2336 tcphalt(tpriv, &tcb->katimer);
2337 tcpsetstate(s, Time_wait);
2338 tcb->timer.start = MSL2*(1000/MSPTICK);
2339 tcpgo(tpriv, &tcb->timer);
2340 }
2341 else
2342 tcpsetstate(s, Closing);
2343 break;
2344 case Finwait2:
2345 tcb->rcv.nxt++;
2346 tcphalt(tpriv, &tcb->rtt_timer);
2347 tcphalt(tpriv, &tcb->acktimer);
2348 tcphalt(tpriv, &tcb->katimer);
2349 tcpsetstate(s, Time_wait);
2350 tcb->timer.start = MSL2 * (1000/MSPTICK);
2351 tcpgo(tpriv, &tcb->timer);
2352 break;
2353 case Close_wait:
2354 case Closing:
2355 case Last_ack:
2356 break;
2357 case Time_wait:
2358 tcpgo(tpriv, &tcb->timer);
2359 break;
2360 }
2361 }
2362
2363 /*
2364 * get next adjacent segment from the resequence queue.
2365 * dump/trim any overlapping segments
2366 */
2367 for(;;) {
2368 if(tcb->reseq == nil)
2369 goto output;
2370
2371 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2372 goto output;
2373
2374 getreseq(tcb, &seg, &bp, &length);
2375
2376 if(tcptrim(tcb, &seg, &bp, &length) == 0)
2377 break;
2378 }
2379 }
2380 output:
2381 tcpoutput(s);
2382 qunlock(s);
2383 poperror();
2384 return;
2385 raise:
2386 qunlock(s);
2387 poperror();
2388 freeblist(bp);
2389 tcpkick(s);
2390 }
2391
2392 /*
2393 * always enters and exits with the s locked. We drop
2394 * the lock to ipoput the packet so some care has to be
2395 * taken by callers.
2396 */
2397 void
tcpoutput(Conv * s)2398 tcpoutput(Conv *s)
2399 {
2400 Tcp seg;
2401 int msgs;
2402 Tcpctl *tcb;
2403 Block *hbp, *bp;
2404 int sndcnt, n;
2405 ulong ssize, dsize, usable, sent;
2406 Fs *f;
2407 Tcppriv *tpriv;
2408 uchar version;
2409
2410 f = s->p->f;
2411 tpriv = s->p->priv;
2412 version = s->ipversion;
2413
2414 for(msgs = 0; msgs < 100; msgs++) {
2415 tcb = (Tcpctl*)s->ptcl;
2416
2417 switch(tcb->state) {
2418 case Listen:
2419 case Closed:
2420 case Finwait2:
2421 return;
2422 }
2423
2424 /* force an ack when a window has opened up */
2425 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2426 tcb->rcv.blocked = 0;
2427 tcb->flags |= FORCE;
2428 }
2429
2430 sndcnt = qlen(s->wq)+tcb->flgcnt;
2431 sent = tcb->snd.ptr - tcb->snd.una;
2432
2433 /* Don't send anything else until our SYN has been acked */
2434 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2435 break;
2436
2437 /* Compute usable segment based on offered window and limit
2438 * window probes to one
2439 */
2440 if(tcb->snd.wnd == 0){
2441 if(sent != 0) {
2442 if((tcb->flags&FORCE) == 0)
2443 break;
2444 // tcb->snd.ptr = tcb->snd.una;
2445 }
2446 usable = 1;
2447 }
2448 else {
2449 usable = tcb->cwind;
2450 if(tcb->snd.wnd < usable)
2451 usable = tcb->snd.wnd;
2452 usable -= sent;
2453 }
2454 ssize = sndcnt-sent;
2455 if(ssize && usable < 2)
2456 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2457 tcb->snd.wnd, tcb->cwind);
2458 if(usable < ssize)
2459 ssize = usable;
2460 if(tcb->mss < ssize)
2461 ssize = tcb->mss;
2462 dsize = ssize;
2463 seg.urg = 0;
2464
2465 if(ssize == 0)
2466 if((tcb->flags&FORCE) == 0)
2467 break;
2468
2469 tcb->flags &= ~FORCE;
2470 tcprcvwin(s);
2471
2472 /* By default we will generate an ack */
2473 tcphalt(tpriv, &tcb->acktimer);
2474 tcb->rcv.una = 0;
2475 seg.source = s->lport;
2476 seg.dest = s->rport;
2477 seg.flags = ACK;
2478 seg.mss = 0;
2479 seg.ws = 0;
2480 switch(tcb->state){
2481 case Syn_sent:
2482 seg.flags = 0;
2483 if(tcb->snd.ptr == tcb->iss){
2484 seg.flags |= SYN;
2485 dsize--;
2486 seg.mss = tcb->mss;
2487 seg.ws = tcb->scale;
2488 }
2489 break;
2490 case Syn_received:
2491 /*
2492 * don't send any data with a SYN/ACK packet
2493 * because Linux rejects the packet in its
2494 * attempt to solve the SYN attack problem
2495 */
2496 if(tcb->snd.ptr == tcb->iss){
2497 seg.flags |= SYN;
2498 dsize = 0;
2499 ssize = 1;
2500 seg.mss = tcb->mss;
2501 seg.ws = tcb->scale;
2502 }
2503 break;
2504 }
2505 seg.seq = tcb->snd.ptr;
2506 seg.ack = tcb->rcv.nxt;
2507 seg.wnd = tcb->rcv.wnd;
2508
2509 /* Pull out data to send */
2510 bp = nil;
2511 if(dsize != 0) {
2512 bp = qcopy(s->wq, dsize, sent);
2513 if(BLEN(bp) != dsize) {
2514 seg.flags |= FIN;
2515 dsize--;
2516 }
2517 }
2518
2519 if(sent+dsize == sndcnt)
2520 seg.flags |= PSH;
2521
2522 /* keep track of balance of resent data */
2523 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2524 n = tcb->snd.nxt - tcb->snd.ptr;
2525 if(ssize < n)
2526 n = ssize;
2527 tcb->resent += n;
2528 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2529 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2530 tpriv->stats[RetransSegs]++;
2531 }
2532
2533 tcb->snd.ptr += ssize;
2534
2535 /* Pull up the send pointer so we can accept acks
2536 * for this window
2537 */
2538 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2539 tcb->snd.nxt = tcb->snd.ptr;
2540
2541 /* Build header, link data and compute cksum */
2542 switch(version){
2543 case V4:
2544 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2545 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2546 if(hbp == nil) {
2547 freeblist(bp);
2548 return;
2549 }
2550 break;
2551 case V6:
2552 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2553 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2554 if(hbp == nil) {
2555 freeblist(bp);
2556 return;
2557 }
2558 break;
2559 default:
2560 hbp = nil; /* to suppress a warning */
2561 panic("tcpoutput: version %d", version);
2562 }
2563
2564 /* Start the transmission timers if there is new data and we
2565 * expect acknowledges
2566 */
2567 if(ssize != 0){
2568 if(tcb->timer.state != TcptimerON)
2569 tcpgo(tpriv, &tcb->timer);
2570
2571 /* If round trip timer isn't running, start it.
2572 * measure the longest packet only in case the
2573 * transmission time dominates RTT
2574 */
2575 if(tcb->rtt_timer.state != TcptimerON)
2576 if(ssize == tcb->mss) {
2577 tcpgo(tpriv, &tcb->rtt_timer);
2578 tcb->rttseq = tcb->snd.ptr;
2579 }
2580 }
2581
2582 tpriv->stats[OutSegs]++;
2583
2584 /* put off the next keep alive */
2585 tcpgo(tpriv, &tcb->katimer);
2586
2587 switch(version){
2588 case V4:
2589 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2590 /* a negative return means no route */
2591 localclose(s, "no route");
2592 }
2593 break;
2594 case V6:
2595 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2596 /* a negative return means no route */
2597 localclose(s, "no route");
2598 }
2599 break;
2600 default:
2601 panic("tcpoutput2: version %d", version);
2602 }
2603 if((msgs%4) == 1){
2604 qunlock(s);
2605 sched();
2606 qlock(s);
2607 }
2608 }
2609 }
2610
2611 /*
2612 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2613 */
2614 void
tcpsendka(Conv * s)2615 tcpsendka(Conv *s)
2616 {
2617 Tcp seg;
2618 Tcpctl *tcb;
2619 Block *hbp,*dbp;
2620
2621 tcb = (Tcpctl*)s->ptcl;
2622
2623 dbp = nil;
2624 seg.urg = 0;
2625 seg.source = s->lport;
2626 seg.dest = s->rport;
2627 seg.flags = ACK|PSH;
2628 seg.mss = 0;
2629 seg.ws = 0;
2630 if(tcpporthogdefense)
2631 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2632 else
2633 seg.seq = tcb->snd.una-1;
2634 seg.ack = tcb->rcv.nxt;
2635 tcb->rcv.una = 0;
2636 seg.wnd = tcb->rcv.wnd;
2637 if(tcb->state == Finwait2){
2638 seg.flags |= FIN;
2639 } else {
2640 dbp = allocb(1);
2641 dbp->wp++;
2642 }
2643
2644 if(isv4(s->raddr)) {
2645 /* Build header, link data and compute cksum */
2646 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2647 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2648 if(hbp == nil) {
2649 freeblist(dbp);
2650 return;
2651 }
2652 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2653 }
2654 else {
2655 /* Build header, link data and compute cksum */
2656 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2657 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2658 if(hbp == nil) {
2659 freeblist(dbp);
2660 return;
2661 }
2662 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2663 }
2664 }
2665
2666 /*
2667 * set connection to time out after 12 minutes
2668 */
2669 void
tcpsetkacounter(Tcpctl * tcb)2670 tcpsetkacounter(Tcpctl *tcb)
2671 {
2672 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2673 if(tcb->kacounter < 3)
2674 tcb->kacounter = 3;
2675 }
2676
2677 /*
2678 * if we've timed out, close the connection
2679 * otherwise, send a keepalive and restart the timer
2680 */
2681 void
tcpkeepalive(void * v)2682 tcpkeepalive(void *v)
2683 {
2684 Tcpctl *tcb;
2685 Conv *s;
2686
2687 s = v;
2688 tcb = (Tcpctl*)s->ptcl;
2689 if(waserror()){
2690 qunlock(s);
2691 nexterror();
2692 }
2693 qlock(s);
2694 if(tcb->state != Closed){
2695 if(--(tcb->kacounter) <= 0) {
2696 localclose(s, Etimedout);
2697 } else {
2698 tcpsendka(s);
2699 tcpgo(s->p->priv, &tcb->katimer);
2700 }
2701 }
2702 qunlock(s);
2703 poperror();
2704 }
2705
2706 /*
2707 * start keepalive timer
2708 */
2709 char*
tcpstartka(Conv * s,char ** f,int n)2710 tcpstartka(Conv *s, char **f, int n)
2711 {
2712 Tcpctl *tcb;
2713 int x;
2714
2715 tcb = (Tcpctl*)s->ptcl;
2716 if(tcb->state != Established)
2717 return "connection must be in Establised state";
2718 if(n > 1){
2719 x = atoi(f[1]);
2720 if(x >= MSPTICK)
2721 tcb->katimer.start = x/MSPTICK;
2722 }
2723 tcpsetkacounter(tcb);
2724 tcpgo(s->p->priv, &tcb->katimer);
2725
2726 return nil;
2727 }
2728
2729 /*
2730 * turn checksums on/off
2731 */
2732 char*
tcpsetchecksum(Conv * s,char ** f,int)2733 tcpsetchecksum(Conv *s, char **f, int)
2734 {
2735 Tcpctl *tcb;
2736
2737 tcb = (Tcpctl*)s->ptcl;
2738 tcb->nochecksum = !atoi(f[1]);
2739
2740 return nil;
2741 }
2742
2743 void
tcprxmit(Conv * s)2744 tcprxmit(Conv *s)
2745 {
2746 Tcpctl *tcb;
2747
2748 tcb = (Tcpctl*)s->ptcl;
2749
2750 tcb->flags |= RETRAN|FORCE;
2751 tcb->snd.ptr = tcb->snd.una;
2752
2753 /*
2754 * We should be halving the slow start threshhold (down to one
2755 * mss) but leaving it at mss seems to work well enough
2756 */
2757 tcb->ssthresh = tcb->mss;
2758
2759 /*
2760 * pull window down to a single packet
2761 */
2762 tcb->cwind = tcb->mss;
2763 tcpoutput(s);
2764 }
2765
2766 void
tcptimeout(void * arg)2767 tcptimeout(void *arg)
2768 {
2769 Conv *s;
2770 Tcpctl *tcb;
2771 int maxback;
2772 Tcppriv *tpriv;
2773
2774 s = (Conv*)arg;
2775 tpriv = s->p->priv;
2776 tcb = (Tcpctl*)s->ptcl;
2777
2778 if(waserror()){
2779 qunlock(s);
2780 nexterror();
2781 }
2782 qlock(s);
2783 switch(tcb->state){
2784 default:
2785 tcb->backoff++;
2786 if(tcb->state == Syn_sent)
2787 maxback = MAXBACKMS/2;
2788 else
2789 maxback = MAXBACKMS;
2790 tcb->backedoff += tcb->timer.start * MSPTICK;
2791 if(tcb->backedoff >= maxback) {
2792 localclose(s, Etimedout);
2793 break;
2794 }
2795 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2796 tcpsettimer(tcb);
2797 tcprxmit(s);
2798 tpriv->stats[RetransTimeouts]++;
2799 tcb->snd.dupacks = 0;
2800 break;
2801 case Time_wait:
2802 localclose(s, nil);
2803 break;
2804 case Closed:
2805 break;
2806 }
2807 qunlock(s);
2808 poperror();
2809 }
2810
2811 int
inwindow(Tcpctl * tcb,int seq)2812 inwindow(Tcpctl *tcb, int seq)
2813 {
2814 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2815 }
2816
2817 /*
2818 * set up state for a received SYN (or SYN ACK) packet
2819 */
2820 void
procsyn(Conv * s,Tcp * seg)2821 procsyn(Conv *s, Tcp *seg)
2822 {
2823 Tcpctl *tcb;
2824
2825 tcb = (Tcpctl*)s->ptcl;
2826 tcb->flags |= FORCE;
2827
2828 tcb->rcv.nxt = seg->seq + 1;
2829 tcb->rcv.urg = tcb->rcv.nxt;
2830 tcb->irs = seg->seq;
2831
2832 /* our sending max segment size cannot be bigger than what he asked for */
2833 if(seg->mss != 0 && seg->mss < tcb->mss)
2834 tcb->mss = seg->mss;
2835
2836 /* the congestion window always starts out as a single segment */
2837 tcb->snd.wnd = seg->wnd;
2838 tcb->cwind = tcb->mss;
2839 }
2840
2841 int
addreseq(Tcpctl * tcb,Tcppriv * tpriv,Tcp * seg,Block * bp,ushort length)2842 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2843 {
2844 Reseq *rp, *rp1;
2845 int i, rqlen, qmax;
2846
2847 rp = malloc(sizeof(Reseq));
2848 if(rp == nil){
2849 freeblist(bp); /* bp always consumed by add_reseq */
2850 return 0;
2851 }
2852
2853 rp->seg = *seg;
2854 rp->bp = bp;
2855 rp->length = length;
2856
2857 /* Place on reassembly list sorting by starting seq number */
2858 rp1 = tcb->reseq;
2859 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2860 rp->next = rp1;
2861 tcb->reseq = rp;
2862 if(rp->next != nil)
2863 tpriv->stats[OutOfOrder]++;
2864 return 0;
2865 }
2866
2867 rqlen = 0;
2868 for(i = 0;; i++) {
2869 rqlen += rp1->length;
2870 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2871 rp->next = rp1->next;
2872 rp1->next = rp;
2873 if(rp->next != nil)
2874 tpriv->stats[OutOfOrder]++;
2875 break;
2876 }
2877 rp1 = rp1->next;
2878 }
2879 qmax = QMAX<<tcb->rcv.scale;
2880 if(rqlen > qmax){
2881 print("resequence queue > window: %d > %d\n", rqlen, qmax);
2882 i = 0;
2883 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2884 print("%#lux %#lux %#ux\n", rp1->seg.seq,
2885 rp1->seg.ack, rp1->seg.flags);
2886 if(i++ > 10){
2887 print("...\n");
2888 break;
2889 }
2890 }
2891
2892 // delete entire reassembly queue; wait for retransmit.
2893 // - should we be smarter and only delete the tail?
2894 for(rp = tcb->reseq; rp != nil; rp = rp1){
2895 rp1 = rp->next;
2896 freeblist(rp->bp);
2897 free(rp);
2898 }
2899 tcb->reseq = nil;
2900
2901 return -1;
2902 }
2903 return 0;
2904 }
2905
2906 void
getreseq(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)2907 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2908 {
2909 Reseq *rp;
2910
2911 rp = tcb->reseq;
2912 if(rp == nil)
2913 return;
2914
2915 tcb->reseq = rp->next;
2916
2917 *seg = rp->seg;
2918 *bp = rp->bp;
2919 *length = rp->length;
2920
2921 free(rp);
2922 }
2923
2924 int
tcptrim(Tcpctl * tcb,Tcp * seg,Block ** bp,ushort * length)2925 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2926 {
2927 ushort len;
2928 uchar accept;
2929 int dupcnt, excess;
2930
2931 accept = 0;
2932 len = *length;
2933 if(seg->flags & SYN)
2934 len++;
2935 if(seg->flags & FIN)
2936 len++;
2937
2938 if(tcb->rcv.wnd == 0) {
2939 if(len == 0 && seg->seq == tcb->rcv.nxt)
2940 return 0;
2941 }
2942 else {
2943 /* Some part of the segment should be in the window */
2944 if(inwindow(tcb,seg->seq))
2945 accept++;
2946 else
2947 if(len != 0) {
2948 if(inwindow(tcb, seg->seq+len-1) ||
2949 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2950 accept++;
2951 }
2952 }
2953 if(!accept) {
2954 freeblist(*bp);
2955 return -1;
2956 }
2957 dupcnt = tcb->rcv.nxt - seg->seq;
2958 if(dupcnt > 0){
2959 tcb->rerecv += dupcnt;
2960 if(seg->flags & SYN){
2961 seg->flags &= ~SYN;
2962 seg->seq++;
2963
2964 if(seg->urg > 1)
2965 seg->urg--;
2966 else
2967 seg->flags &= ~URG;
2968 dupcnt--;
2969 }
2970 if(dupcnt > 0){
2971 pullblock(bp, (ushort)dupcnt);
2972 seg->seq += dupcnt;
2973 *length -= dupcnt;
2974
2975 if(seg->urg > dupcnt)
2976 seg->urg -= dupcnt;
2977 else {
2978 seg->flags &= ~URG;
2979 seg->urg = 0;
2980 }
2981 }
2982 }
2983 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2984 if(excess > 0) {
2985 tcb->rerecv += excess;
2986 *length -= excess;
2987 *bp = trimblock(*bp, 0, *length);
2988 if(*bp == nil)
2989 panic("presotto is a boofhead");
2990 seg->flags &= ~FIN;
2991 }
2992 return 0;
2993 }
2994
2995 void
tcpadvise(Proto * tcp,Block * bp,char * msg)2996 tcpadvise(Proto *tcp, Block *bp, char *msg)
2997 {
2998 Tcp4hdr *h4;
2999 Tcp6hdr *h6;
3000 Tcpctl *tcb;
3001 uchar source[IPaddrlen];
3002 uchar dest[IPaddrlen];
3003 ushort psource, pdest;
3004 Conv *s, **p;
3005
3006 h4 = (Tcp4hdr*)(bp->rp);
3007 h6 = (Tcp6hdr*)(bp->rp);
3008
3009 if((h4->vihl&0xF0)==IP_VER4) {
3010 v4tov6(dest, h4->tcpdst);
3011 v4tov6(source, h4->tcpsrc);
3012 psource = nhgets(h4->tcpsport);
3013 pdest = nhgets(h4->tcpdport);
3014 }
3015 else {
3016 ipmove(dest, h6->tcpdst);
3017 ipmove(source, h6->tcpsrc);
3018 psource = nhgets(h6->tcpsport);
3019 pdest = nhgets(h6->tcpdport);
3020 }
3021
3022 /* Look for a connection */
3023 qlock(tcp);
3024 for(p = tcp->conv; *p; p++) {
3025 s = *p;
3026 tcb = (Tcpctl*)s->ptcl;
3027 if(s->rport == pdest)
3028 if(s->lport == psource)
3029 if(tcb->state != Closed)
3030 if(ipcmp(s->raddr, dest) == 0)
3031 if(ipcmp(s->laddr, source) == 0){
3032 qlock(s);
3033 qunlock(tcp);
3034 switch(tcb->state){
3035 case Syn_sent:
3036 localclose(s, msg);
3037 break;
3038 }
3039 qunlock(s);
3040 freeblist(bp);
3041 return;
3042 }
3043 }
3044 qunlock(tcp);
3045 freeblist(bp);
3046 }
3047
3048 static char*
tcpporthogdefensectl(char * val)3049 tcpporthogdefensectl(char *val)
3050 {
3051 if(strcmp(val, "on") == 0)
3052 tcpporthogdefense = 1;
3053 else if(strcmp(val, "off") == 0)
3054 tcpporthogdefense = 0;
3055 else
3056 return "unknown value for tcpporthogdefense";
3057 return nil;
3058 }
3059
3060 /* called with c qlocked */
3061 char*
tcpctl(Conv * c,char ** f,int n)3062 tcpctl(Conv* c, char** f, int n)
3063 {
3064 if(n == 1 && strcmp(f[0], "hangup") == 0)
3065 return tcphangup(c);
3066 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3067 return tcpstartka(c, f, n);
3068 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3069 return tcpsetchecksum(c, f, n);
3070 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3071 return tcpporthogdefensectl(f[1]);
3072 return "unknown control request";
3073 }
3074
3075 int
tcpstats(Proto * tcp,char * buf,int len)3076 tcpstats(Proto *tcp, char *buf, int len)
3077 {
3078 Tcppriv *priv;
3079 char *p, *e;
3080 int i;
3081
3082 priv = tcp->priv;
3083 p = buf;
3084 e = p+len;
3085 for(i = 0; i < Nstats; i++)
3086 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3087 return p - buf;
3088 }
3089
3090 /*
3091 * garbage collect any stale conversations:
3092 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3093 * - Finwait2 after 5 minutes
3094 *
3095 * this is called whenever we run out of channels. Both checks are
3096 * of questionable validity so we try to use them only when we're
3097 * up against the wall.
3098 */
3099 int
tcpgc(Proto * tcp)3100 tcpgc(Proto *tcp)
3101 {
3102 Conv *c, **pp, **ep;
3103 int n;
3104 Tcpctl *tcb;
3105
3106
3107 n = 0;
3108 ep = &tcp->conv[tcp->nc];
3109 for(pp = tcp->conv; pp < ep; pp++) {
3110 c = *pp;
3111 if(c == nil)
3112 break;
3113 if(!canqlock(c))
3114 continue;
3115 tcb = (Tcpctl*)c->ptcl;
3116 switch(tcb->state){
3117 case Syn_received:
3118 if(NOW - tcb->time > 5000){
3119 localclose(c, "timed out");
3120 n++;
3121 }
3122 break;
3123 case Finwait2:
3124 if(NOW - tcb->time > 5*60*1000){
3125 localclose(c, "timed out");
3126 n++;
3127 }
3128 break;
3129 }
3130 qunlock(c);
3131 }
3132 return n;
3133 }
3134
3135 void
tcpsettimer(Tcpctl * tcb)3136 tcpsettimer(Tcpctl *tcb)
3137 {
3138 int x;
3139
3140 /* round trip dependency */
3141 x = backoff(tcb->backoff) *
3142 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3143
3144 /* bounded twixt 1/2 and 64 seconds */
3145 if(x < 500/MSPTICK)
3146 x = 500/MSPTICK;
3147 else if(x > (64000/MSPTICK))
3148 x = 64000/MSPTICK;
3149 tcb->timer.start = x;
3150 }
3151
3152 void
tcpinit(Fs * fs)3153 tcpinit(Fs *fs)
3154 {
3155 Proto *tcp;
3156 Tcppriv *tpriv;
3157
3158 tcp = smalloc(sizeof(Proto));
3159 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3160 tcp->name = "tcp";
3161 tcp->connect = tcpconnect;
3162 tcp->announce = tcpannounce;
3163 tcp->ctl = tcpctl;
3164 tcp->state = tcpstate;
3165 tcp->create = tcpcreate;
3166 tcp->close = tcpclose;
3167 tcp->rcv = tcpiput;
3168 tcp->advise = tcpadvise;
3169 tcp->stats = tcpstats;
3170 tcp->inuse = tcpinuse;
3171 tcp->gc = tcpgc;
3172 tcp->ipproto = IP_TCPPROTO;
3173 tcp->nc = scalednconv();
3174 tcp->ptclsize = sizeof(Tcpctl);
3175 tpriv->stats[MaxConn] = tcp->nc;
3176
3177 Fsproto(fs, tcp);
3178 }
3179
3180 void
tcpsetscale(Conv * s,Tcpctl * tcb,ushort rcvscale,ushort sndscale)3181 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3182 {
3183 if(rcvscale){
3184 tcb->rcv.scale = rcvscale & 0xff;
3185 tcb->snd.scale = sndscale & 0xff;
3186 tcb->window = QMAX<<tcb->snd.scale;
3187 qsetlimit(s->rq, tcb->window);
3188 } else {
3189 tcb->rcv.scale = 0;
3190 tcb->snd.scale = 0;
3191 tcb->window = QMAX;
3192 qsetlimit(s->rq, tcb->window);
3193 }
3194 }
3195