1 #include "u.h" 2 #include "../port/lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "../port/error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Maximum segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default maximum segment */ 50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 HaveWS = 1<<8, 85 }; 86 87 /* Must correspond to the enumeration above */ 88 char *tcpstates[] = 89 { 90 "Closed", "Listen", "Syn_sent", "Syn_received", 91 "Established", "Finwait1", "Finwait2", "Close_wait", 92 "Closing", "Last_ack", "Time_wait" 93 }; 94 95 typedef struct Tcptimer Tcptimer; 96 struct Tcptimer 97 { 98 Tcptimer *next; 99 Tcptimer *prev; 100 Tcptimer *readynext; 101 int state; 102 int start; 103 int count; 104 void (*func)(void*); 105 void *arg; 106 }; 107 108 /* 109 * v4 and v6 pseudo headers used for 110 * checksuming tcp 111 */ 112 typedef struct Tcp4hdr Tcp4hdr; 113 struct Tcp4hdr 114 { 115 uchar vihl; /* Version and header length */ 116 uchar tos; /* Type of service */ 117 uchar length[2]; /* packet length */ 118 uchar id[2]; /* Identification */ 119 uchar frag[2]; /* Fragment information */ 120 uchar Unused; 121 uchar proto; 122 uchar tcplen[2]; 123 uchar tcpsrc[4]; 124 uchar tcpdst[4]; 125 uchar tcpsport[2]; 126 uchar tcpdport[2]; 127 uchar tcpseq[4]; 128 uchar tcpack[4]; 129 uchar tcpflag[2]; 130 uchar tcpwin[2]; 131 uchar tcpcksum[2]; 132 uchar tcpurg[2]; 133 /* Options segment */ 134 uchar tcpopt[1]; 135 }; 136 137 typedef struct Tcp6hdr Tcp6hdr; 138 struct Tcp6hdr 139 { 140 uchar vcf[4]; 141 uchar ploadlen[2]; 142 uchar proto; 143 uchar ttl; 144 uchar tcpsrc[IPaddrlen]; 145 uchar tcpdst[IPaddrlen]; 146 uchar tcpsport[2]; 147 uchar tcpdport[2]; 148 uchar tcpseq[4]; 149 uchar tcpack[4]; 150 uchar tcpflag[2]; 151 uchar tcpwin[2]; 152 uchar tcpcksum[2]; 153 uchar tcpurg[2]; 154 /* Options segment */ 155 uchar tcpopt[1]; 156 }; 157 158 /* 159 * this represents the control info 160 * for a single packet. It is derived from 161 * a packet in ntohtcp{4,6}() and stuck into 162 * a packet in htontcp{4,6}(). 163 */ 164 typedef struct Tcp Tcp; 165 struct Tcp 166 { 167 ushort source; 168 ushort dest; 169 ulong seq; 170 ulong ack; 171 uchar flags; 172 ushort ws; /* window scale option (if not zero) */ 173 ulong wnd; 174 ushort urg; 175 ushort mss; /* max segment size option (if not zero) */ 176 ushort len; /* size of data */ 177 }; 178 179 /* 180 * this header is malloc'd to thread together fragments 181 * waiting to be coalesced 182 */ 183 typedef struct Reseq Reseq; 184 struct Reseq 185 { 186 Reseq *next; 187 Tcp seg; 188 Block *bp; 189 ushort length; 190 }; 191 192 /* 193 * the qlock in the Conv locks this structure 194 */ 195 typedef struct Tcpctl Tcpctl; 196 struct Tcpctl 197 { 198 uchar state; /* Connection state */ 199 uchar type; /* Listening or active connection */ 200 uchar code; /* Icmp code */ 201 struct { 202 ulong una; /* Unacked data pointer */ 203 ulong nxt; /* Next sequence expected */ 204 ulong ptr; /* Data pointer */ 205 ulong wnd; /* Tcp send window */ 206 ulong urg; /* Urgent data pointer */ 207 ulong wl2; 208 int scale; /* how much to right shift window in xmitted packets */ 209 /* to implement tahoe and reno TCP */ 210 ulong dupacks; /* number of duplicate acks rcvd */ 211 int recovery; /* loss recovery flag */ 212 ulong rxt; /* right window marker for recovery */ 213 } snd; 214 struct { 215 ulong nxt; /* Receive pointer to next uchar slot */ 216 ulong wnd; /* Receive window incoming */ 217 ulong urg; /* Urgent pointer */ 218 int blocked; 219 int una; /* unacked data segs */ 220 int scale; /* how much to left shift window in rcved packets */ 221 } rcv; 222 ulong iss; /* Initial sequence number */ 223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ 224 ulong cwind; /* Congestion window */ 225 int scale; /* desired snd.scale */ 226 ushort ssthresh; /* Slow start threshold */ 227 int resent; /* Bytes just resent */ 228 int irs; /* Initial received squence */ 229 ushort mss; /* Maximum segment size */ 230 int rerecv; /* Overlap of data rerecevived */ 231 ulong window; /* Receive window */ 232 uchar backoff; /* Exponential backoff counter */ 233 int backedoff; /* ms we've backed off for rexmits */ 234 uchar flags; /* State flags */ 235 Reseq *reseq; /* Resequencing queue */ 236 Tcptimer timer; /* Activity timer */ 237 Tcptimer acktimer; /* Acknowledge timer */ 238 Tcptimer rtt_timer; /* Round trip timer */ 239 Tcptimer katimer; /* keep alive timer */ 240 ulong rttseq; /* Round trip sequence */ 241 int srtt; /* Shortened round trip */ 242 int mdev; /* Mean deviation of round trip */ 243 int kacounter; /* count down for keep alive */ 244 uint sndsyntime; /* time syn sent */ 245 ulong time; /* time Finwait2 or Syn_received was sent */ 246 int nochecksum; /* non-zero means don't send checksums */ 247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 248 249 union { 250 Tcp4hdr tcp4hdr; 251 Tcp6hdr tcp6hdr; 252 } protohdr; /* prototype header */ 253 }; 254 255 /* 256 * New calls are put in limbo rather than having a conversation structure 257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 258 * any real Conv structures mucking things up. Calls in limbo rexmit their 259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 260 * 261 * In particular they aren't on a listener's queue so that they don't figure 262 * in the input queue limit. 263 * 264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 266 * there is no hashing of this list. 267 */ 268 typedef struct Limbo Limbo; 269 struct Limbo 270 { 271 Limbo *next; 272 273 uchar laddr[IPaddrlen]; 274 uchar raddr[IPaddrlen]; 275 ushort lport; 276 ushort rport; 277 ulong irs; /* initial received sequence */ 278 ulong iss; /* initial sent sequence */ 279 ushort mss; /* mss from the other end */ 280 ushort rcvscale; /* how much to scale rcvd windows */ 281 ushort sndscale; /* how much to scale sent windows */ 282 ulong lastsend; /* last time we sent a synack */ 283 uchar version; /* v4 or v6 */ 284 uchar rexmits; /* number of retransmissions */ 285 }; 286 287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ 289 290 enum { 291 /* MIB stats */ 292 MaxConn, 293 Mss, 294 ActiveOpens, 295 PassiveOpens, 296 EstabResets, 297 CurrEstab, 298 InSegs, 299 OutSegs, 300 RetransSegs, 301 RetransTimeouts, 302 InErrs, 303 OutRsts, 304 305 /* non-MIB stats */ 306 CsumErrs, 307 HlenErrs, 308 LenErrs, 309 OutOfOrder, 310 311 Nstats 312 }; 313 314 static char *statnames[] = 315 { 316 [MaxConn] "MaxConn", 317 [Mss] "MaxSegment", 318 [ActiveOpens] "ActiveOpens", 319 [PassiveOpens] "PassiveOpens", 320 [EstabResets] "EstabResets", 321 [CurrEstab] "CurrEstab", 322 [InSegs] "InSegs", 323 [OutSegs] "OutSegs", 324 [RetransSegs] "RetransSegs", 325 [RetransTimeouts] "RetransTimeouts", 326 [InErrs] "InErrs", 327 [OutRsts] "OutRsts", 328 [CsumErrs] "CsumErrs", 329 [HlenErrs] "HlenErrs", 330 [LenErrs] "LenErrs", 331 [OutOfOrder] "OutOfOrder", 332 }; 333 334 typedef struct Tcppriv Tcppriv; 335 struct Tcppriv 336 { 337 /* List of active timers */ 338 QLock tl; 339 Tcptimer *timers; 340 341 /* hash table for matching conversations */ 342 Ipht ht; 343 344 /* calls in limbo waiting for an ACK to our SYN ACK */ 345 int nlimbo; 346 Limbo *lht[NLHT]; 347 348 /* for keeping track of tcpackproc */ 349 QLock apl; 350 int ackprocstarted; 351 352 uvlong stats[Nstats]; 353 }; 354 355 /* 356 * Setting tcpporthogdefense to non-zero enables Dong Lin's 357 * solution to hijacked systems staking out port's as a form 358 * of DoS attack. 359 * 360 * To avoid stateless Conv hogs, we pick a sequence number at random. If 361 * that number gets acked by the other end, we shut down the connection. 362 * Look for tcpporthogdefense in the code. 363 */ 364 int tcpporthogdefense = 0; 365 366 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 367 void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 368 void localclose(Conv*, char*); 369 void procsyn(Conv*, Tcp*); 370 void tcpiput(Proto*, Ipifc*, Block*); 371 void tcpoutput(Conv*); 372 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 373 void tcpstart(Conv*, int); 374 void tcptimeout(void*); 375 void tcpsndsyn(Conv*, Tcpctl*); 376 void tcprcvwin(Conv*); 377 void tcpacktimer(void*); 378 void tcpkeepalive(void*); 379 void tcpsetkacounter(Tcpctl*); 380 void tcprxmit(Conv*); 381 void tcpsettimer(Tcpctl*); 382 void tcpsynackrtt(Conv*); 383 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 384 385 static void limborexmit(Proto*); 386 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 387 388 void 389 tcpsetstate(Conv *s, uchar newstate) 390 { 391 Tcpctl *tcb; 392 uchar oldstate; 393 Tcppriv *tpriv; 394 395 tpriv = s->p->priv; 396 397 tcb = (Tcpctl*)s->ptcl; 398 399 oldstate = tcb->state; 400 if(oldstate == newstate) 401 return; 402 403 if(oldstate == Established) 404 tpriv->stats[CurrEstab]--; 405 if(newstate == Established) 406 tpriv->stats[CurrEstab]++; 407 408 /** 409 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, 410 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); 411 **/ 412 413 switch(newstate) { 414 case Closed: 415 qclose(s->rq); 416 qclose(s->wq); 417 qclose(s->eq); 418 break; 419 420 case Close_wait: /* Remote closes */ 421 qhangup(s->rq, nil); 422 break; 423 } 424 425 tcb->state = newstate; 426 427 if(oldstate == Syn_sent && newstate != Closed) 428 Fsconnected(s, nil); 429 } 430 431 static char* 432 tcpconnect(Conv *c, char **argv, int argc) 433 { 434 char *e; 435 Tcpctl *tcb; 436 437 tcb = (Tcpctl*)(c->ptcl); 438 if(tcb->state != Closed) 439 return Econinuse; 440 441 e = Fsstdconnect(c, argv, argc); 442 if(e != nil) 443 return e; 444 tcpstart(c, TCP_CONNECT); 445 446 return nil; 447 } 448 449 static int 450 tcpstate(Conv *c, char *state, int n) 451 { 452 Tcpctl *s; 453 454 s = (Tcpctl*)(c->ptcl); 455 456 return snprint(state, n, 457 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 458 tcpstates[s->state], 459 c->rq ? qlen(c->rq) : 0, 460 c->wq ? qlen(c->wq) : 0, 461 s->srtt, s->mdev, 462 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 463 s->timer.start, s->timer.count, s->rerecv, 464 s->katimer.start, s->katimer.count); 465 } 466 467 static int 468 tcpinuse(Conv *c) 469 { 470 Tcpctl *s; 471 472 s = (Tcpctl*)(c->ptcl); 473 return s->state != Closed; 474 } 475 476 static char* 477 tcpannounce(Conv *c, char **argv, int argc) 478 { 479 char *e; 480 Tcpctl *tcb; 481 482 tcb = (Tcpctl*)(c->ptcl); 483 if(tcb->state != Closed) 484 return Econinuse; 485 486 e = Fsstdannounce(c, argv, argc); 487 if(e != nil) 488 return e; 489 tcpstart(c, TCP_LISTEN); 490 Fsconnected(c, nil); 491 492 return nil; 493 } 494 495 /* 496 * tcpclose is always called with the q locked 497 */ 498 static void 499 tcpclose(Conv *c) 500 { 501 Tcpctl *tcb; 502 503 tcb = (Tcpctl*)c->ptcl; 504 505 qhangup(c->rq, nil); 506 qhangup(c->wq, nil); 507 qhangup(c->eq, nil); 508 qflush(c->rq); 509 510 switch(tcb->state) { 511 case Listen: 512 /* 513 * reset any incoming calls to this listener 514 */ 515 Fsconnected(c, "Hangup"); 516 517 localclose(c, nil); 518 break; 519 case Closed: 520 case Syn_sent: 521 localclose(c, nil); 522 break; 523 case Syn_received: 524 case Established: 525 tcb->flgcnt++; 526 tcb->snd.nxt++; 527 tcpsetstate(c, Finwait1); 528 tcpoutput(c); 529 break; 530 case Close_wait: 531 tcb->flgcnt++; 532 tcb->snd.nxt++; 533 tcpsetstate(c, Last_ack); 534 tcpoutput(c); 535 break; 536 } 537 } 538 539 void 540 tcpkick(void *x) 541 { 542 Conv *s = x; 543 Tcpctl *tcb; 544 545 tcb = (Tcpctl*)s->ptcl; 546 547 if(waserror()){ 548 qunlock(s); 549 nexterror(); 550 } 551 qlock(s); 552 553 switch(tcb->state) { 554 case Syn_sent: 555 case Syn_received: 556 case Established: 557 case Close_wait: 558 /* 559 * Push data 560 */ 561 tcprcvwin(s); 562 tcpoutput(s); 563 break; 564 default: 565 localclose(s, "Hangup"); 566 break; 567 } 568 569 qunlock(s); 570 poperror(); 571 } 572 573 void 574 tcprcvwin(Conv *s) /* Call with tcb locked */ 575 { 576 int w; 577 Tcpctl *tcb; 578 579 tcb = (Tcpctl*)s->ptcl; 580 w = tcb->window - qlen(s->rq); 581 if(w < 0) 582 w = 0; 583 if(w == 0) 584 netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); 585 tcb->rcv.wnd = w; 586 if(w == 0) 587 tcb->rcv.blocked = 1; 588 } 589 590 void 591 tcpacktimer(void *v) 592 { 593 Tcpctl *tcb; 594 Conv *s; 595 596 s = v; 597 tcb = (Tcpctl*)s->ptcl; 598 599 if(waserror()){ 600 qunlock(s); 601 nexterror(); 602 } 603 qlock(s); 604 if(tcb->state != Closed){ 605 tcb->flags |= FORCE; 606 tcprcvwin(s); 607 tcpoutput(s); 608 } 609 qunlock(s); 610 poperror(); 611 } 612 613 static void 614 tcpcreate(Conv *c) 615 { 616 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 617 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); 618 } 619 620 static void 621 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 622 { 623 if(newstate != TcptimerON){ 624 if(t->state == TcptimerON){ 625 /* unchain */ 626 if(priv->timers == t){ 627 priv->timers = t->next; 628 if(t->prev != nil) 629 panic("timerstate1"); 630 } 631 if(t->next) 632 t->next->prev = t->prev; 633 if(t->prev) 634 t->prev->next = t->next; 635 t->next = t->prev = nil; 636 } 637 } else { 638 if(t->state != TcptimerON){ 639 /* chain */ 640 if(t->prev != nil || t->next != nil) 641 panic("timerstate2"); 642 t->prev = nil; 643 t->next = priv->timers; 644 if(t->next) 645 t->next->prev = t; 646 priv->timers = t; 647 } 648 } 649 t->state = newstate; 650 } 651 652 void 653 tcpackproc(void *a) 654 { 655 Tcptimer *t, *tp, *timeo; 656 Proto *tcp; 657 Tcppriv *priv; 658 int loop; 659 660 tcp = a; 661 priv = tcp->priv; 662 663 for(;;) { 664 tsleep(&up->sleep, return0, 0, MSPTICK); 665 666 qlock(&priv->tl); 667 timeo = nil; 668 loop = 0; 669 for(t = priv->timers; t != nil; t = tp) { 670 if(loop++ > 10000) 671 panic("tcpackproc1"); 672 tp = t->next; 673 if(t->state == TcptimerON) { 674 t->count--; 675 if(t->count == 0) { 676 timerstate(priv, t, TcptimerDONE); 677 t->readynext = timeo; 678 timeo = t; 679 } 680 } 681 } 682 qunlock(&priv->tl); 683 684 loop = 0; 685 for(t = timeo; t != nil; t = t->readynext) { 686 if(loop++ > 10000) 687 panic("tcpackproc2"); 688 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 689 (*t->func)(t->arg); 690 poperror(); 691 } 692 } 693 694 limborexmit(tcp); 695 } 696 } 697 698 void 699 tcpgo(Tcppriv *priv, Tcptimer *t) 700 { 701 if(t == nil || t->start == 0) 702 return; 703 704 qlock(&priv->tl); 705 t->count = t->start; 706 timerstate(priv, t, TcptimerON); 707 qunlock(&priv->tl); 708 } 709 710 void 711 tcphalt(Tcppriv *priv, Tcptimer *t) 712 { 713 if(t == nil) 714 return; 715 716 qlock(&priv->tl); 717 timerstate(priv, t, TcptimerOFF); 718 qunlock(&priv->tl); 719 } 720 721 int 722 backoff(int n) 723 { 724 return 1 << n; 725 } 726 727 void 728 localclose(Conv *s, char *reason) /* called with tcb locked */ 729 { 730 Tcpctl *tcb; 731 Reseq *rp,*rp1; 732 Tcppriv *tpriv; 733 734 tpriv = s->p->priv; 735 tcb = (Tcpctl*)s->ptcl; 736 737 iphtrem(&tpriv->ht, s); 738 739 tcphalt(tpriv, &tcb->timer); 740 tcphalt(tpriv, &tcb->rtt_timer); 741 tcphalt(tpriv, &tcb->acktimer); 742 tcphalt(tpriv, &tcb->katimer); 743 744 /* Flush reassembly queue; nothing more can arrive */ 745 for(rp = tcb->reseq; rp != nil; rp = rp1) { 746 rp1 = rp->next; 747 freeblist(rp->bp); 748 free(rp); 749 } 750 tcb->reseq = nil; 751 752 if(tcb->state == Syn_sent) 753 Fsconnected(s, reason); 754 if(s->state == Announced) 755 wakeup(&s->listenr); 756 757 qhangup(s->rq, reason); 758 qhangup(s->wq, reason); 759 760 tcpsetstate(s, Closed); 761 } 762 763 /* mtu (- TCP + IP hdr len) of 1st hop */ 764 int 765 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) 766 { 767 Ipifc *ifc; 768 int mtu; 769 770 ifc = findipifc(tcp->f, addr, 0); 771 switch(version){ 772 default: 773 case V4: 774 mtu = DEF_MSS; 775 if(ifc != nil) 776 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 777 break; 778 case V6: 779 mtu = DEF_MSS6; 780 if(ifc != nil) 781 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 782 break; 783 } 784 if(ifc != nil){ 785 if(ifc->mbps > 1000) 786 *scale = HaveWS | 4; 787 else if(ifc->mbps > 100) 788 *scale = HaveWS | 3; 789 else if(ifc->mbps > 10) 790 *scale = HaveWS | 1; 791 else 792 *scale = HaveWS | 0; 793 } else 794 *scale = HaveWS | 0; 795 796 return mtu; 797 } 798 799 void 800 inittcpctl(Conv *s, int mode) 801 { 802 Tcpctl *tcb; 803 Tcp4hdr* h4; 804 Tcp6hdr* h6; 805 Tcppriv *tpriv; 806 int mss; 807 808 tcb = (Tcpctl*)s->ptcl; 809 810 memset(tcb, 0, sizeof(Tcpctl)); 811 812 tcb->ssthresh = 65535; 813 tcb->srtt = tcp_irtt<<LOGAGAIN; 814 tcb->mdev = 0; 815 816 /* setup timers */ 817 tcb->timer.start = tcp_irtt / MSPTICK; 818 tcb->timer.func = tcptimeout; 819 tcb->timer.arg = s; 820 tcb->rtt_timer.start = MAX_TIME; 821 tcb->acktimer.start = TCP_ACK / MSPTICK; 822 tcb->acktimer.func = tcpacktimer; 823 tcb->acktimer.arg = s; 824 tcb->katimer.start = DEF_KAT / MSPTICK; 825 tcb->katimer.func = tcpkeepalive; 826 tcb->katimer.arg = s; 827 828 mss = DEF_MSS; 829 830 /* create a prototype(pseudo) header */ 831 if(mode != TCP_LISTEN){ 832 if(ipcmp(s->laddr, IPnoaddr) == 0) 833 findlocalip(s->p->f, s->laddr, s->raddr); 834 835 switch(s->ipversion){ 836 case V4: 837 h4 = &tcb->protohdr.tcp4hdr; 838 memset(h4, 0, sizeof(*h4)); 839 h4->proto = IP_TCPPROTO; 840 hnputs(h4->tcpsport, s->lport); 841 hnputs(h4->tcpdport, s->rport); 842 v6tov4(h4->tcpsrc, s->laddr); 843 v6tov4(h4->tcpdst, s->raddr); 844 break; 845 case V6: 846 h6 = &tcb->protohdr.tcp6hdr; 847 memset(h6, 0, sizeof(*h6)); 848 h6->proto = IP_TCPPROTO; 849 hnputs(h6->tcpsport, s->lport); 850 hnputs(h6->tcpdport, s->rport); 851 ipmove(h6->tcpsrc, s->laddr); 852 ipmove(h6->tcpdst, s->raddr); 853 mss = DEF_MSS6; 854 break; 855 default: 856 panic("inittcpctl: version %d", s->ipversion); 857 } 858 } 859 860 tcb->mss = tcb->cwind = mss; 861 tpriv = s->p->priv; 862 tpriv->stats[Mss] = tcb->mss; 863 864 /* default is no window scaling */ 865 tcb->window = QMAX; 866 tcb->rcv.wnd = QMAX; 867 tcb->rcv.scale = 0; 868 tcb->snd.scale = 0; 869 qsetlimit(s->rq, QMAX); 870 } 871 872 /* 873 * called with s qlocked 874 */ 875 void 876 tcpstart(Conv *s, int mode) 877 { 878 Tcpctl *tcb; 879 Tcppriv *tpriv; 880 char kpname[KNAMELEN]; 881 882 tpriv = s->p->priv; 883 884 if(tpriv->ackprocstarted == 0){ 885 qlock(&tpriv->apl); 886 if(tpriv->ackprocstarted == 0){ 887 sprint(kpname, "#I%dtcpack", s->p->f->dev); 888 kproc(kpname, tcpackproc, s->p); 889 tpriv->ackprocstarted = 1; 890 } 891 qunlock(&tpriv->apl); 892 } 893 894 tcb = (Tcpctl*)s->ptcl; 895 896 inittcpctl(s, mode); 897 898 iphtadd(&tpriv->ht, s); 899 switch(mode) { 900 case TCP_LISTEN: 901 tpriv->stats[PassiveOpens]++; 902 tcb->flags |= CLONE; 903 tcpsetstate(s, Listen); 904 break; 905 906 case TCP_CONNECT: 907 tpriv->stats[ActiveOpens]++; 908 tcb->flags |= ACTIVE; 909 tcpsndsyn(s, tcb); 910 tcpsetstate(s, Syn_sent); 911 tcpoutput(s); 912 break; 913 } 914 } 915 916 static char* 917 tcpflag(ushort flag) 918 { 919 static char buf[128]; 920 921 sprint(buf, "%d", flag>>10); /* Head len */ 922 if(flag & URG) 923 strcat(buf, " URG"); 924 if(flag & ACK) 925 strcat(buf, " ACK"); 926 if(flag & PSH) 927 strcat(buf, " PSH"); 928 if(flag & RST) 929 strcat(buf, " RST"); 930 if(flag & SYN) 931 strcat(buf, " SYN"); 932 if(flag & FIN) 933 strcat(buf, " FIN"); 934 935 return buf; 936 } 937 938 Block * 939 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 940 { 941 int dlen; 942 Tcp6hdr *h; 943 ushort csum; 944 ushort hdrlen, optpad = 0; 945 uchar *opt; 946 947 hdrlen = TCP6_HDRSIZE; 948 if(tcph->flags & SYN){ 949 if(tcph->mss) 950 hdrlen += MSS_LENGTH; 951 if(tcph->ws) 952 hdrlen += WS_LENGTH; 953 optpad = hdrlen & 3; 954 if(optpad) 955 optpad = 4 - optpad; 956 hdrlen += optpad; 957 } 958 959 if(data) { 960 dlen = blocklen(data); 961 data = padblock(data, hdrlen + TCP6_PKT); 962 if(data == nil) 963 return nil; 964 } 965 else { 966 dlen = 0; 967 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 968 if(data == nil) 969 return nil; 970 data->wp += hdrlen + TCP6_PKT; 971 } 972 973 /* copy in pseudo ip header plus port numbers */ 974 h = (Tcp6hdr *)(data->rp); 975 memmove(h, ph, TCP6_TCBPHDRSZ); 976 977 /* compose pseudo tcp header, do cksum calculation */ 978 hnputl(h->vcf, hdrlen + dlen); 979 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 980 h->ttl = ph->proto; 981 982 /* copy in variable bits */ 983 hnputl(h->tcpseq, tcph->seq); 984 hnputl(h->tcpack, tcph->ack); 985 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 986 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 987 hnputs(h->tcpurg, tcph->urg); 988 989 if(tcph->flags & SYN){ 990 opt = h->tcpopt; 991 if(tcph->mss != 0){ 992 *opt++ = MSSOPT; 993 *opt++ = MSS_LENGTH; 994 hnputs(opt, tcph->mss); 995 // print("our outgoing mss %d\n", tcph->mss); 996 opt += 2; 997 } 998 if(tcph->ws != 0){ 999 *opt++ = WSOPT; 1000 *opt++ = WS_LENGTH; 1001 *opt++ = tcph->ws; 1002 } 1003 while(optpad-- > 0) 1004 *opt++ = NOOPOPT; 1005 } 1006 1007 if(tcb != nil && tcb->nochecksum){ 1008 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1009 } else { 1010 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 1011 hnputs(h->tcpcksum, csum); 1012 } 1013 1014 /* move from pseudo header back to normal ip header */ 1015 memset(h->vcf, 0, 4); 1016 h->vcf[0] = IP_VER6; 1017 hnputs(h->ploadlen, hdrlen+dlen); 1018 h->proto = ph->proto; 1019 1020 return data; 1021 } 1022 1023 Block * 1024 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1025 { 1026 int dlen; 1027 Tcp4hdr *h; 1028 ushort csum; 1029 ushort hdrlen, optpad = 0; 1030 uchar *opt; 1031 1032 hdrlen = TCP4_HDRSIZE; 1033 if(tcph->flags & SYN){ 1034 if(tcph->mss) 1035 hdrlen += MSS_LENGTH; 1036 if(tcph->ws) 1037 hdrlen += WS_LENGTH; 1038 optpad = hdrlen & 3; 1039 if(optpad) 1040 optpad = 4 - optpad; 1041 hdrlen += optpad; 1042 } 1043 1044 if(data) { 1045 dlen = blocklen(data); 1046 data = padblock(data, hdrlen + TCP4_PKT); 1047 if(data == nil) 1048 return nil; 1049 } 1050 else { 1051 dlen = 0; 1052 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1053 if(data == nil) 1054 return nil; 1055 data->wp += hdrlen + TCP4_PKT; 1056 } 1057 1058 /* copy in pseudo ip header plus port numbers */ 1059 h = (Tcp4hdr *)(data->rp); 1060 memmove(h, ph, TCP4_TCBPHDRSZ); 1061 1062 /* copy in variable bits */ 1063 hnputs(h->tcplen, hdrlen + dlen); 1064 hnputl(h->tcpseq, tcph->seq); 1065 hnputl(h->tcpack, tcph->ack); 1066 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1067 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1068 hnputs(h->tcpurg, tcph->urg); 1069 1070 if(tcph->flags & SYN){ 1071 opt = h->tcpopt; 1072 if(tcph->mss != 0){ 1073 *opt++ = MSSOPT; 1074 *opt++ = MSS_LENGTH; 1075 hnputs(opt, tcph->mss); 1076 opt += 2; 1077 } 1078 if(tcph->ws != 0){ 1079 *opt++ = WSOPT; 1080 *opt++ = WS_LENGTH; 1081 *opt++ = tcph->ws; 1082 } 1083 while(optpad-- > 0) 1084 *opt++ = NOOPOPT; 1085 } 1086 1087 if(tcb != nil && tcb->nochecksum){ 1088 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1089 } else { 1090 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1091 hnputs(h->tcpcksum, csum); 1092 } 1093 1094 return data; 1095 } 1096 1097 int 1098 ntohtcp6(Tcp *tcph, Block **bpp) 1099 { 1100 Tcp6hdr *h; 1101 uchar *optr; 1102 ushort hdrlen; 1103 ushort optlen; 1104 int n; 1105 1106 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1107 if(*bpp == nil) 1108 return -1; 1109 1110 h = (Tcp6hdr *)((*bpp)->rp); 1111 tcph->source = nhgets(h->tcpsport); 1112 tcph->dest = nhgets(h->tcpdport); 1113 tcph->seq = nhgetl(h->tcpseq); 1114 tcph->ack = nhgetl(h->tcpack); 1115 hdrlen = (h->tcpflag[0]>>2) & ~3; 1116 if(hdrlen < TCP6_HDRSIZE) { 1117 freeblist(*bpp); 1118 return -1; 1119 } 1120 1121 tcph->flags = h->tcpflag[1]; 1122 tcph->wnd = nhgets(h->tcpwin); 1123 tcph->urg = nhgets(h->tcpurg); 1124 tcph->mss = 0; 1125 tcph->ws = 0; 1126 tcph->len = nhgets(h->ploadlen) - hdrlen; 1127 1128 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1129 if(*bpp == nil) 1130 return -1; 1131 1132 optr = h->tcpopt; 1133 n = hdrlen - TCP6_HDRSIZE; 1134 while(n > 0 && *optr != EOLOPT) { 1135 if(*optr == NOOPOPT) { 1136 n--; 1137 optr++; 1138 continue; 1139 } 1140 optlen = optr[1]; 1141 if(optlen < 2 || optlen > n) 1142 break; 1143 switch(*optr) { 1144 case MSSOPT: 1145 if(optlen == MSS_LENGTH) 1146 tcph->mss = nhgets(optr+2); 1147 break; 1148 case WSOPT: 1149 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1150 tcph->ws = HaveWS | *(optr+2); 1151 break; 1152 } 1153 n -= optlen; 1154 optr += optlen; 1155 } 1156 return hdrlen; 1157 } 1158 1159 int 1160 ntohtcp4(Tcp *tcph, Block **bpp) 1161 { 1162 Tcp4hdr *h; 1163 uchar *optr; 1164 ushort hdrlen; 1165 ushort optlen; 1166 int n; 1167 1168 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1169 if(*bpp == nil) 1170 return -1; 1171 1172 h = (Tcp4hdr *)((*bpp)->rp); 1173 tcph->source = nhgets(h->tcpsport); 1174 tcph->dest = nhgets(h->tcpdport); 1175 tcph->seq = nhgetl(h->tcpseq); 1176 tcph->ack = nhgetl(h->tcpack); 1177 1178 hdrlen = (h->tcpflag[0]>>2) & ~3; 1179 if(hdrlen < TCP4_HDRSIZE) { 1180 freeblist(*bpp); 1181 return -1; 1182 } 1183 1184 tcph->flags = h->tcpflag[1]; 1185 tcph->wnd = nhgets(h->tcpwin); 1186 tcph->urg = nhgets(h->tcpurg); 1187 tcph->mss = 0; 1188 tcph->ws = 0; 1189 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1190 1191 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1192 if(*bpp == nil) 1193 return -1; 1194 1195 optr = h->tcpopt; 1196 n = hdrlen - TCP4_HDRSIZE; 1197 while(n > 0 && *optr != EOLOPT) { 1198 if(*optr == NOOPOPT) { 1199 n--; 1200 optr++; 1201 continue; 1202 } 1203 optlen = optr[1]; 1204 if(optlen < 2 || optlen > n) 1205 break; 1206 switch(*optr) { 1207 case MSSOPT: 1208 if(optlen == MSS_LENGTH) { 1209 tcph->mss = nhgets(optr+2); 1210 // print("new incoming mss %d\n", tcph->mss); 1211 } 1212 break; 1213 case WSOPT: 1214 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1215 tcph->ws = HaveWS | *(optr+2); 1216 break; 1217 } 1218 n -= optlen; 1219 optr += optlen; 1220 } 1221 return hdrlen; 1222 } 1223 1224 /* 1225 * For outgiing calls, generate an initial sequence 1226 * number and put a SYN on the send queue 1227 */ 1228 void 1229 tcpsndsyn(Conv *s, Tcpctl *tcb) 1230 { 1231 Tcppriv *tpriv; 1232 1233 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1234 tcb->rttseq = tcb->iss; 1235 tcb->snd.wl2 = tcb->iss; 1236 tcb->snd.una = tcb->iss; 1237 tcb->snd.ptr = tcb->rttseq; 1238 tcb->snd.nxt = tcb->rttseq; 1239 tcb->flgcnt++; 1240 tcb->flags |= FORCE; 1241 tcb->sndsyntime = NOW; 1242 1243 /* set desired mss and scale */ 1244 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1245 tpriv = s->p->priv; 1246 tpriv->stats[Mss] = tcb->mss; 1247 } 1248 1249 void 1250 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1251 { 1252 Block *hbp; 1253 uchar rflags; 1254 Tcppriv *tpriv; 1255 Tcp4hdr ph4; 1256 Tcp6hdr ph6; 1257 1258 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason); 1259 1260 tpriv = tcp->priv; 1261 1262 if(seg->flags & RST) 1263 return; 1264 1265 /* make pseudo header */ 1266 switch(version) { 1267 case V4: 1268 memset(&ph4, 0, sizeof(ph4)); 1269 ph4.vihl = IP_VER4; 1270 v6tov4(ph4.tcpsrc, dest); 1271 v6tov4(ph4.tcpdst, source); 1272 ph4.proto = IP_TCPPROTO; 1273 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1274 hnputs(ph4.tcpsport, seg->dest); 1275 hnputs(ph4.tcpdport, seg->source); 1276 break; 1277 case V6: 1278 memset(&ph6, 0, sizeof(ph6)); 1279 ph6.vcf[0] = IP_VER6; 1280 ipmove(ph6.tcpsrc, dest); 1281 ipmove(ph6.tcpdst, source); 1282 ph6.proto = IP_TCPPROTO; 1283 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1284 hnputs(ph6.tcpsport, seg->dest); 1285 hnputs(ph6.tcpdport, seg->source); 1286 break; 1287 default: 1288 panic("sndrst: version %d", version); 1289 } 1290 1291 tpriv->stats[OutRsts]++; 1292 rflags = RST; 1293 1294 /* convince the other end that this reset is in band */ 1295 if(seg->flags & ACK) { 1296 seg->seq = seg->ack; 1297 seg->ack = 0; 1298 } 1299 else { 1300 rflags |= ACK; 1301 seg->ack = seg->seq; 1302 seg->seq = 0; 1303 if(seg->flags & SYN) 1304 seg->ack++; 1305 seg->ack += length; 1306 if(seg->flags & FIN) 1307 seg->ack++; 1308 } 1309 seg->flags = rflags; 1310 seg->wnd = 0; 1311 seg->urg = 0; 1312 seg->mss = 0; 1313 seg->ws = 0; 1314 switch(version) { 1315 case V4: 1316 hbp = htontcp4(seg, nil, &ph4, nil); 1317 if(hbp == nil) 1318 return; 1319 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1320 break; 1321 case V6: 1322 hbp = htontcp6(seg, nil, &ph6, nil); 1323 if(hbp == nil) 1324 return; 1325 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1326 break; 1327 default: 1328 panic("sndrst2: version %d", version); 1329 } 1330 } 1331 1332 /* 1333 * send a reset to the remote side and close the conversation 1334 * called with s qlocked 1335 */ 1336 char* 1337 tcphangup(Conv *s) 1338 { 1339 Tcp seg; 1340 Tcpctl *tcb; 1341 Block *hbp; 1342 1343 tcb = (Tcpctl*)s->ptcl; 1344 if(waserror()) 1345 return commonerror(); 1346 if(ipcmp(s->raddr, IPnoaddr) != 0) { 1347 if(!waserror()){ 1348 seg.flags = RST | ACK; 1349 seg.ack = tcb->rcv.nxt; 1350 tcb->rcv.una = 0; 1351 seg.seq = tcb->snd.ptr; 1352 seg.wnd = 0; 1353 seg.urg = 0; 1354 seg.mss = 0; 1355 seg.ws = 0; 1356 switch(s->ipversion) { 1357 case V4: 1358 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1359 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1360 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1361 break; 1362 case V6: 1363 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1364 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1365 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1366 break; 1367 default: 1368 panic("tcphangup: version %d", s->ipversion); 1369 } 1370 poperror(); 1371 } 1372 } 1373 localclose(s, nil); 1374 poperror(); 1375 return nil; 1376 } 1377 1378 /* 1379 * (re)send a SYN ACK 1380 */ 1381 int 1382 sndsynack(Proto *tcp, Limbo *lp) 1383 { 1384 Block *hbp; 1385 Tcp4hdr ph4; 1386 Tcp6hdr ph6; 1387 Tcp seg; 1388 int scale; 1389 1390 /* make pseudo header */ 1391 switch(lp->version) { 1392 case V4: 1393 memset(&ph4, 0, sizeof(ph4)); 1394 ph4.vihl = IP_VER4; 1395 v6tov4(ph4.tcpsrc, lp->laddr); 1396 v6tov4(ph4.tcpdst, lp->raddr); 1397 ph4.proto = IP_TCPPROTO; 1398 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1399 hnputs(ph4.tcpsport, lp->lport); 1400 hnputs(ph4.tcpdport, lp->rport); 1401 break; 1402 case V6: 1403 memset(&ph6, 0, sizeof(ph6)); 1404 ph6.vcf[0] = IP_VER6; 1405 ipmove(ph6.tcpsrc, lp->laddr); 1406 ipmove(ph6.tcpdst, lp->raddr); 1407 ph6.proto = IP_TCPPROTO; 1408 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1409 hnputs(ph6.tcpsport, lp->lport); 1410 hnputs(ph6.tcpdport, lp->rport); 1411 break; 1412 default: 1413 panic("sndrst: version %d", lp->version); 1414 } 1415 1416 seg.seq = lp->iss; 1417 seg.ack = lp->irs+1; 1418 seg.flags = SYN|ACK; 1419 seg.urg = 0; 1420 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1421 // if (seg.mss > lp->mss && lp->mss >= 512) 1422 // seg.mss = lp->mss; 1423 seg.wnd = QMAX; 1424 1425 /* if the other side set scale, we should too */ 1426 if(lp->rcvscale){ 1427 seg.ws = scale; 1428 lp->sndscale = scale; 1429 } else { 1430 seg.ws = 0; 1431 lp->sndscale = 0; 1432 } 1433 1434 switch(lp->version) { 1435 case V4: 1436 hbp = htontcp4(&seg, nil, &ph4, nil); 1437 if(hbp == nil) 1438 return -1; 1439 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1440 break; 1441 case V6: 1442 hbp = htontcp6(&seg, nil, &ph6, nil); 1443 if(hbp == nil) 1444 return -1; 1445 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1446 break; 1447 default: 1448 panic("sndsnack: version %d", lp->version); 1449 } 1450 lp->lastsend = NOW; 1451 return 0; 1452 } 1453 1454 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1455 1456 /* 1457 * put a call into limbo and respond with a SYN ACK 1458 * 1459 * called with proto locked 1460 */ 1461 static void 1462 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1463 { 1464 Limbo *lp, **l; 1465 Tcppriv *tpriv; 1466 int h; 1467 1468 tpriv = s->p->priv; 1469 h = hashipa(source, seg->source); 1470 1471 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1472 lp = *l; 1473 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1474 continue; 1475 if(ipcmp(lp->raddr, source) != 0) 1476 continue; 1477 if(ipcmp(lp->laddr, dest) != 0) 1478 continue; 1479 1480 /* each new SYN restarts the retransmits */ 1481 lp->irs = seg->seq; 1482 break; 1483 } 1484 lp = *l; 1485 if(lp == nil){ 1486 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1487 lp = tpriv->lht[h]; 1488 tpriv->lht[h] = lp->next; 1489 lp->next = nil; 1490 } else { 1491 lp = malloc(sizeof(*lp)); 1492 if(lp == nil) 1493 return; 1494 tpriv->nlimbo++; 1495 } 1496 *l = lp; 1497 lp->version = version; 1498 ipmove(lp->laddr, dest); 1499 ipmove(lp->raddr, source); 1500 lp->lport = seg->dest; 1501 lp->rport = seg->source; 1502 lp->mss = seg->mss; 1503 lp->rcvscale = seg->ws; 1504 lp->irs = seg->seq; 1505 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1506 } 1507 1508 if(sndsynack(s->p, lp) < 0){ 1509 *l = lp->next; 1510 tpriv->nlimbo--; 1511 free(lp); 1512 } 1513 } 1514 1515 /* 1516 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1517 */ 1518 static void 1519 limborexmit(Proto *tcp) 1520 { 1521 Tcppriv *tpriv; 1522 Limbo **l, *lp; 1523 int h; 1524 int seen; 1525 ulong now; 1526 1527 tpriv = tcp->priv; 1528 1529 if(!canqlock(tcp)) 1530 return; 1531 seen = 0; 1532 now = NOW; 1533 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1534 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1535 lp = *l; 1536 seen++; 1537 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1538 continue; 1539 1540 /* time it out after 1 second */ 1541 if(++(lp->rexmits) > 5){ 1542 tpriv->nlimbo--; 1543 *l = lp->next; 1544 free(lp); 1545 continue; 1546 } 1547 1548 /* if we're being attacked, don't bother resending SYN ACK's */ 1549 if(tpriv->nlimbo > 100) 1550 continue; 1551 1552 if(sndsynack(tcp, lp) < 0){ 1553 tpriv->nlimbo--; 1554 *l = lp->next; 1555 free(lp); 1556 continue; 1557 } 1558 1559 l = &lp->next; 1560 } 1561 } 1562 qunlock(tcp); 1563 } 1564 1565 /* 1566 * lookup call in limbo. if found, throw it out. 1567 * 1568 * called with proto locked 1569 */ 1570 static void 1571 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1572 { 1573 Limbo *lp, **l; 1574 int h; 1575 Tcppriv *tpriv; 1576 1577 tpriv = s->p->priv; 1578 1579 /* find a call in limbo */ 1580 h = hashipa(src, segp->source); 1581 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1582 lp = *l; 1583 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1584 continue; 1585 if(ipcmp(lp->laddr, dst) != 0) 1586 continue; 1587 if(ipcmp(lp->raddr, src) != 0) 1588 continue; 1589 1590 /* RST can only follow the SYN */ 1591 if(segp->seq == lp->irs+1){ 1592 tpriv->nlimbo--; 1593 *l = lp->next; 1594 free(lp); 1595 } 1596 break; 1597 } 1598 } 1599 1600 /* 1601 * come here when we finally get an ACK to our SYN-ACK. 1602 * lookup call in limbo. if found, create a new conversation 1603 * 1604 * called with proto locked 1605 */ 1606 static Conv* 1607 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1608 { 1609 Conv *new; 1610 Tcpctl *tcb; 1611 Tcppriv *tpriv; 1612 Tcp4hdr *h4; 1613 Tcp6hdr *h6; 1614 Limbo *lp, **l; 1615 int h; 1616 1617 /* unless it's just an ack, it can't be someone coming out of limbo */ 1618 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1619 return nil; 1620 1621 tpriv = s->p->priv; 1622 1623 /* find a call in limbo */ 1624 h = hashipa(src, segp->source); 1625 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1626 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n", 1627 src, segp->source, lp->raddr, lp->rport, 1628 dst, segp->dest, lp->laddr, lp->lport, 1629 version, lp->version 1630 ); 1631 1632 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1633 continue; 1634 if(ipcmp(lp->laddr, dst) != 0) 1635 continue; 1636 if(ipcmp(lp->raddr, src) != 0) 1637 continue; 1638 1639 /* we're assuming no data with the initial SYN */ 1640 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1641 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n", 1642 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1643 lp = nil; 1644 } else { 1645 tpriv->nlimbo--; 1646 *l = lp->next; 1647 } 1648 break; 1649 } 1650 if(lp == nil) 1651 return nil; 1652 1653 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1654 if(new == nil) 1655 return nil; 1656 1657 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1658 tcb = (Tcpctl*)new->ptcl; 1659 tcb->flags &= ~CLONE; 1660 tcb->timer.arg = new; 1661 tcb->timer.state = TcptimerOFF; 1662 tcb->acktimer.arg = new; 1663 tcb->acktimer.state = TcptimerOFF; 1664 tcb->katimer.arg = new; 1665 tcb->katimer.state = TcptimerOFF; 1666 tcb->rtt_timer.arg = new; 1667 tcb->rtt_timer.state = TcptimerOFF; 1668 1669 tcb->irs = lp->irs; 1670 tcb->rcv.nxt = tcb->irs+1; 1671 tcb->rcv.urg = tcb->rcv.nxt; 1672 1673 tcb->iss = lp->iss; 1674 tcb->rttseq = tcb->iss; 1675 tcb->snd.wl2 = tcb->iss; 1676 tcb->snd.una = tcb->iss+1; 1677 tcb->snd.ptr = tcb->iss+1; 1678 tcb->snd.nxt = tcb->iss+1; 1679 tcb->flgcnt = 0; 1680 tcb->flags |= SYNACK; 1681 1682 /* our sending max segment size cannot be bigger than what he asked for */ 1683 if(lp->mss != 0 && lp->mss < tcb->mss) { 1684 tcb->mss = lp->mss; 1685 tpriv->stats[Mss] = tcb->mss; 1686 } 1687 1688 /* window scaling */ 1689 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1690 1691 /* the congestion window always starts out as a single segment */ 1692 tcb->snd.wnd = segp->wnd; 1693 tcb->cwind = tcb->mss; 1694 1695 /* set initial round trip time */ 1696 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1697 tcpsynackrtt(new); 1698 1699 free(lp); 1700 1701 /* set up proto header */ 1702 switch(version){ 1703 case V4: 1704 h4 = &tcb->protohdr.tcp4hdr; 1705 memset(h4, 0, sizeof(*h4)); 1706 h4->proto = IP_TCPPROTO; 1707 hnputs(h4->tcpsport, new->lport); 1708 hnputs(h4->tcpdport, new->rport); 1709 v6tov4(h4->tcpsrc, dst); 1710 v6tov4(h4->tcpdst, src); 1711 break; 1712 case V6: 1713 h6 = &tcb->protohdr.tcp6hdr; 1714 memset(h6, 0, sizeof(*h6)); 1715 h6->proto = IP_TCPPROTO; 1716 hnputs(h6->tcpsport, new->lport); 1717 hnputs(h6->tcpdport, new->rport); 1718 ipmove(h6->tcpsrc, dst); 1719 ipmove(h6->tcpdst, src); 1720 break; 1721 default: 1722 panic("tcpincoming: version %d", new->ipversion); 1723 } 1724 1725 tcpsetstate(new, Established); 1726 1727 iphtadd(&tpriv->ht, new); 1728 1729 return new; 1730 } 1731 1732 int 1733 seq_within(ulong x, ulong low, ulong high) 1734 { 1735 if(low <= high){ 1736 if(low <= x && x <= high) 1737 return 1; 1738 } 1739 else { 1740 if(x >= low || x <= high) 1741 return 1; 1742 } 1743 return 0; 1744 } 1745 1746 int 1747 seq_lt(ulong x, ulong y) 1748 { 1749 return (int)(x-y) < 0; 1750 } 1751 1752 int 1753 seq_le(ulong x, ulong y) 1754 { 1755 return (int)(x-y) <= 0; 1756 } 1757 1758 int 1759 seq_gt(ulong x, ulong y) 1760 { 1761 return (int)(x-y) > 0; 1762 } 1763 1764 int 1765 seq_ge(ulong x, ulong y) 1766 { 1767 return (int)(x-y) >= 0; 1768 } 1769 1770 /* 1771 * use the time between the first SYN and it's ack as the 1772 * initial round trip time 1773 */ 1774 void 1775 tcpsynackrtt(Conv *s) 1776 { 1777 Tcpctl *tcb; 1778 int delta; 1779 Tcppriv *tpriv; 1780 1781 tcb = (Tcpctl*)s->ptcl; 1782 tpriv = s->p->priv; 1783 1784 delta = NOW - tcb->sndsyntime; 1785 tcb->srtt = delta<<LOGAGAIN; 1786 tcb->mdev = delta<<LOGDGAIN; 1787 1788 /* halt round trip timer */ 1789 tcphalt(tpriv, &tcb->rtt_timer); 1790 } 1791 1792 void 1793 update(Conv *s, Tcp *seg) 1794 { 1795 int rtt, delta; 1796 Tcpctl *tcb; 1797 ulong acked; 1798 ulong expand; 1799 Tcppriv *tpriv; 1800 1801 tpriv = s->p->priv; 1802 tcb = (Tcpctl*)s->ptcl; 1803 1804 /* if everything has been acked, force output(?) */ 1805 if(seq_gt(seg->ack, tcb->snd.nxt)) { 1806 tcb->flags |= FORCE; 1807 return; 1808 } 1809 1810 /* added by Dong Lin for fast retransmission */ 1811 if(seg->ack == tcb->snd.una 1812 && tcb->snd.una != tcb->snd.nxt 1813 && seg->len == 0 1814 && seg->wnd == tcb->snd.wnd) { 1815 1816 /* this is a pure ack w/o window update */ 1817 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n", 1818 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); 1819 1820 if(++tcb->snd.dupacks == TCPREXMTTHRESH) { 1821 /* 1822 * tahoe tcp rxt the packet, half sshthresh, 1823 * and set cwnd to one packet 1824 */ 1825 tcb->snd.recovery = 1; 1826 tcb->snd.rxt = tcb->snd.nxt; 1827 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); 1828 tcprxmit(s); 1829 } else { 1830 /* do reno tcp here. */ 1831 } 1832 } 1833 1834 /* 1835 * update window 1836 */ 1837 if(seq_gt(seg->ack, tcb->snd.wl2) 1838 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1839 tcb->snd.wnd = seg->wnd; 1840 tcb->snd.wl2 = seg->ack; 1841 } 1842 1843 if(!seq_gt(seg->ack, tcb->snd.una)){ 1844 /* 1845 * don't let us hangup if sending into a closed window and 1846 * we're still getting acks 1847 */ 1848 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ 1849 tcb->backedoff = MAXBACKMS/4; 1850 } 1851 return; 1852 } 1853 1854 /* 1855 * any positive ack turns off fast rxt, 1856 * (should we do new-reno on partial acks?) 1857 */ 1858 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { 1859 tcb->snd.dupacks = 0; 1860 tcb->snd.recovery = 0; 1861 } else 1862 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind); 1863 1864 /* Compute the new send window size */ 1865 acked = seg->ack - tcb->snd.una; 1866 1867 /* avoid slow start and timers for SYN acks */ 1868 if((tcb->flags & SYNACK) == 0) { 1869 tcb->flags |= SYNACK; 1870 acked--; 1871 tcb->flgcnt--; 1872 goto done; 1873 } 1874 1875 /* slow start as long as we're not recovering from lost packets */ 1876 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { 1877 if(tcb->cwind < tcb->ssthresh) { 1878 expand = tcb->mss; 1879 if(acked < expand) 1880 expand = acked; 1881 } 1882 else 1883 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; 1884 1885 if(tcb->cwind + expand < tcb->cwind) 1886 expand = tcb->snd.wnd - tcb->cwind; 1887 if(tcb->cwind + expand > tcb->snd.wnd) 1888 expand = tcb->snd.wnd - tcb->cwind; 1889 tcb->cwind += expand; 1890 } 1891 1892 /* Adjust the timers according to the round trip time */ 1893 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 1894 tcphalt(tpriv, &tcb->rtt_timer); 1895 if((tcb->flags&RETRAN) == 0) { 1896 tcb->backoff = 0; 1897 tcb->backedoff = 0; 1898 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 1899 if(rtt == 0) 1900 rtt = 1; /* otherwise all close systems will rexmit in 0 time */ 1901 rtt *= MSPTICK; 1902 if(tcb->srtt == 0) { 1903 tcb->srtt = rtt << LOGAGAIN; 1904 tcb->mdev = rtt << LOGDGAIN; 1905 } else { 1906 delta = rtt - (tcb->srtt>>LOGAGAIN); 1907 tcb->srtt += delta; 1908 if(tcb->srtt <= 0) 1909 tcb->srtt = 1; 1910 1911 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 1912 tcb->mdev += delta; 1913 if(tcb->mdev <= 0) 1914 tcb->mdev = 1; 1915 } 1916 tcpsettimer(tcb); 1917 } 1918 } 1919 1920 done: 1921 if(qdiscard(s->wq, acked) < acked) 1922 tcb->flgcnt--; 1923 1924 tcb->snd.una = seg->ack; 1925 if(seq_gt(seg->ack, tcb->snd.urg)) 1926 tcb->snd.urg = seg->ack; 1927 1928 if(tcb->snd.una != tcb->snd.nxt) 1929 tcpgo(tpriv, &tcb->timer); 1930 else 1931 tcphalt(tpriv, &tcb->timer); 1932 1933 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 1934 tcb->snd.ptr = tcb->snd.una; 1935 1936 tcb->flags &= ~RETRAN; 1937 tcb->backoff = 0; 1938 tcb->backedoff = 0; 1939 } 1940 1941 void 1942 tcpiput(Proto *tcp, Ipifc*, Block *bp) 1943 { 1944 Tcp seg; 1945 Tcp4hdr *h4; 1946 Tcp6hdr *h6; 1947 int hdrlen; 1948 Tcpctl *tcb; 1949 ushort length, csum; 1950 uchar source[IPaddrlen], dest[IPaddrlen]; 1951 Conv *s; 1952 Fs *f; 1953 Tcppriv *tpriv; 1954 uchar version; 1955 1956 f = tcp->f; 1957 tpriv = tcp->priv; 1958 1959 tpriv->stats[InSegs]++; 1960 1961 h4 = (Tcp4hdr*)(bp->rp); 1962 h6 = (Tcp6hdr*)(bp->rp); 1963 1964 if((h4->vihl&0xF0)==IP_VER4) { 1965 version = V4; 1966 length = nhgets(h4->length); 1967 v4tov6(dest, h4->tcpdst); 1968 v4tov6(source, h4->tcpsrc); 1969 1970 h4->Unused = 0; 1971 hnputs(h4->tcplen, length-TCP4_PKT); 1972 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 1973 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 1974 tpriv->stats[CsumErrs]++; 1975 tpriv->stats[InErrs]++; 1976 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1977 freeblist(bp); 1978 return; 1979 } 1980 1981 hdrlen = ntohtcp4(&seg, &bp); 1982 if(hdrlen < 0){ 1983 tpriv->stats[HlenErrs]++; 1984 tpriv->stats[InErrs]++; 1985 netlog(f, Logtcp, "bad tcp hdr len\n"); 1986 return; 1987 } 1988 1989 /* trim the packet to the size claimed by the datagram */ 1990 length -= hdrlen+TCP4_PKT; 1991 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 1992 if(bp == nil){ 1993 tpriv->stats[LenErrs]++; 1994 tpriv->stats[InErrs]++; 1995 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 1996 return; 1997 } 1998 } 1999 else { 2000 int ttl = h6->ttl; 2001 int proto = h6->proto; 2002 2003 version = V6; 2004 length = nhgets(h6->ploadlen); 2005 ipmove(dest, h6->tcpdst); 2006 ipmove(source, h6->tcpsrc); 2007 2008 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 2009 h6->ttl = proto; 2010 hnputl(h6->vcf, length); 2011 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 2012 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) { 2013 tpriv->stats[CsumErrs]++; 2014 tpriv->stats[InErrs]++; 2015 netlog(f, Logtcp, 2016 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n", 2017 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum); 2018 freeblist(bp); 2019 return; 2020 } 2021 h6->ttl = ttl; 2022 h6->proto = proto; 2023 hnputs(h6->ploadlen, length); 2024 2025 hdrlen = ntohtcp6(&seg, &bp); 2026 if(hdrlen < 0){ 2027 tpriv->stats[HlenErrs]++; 2028 tpriv->stats[InErrs]++; 2029 netlog(f, Logtcp, "bad tcpv6 hdr len\n"); 2030 return; 2031 } 2032 2033 /* trim the packet to the size claimed by the datagram */ 2034 length -= hdrlen; 2035 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2036 if(bp == nil){ 2037 tpriv->stats[LenErrs]++; 2038 tpriv->stats[InErrs]++; 2039 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n"); 2040 return; 2041 } 2042 } 2043 2044 /* lock protocol while searching for a conversation */ 2045 qlock(tcp); 2046 2047 /* Look for a matching conversation */ 2048 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2049 if(s == nil){ 2050 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n", 2051 source, seg.source, dest, seg.dest); 2052 reset: 2053 qunlock(tcp); 2054 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2055 freeblist(bp); 2056 return; 2057 } 2058 2059 /* if it's a listener, look for the right flags and get a new conv */ 2060 tcb = (Tcpctl*)s->ptcl; 2061 if(tcb->state == Listen){ 2062 if(seg.flags & RST){ 2063 limborst(s, &seg, source, dest, version); 2064 qunlock(tcp); 2065 freeblist(bp); 2066 return; 2067 } 2068 2069 /* if this is a new SYN, put the call into limbo */ 2070 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2071 limbo(s, source, dest, &seg, version); 2072 qunlock(tcp); 2073 freeblist(bp); 2074 return; 2075 } 2076 2077 /* 2078 * if there's a matching call in limbo, tcpincoming will 2079 * return it in state Syn_received 2080 */ 2081 s = tcpincoming(s, &seg, source, dest, version); 2082 if(s == nil) 2083 goto reset; 2084 } 2085 2086 /* The rest of the input state machine is run with the control block 2087 * locked and implements the state machine directly out of the RFC. 2088 * Out-of-band data is ignored - it was always a bad idea. 2089 */ 2090 tcb = (Tcpctl*)s->ptcl; 2091 if(waserror()){ 2092 qunlock(s); 2093 nexterror(); 2094 } 2095 qlock(s); 2096 qunlock(tcp); 2097 2098 /* fix up window */ 2099 seg.wnd <<= tcb->rcv.scale; 2100 2101 /* every input packet in puts off the keep alive time out */ 2102 tcpsetkacounter(tcb); 2103 2104 switch(tcb->state) { 2105 case Closed: 2106 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2107 goto raise; 2108 case Syn_sent: 2109 if(seg.flags & ACK) { 2110 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2111 sndrst(tcp, source, dest, length, &seg, version, 2112 "bad seq in Syn_sent"); 2113 goto raise; 2114 } 2115 } 2116 if(seg.flags & RST) { 2117 if(seg.flags & ACK) 2118 localclose(s, Econrefused); 2119 goto raise; 2120 } 2121 2122 if(seg.flags & SYN) { 2123 procsyn(s, &seg); 2124 if(seg.flags & ACK){ 2125 update(s, &seg); 2126 tcpsynackrtt(s); 2127 tcpsetstate(s, Established); 2128 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2129 } 2130 else { 2131 tcb->time = NOW; 2132 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2133 } 2134 2135 if(length != 0 || (seg.flags & FIN)) 2136 break; 2137 2138 freeblist(bp); 2139 goto output; 2140 } 2141 else 2142 freeblist(bp); 2143 2144 qunlock(s); 2145 poperror(); 2146 return; 2147 case Syn_received: 2148 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2149 if(seg.flags & ACK) 2150 tcpsynackrtt(s); 2151 break; 2152 } 2153 2154 /* 2155 * One DOS attack is to open connections to us and then forget about them, 2156 * thereby tying up a conv at no long term cost to the attacker. 2157 * This is an attempt to defeat these stateless DOS attacks. See 2158 * corresponding code in tcpsendka(). 2159 */ 2160 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2161 if(tcpporthogdefense 2162 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2163 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2164 source, seg.source, dest, seg.dest, seg.flags, 2165 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2166 localclose(s, "stateless hog"); 2167 } 2168 } 2169 2170 /* Cut the data to fit the receive window */ 2171 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2172 netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud\n", 2173 seg.seq, seg.seq + length - 1, 2174 tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1); 2175 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); 2176 update(s, &seg); 2177 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2178 tcphalt(tpriv, &tcb->rtt_timer); 2179 tcphalt(tpriv, &tcb->acktimer); 2180 tcphalt(tpriv, &tcb->katimer); 2181 tcpsetstate(s, Time_wait); 2182 tcb->timer.start = MSL2*(1000 / MSPTICK); 2183 tcpgo(tpriv, &tcb->timer); 2184 } 2185 if(!(seg.flags & RST)) { 2186 tcb->flags |= FORCE; 2187 goto output; 2188 } 2189 qunlock(s); 2190 poperror(); 2191 return; 2192 } 2193 2194 /* Cannot accept so answer with a rst */ 2195 if(length && tcb->state == Closed) { 2196 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2197 goto raise; 2198 } 2199 2200 /* The segment is beyond the current receive pointer so 2201 * queue the data in the resequence queue 2202 */ 2203 if(seg.seq != tcb->rcv.nxt) 2204 if(length != 0 || (seg.flags & (SYN|FIN))) { 2205 update(s, &seg); 2206 if(addreseq(tcb, tpriv, &seg, bp, length) < 0) 2207 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); 2208 tcb->flags |= FORCE; 2209 goto output; 2210 } 2211 2212 /* 2213 * keep looping till we've processed this packet plus any 2214 * adjacent packets in the resequence queue 2215 */ 2216 for(;;) { 2217 if(seg.flags & RST) { 2218 if(tcb->state == Established) { 2219 tpriv->stats[EstabResets]++; 2220 if(tcb->rcv.nxt != seg.seq) 2221 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); 2222 } 2223 localclose(s, Econrefused); 2224 goto raise; 2225 } 2226 2227 if((seg.flags&ACK) == 0) 2228 goto raise; 2229 2230 switch(tcb->state) { 2231 case Syn_received: 2232 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2233 sndrst(tcp, source, dest, length, &seg, version, 2234 "bad seq in Syn_received"); 2235 goto raise; 2236 } 2237 update(s, &seg); 2238 tcpsetstate(s, Established); 2239 case Established: 2240 case Close_wait: 2241 update(s, &seg); 2242 break; 2243 case Finwait1: 2244 update(s, &seg); 2245 if(qlen(s->wq)+tcb->flgcnt == 0){ 2246 tcphalt(tpriv, &tcb->rtt_timer); 2247 tcphalt(tpriv, &tcb->acktimer); 2248 tcpsetkacounter(tcb); 2249 tcb->time = NOW; 2250 tcpsetstate(s, Finwait2); 2251 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2252 tcpgo(tpriv, &tcb->katimer); 2253 } 2254 break; 2255 case Finwait2: 2256 update(s, &seg); 2257 break; 2258 case Closing: 2259 update(s, &seg); 2260 if(qlen(s->wq)+tcb->flgcnt == 0) { 2261 tcphalt(tpriv, &tcb->rtt_timer); 2262 tcphalt(tpriv, &tcb->acktimer); 2263 tcphalt(tpriv, &tcb->katimer); 2264 tcpsetstate(s, Time_wait); 2265 tcb->timer.start = MSL2*(1000 / MSPTICK); 2266 tcpgo(tpriv, &tcb->timer); 2267 } 2268 break; 2269 case Last_ack: 2270 update(s, &seg); 2271 if(qlen(s->wq)+tcb->flgcnt == 0) { 2272 localclose(s, nil); 2273 goto raise; 2274 } 2275 case Time_wait: 2276 tcb->flags |= FORCE; 2277 if(tcb->timer.state != TcptimerON) 2278 tcpgo(tpriv, &tcb->timer); 2279 } 2280 2281 if((seg.flags&URG) && seg.urg) { 2282 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2283 tcb->rcv.urg = seg.urg + seg.seq; 2284 pullblock(&bp, seg.urg); 2285 } 2286 } 2287 else 2288 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2289 tcb->rcv.urg = tcb->rcv.nxt; 2290 2291 if(length == 0) { 2292 if(bp != nil) 2293 freeblist(bp); 2294 } 2295 else { 2296 switch(tcb->state){ 2297 default: 2298 /* Ignore segment text */ 2299 if(bp != nil) 2300 freeblist(bp); 2301 break; 2302 2303 case Syn_received: 2304 case Established: 2305 case Finwait1: 2306 /* If we still have some data place on 2307 * receive queue 2308 */ 2309 if(bp) { 2310 bp = packblock(bp); 2311 if(bp == nil) 2312 panic("tcp packblock"); 2313 qpassnolim(s->rq, bp); 2314 bp = nil; 2315 2316 /* 2317 * Force an ack every 2 data messages. This is 2318 * a hack for rob to make his home system run 2319 * faster. 2320 * 2321 * this also keeps the standard TCP congestion 2322 * control working since it needs an ack every 2323 * 2 max segs worth. This is not quite that, 2324 * but under a real stream is equivalent since 2325 * every packet has a max seg in it. 2326 */ 2327 if(++(tcb->rcv.una) >= 2) 2328 tcb->flags |= FORCE; 2329 } 2330 tcb->rcv.nxt += length; 2331 2332 /* 2333 * update our rcv window 2334 */ 2335 tcprcvwin(s); 2336 2337 /* 2338 * turn on the acktimer if there's something 2339 * to ack 2340 */ 2341 if(tcb->acktimer.state != TcptimerON) 2342 tcpgo(tpriv, &tcb->acktimer); 2343 2344 break; 2345 case Finwait2: 2346 /* no process to read the data, send a reset */ 2347 if(bp != nil) 2348 freeblist(bp); 2349 sndrst(tcp, source, dest, length, &seg, version, 2350 "send to Finwait2"); 2351 qunlock(s); 2352 poperror(); 2353 return; 2354 } 2355 } 2356 2357 if(seg.flags & FIN) { 2358 tcb->flags |= FORCE; 2359 2360 switch(tcb->state) { 2361 case Syn_received: 2362 case Established: 2363 tcb->rcv.nxt++; 2364 tcpsetstate(s, Close_wait); 2365 break; 2366 case Finwait1: 2367 tcb->rcv.nxt++; 2368 if(qlen(s->wq)+tcb->flgcnt == 0) { 2369 tcphalt(tpriv, &tcb->rtt_timer); 2370 tcphalt(tpriv, &tcb->acktimer); 2371 tcphalt(tpriv, &tcb->katimer); 2372 tcpsetstate(s, Time_wait); 2373 tcb->timer.start = MSL2*(1000/MSPTICK); 2374 tcpgo(tpriv, &tcb->timer); 2375 } 2376 else 2377 tcpsetstate(s, Closing); 2378 break; 2379 case Finwait2: 2380 tcb->rcv.nxt++; 2381 tcphalt(tpriv, &tcb->rtt_timer); 2382 tcphalt(tpriv, &tcb->acktimer); 2383 tcphalt(tpriv, &tcb->katimer); 2384 tcpsetstate(s, Time_wait); 2385 tcb->timer.start = MSL2 * (1000/MSPTICK); 2386 tcpgo(tpriv, &tcb->timer); 2387 break; 2388 case Close_wait: 2389 case Closing: 2390 case Last_ack: 2391 break; 2392 case Time_wait: 2393 tcpgo(tpriv, &tcb->timer); 2394 break; 2395 } 2396 } 2397 2398 /* 2399 * get next adjacent segment from the resequence queue. 2400 * dump/trim any overlapping segments 2401 */ 2402 for(;;) { 2403 if(tcb->reseq == nil) 2404 goto output; 2405 2406 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2407 goto output; 2408 2409 getreseq(tcb, &seg, &bp, &length); 2410 2411 if(tcptrim(tcb, &seg, &bp, &length) == 0) 2412 break; 2413 } 2414 } 2415 output: 2416 tcpoutput(s); 2417 qunlock(s); 2418 poperror(); 2419 return; 2420 raise: 2421 qunlock(s); 2422 poperror(); 2423 freeblist(bp); 2424 tcpkick(s); 2425 } 2426 2427 /* 2428 * always enters and exits with the s locked. We drop 2429 * the lock to ipoput the packet so some care has to be 2430 * taken by callers. 2431 */ 2432 void 2433 tcpoutput(Conv *s) 2434 { 2435 Tcp seg; 2436 int msgs; 2437 Tcpctl *tcb; 2438 Block *hbp, *bp; 2439 int sndcnt, n; 2440 ulong ssize, dsize, usable, sent; 2441 Fs *f; 2442 Tcppriv *tpriv; 2443 uchar version; 2444 2445 f = s->p->f; 2446 tpriv = s->p->priv; 2447 version = s->ipversion; 2448 2449 for(msgs = 0; msgs < 100; msgs++) { 2450 tcb = (Tcpctl*)s->ptcl; 2451 2452 switch(tcb->state) { 2453 case Listen: 2454 case Closed: 2455 case Finwait2: 2456 return; 2457 } 2458 2459 /* force an ack when a window has opened up */ 2460 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2461 tcb->rcv.blocked = 0; 2462 tcb->flags |= FORCE; 2463 } 2464 2465 sndcnt = qlen(s->wq)+tcb->flgcnt; 2466 sent = tcb->snd.ptr - tcb->snd.una; 2467 2468 /* Don't send anything else until our SYN has been acked */ 2469 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2470 break; 2471 2472 /* Compute usable segment based on offered window and limit 2473 * window probes to one 2474 */ 2475 if(tcb->snd.wnd == 0){ 2476 if(sent != 0) { 2477 if((tcb->flags&FORCE) == 0) 2478 break; 2479 // tcb->snd.ptr = tcb->snd.una; 2480 } 2481 usable = 1; 2482 } 2483 else { 2484 usable = tcb->cwind; 2485 if(tcb->snd.wnd < usable) 2486 usable = tcb->snd.wnd; 2487 usable -= sent; 2488 } 2489 ssize = sndcnt-sent; 2490 if(ssize && usable < 2) 2491 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", 2492 tcb->snd.wnd, tcb->cwind); 2493 if(usable < ssize) 2494 ssize = usable; 2495 if(tcb->mss < ssize) 2496 ssize = tcb->mss; 2497 dsize = ssize; 2498 seg.urg = 0; 2499 2500 if(ssize == 0) 2501 if((tcb->flags&FORCE) == 0) 2502 break; 2503 2504 tcb->flags &= ~FORCE; 2505 tcprcvwin(s); 2506 2507 /* By default we will generate an ack */ 2508 tcphalt(tpriv, &tcb->acktimer); 2509 tcb->rcv.una = 0; 2510 seg.source = s->lport; 2511 seg.dest = s->rport; 2512 seg.flags = ACK; 2513 seg.mss = 0; 2514 seg.ws = 0; 2515 switch(tcb->state){ 2516 case Syn_sent: 2517 seg.flags = 0; 2518 if(tcb->snd.ptr == tcb->iss){ 2519 seg.flags |= SYN; 2520 dsize--; 2521 seg.mss = tcb->mss; 2522 seg.ws = tcb->scale; 2523 } 2524 break; 2525 case Syn_received: 2526 /* 2527 * don't send any data with a SYN/ACK packet 2528 * because Linux rejects the packet in its 2529 * attempt to solve the SYN attack problem 2530 */ 2531 if(tcb->snd.ptr == tcb->iss){ 2532 seg.flags |= SYN; 2533 dsize = 0; 2534 ssize = 1; 2535 seg.mss = tcb->mss; 2536 seg.ws = tcb->scale; 2537 } 2538 break; 2539 } 2540 seg.seq = tcb->snd.ptr; 2541 seg.ack = tcb->rcv.nxt; 2542 seg.wnd = tcb->rcv.wnd; 2543 2544 /* Pull out data to send */ 2545 bp = nil; 2546 if(dsize != 0) { 2547 bp = qcopy(s->wq, dsize, sent); 2548 if(BLEN(bp) != dsize) { 2549 seg.flags |= FIN; 2550 dsize--; 2551 } 2552 } 2553 2554 if(sent+dsize == sndcnt) 2555 seg.flags |= PSH; 2556 2557 /* keep track of balance of resent data */ 2558 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { 2559 n = tcb->snd.nxt - tcb->snd.ptr; 2560 if(ssize < n) 2561 n = ssize; 2562 tcb->resent += n; 2563 netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n", 2564 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); 2565 tpriv->stats[RetransSegs]++; 2566 } 2567 2568 tcb->snd.ptr += ssize; 2569 2570 /* Pull up the send pointer so we can accept acks 2571 * for this window 2572 */ 2573 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2574 tcb->snd.nxt = tcb->snd.ptr; 2575 2576 /* Build header, link data and compute cksum */ 2577 switch(version){ 2578 case V4: 2579 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2580 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2581 if(hbp == nil) { 2582 freeblist(bp); 2583 return; 2584 } 2585 break; 2586 case V6: 2587 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2588 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2589 if(hbp == nil) { 2590 freeblist(bp); 2591 return; 2592 } 2593 break; 2594 default: 2595 hbp = nil; /* to suppress a warning */ 2596 panic("tcpoutput: version %d", version); 2597 } 2598 2599 /* Start the transmission timers if there is new data and we 2600 * expect acknowledges 2601 */ 2602 if(ssize != 0){ 2603 if(tcb->timer.state != TcptimerON) 2604 tcpgo(tpriv, &tcb->timer); 2605 2606 /* If round trip timer isn't running, start it. 2607 * measure the longest packet only in case the 2608 * transmission time dominates RTT 2609 */ 2610 if(tcb->rtt_timer.state != TcptimerON) 2611 if(ssize == tcb->mss) { 2612 tcpgo(tpriv, &tcb->rtt_timer); 2613 tcb->rttseq = tcb->snd.ptr; 2614 } 2615 } 2616 2617 tpriv->stats[OutSegs]++; 2618 2619 /* put off the next keep alive */ 2620 tcpgo(tpriv, &tcb->katimer); 2621 2622 switch(version){ 2623 case V4: 2624 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2625 /* a negative return means no route */ 2626 localclose(s, "no route"); 2627 } 2628 break; 2629 case V6: 2630 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2631 /* a negative return means no route */ 2632 localclose(s, "no route"); 2633 } 2634 break; 2635 default: 2636 panic("tcpoutput2: version %d", version); 2637 } 2638 if((msgs%4) == 1){ 2639 qunlock(s); 2640 sched(); 2641 qlock(s); 2642 } 2643 } 2644 } 2645 2646 /* 2647 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2648 */ 2649 void 2650 tcpsendka(Conv *s) 2651 { 2652 Tcp seg; 2653 Tcpctl *tcb; 2654 Block *hbp,*dbp; 2655 2656 tcb = (Tcpctl*)s->ptcl; 2657 2658 dbp = nil; 2659 seg.urg = 0; 2660 seg.source = s->lport; 2661 seg.dest = s->rport; 2662 seg.flags = ACK|PSH; 2663 seg.mss = 0; 2664 seg.ws = 0; 2665 if(tcpporthogdefense) 2666 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2667 else 2668 seg.seq = tcb->snd.una-1; 2669 seg.ack = tcb->rcv.nxt; 2670 tcb->rcv.una = 0; 2671 seg.wnd = tcb->rcv.wnd; 2672 if(tcb->state == Finwait2){ 2673 seg.flags |= FIN; 2674 } else { 2675 dbp = allocb(1); 2676 dbp->wp++; 2677 } 2678 2679 if(isv4(s->raddr)) { 2680 /* Build header, link data and compute cksum */ 2681 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2682 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2683 if(hbp == nil) { 2684 freeblist(dbp); 2685 return; 2686 } 2687 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2688 } 2689 else { 2690 /* Build header, link data and compute cksum */ 2691 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2692 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2693 if(hbp == nil) { 2694 freeblist(dbp); 2695 return; 2696 } 2697 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2698 } 2699 } 2700 2701 /* 2702 * set connection to time out after 12 minutes 2703 */ 2704 void 2705 tcpsetkacounter(Tcpctl *tcb) 2706 { 2707 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2708 if(tcb->kacounter < 3) 2709 tcb->kacounter = 3; 2710 } 2711 2712 /* 2713 * if we've timed out, close the connection 2714 * otherwise, send a keepalive and restart the timer 2715 */ 2716 void 2717 tcpkeepalive(void *v) 2718 { 2719 Tcpctl *tcb; 2720 Conv *s; 2721 2722 s = v; 2723 tcb = (Tcpctl*)s->ptcl; 2724 if(waserror()){ 2725 qunlock(s); 2726 nexterror(); 2727 } 2728 qlock(s); 2729 if(tcb->state != Closed){ 2730 if(--(tcb->kacounter) <= 0) { 2731 localclose(s, Etimedout); 2732 } else { 2733 tcpsendka(s); 2734 tcpgo(s->p->priv, &tcb->katimer); 2735 } 2736 } 2737 qunlock(s); 2738 poperror(); 2739 } 2740 2741 /* 2742 * start keepalive timer 2743 */ 2744 char* 2745 tcpstartka(Conv *s, char **f, int n) 2746 { 2747 Tcpctl *tcb; 2748 int x; 2749 2750 tcb = (Tcpctl*)s->ptcl; 2751 if(tcb->state != Established) 2752 return "connection must be in Establised state"; 2753 if(n > 1){ 2754 x = atoi(f[1]); 2755 if(x >= MSPTICK) 2756 tcb->katimer.start = x/MSPTICK; 2757 } 2758 tcpsetkacounter(tcb); 2759 tcpgo(s->p->priv, &tcb->katimer); 2760 2761 return nil; 2762 } 2763 2764 /* 2765 * turn checksums on/off 2766 */ 2767 char* 2768 tcpsetchecksum(Conv *s, char **f, int) 2769 { 2770 Tcpctl *tcb; 2771 2772 tcb = (Tcpctl*)s->ptcl; 2773 tcb->nochecksum = !atoi(f[1]); 2774 2775 return nil; 2776 } 2777 2778 void 2779 tcprxmit(Conv *s) 2780 { 2781 Tcpctl *tcb; 2782 2783 tcb = (Tcpctl*)s->ptcl; 2784 2785 tcb->flags |= RETRAN|FORCE; 2786 tcb->snd.ptr = tcb->snd.una; 2787 2788 /* 2789 * We should be halving the slow start threshhold (down to one 2790 * mss) but leaving it at mss seems to work well enough 2791 */ 2792 tcb->ssthresh = tcb->mss; 2793 2794 /* 2795 * pull window down to a single packet 2796 */ 2797 tcb->cwind = tcb->mss; 2798 tcpoutput(s); 2799 } 2800 2801 void 2802 tcptimeout(void *arg) 2803 { 2804 Conv *s; 2805 Tcpctl *tcb; 2806 int maxback; 2807 Tcppriv *tpriv; 2808 2809 s = (Conv*)arg; 2810 tpriv = s->p->priv; 2811 tcb = (Tcpctl*)s->ptcl; 2812 2813 if(waserror()){ 2814 qunlock(s); 2815 nexterror(); 2816 } 2817 qlock(s); 2818 switch(tcb->state){ 2819 default: 2820 tcb->backoff++; 2821 if(tcb->state == Syn_sent) 2822 maxback = MAXBACKMS/2; 2823 else 2824 maxback = MAXBACKMS; 2825 tcb->backedoff += tcb->timer.start * MSPTICK; 2826 if(tcb->backedoff >= maxback) { 2827 localclose(s, Etimedout); 2828 break; 2829 } 2830 netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW); 2831 tcpsettimer(tcb); 2832 tcprxmit(s); 2833 tpriv->stats[RetransTimeouts]++; 2834 tcb->snd.dupacks = 0; 2835 break; 2836 case Time_wait: 2837 localclose(s, nil); 2838 break; 2839 case Closed: 2840 break; 2841 } 2842 qunlock(s); 2843 poperror(); 2844 } 2845 2846 int 2847 inwindow(Tcpctl *tcb, int seq) 2848 { 2849 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 2850 } 2851 2852 /* 2853 * set up state for a received SYN (or SYN ACK) packet 2854 */ 2855 void 2856 procsyn(Conv *s, Tcp *seg) 2857 { 2858 Tcpctl *tcb; 2859 Tcppriv *tpriv; 2860 2861 tcb = (Tcpctl*)s->ptcl; 2862 tcb->flags |= FORCE; 2863 2864 tcb->rcv.nxt = seg->seq + 1; 2865 tcb->rcv.urg = tcb->rcv.nxt; 2866 tcb->irs = seg->seq; 2867 2868 /* our sending max segment size cannot be bigger than what he asked for */ 2869 if(seg->mss != 0 && seg->mss < tcb->mss) { 2870 tcb->mss = seg->mss; 2871 tpriv = s->p->priv; 2872 tpriv->stats[Mss] = tcb->mss; 2873 } 2874 2875 /* the congestion window always starts out as a single segment */ 2876 tcb->snd.wnd = seg->wnd; 2877 tcb->cwind = tcb->mss; 2878 } 2879 2880 int 2881 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 2882 { 2883 Reseq *rp, *rp1; 2884 int i, rqlen, qmax; 2885 2886 rp = malloc(sizeof(Reseq)); 2887 if(rp == nil){ 2888 freeblist(bp); /* bp always consumed by add_reseq */ 2889 return 0; 2890 } 2891 2892 rp->seg = *seg; 2893 rp->bp = bp; 2894 rp->length = length; 2895 2896 /* Place on reassembly list sorting by starting seq number */ 2897 rp1 = tcb->reseq; 2898 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { 2899 rp->next = rp1; 2900 tcb->reseq = rp; 2901 if(rp->next != nil) 2902 tpriv->stats[OutOfOrder]++; 2903 return 0; 2904 } 2905 2906 rqlen = 0; 2907 for(i = 0;; i++) { 2908 rqlen += rp1->length; 2909 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { 2910 rp->next = rp1->next; 2911 rp1->next = rp; 2912 if(rp->next != nil) 2913 tpriv->stats[OutOfOrder]++; 2914 break; 2915 } 2916 rp1 = rp1->next; 2917 } 2918 qmax = QMAX<<tcb->rcv.scale; 2919 if(rqlen > qmax){ 2920 print("resequence queue > window: %d > %d\n", rqlen, qmax); 2921 i = 0; 2922 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ 2923 print("%#lux %#lux %#ux\n", rp1->seg.seq, 2924 rp1->seg.ack, rp1->seg.flags); 2925 if(i++ > 10){ 2926 print("...\n"); 2927 break; 2928 } 2929 } 2930 2931 /* 2932 * delete entire reassembly queue; wait for retransmit. 2933 * - should we be smarter and only delete the tail? 2934 */ 2935 for(rp = tcb->reseq; rp != nil; rp = rp1){ 2936 rp1 = rp->next; 2937 freeblist(rp->bp); 2938 free(rp); 2939 } 2940 tcb->reseq = nil; 2941 2942 return -1; 2943 } 2944 return 0; 2945 } 2946 2947 void 2948 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2949 { 2950 Reseq *rp; 2951 2952 rp = tcb->reseq; 2953 if(rp == nil) 2954 return; 2955 2956 tcb->reseq = rp->next; 2957 2958 *seg = rp->seg; 2959 *bp = rp->bp; 2960 *length = rp->length; 2961 2962 free(rp); 2963 } 2964 2965 int 2966 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2967 { 2968 ushort len; 2969 uchar accept; 2970 int dupcnt, excess; 2971 2972 accept = 0; 2973 len = *length; 2974 if(seg->flags & SYN) 2975 len++; 2976 if(seg->flags & FIN) 2977 len++; 2978 2979 if(tcb->rcv.wnd == 0) { 2980 if(len == 0 && seg->seq == tcb->rcv.nxt) 2981 return 0; 2982 } 2983 else { 2984 /* Some part of the segment should be in the window */ 2985 if(inwindow(tcb,seg->seq)) 2986 accept++; 2987 else 2988 if(len != 0) { 2989 if(inwindow(tcb, seg->seq+len-1) || 2990 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 2991 accept++; 2992 } 2993 } 2994 if(!accept) { 2995 freeblist(*bp); 2996 return -1; 2997 } 2998 dupcnt = tcb->rcv.nxt - seg->seq; 2999 if(dupcnt > 0){ 3000 tcb->rerecv += dupcnt; 3001 if(seg->flags & SYN){ 3002 seg->flags &= ~SYN; 3003 seg->seq++; 3004 3005 if(seg->urg > 1) 3006 seg->urg--; 3007 else 3008 seg->flags &= ~URG; 3009 dupcnt--; 3010 } 3011 if(dupcnt > 0){ 3012 pullblock(bp, (ushort)dupcnt); 3013 seg->seq += dupcnt; 3014 *length -= dupcnt; 3015 3016 if(seg->urg > dupcnt) 3017 seg->urg -= dupcnt; 3018 else { 3019 seg->flags &= ~URG; 3020 seg->urg = 0; 3021 } 3022 } 3023 } 3024 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 3025 if(excess > 0) { 3026 tcb->rerecv += excess; 3027 *length -= excess; 3028 *bp = trimblock(*bp, 0, *length); 3029 if(*bp == nil) 3030 panic("presotto is a boofhead"); 3031 seg->flags &= ~FIN; 3032 } 3033 return 0; 3034 } 3035 3036 void 3037 tcpadvise(Proto *tcp, Block *bp, char *msg) 3038 { 3039 Tcp4hdr *h4; 3040 Tcp6hdr *h6; 3041 Tcpctl *tcb; 3042 uchar source[IPaddrlen]; 3043 uchar dest[IPaddrlen]; 3044 ushort psource, pdest; 3045 Conv *s, **p; 3046 3047 h4 = (Tcp4hdr*)(bp->rp); 3048 h6 = (Tcp6hdr*)(bp->rp); 3049 3050 if((h4->vihl&0xF0)==IP_VER4) { 3051 v4tov6(dest, h4->tcpdst); 3052 v4tov6(source, h4->tcpsrc); 3053 psource = nhgets(h4->tcpsport); 3054 pdest = nhgets(h4->tcpdport); 3055 } 3056 else { 3057 ipmove(dest, h6->tcpdst); 3058 ipmove(source, h6->tcpsrc); 3059 psource = nhgets(h6->tcpsport); 3060 pdest = nhgets(h6->tcpdport); 3061 } 3062 3063 /* Look for a connection */ 3064 qlock(tcp); 3065 for(p = tcp->conv; *p; p++) { 3066 s = *p; 3067 tcb = (Tcpctl*)s->ptcl; 3068 if(s->rport == pdest) 3069 if(s->lport == psource) 3070 if(tcb->state != Closed) 3071 if(ipcmp(s->raddr, dest) == 0) 3072 if(ipcmp(s->laddr, source) == 0){ 3073 qlock(s); 3074 qunlock(tcp); 3075 switch(tcb->state){ 3076 case Syn_sent: 3077 localclose(s, msg); 3078 break; 3079 } 3080 qunlock(s); 3081 freeblist(bp); 3082 return; 3083 } 3084 } 3085 qunlock(tcp); 3086 freeblist(bp); 3087 } 3088 3089 static char* 3090 tcpporthogdefensectl(char *val) 3091 { 3092 if(strcmp(val, "on") == 0) 3093 tcpporthogdefense = 1; 3094 else if(strcmp(val, "off") == 0) 3095 tcpporthogdefense = 0; 3096 else 3097 return "unknown value for tcpporthogdefense"; 3098 return nil; 3099 } 3100 3101 /* called with c qlocked */ 3102 char* 3103 tcpctl(Conv* c, char** f, int n) 3104 { 3105 if(n == 1 && strcmp(f[0], "hangup") == 0) 3106 return tcphangup(c); 3107 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3108 return tcpstartka(c, f, n); 3109 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3110 return tcpsetchecksum(c, f, n); 3111 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3112 return tcpporthogdefensectl(f[1]); 3113 return "unknown control request"; 3114 } 3115 3116 int 3117 tcpstats(Proto *tcp, char *buf, int len) 3118 { 3119 Tcppriv *priv; 3120 char *p, *e; 3121 int i; 3122 3123 priv = tcp->priv; 3124 p = buf; 3125 e = p+len; 3126 for(i = 0; i < Nstats; i++) 3127 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]); 3128 return p - buf; 3129 } 3130 3131 /* 3132 * garbage collect any stale conversations: 3133 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3134 * - Finwait2 after 5 minutes 3135 * 3136 * this is called whenever we run out of channels. Both checks are 3137 * of questionable validity so we try to use them only when we're 3138 * up against the wall. 3139 */ 3140 int 3141 tcpgc(Proto *tcp) 3142 { 3143 Conv *c, **pp, **ep; 3144 int n; 3145 Tcpctl *tcb; 3146 3147 3148 n = 0; 3149 ep = &tcp->conv[tcp->nc]; 3150 for(pp = tcp->conv; pp < ep; pp++) { 3151 c = *pp; 3152 if(c == nil) 3153 break; 3154 if(!canqlock(c)) 3155 continue; 3156 tcb = (Tcpctl*)c->ptcl; 3157 switch(tcb->state){ 3158 case Syn_received: 3159 if(NOW - tcb->time > 5000){ 3160 localclose(c, "timed out"); 3161 n++; 3162 } 3163 break; 3164 case Finwait2: 3165 if(NOW - tcb->time > 5*60*1000){ 3166 localclose(c, "timed out"); 3167 n++; 3168 } 3169 break; 3170 } 3171 qunlock(c); 3172 } 3173 return n; 3174 } 3175 3176 void 3177 tcpsettimer(Tcpctl *tcb) 3178 { 3179 int x; 3180 3181 /* round trip dependency */ 3182 x = backoff(tcb->backoff) * 3183 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3184 3185 /* bounded twixt 1/2 and 64 seconds */ 3186 if(x < 500/MSPTICK) 3187 x = 500/MSPTICK; 3188 else if(x > (64000/MSPTICK)) 3189 x = 64000/MSPTICK; 3190 tcb->timer.start = x; 3191 } 3192 3193 void 3194 tcpinit(Fs *fs) 3195 { 3196 Proto *tcp; 3197 Tcppriv *tpriv; 3198 3199 tcp = smalloc(sizeof(Proto)); 3200 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3201 tcp->name = "tcp"; 3202 tcp->connect = tcpconnect; 3203 tcp->announce = tcpannounce; 3204 tcp->ctl = tcpctl; 3205 tcp->state = tcpstate; 3206 tcp->create = tcpcreate; 3207 tcp->close = tcpclose; 3208 tcp->rcv = tcpiput; 3209 tcp->advise = tcpadvise; 3210 tcp->stats = tcpstats; 3211 tcp->inuse = tcpinuse; 3212 tcp->gc = tcpgc; 3213 tcp->ipproto = IP_TCPPROTO; 3214 tcp->nc = scalednconv(); 3215 tcp->ptclsize = sizeof(Tcpctl); 3216 tpriv->stats[MaxConn] = tcp->nc; 3217 3218 Fsproto(fs, tcp); 3219 } 3220 3221 void 3222 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3223 { 3224 if(rcvscale){ 3225 tcb->rcv.scale = rcvscale & 0xff; 3226 tcb->snd.scale = sndscale & 0xff; 3227 tcb->window = QMAX<<tcb->snd.scale; 3228 qsetlimit(s->rq, tcb->window); 3229 } else { 3230 tcb->rcv.scale = 0; 3231 tcb->snd.scale = 0; 3232 tcb->window = QMAX; 3233 qsetlimit(s->rq, tcb->window); 3234 } 3235 } 3236