1 #include "u.h" 2 #include "../port/lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "../port/error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Mean segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default mean segment */ 50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 HaveWS = 1<<8, 85 }; 86 87 /* Must correspond to the enumeration above */ 88 char *tcpstates[] = 89 { 90 "Closed", "Listen", "Syn_sent", "Syn_received", 91 "Established", "Finwait1", "Finwait2", "Close_wait", 92 "Closing", "Last_ack", "Time_wait" 93 }; 94 95 typedef struct Tcptimer Tcptimer; 96 struct Tcptimer 97 { 98 Tcptimer *next; 99 Tcptimer *prev; 100 Tcptimer *readynext; 101 int state; 102 int start; 103 int count; 104 void (*func)(void*); 105 void *arg; 106 }; 107 108 /* 109 * v4 and v6 pseudo headers used for 110 * checksuming tcp 111 */ 112 typedef struct Tcp4hdr Tcp4hdr; 113 struct Tcp4hdr 114 { 115 uchar vihl; /* Version and header length */ 116 uchar tos; /* Type of service */ 117 uchar length[2]; /* packet length */ 118 uchar id[2]; /* Identification */ 119 uchar frag[2]; /* Fragment information */ 120 uchar Unused; 121 uchar proto; 122 uchar tcplen[2]; 123 uchar tcpsrc[4]; 124 uchar tcpdst[4]; 125 uchar tcpsport[2]; 126 uchar tcpdport[2]; 127 uchar tcpseq[4]; 128 uchar tcpack[4]; 129 uchar tcpflag[2]; 130 uchar tcpwin[2]; 131 uchar tcpcksum[2]; 132 uchar tcpurg[2]; 133 /* Options segment */ 134 uchar tcpopt[1]; 135 }; 136 137 typedef struct Tcp6hdr Tcp6hdr; 138 struct Tcp6hdr 139 { 140 uchar vcf[4]; 141 uchar ploadlen[2]; 142 uchar proto; 143 uchar ttl; 144 uchar tcpsrc[IPaddrlen]; 145 uchar tcpdst[IPaddrlen]; 146 uchar tcpsport[2]; 147 uchar tcpdport[2]; 148 uchar tcpseq[4]; 149 uchar tcpack[4]; 150 uchar tcpflag[2]; 151 uchar tcpwin[2]; 152 uchar tcpcksum[2]; 153 uchar tcpurg[2]; 154 /* Options segment */ 155 uchar tcpopt[1]; 156 }; 157 158 /* 159 * this represents the control info 160 * for a single packet. It is derived from 161 * a packet in ntohtcp{4,6}() and stuck into 162 * a packet in htontcp{4,6}(). 163 */ 164 typedef struct Tcp Tcp; 165 struct Tcp 166 { 167 ushort source; 168 ushort dest; 169 ulong seq; 170 ulong ack; 171 uchar flags; 172 ushort ws; /* window scale option (if not zero) */ 173 ulong wnd; 174 ushort urg; 175 ushort mss; /* max segment size option (if not zero) */ 176 ushort len; /* size of data */ 177 }; 178 179 /* 180 * this header is malloc'd to thread together fragments 181 * waiting to be coalesced 182 */ 183 typedef struct Reseq Reseq; 184 struct Reseq 185 { 186 Reseq *next; 187 Tcp seg; 188 Block *bp; 189 ushort length; 190 }; 191 192 /* 193 * the qlock in the Conv locks this structure 194 */ 195 typedef struct Tcpctl Tcpctl; 196 struct Tcpctl 197 { 198 uchar state; /* Connection state */ 199 uchar type; /* Listening or active connection */ 200 uchar code; /* Icmp code */ 201 struct { 202 ulong una; /* Unacked data pointer */ 203 ulong nxt; /* Next sequence expected */ 204 ulong ptr; /* Data pointer */ 205 ulong wnd; /* Tcp send window */ 206 ulong urg; /* Urgent data pointer */ 207 ulong wl2; 208 int scale; /* how much to right shift window in xmitted packets */ 209 /* to implement tahoe and reno TCP */ 210 ulong dupacks; /* number of duplicate acks rcvd */ 211 int recovery; /* loss recovery flag */ 212 ulong rxt; /* right window marker for recovery */ 213 } snd; 214 struct { 215 ulong nxt; /* Receive pointer to next uchar slot */ 216 ulong wnd; /* Receive window incoming */ 217 ulong urg; /* Urgent pointer */ 218 int blocked; 219 int una; /* unacked data segs */ 220 int scale; /* how much to left shift window in rcved packets */ 221 } rcv; 222 ulong iss; /* Initial sequence number */ 223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ 224 ulong cwind; /* Congestion window */ 225 int scale; /* desired snd.scale */ 226 ushort ssthresh; /* Slow start threshold */ 227 int resent; /* Bytes just resent */ 228 int irs; /* Initial received squence */ 229 ushort mss; /* Mean segment size */ 230 int rerecv; /* Overlap of data rerecevived */ 231 ulong window; /* Recevive window */ 232 uchar backoff; /* Exponential backoff counter */ 233 int backedoff; /* ms we've backed off for rexmits */ 234 uchar flags; /* State flags */ 235 Reseq *reseq; /* Resequencing queue */ 236 Tcptimer timer; /* Activity timer */ 237 Tcptimer acktimer; /* Acknowledge timer */ 238 Tcptimer rtt_timer; /* Round trip timer */ 239 Tcptimer katimer; /* keep alive timer */ 240 ulong rttseq; /* Round trip sequence */ 241 int srtt; /* Shortened round trip */ 242 int mdev; /* Mean deviation of round trip */ 243 int kacounter; /* count down for keep alive */ 244 uint sndsyntime; /* time syn sent */ 245 ulong time; /* time Finwait2 or Syn_received was sent */ 246 int nochecksum; /* non-zero means don't send checksums */ 247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 248 249 union { 250 Tcp4hdr tcp4hdr; 251 Tcp6hdr tcp6hdr; 252 } protohdr; /* prototype header */ 253 }; 254 255 /* 256 * New calls are put in limbo rather than having a conversation structure 257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 258 * any real Conv structures mucking things up. Calls in limbo rexmit their 259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 260 * 261 * In particular they aren't on a listener's queue so that they don't figure 262 * in the input queue limit. 263 * 264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 266 * there is no hashing of this list. 267 */ 268 typedef struct Limbo Limbo; 269 struct Limbo 270 { 271 Limbo *next; 272 273 uchar laddr[IPaddrlen]; 274 uchar raddr[IPaddrlen]; 275 ushort lport; 276 ushort rport; 277 ulong irs; /* initial received sequence */ 278 ulong iss; /* initial sent sequence */ 279 ushort mss; /* mss from the other end */ 280 ushort rcvscale; /* how much to scale rcvd windows */ 281 ushort sndscale; /* how much to scale sent windows */ 282 ulong lastsend; /* last time we sent a synack */ 283 uchar version; /* v4 or v6 */ 284 uchar rexmits; /* number of retransmissions */ 285 }; 286 287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ 289 290 enum { 291 /* MIB stats */ 292 MaxConn, 293 ActiveOpens, 294 PassiveOpens, 295 EstabResets, 296 CurrEstab, 297 InSegs, 298 OutSegs, 299 RetransSegs, 300 RetransTimeouts, 301 InErrs, 302 OutRsts, 303 304 /* non-MIB stats */ 305 CsumErrs, 306 HlenErrs, 307 LenErrs, 308 OutOfOrder, 309 310 Nstats 311 }; 312 313 static char *statnames[] = 314 { 315 [MaxConn] "MaxConn", 316 [ActiveOpens] "ActiveOpens", 317 [PassiveOpens] "PassiveOpens", 318 [EstabResets] "EstabResets", 319 [CurrEstab] "CurrEstab", 320 [InSegs] "InSegs", 321 [OutSegs] "OutSegs", 322 [RetransSegs] "RetransSegs", 323 [RetransTimeouts] "RetransTimeouts", 324 [InErrs] "InErrs", 325 [OutRsts] "OutRsts", 326 [CsumErrs] "CsumErrs", 327 [HlenErrs] "HlenErrs", 328 [LenErrs] "LenErrs", 329 [OutOfOrder] "OutOfOrder", 330 }; 331 332 typedef struct Tcppriv Tcppriv; 333 struct Tcppriv 334 { 335 /* List of active timers */ 336 QLock tl; 337 Tcptimer *timers; 338 339 /* hash table for matching conversations */ 340 Ipht ht; 341 342 /* calls in limbo waiting for an ACK to our SYN ACK */ 343 int nlimbo; 344 Limbo *lht[NLHT]; 345 346 /* for keeping track of tcpackproc */ 347 QLock apl; 348 int ackprocstarted; 349 350 ulong stats[Nstats]; 351 }; 352 353 /* 354 * Setting tcpporthogdefense to non-zero enables Dong Lin's 355 * solution to hijacked systems staking out port's as a form 356 * of DoS attack. 357 * 358 * To avoid stateless Conv hogs, we pick a sequence number at random. If 359 * it that number gets acked by the other end, we shut down the connection. 360 * Look for tcpporthogedefense in the code. 361 */ 362 int tcpporthogdefense = 0; 363 364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 366 void localclose(Conv*, char*); 367 void procsyn(Conv*, Tcp*); 368 void tcpiput(Proto*, Ipifc*, Block*); 369 void tcpoutput(Conv*); 370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 371 void tcpstart(Conv*, int); 372 void tcptimeout(void*); 373 void tcpsndsyn(Conv*, Tcpctl*); 374 void tcprcvwin(Conv*); 375 void tcpacktimer(void*); 376 void tcpkeepalive(void*); 377 void tcpsetkacounter(Tcpctl*); 378 void tcprxmit(Conv*); 379 void tcpsettimer(Tcpctl*); 380 void tcpsynackrtt(Conv*); 381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 382 383 static void limborexmit(Proto*); 384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 385 386 void 387 tcpsetstate(Conv *s, uchar newstate) 388 { 389 Tcpctl *tcb; 390 uchar oldstate; 391 Tcppriv *tpriv; 392 393 tpriv = s->p->priv; 394 395 tcb = (Tcpctl*)s->ptcl; 396 397 oldstate = tcb->state; 398 if(oldstate == newstate) 399 return; 400 401 if(oldstate == Established) 402 tpriv->stats[CurrEstab]--; 403 if(newstate == Established) 404 tpriv->stats[CurrEstab]++; 405 406 /** 407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, 408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); 409 **/ 410 411 switch(newstate) { 412 case Closed: 413 qclose(s->rq); 414 qclose(s->wq); 415 qclose(s->eq); 416 break; 417 418 case Close_wait: /* Remote closes */ 419 qhangup(s->rq, nil); 420 break; 421 } 422 423 tcb->state = newstate; 424 425 if(oldstate == Syn_sent && newstate != Closed) 426 Fsconnected(s, nil); 427 } 428 429 static char* 430 tcpconnect(Conv *c, char **argv, int argc) 431 { 432 char *e; 433 434 e = Fsstdconnect(c, argv, argc); 435 if(e != nil) 436 return e; 437 tcpstart(c, TCP_CONNECT); 438 439 return nil; 440 } 441 442 static int 443 tcpstate(Conv *c, char *state, int n) 444 { 445 Tcpctl *s; 446 447 s = (Tcpctl*)(c->ptcl); 448 449 return snprint(state, n, 450 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 451 tcpstates[s->state], 452 c->rq ? qlen(c->rq) : 0, 453 c->wq ? qlen(c->wq) : 0, 454 s->srtt, s->mdev, 455 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 456 s->timer.start, s->timer.count, s->rerecv, 457 s->katimer.start, s->katimer.count); 458 } 459 460 static int 461 tcpinuse(Conv *c) 462 { 463 Tcpctl *s; 464 465 s = (Tcpctl*)(c->ptcl); 466 return s->state != Closed; 467 } 468 469 static char* 470 tcpannounce(Conv *c, char **argv, int argc) 471 { 472 char *e; 473 474 e = Fsstdannounce(c, argv, argc); 475 if(e != nil) 476 return e; 477 tcpstart(c, TCP_LISTEN); 478 Fsconnected(c, nil); 479 480 return nil; 481 } 482 483 /* 484 * tcpclose is always called with the q locked 485 */ 486 static void 487 tcpclose(Conv *c) 488 { 489 Tcpctl *tcb; 490 491 tcb = (Tcpctl*)c->ptcl; 492 493 qhangup(c->rq, nil); 494 qhangup(c->wq, nil); 495 qhangup(c->eq, nil); 496 qflush(c->rq); 497 498 switch(tcb->state) { 499 case Listen: 500 /* 501 * reset any incoming calls to this listener 502 */ 503 Fsconnected(c, "Hangup"); 504 505 localclose(c, nil); 506 break; 507 case Closed: 508 case Syn_sent: 509 localclose(c, nil); 510 break; 511 case Syn_received: 512 case Established: 513 tcb->flgcnt++; 514 tcb->snd.nxt++; 515 tcpsetstate(c, Finwait1); 516 tcpoutput(c); 517 break; 518 case Close_wait: 519 tcb->flgcnt++; 520 tcb->snd.nxt++; 521 tcpsetstate(c, Last_ack); 522 tcpoutput(c); 523 break; 524 } 525 } 526 527 void 528 tcpkick(void *x) 529 { 530 Conv *s = x; 531 Tcpctl *tcb; 532 533 tcb = (Tcpctl*)s->ptcl; 534 535 if(waserror()){ 536 qunlock(s); 537 nexterror(); 538 } 539 qlock(s); 540 541 switch(tcb->state) { 542 case Syn_sent: 543 case Syn_received: 544 case Established: 545 case Close_wait: 546 /* 547 * Push data 548 */ 549 tcprcvwin(s); 550 tcpoutput(s); 551 break; 552 default: 553 localclose(s, "Hangup"); 554 break; 555 } 556 557 qunlock(s); 558 poperror(); 559 } 560 561 void 562 tcprcvwin(Conv *s) /* Call with tcb locked */ 563 { 564 int w; 565 Tcpctl *tcb; 566 567 tcb = (Tcpctl*)s->ptcl; 568 w = tcb->window - qlen(s->rq); 569 if(w < 0) 570 w = 0; 571 tcb->rcv.wnd = w; 572 if(w == 0) 573 tcb->rcv.blocked = 1; 574 } 575 576 void 577 tcpacktimer(void *v) 578 { 579 Tcpctl *tcb; 580 Conv *s; 581 582 s = v; 583 tcb = (Tcpctl*)s->ptcl; 584 585 if(waserror()){ 586 qunlock(s); 587 nexterror(); 588 } 589 qlock(s); 590 if(tcb->state != Closed){ 591 tcb->flags |= FORCE; 592 tcprcvwin(s); 593 tcpoutput(s); 594 } 595 qunlock(s); 596 poperror(); 597 } 598 599 static void 600 tcpcreate(Conv *c) 601 { 602 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 603 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); 604 } 605 606 static void 607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 608 { 609 if(newstate != TcptimerON){ 610 if(t->state == TcptimerON){ 611 // unchain 612 if(priv->timers == t){ 613 priv->timers = t->next; 614 if(t->prev != nil) 615 panic("timerstate1"); 616 } 617 if(t->next) 618 t->next->prev = t->prev; 619 if(t->prev) 620 t->prev->next = t->next; 621 t->next = t->prev = nil; 622 } 623 } else { 624 if(t->state != TcptimerON){ 625 // chain 626 if(t->prev != nil || t->next != nil) 627 panic("timerstate2"); 628 t->prev = nil; 629 t->next = priv->timers; 630 if(t->next) 631 t->next->prev = t; 632 priv->timers = t; 633 } 634 } 635 t->state = newstate; 636 } 637 638 void 639 tcpackproc(void *a) 640 { 641 Tcptimer *t, *tp, *timeo; 642 Proto *tcp; 643 Tcppriv *priv; 644 int loop; 645 646 tcp = a; 647 priv = tcp->priv; 648 649 for(;;) { 650 tsleep(&up->sleep, return0, 0, MSPTICK); 651 652 qlock(&priv->tl); 653 timeo = nil; 654 loop = 0; 655 for(t = priv->timers; t != nil; t = tp) { 656 if(loop++ > 10000) 657 panic("tcpackproc1"); 658 tp = t->next; 659 if(t->state == TcptimerON) { 660 t->count--; 661 if(t->count == 0) { 662 timerstate(priv, t, TcptimerDONE); 663 t->readynext = timeo; 664 timeo = t; 665 } 666 } 667 } 668 qunlock(&priv->tl); 669 670 loop = 0; 671 for(t = timeo; t != nil; t = t->readynext) { 672 if(loop++ > 10000) 673 panic("tcpackproc2"); 674 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 675 (*t->func)(t->arg); 676 poperror(); 677 } 678 } 679 680 limborexmit(tcp); 681 } 682 } 683 684 void 685 tcpgo(Tcppriv *priv, Tcptimer *t) 686 { 687 if(t == nil || t->start == 0) 688 return; 689 690 qlock(&priv->tl); 691 t->count = t->start; 692 timerstate(priv, t, TcptimerON); 693 qunlock(&priv->tl); 694 } 695 696 void 697 tcphalt(Tcppriv *priv, Tcptimer *t) 698 { 699 if(t == nil) 700 return; 701 702 qlock(&priv->tl); 703 timerstate(priv, t, TcptimerOFF); 704 qunlock(&priv->tl); 705 } 706 707 int 708 backoff(int n) 709 { 710 return 1 << n; 711 } 712 713 void 714 localclose(Conv *s, char *reason) /* called with tcb locked */ 715 { 716 Tcpctl *tcb; 717 Reseq *rp,*rp1; 718 Tcppriv *tpriv; 719 720 tpriv = s->p->priv; 721 tcb = (Tcpctl*)s->ptcl; 722 723 iphtrem(&tpriv->ht, s); 724 725 tcphalt(tpriv, &tcb->timer); 726 tcphalt(tpriv, &tcb->rtt_timer); 727 tcphalt(tpriv, &tcb->acktimer); 728 tcphalt(tpriv, &tcb->katimer); 729 730 /* Flush reassembly queue; nothing more can arrive */ 731 for(rp = tcb->reseq; rp != nil; rp = rp1) { 732 rp1 = rp->next; 733 freeblist(rp->bp); 734 free(rp); 735 } 736 tcb->reseq = nil; 737 738 if(tcb->state == Syn_sent) 739 Fsconnected(s, reason); 740 if(s->state == Announced) 741 wakeup(&s->listenr); 742 743 qhangup(s->rq, reason); 744 qhangup(s->wq, reason); 745 746 tcpsetstate(s, Closed); 747 } 748 749 /* mtu (- TCP + IP hdr len) of 1st hop */ 750 int 751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) 752 { 753 Ipifc *ifc; 754 int mtu; 755 756 ifc = findipifc(tcp->f, addr, 0); 757 switch(version){ 758 default: 759 case V4: 760 mtu = DEF_MSS; 761 if(ifc != nil) 762 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 763 break; 764 case V6: 765 mtu = DEF_MSS6; 766 if(ifc != nil) 767 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 768 break; 769 } 770 if(ifc != nil){ 771 if(ifc->mbps > 100) 772 *scale = HaveWS | 3; 773 else if(ifc->mbps > 10) 774 *scale = HaveWS | 1; 775 else 776 *scale = HaveWS | 0; 777 } else 778 *scale = HaveWS | 0; 779 780 return mtu; 781 } 782 783 void 784 inittcpctl(Conv *s, int mode) 785 { 786 Tcpctl *tcb; 787 Tcp4hdr* h4; 788 Tcp6hdr* h6; 789 int mss; 790 791 tcb = (Tcpctl*)s->ptcl; 792 793 memset(tcb, 0, sizeof(Tcpctl)); 794 795 tcb->ssthresh = 65535; 796 tcb->srtt = tcp_irtt<<LOGAGAIN; 797 tcb->mdev = 0; 798 799 /* setup timers */ 800 tcb->timer.start = tcp_irtt / MSPTICK; 801 tcb->timer.func = tcptimeout; 802 tcb->timer.arg = s; 803 tcb->rtt_timer.start = MAX_TIME; 804 tcb->acktimer.start = TCP_ACK / MSPTICK; 805 tcb->acktimer.func = tcpacktimer; 806 tcb->acktimer.arg = s; 807 tcb->katimer.start = DEF_KAT / MSPTICK; 808 tcb->katimer.func = tcpkeepalive; 809 tcb->katimer.arg = s; 810 811 mss = DEF_MSS; 812 813 /* create a prototype(pseudo) header */ 814 if(mode != TCP_LISTEN){ 815 if(ipcmp(s->laddr, IPnoaddr) == 0) 816 findlocalip(s->p->f, s->laddr, s->raddr); 817 818 switch(s->ipversion){ 819 case V4: 820 h4 = &tcb->protohdr.tcp4hdr; 821 memset(h4, 0, sizeof(*h4)); 822 h4->proto = IP_TCPPROTO; 823 hnputs(h4->tcpsport, s->lport); 824 hnputs(h4->tcpdport, s->rport); 825 v6tov4(h4->tcpsrc, s->laddr); 826 v6tov4(h4->tcpdst, s->raddr); 827 break; 828 case V6: 829 h6 = &tcb->protohdr.tcp6hdr; 830 memset(h6, 0, sizeof(*h6)); 831 h6->proto = IP_TCPPROTO; 832 hnputs(h6->tcpsport, s->lport); 833 hnputs(h6->tcpdport, s->rport); 834 ipmove(h6->tcpsrc, s->laddr); 835 ipmove(h6->tcpdst, s->raddr); 836 mss = DEF_MSS6; 837 break; 838 default: 839 panic("inittcpctl: version %d", s->ipversion); 840 } 841 } 842 843 tcb->mss = tcb->cwind = mss; 844 845 /* default is no window scaling */ 846 tcb->window = QMAX; 847 tcb->rcv.wnd = QMAX; 848 tcb->rcv.scale = 0; 849 tcb->snd.scale = 0; 850 qsetlimit(s->rq, QMAX); 851 } 852 853 /* 854 * called with s qlocked 855 */ 856 void 857 tcpstart(Conv *s, int mode) 858 { 859 Tcpctl *tcb; 860 Tcppriv *tpriv; 861 char kpname[KNAMELEN]; 862 863 tpriv = s->p->priv; 864 865 if(tpriv->ackprocstarted == 0){ 866 qlock(&tpriv->apl); 867 if(tpriv->ackprocstarted == 0){ 868 sprint(kpname, "#I%dtcpack", s->p->f->dev); 869 kproc(kpname, tcpackproc, s->p, 0); 870 tpriv->ackprocstarted = 1; 871 } 872 qunlock(&tpriv->apl); 873 } 874 875 tcb = (Tcpctl*)s->ptcl; 876 877 inittcpctl(s, mode); 878 879 iphtadd(&tpriv->ht, s); 880 switch(mode) { 881 case TCP_LISTEN: 882 tpriv->stats[PassiveOpens]++; 883 tcb->flags |= CLONE; 884 tcpsetstate(s, Listen); 885 break; 886 887 case TCP_CONNECT: 888 tpriv->stats[ActiveOpens]++; 889 tcb->flags |= ACTIVE; 890 tcpsndsyn(s, tcb); 891 tcpsetstate(s, Syn_sent); 892 tcpoutput(s); 893 break; 894 } 895 } 896 897 static char* 898 tcpflag(ushort flag) 899 { 900 static char buf[128]; 901 902 sprint(buf, "%d", flag>>10); /* Head len */ 903 if(flag & URG) 904 strcat(buf, " URG"); 905 if(flag & ACK) 906 strcat(buf, " ACK"); 907 if(flag & PSH) 908 strcat(buf, " PSH"); 909 if(flag & RST) 910 strcat(buf, " RST"); 911 if(flag & SYN) 912 strcat(buf, " SYN"); 913 if(flag & FIN) 914 strcat(buf, " FIN"); 915 916 return buf; 917 } 918 919 Block * 920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 921 { 922 int dlen; 923 Tcp6hdr *h; 924 ushort csum; 925 ushort hdrlen, optpad = 0; 926 uchar *opt; 927 928 hdrlen = TCP6_HDRSIZE; 929 if(tcph->flags & SYN){ 930 if(tcph->mss) 931 hdrlen += MSS_LENGTH; 932 if(tcph->ws) 933 hdrlen += WS_LENGTH; 934 optpad = hdrlen & 3; 935 if(optpad) 936 optpad = 4 - optpad; 937 hdrlen += optpad; 938 } 939 940 if(data) { 941 dlen = blocklen(data); 942 data = padblock(data, hdrlen + TCP6_PKT); 943 if(data == nil) 944 return nil; 945 } 946 else { 947 dlen = 0; 948 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 949 if(data == nil) 950 return nil; 951 data->wp += hdrlen + TCP6_PKT; 952 } 953 954 /* copy in pseudo ip header plus port numbers */ 955 h = (Tcp6hdr *)(data->rp); 956 memmove(h, ph, TCP6_TCBPHDRSZ); 957 958 /* compose pseudo tcp header, do cksum calculation */ 959 hnputl(h->vcf, hdrlen + dlen); 960 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 961 h->ttl = ph->proto; 962 963 /* copy in variable bits */ 964 hnputl(h->tcpseq, tcph->seq); 965 hnputl(h->tcpack, tcph->ack); 966 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 967 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 968 hnputs(h->tcpurg, tcph->urg); 969 970 if(tcph->flags & SYN){ 971 opt = h->tcpopt; 972 if(tcph->mss != 0){ 973 *opt++ = MSSOPT; 974 *opt++ = MSS_LENGTH; 975 hnputs(opt, tcph->mss); 976 opt += 2; 977 } 978 if(tcph->ws != 0){ 979 *opt++ = WSOPT; 980 *opt++ = WS_LENGTH; 981 *opt++ = tcph->ws; 982 } 983 while(optpad-- > 0) 984 *opt++ = NOOPOPT; 985 } 986 987 if(tcb != nil && tcb->nochecksum){ 988 h->tcpcksum[0] = h->tcpcksum[1] = 0; 989 } else { 990 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 991 hnputs(h->tcpcksum, csum); 992 } 993 994 /* move from pseudo header back to normal ip header */ 995 memset(h->vcf, 0, 4); 996 h->vcf[0] = IP_VER6; 997 hnputs(h->ploadlen, hdrlen+dlen); 998 h->proto = ph->proto; 999 1000 return data; 1001 } 1002 1003 Block * 1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1005 { 1006 int dlen; 1007 Tcp4hdr *h; 1008 ushort csum; 1009 ushort hdrlen, optpad = 0; 1010 uchar *opt; 1011 1012 hdrlen = TCP4_HDRSIZE; 1013 if(tcph->flags & SYN){ 1014 if(tcph->mss) 1015 hdrlen += MSS_LENGTH; 1016 if(tcph->ws) 1017 hdrlen += WS_LENGTH; 1018 optpad = hdrlen & 3; 1019 if(optpad) 1020 optpad = 4 - optpad; 1021 hdrlen += optpad; 1022 } 1023 1024 if(data) { 1025 dlen = blocklen(data); 1026 data = padblock(data, hdrlen + TCP4_PKT); 1027 if(data == nil) 1028 return nil; 1029 } 1030 else { 1031 dlen = 0; 1032 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1033 if(data == nil) 1034 return nil; 1035 data->wp += hdrlen + TCP4_PKT; 1036 } 1037 1038 /* copy in pseudo ip header plus port numbers */ 1039 h = (Tcp4hdr *)(data->rp); 1040 memmove(h, ph, TCP4_TCBPHDRSZ); 1041 1042 /* copy in variable bits */ 1043 hnputs(h->tcplen, hdrlen + dlen); 1044 hnputl(h->tcpseq, tcph->seq); 1045 hnputl(h->tcpack, tcph->ack); 1046 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1047 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1048 hnputs(h->tcpurg, tcph->urg); 1049 1050 if(tcph->flags & SYN){ 1051 opt = h->tcpopt; 1052 if(tcph->mss != 0){ 1053 *opt++ = MSSOPT; 1054 *opt++ = MSS_LENGTH; 1055 hnputs(opt, tcph->mss); 1056 opt += 2; 1057 } 1058 if(tcph->ws != 0){ 1059 *opt++ = WSOPT; 1060 *opt++ = WS_LENGTH; 1061 *opt++ = tcph->ws; 1062 } 1063 while(optpad-- > 0) 1064 *opt++ = NOOPOPT; 1065 } 1066 1067 if(tcb != nil && tcb->nochecksum){ 1068 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1069 } else { 1070 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1071 hnputs(h->tcpcksum, csum); 1072 } 1073 1074 return data; 1075 } 1076 1077 int 1078 ntohtcp6(Tcp *tcph, Block **bpp) 1079 { 1080 Tcp6hdr *h; 1081 uchar *optr; 1082 ushort hdrlen; 1083 ushort optlen; 1084 int n; 1085 1086 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1087 if(*bpp == nil) 1088 return -1; 1089 1090 h = (Tcp6hdr *)((*bpp)->rp); 1091 tcph->source = nhgets(h->tcpsport); 1092 tcph->dest = nhgets(h->tcpdport); 1093 tcph->seq = nhgetl(h->tcpseq); 1094 tcph->ack = nhgetl(h->tcpack); 1095 hdrlen = (h->tcpflag[0]>>2) & ~3; 1096 if(hdrlen < TCP6_HDRSIZE) { 1097 freeblist(*bpp); 1098 return -1; 1099 } 1100 1101 tcph->flags = h->tcpflag[1]; 1102 tcph->wnd = nhgets(h->tcpwin); 1103 tcph->urg = nhgets(h->tcpurg); 1104 tcph->mss = 0; 1105 tcph->ws = 0; 1106 tcph->len = nhgets(h->ploadlen) - hdrlen; 1107 1108 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1109 if(*bpp == nil) 1110 return -1; 1111 1112 optr = h->tcpopt; 1113 n = hdrlen - TCP6_HDRSIZE; 1114 while(n > 0 && *optr != EOLOPT) { 1115 if(*optr == NOOPOPT) { 1116 n--; 1117 optr++; 1118 continue; 1119 } 1120 optlen = optr[1]; 1121 if(optlen < 2 || optlen > n) 1122 break; 1123 switch(*optr) { 1124 case MSSOPT: 1125 if(optlen == MSS_LENGTH) 1126 tcph->mss = nhgets(optr+2); 1127 break; 1128 case WSOPT: 1129 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1130 tcph->ws = HaveWS | *(optr+2); 1131 break; 1132 } 1133 n -= optlen; 1134 optr += optlen; 1135 } 1136 return hdrlen; 1137 } 1138 1139 int 1140 ntohtcp4(Tcp *tcph, Block **bpp) 1141 { 1142 Tcp4hdr *h; 1143 uchar *optr; 1144 ushort hdrlen; 1145 ushort optlen; 1146 int n; 1147 1148 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1149 if(*bpp == nil) 1150 return -1; 1151 1152 h = (Tcp4hdr *)((*bpp)->rp); 1153 tcph->source = nhgets(h->tcpsport); 1154 tcph->dest = nhgets(h->tcpdport); 1155 tcph->seq = nhgetl(h->tcpseq); 1156 tcph->ack = nhgetl(h->tcpack); 1157 1158 hdrlen = (h->tcpflag[0]>>2) & ~3; 1159 if(hdrlen < TCP4_HDRSIZE) { 1160 freeblist(*bpp); 1161 return -1; 1162 } 1163 1164 tcph->flags = h->tcpflag[1]; 1165 tcph->wnd = nhgets(h->tcpwin); 1166 tcph->urg = nhgets(h->tcpurg); 1167 tcph->mss = 0; 1168 tcph->ws = 0; 1169 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1170 1171 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1172 if(*bpp == nil) 1173 return -1; 1174 1175 optr = h->tcpopt; 1176 n = hdrlen - TCP4_HDRSIZE; 1177 while(n > 0 && *optr != EOLOPT) { 1178 if(*optr == NOOPOPT) { 1179 n--; 1180 optr++; 1181 continue; 1182 } 1183 optlen = optr[1]; 1184 if(optlen < 2 || optlen > n) 1185 break; 1186 switch(*optr) { 1187 case MSSOPT: 1188 if(optlen == MSS_LENGTH) 1189 tcph->mss = nhgets(optr+2); 1190 break; 1191 case WSOPT: 1192 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1193 tcph->ws = HaveWS | *(optr+2); 1194 break; 1195 } 1196 n -= optlen; 1197 optr += optlen; 1198 } 1199 return hdrlen; 1200 } 1201 1202 /* 1203 * For outgiing calls, generate an initial sequence 1204 * number and put a SYN on the send queue 1205 */ 1206 void 1207 tcpsndsyn(Conv *s, Tcpctl *tcb) 1208 { 1209 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1210 tcb->rttseq = tcb->iss; 1211 tcb->snd.wl2 = tcb->iss; 1212 tcb->snd.una = tcb->iss; 1213 tcb->snd.ptr = tcb->rttseq; 1214 tcb->snd.nxt = tcb->rttseq; 1215 tcb->flgcnt++; 1216 tcb->flags |= FORCE; 1217 tcb->sndsyntime = NOW; 1218 1219 /* set desired mss and scale */ 1220 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1221 } 1222 1223 void 1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1225 { 1226 Block *hbp; 1227 uchar rflags; 1228 Tcppriv *tpriv; 1229 Tcp4hdr ph4; 1230 Tcp6hdr ph6; 1231 1232 netlog(tcp->f, Logtcp, "sndrst: %s", reason); 1233 1234 tpriv = tcp->priv; 1235 1236 if(seg->flags & RST) 1237 return; 1238 1239 /* make pseudo header */ 1240 switch(version) { 1241 case V4: 1242 memset(&ph4, 0, sizeof(ph4)); 1243 ph4.vihl = IP_VER4; 1244 v6tov4(ph4.tcpsrc, dest); 1245 v6tov4(ph4.tcpdst, source); 1246 ph4.proto = IP_TCPPROTO; 1247 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1248 hnputs(ph4.tcpsport, seg->dest); 1249 hnputs(ph4.tcpdport, seg->source); 1250 break; 1251 case V6: 1252 memset(&ph6, 0, sizeof(ph6)); 1253 ph6.vcf[0] = IP_VER6; 1254 ipmove(ph6.tcpsrc, dest); 1255 ipmove(ph6.tcpdst, source); 1256 ph6.proto = IP_TCPPROTO; 1257 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1258 hnputs(ph6.tcpsport, seg->dest); 1259 hnputs(ph6.tcpdport, seg->source); 1260 break; 1261 default: 1262 panic("sndrst: version %d", version); 1263 } 1264 1265 tpriv->stats[OutRsts]++; 1266 rflags = RST; 1267 1268 /* convince the other end that this reset is in band */ 1269 if(seg->flags & ACK) { 1270 seg->seq = seg->ack; 1271 seg->ack = 0; 1272 } 1273 else { 1274 rflags |= ACK; 1275 seg->ack = seg->seq; 1276 seg->seq = 0; 1277 if(seg->flags & SYN) 1278 seg->ack++; 1279 seg->ack += length; 1280 if(seg->flags & FIN) 1281 seg->ack++; 1282 } 1283 seg->flags = rflags; 1284 seg->wnd = 0; 1285 seg->urg = 0; 1286 seg->mss = 0; 1287 seg->ws = 0; 1288 switch(version) { 1289 case V4: 1290 hbp = htontcp4(seg, nil, &ph4, nil); 1291 if(hbp == nil) 1292 return; 1293 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1294 break; 1295 case V6: 1296 hbp = htontcp6(seg, nil, &ph6, nil); 1297 if(hbp == nil) 1298 return; 1299 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1300 break; 1301 default: 1302 panic("sndrst2: version %d", version); 1303 } 1304 } 1305 1306 /* 1307 * send a reset to the remote side and close the conversation 1308 * called with s qlocked 1309 */ 1310 char* 1311 tcphangup(Conv *s) 1312 { 1313 Tcp seg; 1314 Tcpctl *tcb; 1315 Block *hbp; 1316 1317 tcb = (Tcpctl*)s->ptcl; 1318 if(waserror()) 1319 return commonerror(); 1320 if(s->raddr != 0) { 1321 if(!waserror()){ 1322 seg.flags = RST | ACK; 1323 seg.ack = tcb->rcv.nxt; 1324 tcb->rcv.una = 0; 1325 seg.seq = tcb->snd.ptr; 1326 seg.wnd = 0; 1327 seg.urg = 0; 1328 seg.mss = 0; 1329 seg.ws = 0; 1330 switch(s->ipversion) { 1331 case V4: 1332 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1333 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1334 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1335 break; 1336 case V6: 1337 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1338 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1339 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1340 break; 1341 default: 1342 panic("tcphangup: version %d", s->ipversion); 1343 } 1344 poperror(); 1345 } 1346 } 1347 localclose(s, nil); 1348 poperror(); 1349 return nil; 1350 } 1351 1352 /* 1353 * (re)send a SYN ACK 1354 */ 1355 int 1356 sndsynack(Proto *tcp, Limbo *lp) 1357 { 1358 Block *hbp; 1359 Tcp4hdr ph4; 1360 Tcp6hdr ph6; 1361 Tcp seg; 1362 int scale; 1363 1364 /* make pseudo header */ 1365 switch(lp->version) { 1366 case V4: 1367 memset(&ph4, 0, sizeof(ph4)); 1368 ph4.vihl = IP_VER4; 1369 v6tov4(ph4.tcpsrc, lp->laddr); 1370 v6tov4(ph4.tcpdst, lp->raddr); 1371 ph4.proto = IP_TCPPROTO; 1372 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1373 hnputs(ph4.tcpsport, lp->lport); 1374 hnputs(ph4.tcpdport, lp->rport); 1375 break; 1376 case V6: 1377 memset(&ph6, 0, sizeof(ph6)); 1378 ph6.vcf[0] = IP_VER6; 1379 ipmove(ph6.tcpsrc, lp->laddr); 1380 ipmove(ph6.tcpdst, lp->raddr); 1381 ph6.proto = IP_TCPPROTO; 1382 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1383 hnputs(ph6.tcpsport, lp->lport); 1384 hnputs(ph6.tcpdport, lp->rport); 1385 break; 1386 default: 1387 panic("sndrst: version %d", lp->version); 1388 } 1389 1390 seg.seq = lp->iss; 1391 seg.ack = lp->irs+1; 1392 seg.flags = SYN|ACK; 1393 seg.urg = 0; 1394 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1395 seg.wnd = QMAX; 1396 1397 /* if the other side set scale, we should too */ 1398 if(lp->rcvscale){ 1399 seg.ws = scale; 1400 lp->sndscale = scale; 1401 } else { 1402 seg.ws = 0; 1403 lp->sndscale = 0; 1404 } 1405 1406 switch(lp->version) { 1407 case V4: 1408 hbp = htontcp4(&seg, nil, &ph4, nil); 1409 if(hbp == nil) 1410 return -1; 1411 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1412 break; 1413 case V6: 1414 hbp = htontcp6(&seg, nil, &ph6, nil); 1415 if(hbp == nil) 1416 return -1; 1417 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1418 break; 1419 default: 1420 panic("sndsnack: version %d", lp->version); 1421 } 1422 lp->lastsend = NOW; 1423 return 0; 1424 } 1425 1426 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1427 1428 /* 1429 * put a call into limbo and respond with a SYN ACK 1430 * 1431 * called with proto locked 1432 */ 1433 static void 1434 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1435 { 1436 Limbo *lp, **l; 1437 Tcppriv *tpriv; 1438 int h; 1439 1440 tpriv = s->p->priv; 1441 h = hashipa(source, seg->source); 1442 1443 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1444 lp = *l; 1445 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1446 continue; 1447 if(ipcmp(lp->raddr, source) != 0) 1448 continue; 1449 if(ipcmp(lp->laddr, dest) != 0) 1450 continue; 1451 1452 /* each new SYN restarts the retransmits */ 1453 lp->irs = seg->seq; 1454 break; 1455 } 1456 lp = *l; 1457 if(lp == nil){ 1458 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1459 lp = tpriv->lht[h]; 1460 tpriv->lht[h] = lp->next; 1461 lp->next = nil; 1462 } else { 1463 lp = malloc(sizeof(*lp)); 1464 if(lp == nil) 1465 return; 1466 tpriv->nlimbo++; 1467 } 1468 *l = lp; 1469 lp->version = version; 1470 ipmove(lp->laddr, dest); 1471 ipmove(lp->raddr, source); 1472 lp->lport = seg->dest; 1473 lp->rport = seg->source; 1474 lp->mss = seg->mss; 1475 lp->rcvscale = seg->ws; 1476 lp->irs = seg->seq; 1477 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1478 } 1479 1480 if(sndsynack(s->p, lp) < 0){ 1481 *l = lp->next; 1482 tpriv->nlimbo--; 1483 free(lp); 1484 } 1485 } 1486 1487 /* 1488 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1489 */ 1490 static void 1491 limborexmit(Proto *tcp) 1492 { 1493 Tcppriv *tpriv; 1494 Limbo **l, *lp; 1495 int h; 1496 int seen; 1497 ulong now; 1498 1499 tpriv = tcp->priv; 1500 1501 if(!canqlock(tcp)) 1502 return; 1503 seen = 0; 1504 now = NOW; 1505 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1506 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1507 lp = *l; 1508 seen++; 1509 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1510 continue; 1511 1512 /* time it out after 1 second */ 1513 if(++(lp->rexmits) > 5){ 1514 tpriv->nlimbo--; 1515 *l = lp->next; 1516 free(lp); 1517 continue; 1518 } 1519 1520 /* if we're being attacked, don't bother resending SYN ACK's */ 1521 if(tpriv->nlimbo > 100) 1522 continue; 1523 1524 if(sndsynack(tcp, lp) < 0){ 1525 tpriv->nlimbo--; 1526 *l = lp->next; 1527 free(lp); 1528 continue; 1529 } 1530 1531 l = &lp->next; 1532 } 1533 } 1534 qunlock(tcp); 1535 } 1536 1537 /* 1538 * lookup call in limbo. if found, throw it out. 1539 * 1540 * called with proto locked 1541 */ 1542 static void 1543 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1544 { 1545 Limbo *lp, **l; 1546 int h; 1547 Tcppriv *tpriv; 1548 1549 tpriv = s->p->priv; 1550 1551 /* find a call in limbo */ 1552 h = hashipa(src, segp->source); 1553 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1554 lp = *l; 1555 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1556 continue; 1557 if(ipcmp(lp->laddr, dst) != 0) 1558 continue; 1559 if(ipcmp(lp->raddr, src) != 0) 1560 continue; 1561 1562 /* RST can only follow the SYN */ 1563 if(segp->seq == lp->irs+1){ 1564 tpriv->nlimbo--; 1565 *l = lp->next; 1566 free(lp); 1567 } 1568 break; 1569 } 1570 } 1571 1572 /* 1573 * come here when we finally get an ACK to our SYN-ACK. 1574 * lookup call in limbo. if found, create a new conversation 1575 * 1576 * called with proto locked 1577 */ 1578 static Conv* 1579 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1580 { 1581 Conv *new; 1582 Tcpctl *tcb; 1583 Tcppriv *tpriv; 1584 Tcp4hdr *h4; 1585 Tcp6hdr *h6; 1586 Limbo *lp, **l; 1587 int h; 1588 1589 /* unless it's just an ack, it can't be someone coming out of limbo */ 1590 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1591 return nil; 1592 1593 tpriv = s->p->priv; 1594 1595 /* find a call in limbo */ 1596 h = hashipa(src, segp->source); 1597 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1598 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d", 1599 src, segp->source, lp->raddr, lp->rport, 1600 dst, segp->dest, lp->laddr, lp->lport, 1601 version, lp->version 1602 ); 1603 1604 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1605 continue; 1606 if(ipcmp(lp->laddr, dst) != 0) 1607 continue; 1608 if(ipcmp(lp->raddr, src) != 0) 1609 continue; 1610 1611 /* we're assuming no data with the initial SYN */ 1612 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1613 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux", 1614 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1615 lp = nil; 1616 } else { 1617 tpriv->nlimbo--; 1618 *l = lp->next; 1619 } 1620 break; 1621 } 1622 if(lp == nil) 1623 return nil; 1624 1625 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1626 if(new == nil) 1627 return nil; 1628 1629 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1630 tcb = (Tcpctl*)new->ptcl; 1631 tcb->flags &= ~CLONE; 1632 tcb->timer.arg = new; 1633 tcb->timer.state = TcptimerOFF; 1634 tcb->acktimer.arg = new; 1635 tcb->acktimer.state = TcptimerOFF; 1636 tcb->katimer.arg = new; 1637 tcb->katimer.state = TcptimerOFF; 1638 tcb->rtt_timer.arg = new; 1639 tcb->rtt_timer.state = TcptimerOFF; 1640 1641 tcb->irs = lp->irs; 1642 tcb->rcv.nxt = tcb->irs+1; 1643 tcb->rcv.urg = tcb->rcv.nxt; 1644 1645 tcb->iss = lp->iss; 1646 tcb->rttseq = tcb->iss; 1647 tcb->snd.wl2 = tcb->iss; 1648 tcb->snd.una = tcb->iss+1; 1649 tcb->snd.ptr = tcb->iss+1; 1650 tcb->snd.nxt = tcb->iss+1; 1651 tcb->flgcnt = 0; 1652 tcb->flags |= SYNACK; 1653 1654 /* our sending max segment size cannot be bigger than what he asked for */ 1655 if(lp->mss != 0 && lp->mss < tcb->mss) 1656 tcb->mss = lp->mss; 1657 1658 /* window scaling */ 1659 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1660 1661 /* the congestion window always starts out as a single segment */ 1662 tcb->snd.wnd = segp->wnd; 1663 tcb->cwind = tcb->mss; 1664 1665 /* set initial round trip time */ 1666 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1667 tcpsynackrtt(new); 1668 1669 free(lp); 1670 1671 /* set up proto header */ 1672 switch(version){ 1673 case V4: 1674 h4 = &tcb->protohdr.tcp4hdr; 1675 memset(h4, 0, sizeof(*h4)); 1676 h4->proto = IP_TCPPROTO; 1677 hnputs(h4->tcpsport, new->lport); 1678 hnputs(h4->tcpdport, new->rport); 1679 v6tov4(h4->tcpsrc, dst); 1680 v6tov4(h4->tcpdst, src); 1681 break; 1682 case V6: 1683 h6 = &tcb->protohdr.tcp6hdr; 1684 memset(h6, 0, sizeof(*h6)); 1685 h6->proto = IP_TCPPROTO; 1686 hnputs(h6->tcpsport, new->lport); 1687 hnputs(h6->tcpdport, new->rport); 1688 ipmove(h6->tcpsrc, dst); 1689 ipmove(h6->tcpdst, src); 1690 break; 1691 default: 1692 panic("tcpincoming: version %d", new->ipversion); 1693 } 1694 1695 tcpsetstate(new, Established); 1696 1697 iphtadd(&tpriv->ht, new); 1698 1699 return new; 1700 } 1701 1702 int 1703 seq_within(ulong x, ulong low, ulong high) 1704 { 1705 if(low <= high){ 1706 if(low <= x && x <= high) 1707 return 1; 1708 } 1709 else { 1710 if(x >= low || x <= high) 1711 return 1; 1712 } 1713 return 0; 1714 } 1715 1716 int 1717 seq_lt(ulong x, ulong y) 1718 { 1719 return (int)(x-y) < 0; 1720 } 1721 1722 int 1723 seq_le(ulong x, ulong y) 1724 { 1725 return (int)(x-y) <= 0; 1726 } 1727 1728 int 1729 seq_gt(ulong x, ulong y) 1730 { 1731 return (int)(x-y) > 0; 1732 } 1733 1734 int 1735 seq_ge(ulong x, ulong y) 1736 { 1737 return (int)(x-y) >= 0; 1738 } 1739 1740 /* 1741 * use the time between the first SYN and it's ack as the 1742 * initial round trip time 1743 */ 1744 void 1745 tcpsynackrtt(Conv *s) 1746 { 1747 Tcpctl *tcb; 1748 int delta; 1749 Tcppriv *tpriv; 1750 1751 tcb = (Tcpctl*)s->ptcl; 1752 tpriv = s->p->priv; 1753 1754 delta = NOW - tcb->sndsyntime; 1755 tcb->srtt = delta<<LOGAGAIN; 1756 tcb->mdev = delta<<LOGDGAIN; 1757 1758 /* halt round trip timer */ 1759 tcphalt(tpriv, &tcb->rtt_timer); 1760 } 1761 1762 void 1763 update(Conv *s, Tcp *seg) 1764 { 1765 int rtt, delta; 1766 Tcpctl *tcb; 1767 ulong acked; 1768 ulong expand; 1769 Tcppriv *tpriv; 1770 1771 tpriv = s->p->priv; 1772 tcb = (Tcpctl*)s->ptcl; 1773 1774 /* if everything has been acked, force output(?) */ 1775 if(seq_gt(seg->ack, tcb->snd.nxt)) { 1776 tcb->flags |= FORCE; 1777 return; 1778 } 1779 1780 /* added by Dong Lin for fast retransmission */ 1781 if(seg->ack == tcb->snd.una 1782 && tcb->snd.una != tcb->snd.nxt 1783 && seg->len == 0 1784 && seg->wnd == tcb->snd.wnd) { 1785 1786 /* this is a pure ack w/o window update */ 1787 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", 1788 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); 1789 1790 if(++tcb->snd.dupacks == TCPREXMTTHRESH) { 1791 /* 1792 * tahoe tcp rxt the packet, half sshthresh, 1793 * and set cwnd to one packet 1794 */ 1795 tcb->snd.recovery = 1; 1796 tcb->snd.rxt = tcb->snd.nxt; 1797 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); 1798 tcprxmit(s); 1799 } else { 1800 /* do reno tcp here. */ 1801 } 1802 } 1803 1804 /* 1805 * update window 1806 */ 1807 if(seq_gt(seg->ack, tcb->snd.wl2) 1808 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1809 tcb->snd.wnd = seg->wnd; 1810 tcb->snd.wl2 = seg->ack; 1811 } 1812 1813 if(!seq_gt(seg->ack, tcb->snd.una)){ 1814 /* 1815 * don't let us hangup if sending into a closed window and 1816 * we're still getting acks 1817 */ 1818 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ 1819 tcb->backedoff = MAXBACKMS/4; 1820 } 1821 return; 1822 } 1823 1824 /* 1825 * any positive ack turns off fast rxt, 1826 * (should we do new-reno on partial acks?) 1827 */ 1828 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { 1829 tcb->snd.dupacks = 0; 1830 tcb->snd.recovery = 0; 1831 } else 1832 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); 1833 1834 /* Compute the new send window size */ 1835 acked = seg->ack - tcb->snd.una; 1836 1837 /* avoid slow start and timers for SYN acks */ 1838 if((tcb->flags & SYNACK) == 0) { 1839 tcb->flags |= SYNACK; 1840 acked--; 1841 tcb->flgcnt--; 1842 goto done; 1843 } 1844 1845 /* slow start as long as we're not recovering from lost packets */ 1846 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { 1847 if(tcb->cwind < tcb->ssthresh) { 1848 expand = tcb->mss; 1849 if(acked < expand) 1850 expand = acked; 1851 } 1852 else 1853 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; 1854 1855 if(tcb->cwind + expand < tcb->cwind) 1856 expand = tcb->snd.wnd - tcb->cwind; 1857 if(tcb->cwind + expand > tcb->snd.wnd) 1858 expand = tcb->snd.wnd - tcb->cwind; 1859 tcb->cwind += expand; 1860 } 1861 1862 /* Adjust the timers according to the round trip time */ 1863 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 1864 tcphalt(tpriv, &tcb->rtt_timer); 1865 if((tcb->flags&RETRAN) == 0) { 1866 tcb->backoff = 0; 1867 tcb->backedoff = 0; 1868 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 1869 if(rtt == 0) 1870 rtt = 1; /* otherwise all close systems will rexmit in 0 time */ 1871 rtt *= MSPTICK; 1872 if(tcb->srtt == 0) { 1873 tcb->srtt = rtt << LOGAGAIN; 1874 tcb->mdev = rtt << LOGDGAIN; 1875 } else { 1876 delta = rtt - (tcb->srtt>>LOGAGAIN); 1877 tcb->srtt += delta; 1878 if(tcb->srtt <= 0) 1879 tcb->srtt = 1; 1880 1881 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 1882 tcb->mdev += delta; 1883 if(tcb->mdev <= 0) 1884 tcb->mdev = 1; 1885 } 1886 tcpsettimer(tcb); 1887 } 1888 } 1889 1890 done: 1891 if(qdiscard(s->wq, acked) < acked) 1892 tcb->flgcnt--; 1893 1894 tcb->snd.una = seg->ack; 1895 if(seq_gt(seg->ack, tcb->snd.urg)) 1896 tcb->snd.urg = seg->ack; 1897 1898 if(tcb->snd.una != tcb->snd.nxt) 1899 tcpgo(tpriv, &tcb->timer); 1900 else 1901 tcphalt(tpriv, &tcb->timer); 1902 1903 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 1904 tcb->snd.ptr = tcb->snd.una; 1905 1906 tcb->flags &= ~RETRAN; 1907 tcb->backoff = 0; 1908 tcb->backedoff = 0; 1909 } 1910 1911 void 1912 tcpiput(Proto *tcp, Ipifc*, Block *bp) 1913 { 1914 Tcp seg; 1915 Tcp4hdr *h4; 1916 Tcp6hdr *h6; 1917 int hdrlen; 1918 Tcpctl *tcb; 1919 ushort length; 1920 uchar source[IPaddrlen], dest[IPaddrlen]; 1921 Conv *s; 1922 Fs *f; 1923 Tcppriv *tpriv; 1924 uchar version; 1925 1926 f = tcp->f; 1927 tpriv = tcp->priv; 1928 1929 tpriv->stats[InSegs]++; 1930 1931 h4 = (Tcp4hdr*)(bp->rp); 1932 h6 = (Tcp6hdr*)(bp->rp); 1933 1934 if((h4->vihl&0xF0)==IP_VER4) { 1935 version = V4; 1936 length = nhgets(h4->length); 1937 v4tov6(dest, h4->tcpdst); 1938 v4tov6(source, h4->tcpsrc); 1939 1940 h4->Unused = 0; 1941 hnputs(h4->tcplen, length-TCP4_PKT); 1942 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 1943 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 1944 tpriv->stats[CsumErrs]++; 1945 tpriv->stats[InErrs]++; 1946 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1947 freeblist(bp); 1948 return; 1949 } 1950 1951 hdrlen = ntohtcp4(&seg, &bp); 1952 if(hdrlen < 0){ 1953 tpriv->stats[HlenErrs]++; 1954 tpriv->stats[InErrs]++; 1955 netlog(f, Logtcp, "bad tcp hdr len\n"); 1956 return; 1957 } 1958 1959 /* trim the packet to the size claimed by the datagram */ 1960 length -= hdrlen+TCP4_PKT; 1961 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 1962 if(bp == nil){ 1963 tpriv->stats[LenErrs]++; 1964 tpriv->stats[InErrs]++; 1965 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 1966 return; 1967 } 1968 } 1969 else { 1970 int ttl = h6->ttl; 1971 int proto = h6->proto; 1972 1973 version = V6; 1974 length = nhgets(h6->ploadlen); 1975 ipmove(dest, h6->tcpdst); 1976 ipmove(source, h6->tcpsrc); 1977 1978 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 1979 h6->ttl = proto; 1980 hnputl(h6->vcf, length); 1981 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 1982 ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) { 1983 tpriv->stats[CsumErrs]++; 1984 tpriv->stats[InErrs]++; 1985 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1986 freeblist(bp); 1987 return; 1988 } 1989 h6->ttl = ttl; 1990 h6->proto = proto; 1991 hnputs(h6->ploadlen, length); 1992 1993 hdrlen = ntohtcp6(&seg, &bp); 1994 if(hdrlen < 0){ 1995 tpriv->stats[HlenErrs]++; 1996 tpriv->stats[InErrs]++; 1997 netlog(f, Logtcp, "bad tcp hdr len\n"); 1998 return; 1999 } 2000 2001 /* trim the packet to the size claimed by the datagram */ 2002 length -= hdrlen; 2003 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2004 if(bp == nil){ 2005 tpriv->stats[LenErrs]++; 2006 tpriv->stats[InErrs]++; 2007 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 2008 return; 2009 } 2010 } 2011 2012 /* lock protocol while searching for a conversation */ 2013 qlock(tcp); 2014 2015 /* Look for a matching conversation */ 2016 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2017 if(s == nil){ 2018 netlog(f, Logtcp, "iphtlook failed"); 2019 reset: 2020 qunlock(tcp); 2021 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2022 freeblist(bp); 2023 return; 2024 } 2025 2026 /* if it's a listener, look for the right flags and get a new conv */ 2027 tcb = (Tcpctl*)s->ptcl; 2028 if(tcb->state == Listen){ 2029 if(seg.flags & RST){ 2030 limborst(s, &seg, source, dest, version); 2031 qunlock(tcp); 2032 freeblist(bp); 2033 return; 2034 } 2035 2036 /* if this is a new SYN, put the call into limbo */ 2037 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2038 limbo(s, source, dest, &seg, version); 2039 qunlock(tcp); 2040 freeblist(bp); 2041 return; 2042 } 2043 2044 /* 2045 * if there's a matching call in limbo, tcpincoming will 2046 * return it in state Syn_received 2047 */ 2048 s = tcpincoming(s, &seg, source, dest, version); 2049 if(s == nil) 2050 goto reset; 2051 } 2052 2053 /* The rest of the input state machine is run with the control block 2054 * locked and implements the state machine directly out of the RFC. 2055 * Out-of-band data is ignored - it was always a bad idea. 2056 */ 2057 tcb = (Tcpctl*)s->ptcl; 2058 if(waserror()){ 2059 qunlock(s); 2060 nexterror(); 2061 } 2062 qlock(s); 2063 qunlock(tcp); 2064 2065 /* fix up window */ 2066 seg.wnd <<= tcb->rcv.scale; 2067 2068 /* every input packet in puts off the keep alive time out */ 2069 tcpsetkacounter(tcb); 2070 2071 switch(tcb->state) { 2072 case Closed: 2073 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2074 goto raise; 2075 case Syn_sent: 2076 if(seg.flags & ACK) { 2077 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2078 sndrst(tcp, source, dest, length, &seg, version, 2079 "bad seq in Syn_sent"); 2080 goto raise; 2081 } 2082 } 2083 if(seg.flags & RST) { 2084 if(seg.flags & ACK) 2085 localclose(s, Econrefused); 2086 goto raise; 2087 } 2088 2089 if(seg.flags & SYN) { 2090 procsyn(s, &seg); 2091 if(seg.flags & ACK){ 2092 update(s, &seg); 2093 tcpsynackrtt(s); 2094 tcpsetstate(s, Established); 2095 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2096 } 2097 else { 2098 tcb->time = NOW; 2099 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2100 } 2101 2102 if(length != 0 || (seg.flags & FIN)) 2103 break; 2104 2105 freeblist(bp); 2106 goto output; 2107 } 2108 else 2109 freeblist(bp); 2110 2111 qunlock(s); 2112 poperror(); 2113 return; 2114 case Syn_received: 2115 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2116 if(seg.flags & ACK) 2117 tcpsynackrtt(s); 2118 break; 2119 } 2120 2121 /* 2122 * One DOS attack is to open connections to us and then forget about them, 2123 * thereby tying up a conv at no long term cost to the attacker. 2124 * This is an attempt to defeat these stateless DOS attacks. See 2125 * corresponding code in tcpsendka(). 2126 */ 2127 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2128 if(tcpporthogdefense 2129 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2130 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2131 source, seg.source, dest, seg.dest, seg.flags, 2132 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2133 localclose(s, "stateless hog"); 2134 } 2135 } 2136 2137 /* Cut the data to fit the receive window */ 2138 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2139 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); 2140 update(s, &seg); 2141 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2142 tcphalt(tpriv, &tcb->rtt_timer); 2143 tcphalt(tpriv, &tcb->acktimer); 2144 tcphalt(tpriv, &tcb->katimer); 2145 tcpsetstate(s, Time_wait); 2146 tcb->timer.start = MSL2*(1000 / MSPTICK); 2147 tcpgo(tpriv, &tcb->timer); 2148 } 2149 if(!(seg.flags & RST)) { 2150 tcb->flags |= FORCE; 2151 goto output; 2152 } 2153 qunlock(s); 2154 poperror(); 2155 return; 2156 } 2157 2158 /* Cannot accept so answer with a rst */ 2159 if(length && tcb->state == Closed) { 2160 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2161 goto raise; 2162 } 2163 2164 /* The segment is beyond the current receive pointer so 2165 * queue the data in the resequence queue 2166 */ 2167 if(seg.seq != tcb->rcv.nxt) 2168 if(length != 0 || (seg.flags & (SYN|FIN))) { 2169 update(s, &seg); 2170 if(addreseq(tcb, tpriv, &seg, bp, length) < 0) 2171 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); 2172 tcb->flags |= FORCE; 2173 goto output; 2174 } 2175 2176 /* 2177 * keep looping till we've processed this packet plus any 2178 * adjacent packets in the resequence queue 2179 */ 2180 for(;;) { 2181 if(seg.flags & RST) { 2182 if(tcb->state == Established) { 2183 tpriv->stats[EstabResets]++; 2184 if(tcb->rcv.nxt != seg.seq) 2185 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); 2186 } 2187 localclose(s, Econrefused); 2188 goto raise; 2189 } 2190 2191 if((seg.flags&ACK) == 0) 2192 goto raise; 2193 2194 switch(tcb->state) { 2195 case Syn_received: 2196 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2197 sndrst(tcp, source, dest, length, &seg, version, 2198 "bad seq in Syn_received"); 2199 goto raise; 2200 } 2201 update(s, &seg); 2202 tcpsetstate(s, Established); 2203 case Established: 2204 case Close_wait: 2205 update(s, &seg); 2206 break; 2207 case Finwait1: 2208 update(s, &seg); 2209 if(qlen(s->wq)+tcb->flgcnt == 0){ 2210 tcphalt(tpriv, &tcb->rtt_timer); 2211 tcphalt(tpriv, &tcb->acktimer); 2212 tcpsetkacounter(tcb); 2213 tcb->time = NOW; 2214 tcpsetstate(s, Finwait2); 2215 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2216 tcpgo(tpriv, &tcb->katimer); 2217 } 2218 break; 2219 case Finwait2: 2220 update(s, &seg); 2221 break; 2222 case Closing: 2223 update(s, &seg); 2224 if(qlen(s->wq)+tcb->flgcnt == 0) { 2225 tcphalt(tpriv, &tcb->rtt_timer); 2226 tcphalt(tpriv, &tcb->acktimer); 2227 tcphalt(tpriv, &tcb->katimer); 2228 tcpsetstate(s, Time_wait); 2229 tcb->timer.start = MSL2*(1000 / MSPTICK); 2230 tcpgo(tpriv, &tcb->timer); 2231 } 2232 break; 2233 case Last_ack: 2234 update(s, &seg); 2235 if(qlen(s->wq)+tcb->flgcnt == 0) { 2236 localclose(s, nil); 2237 goto raise; 2238 } 2239 case Time_wait: 2240 tcb->flags |= FORCE; 2241 if(tcb->timer.state != TcptimerON) 2242 tcpgo(tpriv, &tcb->timer); 2243 } 2244 2245 if((seg.flags&URG) && seg.urg) { 2246 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2247 tcb->rcv.urg = seg.urg + seg.seq; 2248 pullblock(&bp, seg.urg); 2249 } 2250 } 2251 else 2252 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2253 tcb->rcv.urg = tcb->rcv.nxt; 2254 2255 if(length == 0) { 2256 if(bp != nil) 2257 freeblist(bp); 2258 } 2259 else { 2260 switch(tcb->state){ 2261 default: 2262 /* Ignore segment text */ 2263 if(bp != nil) 2264 freeblist(bp); 2265 break; 2266 2267 case Syn_received: 2268 case Established: 2269 case Finwait1: 2270 /* If we still have some data place on 2271 * receive queue 2272 */ 2273 if(bp) { 2274 bp = packblock(bp); 2275 if(bp == nil) 2276 panic("tcp packblock"); 2277 qpassnolim(s->rq, bp); 2278 bp = nil; 2279 2280 /* 2281 * Force an ack every 2 data messages. This is 2282 * a hack for rob to make his home system run 2283 * faster. 2284 * 2285 * this also keeps the standard TCP congestion 2286 * control working since it needs an ack every 2287 * 2 max segs worth. This is not quite that, 2288 * but under a real stream is equivalent since 2289 * every packet has a max seg in it. 2290 */ 2291 if(++(tcb->rcv.una) >= 2) 2292 tcb->flags |= FORCE; 2293 } 2294 tcb->rcv.nxt += length; 2295 2296 /* 2297 * update our rcv window 2298 */ 2299 tcprcvwin(s); 2300 2301 /* 2302 * turn on the acktimer if there's something 2303 * to ack 2304 */ 2305 if(tcb->acktimer.state != TcptimerON) 2306 tcpgo(tpriv, &tcb->acktimer); 2307 2308 break; 2309 case Finwait2: 2310 /* no process to read the data, send a reset */ 2311 if(bp != nil) 2312 freeblist(bp); 2313 sndrst(tcp, source, dest, length, &seg, version, 2314 "send to Finwait2"); 2315 qunlock(s); 2316 poperror(); 2317 return; 2318 } 2319 } 2320 2321 if(seg.flags & FIN) { 2322 tcb->flags |= FORCE; 2323 2324 switch(tcb->state) { 2325 case Syn_received: 2326 case Established: 2327 tcb->rcv.nxt++; 2328 tcpsetstate(s, Close_wait); 2329 break; 2330 case Finwait1: 2331 tcb->rcv.nxt++; 2332 if(qlen(s->wq)+tcb->flgcnt == 0) { 2333 tcphalt(tpriv, &tcb->rtt_timer); 2334 tcphalt(tpriv, &tcb->acktimer); 2335 tcphalt(tpriv, &tcb->katimer); 2336 tcpsetstate(s, Time_wait); 2337 tcb->timer.start = MSL2*(1000/MSPTICK); 2338 tcpgo(tpriv, &tcb->timer); 2339 } 2340 else 2341 tcpsetstate(s, Closing); 2342 break; 2343 case Finwait2: 2344 tcb->rcv.nxt++; 2345 tcphalt(tpriv, &tcb->rtt_timer); 2346 tcphalt(tpriv, &tcb->acktimer); 2347 tcphalt(tpriv, &tcb->katimer); 2348 tcpsetstate(s, Time_wait); 2349 tcb->timer.start = MSL2 * (1000/MSPTICK); 2350 tcpgo(tpriv, &tcb->timer); 2351 break; 2352 case Close_wait: 2353 case Closing: 2354 case Last_ack: 2355 break; 2356 case Time_wait: 2357 tcpgo(tpriv, &tcb->timer); 2358 break; 2359 } 2360 } 2361 2362 /* 2363 * get next adjacent segment from the resequence queue. 2364 * dump/trim any overlapping segments 2365 */ 2366 for(;;) { 2367 if(tcb->reseq == nil) 2368 goto output; 2369 2370 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2371 goto output; 2372 2373 getreseq(tcb, &seg, &bp, &length); 2374 2375 if(tcptrim(tcb, &seg, &bp, &length) == 0) 2376 break; 2377 } 2378 } 2379 output: 2380 tcpoutput(s); 2381 qunlock(s); 2382 poperror(); 2383 return; 2384 raise: 2385 qunlock(s); 2386 poperror(); 2387 freeblist(bp); 2388 tcpkick(s); 2389 } 2390 2391 /* 2392 * always enters and exits with the s locked. We drop 2393 * the lock to ipoput the packet so some care has to be 2394 * taken by callers. 2395 */ 2396 void 2397 tcpoutput(Conv *s) 2398 { 2399 Tcp seg; 2400 int msgs; 2401 Tcpctl *tcb; 2402 Block *hbp, *bp; 2403 int sndcnt, n; 2404 ulong ssize, dsize, usable, sent; 2405 Fs *f; 2406 Tcppriv *tpriv; 2407 uchar version; 2408 2409 f = s->p->f; 2410 tpriv = s->p->priv; 2411 version = s->ipversion; 2412 2413 for(msgs = 0; msgs < 100; msgs++) { 2414 tcb = (Tcpctl*)s->ptcl; 2415 2416 switch(tcb->state) { 2417 case Listen: 2418 case Closed: 2419 case Finwait2: 2420 return; 2421 } 2422 2423 /* force an ack when a window has opened up */ 2424 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2425 tcb->rcv.blocked = 0; 2426 tcb->flags |= FORCE; 2427 } 2428 2429 sndcnt = qlen(s->wq)+tcb->flgcnt; 2430 sent = tcb->snd.ptr - tcb->snd.una; 2431 2432 /* Don't send anything else until our SYN has been acked */ 2433 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2434 break; 2435 2436 /* Compute usable segment based on offered window and limit 2437 * window probes to one 2438 */ 2439 if(tcb->snd.wnd == 0){ 2440 if(sent != 0) { 2441 if((tcb->flags&FORCE) == 0) 2442 break; 2443 // tcb->snd.ptr = tcb->snd.una; 2444 } 2445 usable = 1; 2446 } 2447 else { 2448 usable = tcb->cwind; 2449 if(tcb->snd.wnd < usable) 2450 usable = tcb->snd.wnd; 2451 usable -= sent; 2452 } 2453 ssize = sndcnt-sent; 2454 if(ssize && usable < 2) 2455 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", 2456 tcb->snd.wnd, tcb->cwind); 2457 if(usable < ssize) 2458 ssize = usable; 2459 if(tcb->mss < ssize) 2460 ssize = tcb->mss; 2461 dsize = ssize; 2462 seg.urg = 0; 2463 2464 if(ssize == 0) 2465 if((tcb->flags&FORCE) == 0) 2466 break; 2467 2468 tcb->flags &= ~FORCE; 2469 tcprcvwin(s); 2470 2471 /* By default we will generate an ack */ 2472 tcphalt(tpriv, &tcb->acktimer); 2473 tcb->rcv.una = 0; 2474 seg.source = s->lport; 2475 seg.dest = s->rport; 2476 seg.flags = ACK; 2477 seg.mss = 0; 2478 seg.ws = 0; 2479 switch(tcb->state){ 2480 case Syn_sent: 2481 seg.flags = 0; 2482 if(tcb->snd.ptr == tcb->iss){ 2483 seg.flags |= SYN; 2484 dsize--; 2485 seg.mss = tcb->mss; 2486 seg.ws = tcb->scale; 2487 } 2488 break; 2489 case Syn_received: 2490 /* 2491 * don't send any data with a SYN/ACK packet 2492 * because Linux rejects the packet in its 2493 * attempt to solve the SYN attack problem 2494 */ 2495 if(tcb->snd.ptr == tcb->iss){ 2496 seg.flags |= SYN; 2497 dsize = 0; 2498 ssize = 1; 2499 seg.mss = tcb->mss; 2500 seg.ws = tcb->scale; 2501 } 2502 break; 2503 } 2504 seg.seq = tcb->snd.ptr; 2505 seg.ack = tcb->rcv.nxt; 2506 seg.wnd = tcb->rcv.wnd; 2507 2508 /* Pull out data to send */ 2509 bp = nil; 2510 if(dsize != 0) { 2511 bp = qcopy(s->wq, dsize, sent); 2512 if(BLEN(bp) != dsize) { 2513 seg.flags |= FIN; 2514 dsize--; 2515 } 2516 } 2517 2518 if(sent+dsize == sndcnt) 2519 seg.flags |= PSH; 2520 2521 /* keep track of balance of resent data */ 2522 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { 2523 n = tcb->snd.nxt - tcb->snd.ptr; 2524 if(ssize < n) 2525 n = ssize; 2526 tcb->resent += n; 2527 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", 2528 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); 2529 tpriv->stats[RetransSegs]++; 2530 } 2531 2532 tcb->snd.ptr += ssize; 2533 2534 /* Pull up the send pointer so we can accept acks 2535 * for this window 2536 */ 2537 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2538 tcb->snd.nxt = tcb->snd.ptr; 2539 2540 /* Build header, link data and compute cksum */ 2541 switch(version){ 2542 case V4: 2543 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2544 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2545 if(hbp == nil) { 2546 freeblist(bp); 2547 return; 2548 } 2549 break; 2550 case V6: 2551 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2552 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2553 if(hbp == nil) { 2554 freeblist(bp); 2555 return; 2556 } 2557 break; 2558 default: 2559 hbp = nil; /* to suppress a warning */ 2560 panic("tcpoutput: version %d", version); 2561 } 2562 2563 /* Start the transmission timers if there is new data and we 2564 * expect acknowledges 2565 */ 2566 if(ssize != 0){ 2567 if(tcb->timer.state != TcptimerON) 2568 tcpgo(tpriv, &tcb->timer); 2569 2570 /* If round trip timer isn't running, start it. 2571 * measure the longest packet only in case the 2572 * transmission time dominates RTT 2573 */ 2574 if(tcb->rtt_timer.state != TcptimerON) 2575 if(ssize == tcb->mss) { 2576 tcpgo(tpriv, &tcb->rtt_timer); 2577 tcb->rttseq = tcb->snd.ptr; 2578 } 2579 } 2580 2581 tpriv->stats[OutSegs]++; 2582 2583 /* put off the next keep alive */ 2584 tcpgo(tpriv, &tcb->katimer); 2585 2586 switch(version){ 2587 case V4: 2588 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2589 /* a negative return means no route */ 2590 localclose(s, "no route"); 2591 } 2592 break; 2593 case V6: 2594 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2595 /* a negative return means no route */ 2596 localclose(s, "no route"); 2597 } 2598 break; 2599 default: 2600 panic("tcpoutput2: version %d", version); 2601 } 2602 if((msgs%4) == 1){ 2603 qunlock(s); 2604 sched(); 2605 qlock(s); 2606 } 2607 } 2608 } 2609 2610 /* 2611 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2612 */ 2613 void 2614 tcpsendka(Conv *s) 2615 { 2616 Tcp seg; 2617 Tcpctl *tcb; 2618 Block *hbp,*dbp; 2619 2620 tcb = (Tcpctl*)s->ptcl; 2621 2622 dbp = nil; 2623 seg.urg = 0; 2624 seg.source = s->lport; 2625 seg.dest = s->rport; 2626 seg.flags = ACK|PSH; 2627 seg.mss = 0; 2628 seg.ws = 0; 2629 if(tcpporthogdefense) 2630 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2631 else 2632 seg.seq = tcb->snd.una-1; 2633 seg.ack = tcb->rcv.nxt; 2634 tcb->rcv.una = 0; 2635 seg.wnd = tcb->rcv.wnd; 2636 if(tcb->state == Finwait2){ 2637 seg.flags |= FIN; 2638 } else { 2639 dbp = allocb(1); 2640 dbp->wp++; 2641 } 2642 2643 if(isv4(s->raddr)) { 2644 /* Build header, link data and compute cksum */ 2645 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2646 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2647 if(hbp == nil) { 2648 freeblist(dbp); 2649 return; 2650 } 2651 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2652 } 2653 else { 2654 /* Build header, link data and compute cksum */ 2655 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2656 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2657 if(hbp == nil) { 2658 freeblist(dbp); 2659 return; 2660 } 2661 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2662 } 2663 } 2664 2665 /* 2666 * set connection to time out after 12 minutes 2667 */ 2668 void 2669 tcpsetkacounter(Tcpctl *tcb) 2670 { 2671 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2672 if(tcb->kacounter < 3) 2673 tcb->kacounter = 3; 2674 } 2675 2676 /* 2677 * if we've timed out, close the connection 2678 * otherwise, send a keepalive and restart the timer 2679 */ 2680 void 2681 tcpkeepalive(void *v) 2682 { 2683 Tcpctl *tcb; 2684 Conv *s; 2685 2686 s = v; 2687 tcb = (Tcpctl*)s->ptcl; 2688 if(waserror()){ 2689 qunlock(s); 2690 nexterror(); 2691 } 2692 qlock(s); 2693 if(tcb->state != Closed){ 2694 if(--(tcb->kacounter) <= 0) { 2695 localclose(s, Etimedout); 2696 } else { 2697 tcpsendka(s); 2698 tcpgo(s->p->priv, &tcb->katimer); 2699 } 2700 } 2701 qunlock(s); 2702 poperror(); 2703 } 2704 2705 /* 2706 * start keepalive timer 2707 */ 2708 char* 2709 tcpstartka(Conv *s, char **f, int n) 2710 { 2711 Tcpctl *tcb; 2712 int x; 2713 2714 tcb = (Tcpctl*)s->ptcl; 2715 if(tcb->state != Established) 2716 return "connection must be in Establised state"; 2717 if(n > 1){ 2718 x = atoi(f[1]); 2719 if(x >= MSPTICK) 2720 tcb->katimer.start = x/MSPTICK; 2721 } 2722 tcpsetkacounter(tcb); 2723 tcpgo(s->p->priv, &tcb->katimer); 2724 2725 return nil; 2726 } 2727 2728 /* 2729 * turn checksums on/off 2730 */ 2731 char* 2732 tcpsetchecksum(Conv *s, char **f, int) 2733 { 2734 Tcpctl *tcb; 2735 2736 tcb = (Tcpctl*)s->ptcl; 2737 tcb->nochecksum = !atoi(f[1]); 2738 2739 return nil; 2740 } 2741 2742 void 2743 tcprxmit(Conv *s) 2744 { 2745 Tcpctl *tcb; 2746 2747 tcb = (Tcpctl*)s->ptcl; 2748 2749 tcb->flags |= RETRAN|FORCE; 2750 tcb->snd.ptr = tcb->snd.una; 2751 2752 /* 2753 * We should be halving the slow start threshhold (down to one 2754 * mss) but leaving it at mss seems to work well enough 2755 */ 2756 tcb->ssthresh = tcb->mss; 2757 2758 /* 2759 * pull window down to a single packet 2760 */ 2761 tcb->cwind = tcb->mss; 2762 tcpoutput(s); 2763 } 2764 2765 void 2766 tcptimeout(void *arg) 2767 { 2768 Conv *s; 2769 Tcpctl *tcb; 2770 int maxback; 2771 Tcppriv *tpriv; 2772 2773 s = (Conv*)arg; 2774 tpriv = s->p->priv; 2775 tcb = (Tcpctl*)s->ptcl; 2776 2777 if(waserror()){ 2778 qunlock(s); 2779 nexterror(); 2780 } 2781 qlock(s); 2782 switch(tcb->state){ 2783 default: 2784 tcb->backoff++; 2785 if(tcb->state == Syn_sent) 2786 maxback = MAXBACKMS/2; 2787 else 2788 maxback = MAXBACKMS; 2789 tcb->backedoff += tcb->timer.start * MSPTICK; 2790 if(tcb->backedoff >= maxback) { 2791 localclose(s, Etimedout); 2792 break; 2793 } 2794 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); 2795 tcpsettimer(tcb); 2796 tcprxmit(s); 2797 tpriv->stats[RetransTimeouts]++; 2798 tcb->snd.dupacks = 0; 2799 break; 2800 case Time_wait: 2801 localclose(s, nil); 2802 break; 2803 case Closed: 2804 break; 2805 } 2806 qunlock(s); 2807 poperror(); 2808 } 2809 2810 int 2811 inwindow(Tcpctl *tcb, int seq) 2812 { 2813 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 2814 } 2815 2816 /* 2817 * set up state for a received SYN (or SYN ACK) packet 2818 */ 2819 void 2820 procsyn(Conv *s, Tcp *seg) 2821 { 2822 Tcpctl *tcb; 2823 2824 tcb = (Tcpctl*)s->ptcl; 2825 tcb->flags |= FORCE; 2826 2827 tcb->rcv.nxt = seg->seq + 1; 2828 tcb->rcv.urg = tcb->rcv.nxt; 2829 tcb->irs = seg->seq; 2830 2831 /* our sending max segment size cannot be bigger than what he asked for */ 2832 if(seg->mss != 0 && seg->mss < tcb->mss) 2833 tcb->mss = seg->mss; 2834 2835 /* the congestion window always starts out as a single segment */ 2836 tcb->snd.wnd = seg->wnd; 2837 tcb->cwind = tcb->mss; 2838 } 2839 2840 int 2841 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 2842 { 2843 Reseq *rp, *rp1; 2844 int i, rqlen, qmax; 2845 2846 rp = malloc(sizeof(Reseq)); 2847 if(rp == nil){ 2848 freeblist(bp); /* bp always consumed by add_reseq */ 2849 return 0; 2850 } 2851 2852 rp->seg = *seg; 2853 rp->bp = bp; 2854 rp->length = length; 2855 2856 /* Place on reassembly list sorting by starting seq number */ 2857 rp1 = tcb->reseq; 2858 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { 2859 rp->next = rp1; 2860 tcb->reseq = rp; 2861 if(rp->next != nil) 2862 tpriv->stats[OutOfOrder]++; 2863 return 0; 2864 } 2865 2866 rqlen = 0; 2867 for(i = 0;; i++) { 2868 rqlen += rp1->length; 2869 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { 2870 rp->next = rp1->next; 2871 rp1->next = rp; 2872 if(rp->next != nil) 2873 tpriv->stats[OutOfOrder]++; 2874 break; 2875 } 2876 rp1 = rp1->next; 2877 } 2878 qmax = QMAX<<tcb->rcv.scale; 2879 if(rqlen > qmax){ 2880 print("resequence queue > window: %d > %d\n", rqlen, qmax); 2881 i = 0; 2882 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ 2883 print("%#lux %#lux %#ux\n", rp1->seg.seq, 2884 rp1->seg.ack, rp1->seg.flags); 2885 if(i++ > 10){ 2886 print("...\n"); 2887 break; 2888 } 2889 } 2890 2891 // delete entire reassembly queue; wait for retransmit. 2892 // - should we be smarter and only delete the tail? 2893 for(rp = tcb->reseq; rp != nil; rp = rp1){ 2894 rp1 = rp->next; 2895 freeblist(rp->bp); 2896 free(rp); 2897 } 2898 tcb->reseq = nil; 2899 2900 return -1; 2901 } 2902 return 0; 2903 } 2904 2905 void 2906 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2907 { 2908 Reseq *rp; 2909 2910 rp = tcb->reseq; 2911 if(rp == nil) 2912 return; 2913 2914 tcb->reseq = rp->next; 2915 2916 *seg = rp->seg; 2917 *bp = rp->bp; 2918 *length = rp->length; 2919 2920 free(rp); 2921 } 2922 2923 int 2924 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2925 { 2926 ushort len; 2927 uchar accept; 2928 int dupcnt, excess; 2929 2930 accept = 0; 2931 len = *length; 2932 if(seg->flags & SYN) 2933 len++; 2934 if(seg->flags & FIN) 2935 len++; 2936 2937 if(tcb->rcv.wnd == 0) { 2938 if(len == 0 && seg->seq == tcb->rcv.nxt) 2939 return 0; 2940 } 2941 else { 2942 /* Some part of the segment should be in the window */ 2943 if(inwindow(tcb,seg->seq)) 2944 accept++; 2945 else 2946 if(len != 0) { 2947 if(inwindow(tcb, seg->seq+len-1) || 2948 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 2949 accept++; 2950 } 2951 } 2952 if(!accept) { 2953 freeblist(*bp); 2954 return -1; 2955 } 2956 dupcnt = tcb->rcv.nxt - seg->seq; 2957 if(dupcnt > 0){ 2958 tcb->rerecv += dupcnt; 2959 if(seg->flags & SYN){ 2960 seg->flags &= ~SYN; 2961 seg->seq++; 2962 2963 if(seg->urg > 1) 2964 seg->urg--; 2965 else 2966 seg->flags &= ~URG; 2967 dupcnt--; 2968 } 2969 if(dupcnt > 0){ 2970 pullblock(bp, (ushort)dupcnt); 2971 seg->seq += dupcnt; 2972 *length -= dupcnt; 2973 2974 if(seg->urg > dupcnt) 2975 seg->urg -= dupcnt; 2976 else { 2977 seg->flags &= ~URG; 2978 seg->urg = 0; 2979 } 2980 } 2981 } 2982 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 2983 if(excess > 0) { 2984 tcb->rerecv += excess; 2985 *length -= excess; 2986 *bp = trimblock(*bp, 0, *length); 2987 if(*bp == nil) 2988 panic("presotto is a boofhead"); 2989 seg->flags &= ~FIN; 2990 } 2991 return 0; 2992 } 2993 2994 void 2995 tcpadvise(Proto *tcp, Block *bp, char *msg) 2996 { 2997 Tcp4hdr *h4; 2998 Tcp6hdr *h6; 2999 Tcpctl *tcb; 3000 uchar source[IPaddrlen]; 3001 uchar dest[IPaddrlen]; 3002 ushort psource, pdest; 3003 Conv *s, **p; 3004 3005 h4 = (Tcp4hdr*)(bp->rp); 3006 h6 = (Tcp6hdr*)(bp->rp); 3007 3008 if((h4->vihl&0xF0)==IP_VER4) { 3009 v4tov6(dest, h4->tcpdst); 3010 v4tov6(source, h4->tcpsrc); 3011 psource = nhgets(h4->tcpsport); 3012 pdest = nhgets(h4->tcpdport); 3013 } 3014 else { 3015 ipmove(dest, h6->tcpdst); 3016 ipmove(source, h6->tcpsrc); 3017 psource = nhgets(h6->tcpsport); 3018 pdest = nhgets(h6->tcpdport); 3019 } 3020 3021 /* Look for a connection */ 3022 qlock(tcp); 3023 for(p = tcp->conv; *p; p++) { 3024 s = *p; 3025 tcb = (Tcpctl*)s->ptcl; 3026 if(s->rport == pdest) 3027 if(s->lport == psource) 3028 if(tcb->state != Closed) 3029 if(ipcmp(s->raddr, dest) == 0) 3030 if(ipcmp(s->laddr, source) == 0){ 3031 qlock(s); 3032 qunlock(tcp); 3033 switch(tcb->state){ 3034 case Syn_sent: 3035 localclose(s, msg); 3036 break; 3037 } 3038 qunlock(s); 3039 freeblist(bp); 3040 return; 3041 } 3042 } 3043 qunlock(tcp); 3044 freeblist(bp); 3045 } 3046 3047 static char* 3048 tcpporthogdefensectl(char *val) 3049 { 3050 if(strcmp(val, "on") == 0) 3051 tcpporthogdefense = 1; 3052 else if(strcmp(val, "off") == 0) 3053 tcpporthogdefense = 0; 3054 else 3055 return "unknown value for tcpporthogdefense"; 3056 return nil; 3057 } 3058 3059 /* called with c qlocked */ 3060 char* 3061 tcpctl(Conv* c, char** f, int n) 3062 { 3063 if(n == 1 && strcmp(f[0], "hangup") == 0) 3064 return tcphangup(c); 3065 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3066 return tcpstartka(c, f, n); 3067 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3068 return tcpsetchecksum(c, f, n); 3069 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3070 return tcpporthogdefensectl(f[1]); 3071 return "unknown control request"; 3072 } 3073 3074 int 3075 tcpstats(Proto *tcp, char *buf, int len) 3076 { 3077 Tcppriv *priv; 3078 char *p, *e; 3079 int i; 3080 3081 priv = tcp->priv; 3082 p = buf; 3083 e = p+len; 3084 for(i = 0; i < Nstats; i++) 3085 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); 3086 return p - buf; 3087 } 3088 3089 /* 3090 * garbage collect any stale conversations: 3091 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3092 * - Finwait2 after 5 minutes 3093 * 3094 * this is called whenever we run out of channels. Both checks are 3095 * of questionable validity so we try to use them only when we're 3096 * up against the wall. 3097 */ 3098 int 3099 tcpgc(Proto *tcp) 3100 { 3101 Conv *c, **pp, **ep; 3102 int n; 3103 Tcpctl *tcb; 3104 3105 3106 n = 0; 3107 ep = &tcp->conv[tcp->nc]; 3108 for(pp = tcp->conv; pp < ep; pp++) { 3109 c = *pp; 3110 if(c == nil) 3111 break; 3112 if(!canqlock(c)) 3113 continue; 3114 tcb = (Tcpctl*)c->ptcl; 3115 switch(tcb->state){ 3116 case Syn_received: 3117 if(NOW - tcb->time > 5000){ 3118 localclose(c, "timed out"); 3119 n++; 3120 } 3121 break; 3122 case Finwait2: 3123 if(NOW - tcb->time > 5*60*1000){ 3124 localclose(c, "timed out"); 3125 n++; 3126 } 3127 break; 3128 } 3129 qunlock(c); 3130 } 3131 return n; 3132 } 3133 3134 void 3135 tcpsettimer(Tcpctl *tcb) 3136 { 3137 int x; 3138 3139 /* round trip dependency */ 3140 x = backoff(tcb->backoff) * 3141 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3142 3143 /* bounded twixt 1/2 and 64 seconds */ 3144 if(x < 500/MSPTICK) 3145 x = 500/MSPTICK; 3146 else if(x > (64000/MSPTICK)) 3147 x = 64000/MSPTICK; 3148 tcb->timer.start = x; 3149 } 3150 3151 void 3152 tcpinit(Fs *fs) 3153 { 3154 Proto *tcp; 3155 Tcppriv *tpriv; 3156 3157 tcp = smalloc(sizeof(Proto)); 3158 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3159 tcp->name = "tcp"; 3160 tcp->connect = tcpconnect; 3161 tcp->announce = tcpannounce; 3162 tcp->ctl = tcpctl; 3163 tcp->state = tcpstate; 3164 tcp->create = tcpcreate; 3165 tcp->close = tcpclose; 3166 tcp->rcv = tcpiput; 3167 tcp->advise = tcpadvise; 3168 tcp->stats = tcpstats; 3169 tcp->inuse = tcpinuse; 3170 tcp->gc = tcpgc; 3171 tcp->ipproto = IP_TCPPROTO; 3172 tcp->nc = scalednconv(); 3173 tcp->ptclsize = sizeof(Tcpctl); 3174 tpriv->stats[MaxConn] = tcp->nc; 3175 3176 Fsproto(fs, tcp); 3177 } 3178 3179 void 3180 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3181 { 3182 if(rcvscale){ 3183 tcb->rcv.scale = rcvscale & 0xff; 3184 tcb->snd.scale = sndscale & 0xff; 3185 tcb->window = QMAX<<tcb->snd.scale; 3186 qsetlimit(s->rq, tcb->window); 3187 } else { 3188 tcb->rcv.scale = 0; 3189 tcb->snd.scale = 0; 3190 tcb->window = QMAX; 3191 qsetlimit(s->rq, tcb->window); 3192 } 3193 } 3194