1 #include "u.h" 2 #include "../port/lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "../port/error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Mean segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default mean segment */ 50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 HaveWS = 1<<8, 85 }; 86 87 /* Must correspond to the enumeration above */ 88 char *tcpstates[] = 89 { 90 "Closed", "Listen", "Syn_sent", "Syn_received", 91 "Established", "Finwait1", "Finwait2", "Close_wait", 92 "Closing", "Last_ack", "Time_wait" 93 }; 94 95 typedef struct Tcptimer Tcptimer; 96 struct Tcptimer 97 { 98 Tcptimer *next; 99 Tcptimer *prev; 100 Tcptimer *readynext; 101 int state; 102 int start; 103 int count; 104 void (*func)(void*); 105 void *arg; 106 }; 107 108 /* 109 * v4 and v6 pseudo headers used for 110 * checksuming tcp 111 */ 112 typedef struct Tcp4hdr Tcp4hdr; 113 struct Tcp4hdr 114 { 115 uchar vihl; /* Version and header length */ 116 uchar tos; /* Type of service */ 117 uchar length[2]; /* packet length */ 118 uchar id[2]; /* Identification */ 119 uchar frag[2]; /* Fragment information */ 120 uchar Unused; 121 uchar proto; 122 uchar tcplen[2]; 123 uchar tcpsrc[4]; 124 uchar tcpdst[4]; 125 uchar tcpsport[2]; 126 uchar tcpdport[2]; 127 uchar tcpseq[4]; 128 uchar tcpack[4]; 129 uchar tcpflag[2]; 130 uchar tcpwin[2]; 131 uchar tcpcksum[2]; 132 uchar tcpurg[2]; 133 /* Options segment */ 134 uchar tcpopt[1]; 135 }; 136 137 typedef struct Tcp6hdr Tcp6hdr; 138 struct Tcp6hdr 139 { 140 uchar vcf[4]; 141 uchar ploadlen[2]; 142 uchar proto; 143 uchar ttl; 144 uchar tcpsrc[IPaddrlen]; 145 uchar tcpdst[IPaddrlen]; 146 uchar tcpsport[2]; 147 uchar tcpdport[2]; 148 uchar tcpseq[4]; 149 uchar tcpack[4]; 150 uchar tcpflag[2]; 151 uchar tcpwin[2]; 152 uchar tcpcksum[2]; 153 uchar tcpurg[2]; 154 /* Options segment */ 155 uchar tcpopt[1]; 156 }; 157 158 /* 159 * this represents the control info 160 * for a single packet. It is derived from 161 * a packet in ntohtcp{4,6}() and stuck into 162 * a packet in htontcp{4,6}(). 163 */ 164 typedef struct Tcp Tcp; 165 struct Tcp 166 { 167 ushort source; 168 ushort dest; 169 ulong seq; 170 ulong ack; 171 uchar flags; 172 ushort ws; /* window scale option (if not zero) */ 173 ulong wnd; 174 ushort urg; 175 ushort mss; /* max segment size option (if not zero) */ 176 ushort len; /* size of data */ 177 }; 178 179 /* 180 * this header is malloc'd to thread together fragments 181 * waiting to be coalesced 182 */ 183 typedef struct Reseq Reseq; 184 struct Reseq 185 { 186 Reseq *next; 187 Tcp seg; 188 Block *bp; 189 ushort length; 190 }; 191 192 /* 193 * the qlock in the Conv locks this structure 194 */ 195 typedef struct Tcpctl Tcpctl; 196 struct Tcpctl 197 { 198 uchar state; /* Connection state */ 199 uchar type; /* Listening or active connection */ 200 uchar code; /* Icmp code */ 201 struct { 202 ulong una; /* Unacked data pointer */ 203 ulong nxt; /* Next sequence expected */ 204 ulong ptr; /* Data pointer */ 205 ulong wnd; /* Tcp send window */ 206 ulong urg; /* Urgent data pointer */ 207 ulong wl2; 208 int scale; /* how much to right shift window in xmitted packets */ 209 /* to implement tahoe and reno TCP */ 210 ulong dupacks; /* number of duplicate acks rcvd */ 211 int recovery; /* loss recovery flag */ 212 ulong rxt; /* right window marker for recovery */ 213 } snd; 214 struct { 215 ulong nxt; /* Receive pointer to next uchar slot */ 216 ulong wnd; /* Receive window incoming */ 217 ulong urg; /* Urgent pointer */ 218 int blocked; 219 int una; /* unacked data segs */ 220 int scale; /* how much to left shift window in rcved packets */ 221 } rcv; 222 ulong iss; /* Initial sequence number */ 223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ 224 ulong cwind; /* Congestion window */ 225 int scale; /* desired snd.scale */ 226 ushort ssthresh; /* Slow start threshold */ 227 int resent; /* Bytes just resent */ 228 int irs; /* Initial received squence */ 229 ushort mss; /* Mean segment size */ 230 int rerecv; /* Overlap of data rerecevived */ 231 ulong window; /* Recevive window */ 232 uchar backoff; /* Exponential backoff counter */ 233 int backedoff; /* ms we've backed off for rexmits */ 234 uchar flags; /* State flags */ 235 Reseq *reseq; /* Resequencing queue */ 236 Tcptimer timer; /* Activity timer */ 237 Tcptimer acktimer; /* Acknowledge timer */ 238 Tcptimer rtt_timer; /* Round trip timer */ 239 Tcptimer katimer; /* keep alive timer */ 240 ulong rttseq; /* Round trip sequence */ 241 int srtt; /* Shortened round trip */ 242 int mdev; /* Mean deviation of round trip */ 243 int kacounter; /* count down for keep alive */ 244 uint sndsyntime; /* time syn sent */ 245 ulong time; /* time Finwait2 or Syn_received was sent */ 246 int nochecksum; /* non-zero means don't send checksums */ 247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 248 249 union { 250 Tcp4hdr tcp4hdr; 251 Tcp6hdr tcp6hdr; 252 } protohdr; /* prototype header */ 253 }; 254 255 /* 256 * New calls are put in limbo rather than having a conversation structure 257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 258 * any real Conv structures mucking things up. Calls in limbo rexmit their 259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 260 * 261 * In particular they aren't on a listener's queue so that they don't figure 262 * in the input queue limit. 263 * 264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 266 * there is no hashing of this list. 267 */ 268 typedef struct Limbo Limbo; 269 struct Limbo 270 { 271 Limbo *next; 272 273 uchar laddr[IPaddrlen]; 274 uchar raddr[IPaddrlen]; 275 ushort lport; 276 ushort rport; 277 ulong irs; /* initial received sequence */ 278 ulong iss; /* initial sent sequence */ 279 ushort mss; /* mss from the other end */ 280 ushort rcvscale; /* how much to scale rcvd windows */ 281 ushort sndscale; /* how much to scale sent windows */ 282 ulong lastsend; /* last time we sent a synack */ 283 uchar version; /* v4 or v6 */ 284 uchar rexmits; /* number of retransmissions */ 285 }; 286 287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ 289 290 enum { 291 /* MIB stats */ 292 MaxConn, 293 ActiveOpens, 294 PassiveOpens, 295 EstabResets, 296 CurrEstab, 297 InSegs, 298 OutSegs, 299 RetransSegs, 300 RetransTimeouts, 301 InErrs, 302 OutRsts, 303 304 /* non-MIB stats */ 305 CsumErrs, 306 HlenErrs, 307 LenErrs, 308 OutOfOrder, 309 310 Nstats 311 }; 312 313 static char *statnames[] = 314 { 315 [MaxConn] "MaxConn", 316 [ActiveOpens] "ActiveOpens", 317 [PassiveOpens] "PassiveOpens", 318 [EstabResets] "EstabResets", 319 [CurrEstab] "CurrEstab", 320 [InSegs] "InSegs", 321 [OutSegs] "OutSegs", 322 [RetransSegs] "RetransSegs", 323 [RetransTimeouts] "RetransTimeouts", 324 [InErrs] "InErrs", 325 [OutRsts] "OutRsts", 326 [CsumErrs] "CsumErrs", 327 [HlenErrs] "HlenErrs", 328 [LenErrs] "LenErrs", 329 [OutOfOrder] "OutOfOrder", 330 }; 331 332 typedef struct Tcppriv Tcppriv; 333 struct Tcppriv 334 { 335 /* List of active timers */ 336 QLock tl; 337 Tcptimer *timers; 338 339 /* hash table for matching conversations */ 340 Ipht ht; 341 342 /* calls in limbo waiting for an ACK to our SYN ACK */ 343 int nlimbo; 344 Limbo *lht[NLHT]; 345 346 /* for keeping track of tcpackproc */ 347 QLock apl; 348 int ackprocstarted; 349 350 ulong stats[Nstats]; 351 }; 352 353 /* 354 * Setting tcpporthogdefense to non-zero enables Dong Lin's 355 * solution to hijacked systems staking out port's as a form 356 * of DoS attack. 357 * 358 * To avoid stateless Conv hogs, we pick a sequence number at random. If 359 * it that number gets acked by the other end, we shut down the connection. 360 * Look for tcpporthogedefense in the code. 361 */ 362 int tcpporthogdefense = 0; 363 364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 366 void localclose(Conv*, char*); 367 void procsyn(Conv*, Tcp*); 368 void tcpiput(Proto*, Ipifc*, Block*); 369 void tcpoutput(Conv*); 370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 371 void tcpstart(Conv*, int); 372 void tcptimeout(void*); 373 void tcpsndsyn(Conv*, Tcpctl*); 374 void tcprcvwin(Conv*); 375 void tcpacktimer(void*); 376 void tcpkeepalive(void*); 377 void tcpsetkacounter(Tcpctl*); 378 void tcprxmit(Conv*); 379 void tcpsettimer(Tcpctl*); 380 void tcpsynackrtt(Conv*); 381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 382 383 static void limborexmit(Proto*); 384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 385 386 void 387 tcpsetstate(Conv *s, uchar newstate) 388 { 389 Tcpctl *tcb; 390 uchar oldstate; 391 Tcppriv *tpriv; 392 393 tpriv = s->p->priv; 394 395 tcb = (Tcpctl*)s->ptcl; 396 397 oldstate = tcb->state; 398 if(oldstate == newstate) 399 return; 400 401 if(oldstate == Established) 402 tpriv->stats[CurrEstab]--; 403 if(newstate == Established) 404 tpriv->stats[CurrEstab]++; 405 406 /** 407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, 408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); 409 **/ 410 411 switch(newstate) { 412 case Closed: 413 qclose(s->rq); 414 qclose(s->wq); 415 qclose(s->eq); 416 break; 417 418 case Close_wait: /* Remote closes */ 419 qhangup(s->rq, nil); 420 break; 421 } 422 423 tcb->state = newstate; 424 425 if(oldstate == Syn_sent && newstate != Closed) 426 Fsconnected(s, nil); 427 } 428 429 static char* 430 tcpconnect(Conv *c, char **argv, int argc) 431 { 432 char *e; 433 434 e = Fsstdconnect(c, argv, argc); 435 if(e != nil) 436 return e; 437 tcpstart(c, TCP_CONNECT); 438 439 return nil; 440 } 441 442 static int 443 tcpstate(Conv *c, char *state, int n) 444 { 445 Tcpctl *s; 446 447 s = (Tcpctl*)(c->ptcl); 448 449 return snprint(state, n, 450 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 451 tcpstates[s->state], 452 c->rq ? qlen(c->rq) : 0, 453 c->wq ? qlen(c->wq) : 0, 454 s->srtt, s->mdev, 455 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 456 s->timer.start, s->timer.count, s->rerecv, 457 s->katimer.start, s->katimer.count); 458 } 459 460 static int 461 tcpinuse(Conv *c) 462 { 463 Tcpctl *s; 464 465 s = (Tcpctl*)(c->ptcl); 466 return s->state != Closed; 467 } 468 469 static char* 470 tcpannounce(Conv *c, char **argv, int argc) 471 { 472 char *e; 473 474 e = Fsstdannounce(c, argv, argc); 475 if(e != nil) 476 return e; 477 tcpstart(c, TCP_LISTEN); 478 Fsconnected(c, nil); 479 480 return nil; 481 } 482 483 /* 484 * tcpclose is always called with the q locked 485 */ 486 static void 487 tcpclose(Conv *c) 488 { 489 Tcpctl *tcb; 490 491 tcb = (Tcpctl*)c->ptcl; 492 493 qhangup(c->rq, nil); 494 qhangup(c->wq, nil); 495 qhangup(c->eq, nil); 496 qflush(c->rq); 497 498 switch(tcb->state) { 499 case Listen: 500 /* 501 * reset any incoming calls to this listener 502 */ 503 Fsconnected(c, "Hangup"); 504 505 localclose(c, nil); 506 break; 507 case Closed: 508 case Syn_sent: 509 localclose(c, nil); 510 break; 511 case Syn_received: 512 case Established: 513 tcb->flgcnt++; 514 tcb->snd.nxt++; 515 tcpsetstate(c, Finwait1); 516 tcpoutput(c); 517 break; 518 case Close_wait: 519 tcb->flgcnt++; 520 tcb->snd.nxt++; 521 tcpsetstate(c, Last_ack); 522 tcpoutput(c); 523 break; 524 } 525 } 526 527 void 528 tcpkick(void *x) 529 { 530 Conv *s = x; 531 Tcpctl *tcb; 532 533 tcb = (Tcpctl*)s->ptcl; 534 535 if(waserror()){ 536 qunlock(s); 537 nexterror(); 538 } 539 qlock(s); 540 541 switch(tcb->state) { 542 case Syn_sent: 543 case Syn_received: 544 case Established: 545 case Close_wait: 546 /* 547 * Push data 548 */ 549 tcprcvwin(s); 550 tcpoutput(s); 551 break; 552 default: 553 localclose(s, "Hangup"); 554 break; 555 } 556 557 qunlock(s); 558 poperror(); 559 } 560 561 void 562 tcprcvwin(Conv *s) /* Call with tcb locked */ 563 { 564 int w; 565 Tcpctl *tcb; 566 567 tcb = (Tcpctl*)s->ptcl; 568 w = tcb->window - qlen(s->rq); 569 if(w < 0) 570 w = 0; 571 tcb->rcv.wnd = w; 572 if(w == 0) 573 tcb->rcv.blocked = 1; 574 } 575 576 void 577 tcpacktimer(void *v) 578 { 579 Tcpctl *tcb; 580 Conv *s; 581 582 s = v; 583 tcb = (Tcpctl*)s->ptcl; 584 585 if(waserror()){ 586 qunlock(s); 587 nexterror(); 588 } 589 qlock(s); 590 if(tcb->state != Closed){ 591 tcb->flags |= FORCE; 592 tcprcvwin(s); 593 tcpoutput(s); 594 } 595 qunlock(s); 596 poperror(); 597 } 598 599 static void 600 tcpcreate(Conv *c) 601 { 602 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 603 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); 604 } 605 606 static void 607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 608 { 609 if(newstate != TcptimerON){ 610 if(t->state == TcptimerON){ 611 // unchain 612 if(priv->timers == t){ 613 priv->timers = t->next; 614 if(t->prev != nil) 615 panic("timerstate1"); 616 } 617 if(t->next) 618 t->next->prev = t->prev; 619 if(t->prev) 620 t->prev->next = t->next; 621 t->next = t->prev = nil; 622 } 623 } else { 624 if(t->state != TcptimerON){ 625 // chain 626 if(t->prev != nil || t->next != nil) 627 panic("timerstate2"); 628 t->prev = nil; 629 t->next = priv->timers; 630 if(t->next) 631 t->next->prev = t; 632 priv->timers = t; 633 } 634 } 635 t->state = newstate; 636 } 637 638 void 639 tcpackproc(void *a) 640 { 641 Tcptimer *t, *tp, *timeo; 642 Proto *tcp; 643 Tcppriv *priv; 644 int loop; 645 646 tcp = a; 647 priv = tcp->priv; 648 649 for(;;) { 650 tsleep(&up->sleep, return0, 0, MSPTICK); 651 652 qlock(&priv->tl); 653 timeo = nil; 654 loop = 0; 655 for(t = priv->timers; t != nil; t = tp) { 656 if(loop++ > 10000) 657 panic("tcpackproc1"); 658 tp = t->next; 659 if(t->state == TcptimerON) { 660 t->count--; 661 if(t->count == 0) { 662 timerstate(priv, t, TcptimerDONE); 663 t->readynext = timeo; 664 timeo = t; 665 } 666 } 667 } 668 qunlock(&priv->tl); 669 670 loop = 0; 671 for(t = timeo; t != nil; t = t->readynext) { 672 if(loop++ > 10000) 673 panic("tcpackproc2"); 674 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 675 (*t->func)(t->arg); 676 poperror(); 677 } 678 } 679 680 limborexmit(tcp); 681 } 682 } 683 684 void 685 tcpgo(Tcppriv *priv, Tcptimer *t) 686 { 687 if(t == nil || t->start == 0) 688 return; 689 690 qlock(&priv->tl); 691 t->count = t->start; 692 timerstate(priv, t, TcptimerON); 693 qunlock(&priv->tl); 694 } 695 696 void 697 tcphalt(Tcppriv *priv, Tcptimer *t) 698 { 699 if(t == nil) 700 return; 701 702 qlock(&priv->tl); 703 timerstate(priv, t, TcptimerOFF); 704 qunlock(&priv->tl); 705 } 706 707 int 708 backoff(int n) 709 { 710 return 1 << n; 711 } 712 713 void 714 localclose(Conv *s, char *reason) /* called with tcb locked */ 715 { 716 Tcpctl *tcb; 717 Reseq *rp,*rp1; 718 Tcppriv *tpriv; 719 720 tpriv = s->p->priv; 721 tcb = (Tcpctl*)s->ptcl; 722 723 iphtrem(&tpriv->ht, s); 724 725 tcphalt(tpriv, &tcb->timer); 726 tcphalt(tpriv, &tcb->rtt_timer); 727 tcphalt(tpriv, &tcb->acktimer); 728 tcphalt(tpriv, &tcb->katimer); 729 730 /* Flush reassembly queue; nothing more can arrive */ 731 for(rp = tcb->reseq; rp != nil; rp = rp1) { 732 rp1 = rp->next; 733 freeblist(rp->bp); 734 free(rp); 735 } 736 tcb->reseq = nil; 737 738 if(tcb->state == Syn_sent) 739 Fsconnected(s, reason); 740 if(s->state == Announced) 741 wakeup(&s->listenr); 742 743 qhangup(s->rq, reason); 744 qhangup(s->wq, reason); 745 746 tcpsetstate(s, Closed); 747 } 748 749 /* mtu (- TCP + IP hdr len) of 1st hop */ 750 int 751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) 752 { 753 Ipifc *ifc; 754 int mtu; 755 756 ifc = findipifc(tcp->f, addr, 0); 757 switch(version){ 758 default: 759 case V4: 760 mtu = DEF_MSS; 761 if(ifc != nil) 762 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 763 break; 764 case V6: 765 mtu = DEF_MSS6; 766 if(ifc != nil) 767 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 768 break; 769 } 770 if(ifc != nil){ 771 if(ifc->mbps > 100) 772 *scale = HaveWS | 3; 773 else if(ifc->mbps > 10) 774 *scale = HaveWS | 1; 775 else 776 *scale = HaveWS | 0; 777 } else 778 *scale = HaveWS | 0; 779 780 return mtu; 781 } 782 783 void 784 inittcpctl(Conv *s, int mode) 785 { 786 Tcpctl *tcb; 787 Tcp4hdr* h4; 788 Tcp6hdr* h6; 789 int mss; 790 791 tcb = (Tcpctl*)s->ptcl; 792 793 memset(tcb, 0, sizeof(Tcpctl)); 794 795 tcb->ssthresh = 65535; 796 tcb->srtt = tcp_irtt<<LOGAGAIN; 797 tcb->mdev = 0; 798 799 /* setup timers */ 800 tcb->timer.start = tcp_irtt / MSPTICK; 801 tcb->timer.func = tcptimeout; 802 tcb->timer.arg = s; 803 tcb->rtt_timer.start = MAX_TIME; 804 tcb->acktimer.start = TCP_ACK / MSPTICK; 805 tcb->acktimer.func = tcpacktimer; 806 tcb->acktimer.arg = s; 807 tcb->katimer.start = DEF_KAT / MSPTICK; 808 tcb->katimer.func = tcpkeepalive; 809 tcb->katimer.arg = s; 810 811 mss = DEF_MSS; 812 813 /* create a prototype(pseudo) header */ 814 if(mode != TCP_LISTEN){ 815 if(ipcmp(s->laddr, IPnoaddr) == 0) 816 findlocalip(s->p->f, s->laddr, s->raddr); 817 818 switch(s->ipversion){ 819 case V4: 820 h4 = &tcb->protohdr.tcp4hdr; 821 memset(h4, 0, sizeof(*h4)); 822 h4->proto = IP_TCPPROTO; 823 hnputs(h4->tcpsport, s->lport); 824 hnputs(h4->tcpdport, s->rport); 825 v6tov4(h4->tcpsrc, s->laddr); 826 v6tov4(h4->tcpdst, s->raddr); 827 break; 828 case V6: 829 h6 = &tcb->protohdr.tcp6hdr; 830 memset(h6, 0, sizeof(*h6)); 831 h6->proto = IP_TCPPROTO; 832 hnputs(h6->tcpsport, s->lport); 833 hnputs(h6->tcpdport, s->rport); 834 ipmove(h6->tcpsrc, s->laddr); 835 ipmove(h6->tcpdst, s->raddr); 836 mss = DEF_MSS6; 837 break; 838 default: 839 panic("inittcpctl: version %d", s->ipversion); 840 } 841 } 842 843 tcb->mss = tcb->cwind = mss; 844 845 /* default is no window scaling */ 846 tcb->window = QMAX; 847 tcb->rcv.wnd = QMAX; 848 tcb->rcv.scale = 0; 849 tcb->snd.scale = 0; 850 qsetlimit(s->rq, QMAX); 851 } 852 853 /* 854 * called with s qlocked 855 */ 856 void 857 tcpstart(Conv *s, int mode) 858 { 859 Tcpctl *tcb; 860 Tcppriv *tpriv; 861 char kpname[KNAMELEN]; 862 863 tpriv = s->p->priv; 864 865 if(tpriv->ackprocstarted == 0){ 866 qlock(&tpriv->apl); 867 if(tpriv->ackprocstarted == 0){ 868 sprint(kpname, "#I%dtcpack", s->p->f->dev); 869 kproc(kpname, tcpackproc, s->p, 0); 870 tpriv->ackprocstarted = 1; 871 } 872 qunlock(&tpriv->apl); 873 } 874 875 tcb = (Tcpctl*)s->ptcl; 876 877 inittcpctl(s, mode); 878 879 iphtadd(&tpriv->ht, s); 880 switch(mode) { 881 case TCP_LISTEN: 882 tpriv->stats[PassiveOpens]++; 883 tcb->flags |= CLONE; 884 tcpsetstate(s, Listen); 885 break; 886 887 case TCP_CONNECT: 888 tpriv->stats[ActiveOpens]++; 889 tcb->flags |= ACTIVE; 890 tcpsndsyn(s, tcb); 891 tcpsetstate(s, Syn_sent); 892 tcpoutput(s); 893 break; 894 } 895 } 896 897 static char* 898 tcpflag(ushort flag) 899 { 900 static char buf[128]; 901 902 sprint(buf, "%d", flag>>10); /* Head len */ 903 if(flag & URG) 904 strcat(buf, " URG"); 905 if(flag & ACK) 906 strcat(buf, " ACK"); 907 if(flag & PSH) 908 strcat(buf, " PSH"); 909 if(flag & RST) 910 strcat(buf, " RST"); 911 if(flag & SYN) 912 strcat(buf, " SYN"); 913 if(flag & FIN) 914 strcat(buf, " FIN"); 915 916 return buf; 917 } 918 919 Block * 920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 921 { 922 int dlen; 923 Tcp6hdr *h; 924 ushort csum; 925 ushort hdrlen, optpad = 0; 926 uchar *opt; 927 928 hdrlen = TCP6_HDRSIZE; 929 if(tcph->flags & SYN){ 930 if(tcph->mss) 931 hdrlen += MSS_LENGTH; 932 if(tcph->ws) 933 hdrlen += WS_LENGTH; 934 optpad = hdrlen & 3; 935 if(optpad) 936 optpad = 4 - optpad; 937 hdrlen += optpad; 938 } 939 940 if(data) { 941 dlen = blocklen(data); 942 data = padblock(data, hdrlen + TCP6_PKT); 943 if(data == nil) 944 return nil; 945 } 946 else { 947 dlen = 0; 948 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 949 if(data == nil) 950 return nil; 951 data->wp += hdrlen + TCP6_PKT; 952 } 953 954 /* copy in pseudo ip header plus port numbers */ 955 h = (Tcp6hdr *)(data->rp); 956 memmove(h, ph, TCP6_TCBPHDRSZ); 957 958 /* compose pseudo tcp header, do cksum calculation */ 959 hnputl(h->vcf, hdrlen + dlen); 960 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 961 h->ttl = ph->proto; 962 963 /* copy in variable bits */ 964 hnputl(h->tcpseq, tcph->seq); 965 hnputl(h->tcpack, tcph->ack); 966 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 967 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 968 hnputs(h->tcpurg, tcph->urg); 969 970 if(tcph->flags & SYN){ 971 opt = h->tcpopt; 972 if(tcph->mss != 0){ 973 *opt++ = MSSOPT; 974 *opt++ = MSS_LENGTH; 975 hnputs(opt, tcph->mss); 976 opt += 2; 977 } 978 if(tcph->ws != 0){ 979 *opt++ = WSOPT; 980 *opt++ = WS_LENGTH; 981 *opt++ = tcph->ws; 982 } 983 while(optpad-- > 0) 984 *opt++ = NOOPOPT; 985 } 986 987 if(tcb != nil && tcb->nochecksum){ 988 h->tcpcksum[0] = h->tcpcksum[1] = 0; 989 } else { 990 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 991 hnputs(h->tcpcksum, csum); 992 } 993 994 /* move from pseudo header back to normal ip header */ 995 memset(h->vcf, 0, 4); 996 h->vcf[0] = IP_VER6; 997 hnputs(h->ploadlen, hdrlen+dlen); 998 h->proto = ph->proto; 999 1000 return data; 1001 } 1002 1003 Block * 1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1005 { 1006 int dlen; 1007 Tcp4hdr *h; 1008 ushort csum; 1009 ushort hdrlen, optpad = 0; 1010 uchar *opt; 1011 1012 hdrlen = TCP4_HDRSIZE; 1013 if(tcph->flags & SYN){ 1014 if(tcph->mss) 1015 hdrlen += MSS_LENGTH; 1016 if(tcph->ws) 1017 hdrlen += WS_LENGTH; 1018 optpad = hdrlen & 3; 1019 if(optpad) 1020 optpad = 4 - optpad; 1021 hdrlen += optpad; 1022 } 1023 1024 if(data) { 1025 dlen = blocklen(data); 1026 data = padblock(data, hdrlen + TCP4_PKT); 1027 if(data == nil) 1028 return nil; 1029 } 1030 else { 1031 dlen = 0; 1032 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1033 if(data == nil) 1034 return nil; 1035 data->wp += hdrlen + TCP4_PKT; 1036 } 1037 1038 /* copy in pseudo ip header plus port numbers */ 1039 h = (Tcp4hdr *)(data->rp); 1040 memmove(h, ph, TCP4_TCBPHDRSZ); 1041 1042 /* copy in variable bits */ 1043 hnputs(h->tcplen, hdrlen + dlen); 1044 hnputl(h->tcpseq, tcph->seq); 1045 hnputl(h->tcpack, tcph->ack); 1046 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1047 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1048 hnputs(h->tcpurg, tcph->urg); 1049 1050 if(tcph->flags & SYN){ 1051 opt = h->tcpopt; 1052 if(tcph->mss != 0){ 1053 *opt++ = MSSOPT; 1054 *opt++ = MSS_LENGTH; 1055 hnputs(opt, tcph->mss); 1056 opt += 2; 1057 } 1058 if(tcph->ws != 0){ 1059 *opt++ = WSOPT; 1060 *opt++ = WS_LENGTH; 1061 *opt++ = tcph->ws; 1062 } 1063 while(optpad-- > 0) 1064 *opt++ = NOOPOPT; 1065 } 1066 1067 if(tcb != nil && tcb->nochecksum){ 1068 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1069 } else { 1070 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1071 hnputs(h->tcpcksum, csum); 1072 } 1073 1074 return data; 1075 } 1076 1077 int 1078 ntohtcp6(Tcp *tcph, Block **bpp) 1079 { 1080 Tcp6hdr *h; 1081 uchar *optr; 1082 ushort hdrlen; 1083 ushort optlen; 1084 int n; 1085 1086 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1087 if(*bpp == nil) 1088 return -1; 1089 1090 h = (Tcp6hdr *)((*bpp)->rp); 1091 tcph->source = nhgets(h->tcpsport); 1092 tcph->dest = nhgets(h->tcpdport); 1093 tcph->seq = nhgetl(h->tcpseq); 1094 tcph->ack = nhgetl(h->tcpack); 1095 hdrlen = (h->tcpflag[0]>>2) & ~3; 1096 if(hdrlen < TCP6_HDRSIZE) { 1097 freeblist(*bpp); 1098 return -1; 1099 } 1100 1101 tcph->flags = h->tcpflag[1]; 1102 tcph->wnd = nhgets(h->tcpwin); 1103 tcph->urg = nhgets(h->tcpurg); 1104 tcph->mss = 0; 1105 tcph->ws = 0; 1106 tcph->len = nhgets(h->ploadlen) - hdrlen; 1107 1108 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1109 if(*bpp == nil) 1110 return -1; 1111 1112 optr = h->tcpopt; 1113 n = hdrlen - TCP6_HDRSIZE; 1114 while(n > 0 && *optr != EOLOPT) { 1115 if(*optr == NOOPOPT) { 1116 n--; 1117 optr++; 1118 continue; 1119 } 1120 optlen = optr[1]; 1121 if(optlen < 2 || optlen > n) 1122 break; 1123 switch(*optr) { 1124 case MSSOPT: 1125 if(optlen == MSS_LENGTH) 1126 tcph->mss = nhgets(optr+2); 1127 break; 1128 case WSOPT: 1129 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1130 tcph->ws = HaveWS | *(optr+2); 1131 break; 1132 } 1133 n -= optlen; 1134 optr += optlen; 1135 } 1136 return hdrlen; 1137 } 1138 1139 int 1140 ntohtcp4(Tcp *tcph, Block **bpp) 1141 { 1142 Tcp4hdr *h; 1143 uchar *optr; 1144 ushort hdrlen; 1145 ushort optlen; 1146 int n; 1147 1148 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1149 if(*bpp == nil) 1150 return -1; 1151 1152 h = (Tcp4hdr *)((*bpp)->rp); 1153 tcph->source = nhgets(h->tcpsport); 1154 tcph->dest = nhgets(h->tcpdport); 1155 tcph->seq = nhgetl(h->tcpseq); 1156 tcph->ack = nhgetl(h->tcpack); 1157 1158 hdrlen = (h->tcpflag[0]>>2) & ~3; 1159 if(hdrlen < TCP4_HDRSIZE) { 1160 freeblist(*bpp); 1161 return -1; 1162 } 1163 1164 tcph->flags = h->tcpflag[1]; 1165 tcph->wnd = nhgets(h->tcpwin); 1166 tcph->urg = nhgets(h->tcpurg); 1167 tcph->mss = 0; 1168 tcph->ws = 0; 1169 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1170 1171 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1172 if(*bpp == nil) 1173 return -1; 1174 1175 optr = h->tcpopt; 1176 n = hdrlen - TCP4_HDRSIZE; 1177 while(n > 0 && *optr != EOLOPT) { 1178 if(*optr == NOOPOPT) { 1179 n--; 1180 optr++; 1181 continue; 1182 } 1183 optlen = optr[1]; 1184 if(optlen < 2 || optlen > n) 1185 break; 1186 switch(*optr) { 1187 case MSSOPT: 1188 if(optlen == MSS_LENGTH) 1189 tcph->mss = nhgets(optr+2); 1190 break; 1191 case WSOPT: 1192 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1193 tcph->ws = HaveWS | *(optr+2); 1194 break; 1195 } 1196 n -= optlen; 1197 optr += optlen; 1198 } 1199 return hdrlen; 1200 } 1201 1202 /* 1203 * For outgiing calls, generate an initial sequence 1204 * number and put a SYN on the send queue 1205 */ 1206 void 1207 tcpsndsyn(Conv *s, Tcpctl *tcb) 1208 { 1209 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1210 tcb->rttseq = tcb->iss; 1211 tcb->snd.wl2 = tcb->iss; 1212 tcb->snd.una = tcb->iss; 1213 tcb->snd.ptr = tcb->rttseq; 1214 tcb->snd.nxt = tcb->rttseq; 1215 tcb->flgcnt++; 1216 tcb->flags |= FORCE; 1217 tcb->sndsyntime = NOW; 1218 1219 /* set desired mss and scale */ 1220 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1221 } 1222 1223 void 1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1225 { 1226 Block *hbp; 1227 uchar rflags; 1228 Tcppriv *tpriv; 1229 Tcp4hdr ph4; 1230 Tcp6hdr ph6; 1231 1232 netlog(tcp->f, Logtcp, "sndrst: %s", reason); 1233 1234 tpriv = tcp->priv; 1235 1236 if(seg->flags & RST) 1237 return; 1238 1239 /* make pseudo header */ 1240 switch(version) { 1241 case V4: 1242 memset(&ph4, 0, sizeof(ph4)); 1243 ph4.vihl = IP_VER4; 1244 v6tov4(ph4.tcpsrc, dest); 1245 v6tov4(ph4.tcpdst, source); 1246 ph4.proto = IP_TCPPROTO; 1247 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1248 hnputs(ph4.tcpsport, seg->dest); 1249 hnputs(ph4.tcpdport, seg->source); 1250 break; 1251 case V6: 1252 memset(&ph6, 0, sizeof(ph6)); 1253 ph6.vcf[0] = IP_VER6; 1254 ipmove(ph6.tcpsrc, dest); 1255 ipmove(ph6.tcpdst, source); 1256 ph6.proto = IP_TCPPROTO; 1257 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1258 hnputs(ph6.tcpsport, seg->dest); 1259 hnputs(ph6.tcpdport, seg->source); 1260 break; 1261 default: 1262 panic("sndrst: version %d", version); 1263 } 1264 1265 tpriv->stats[OutRsts]++; 1266 rflags = RST; 1267 1268 /* convince the other end that this reset is in band */ 1269 if(seg->flags & ACK) { 1270 seg->seq = seg->ack; 1271 seg->ack = 0; 1272 } 1273 else { 1274 rflags |= ACK; 1275 seg->ack = seg->seq; 1276 seg->seq = 0; 1277 if(seg->flags & SYN) 1278 seg->ack++; 1279 seg->ack += length; 1280 if(seg->flags & FIN) 1281 seg->ack++; 1282 } 1283 seg->flags = rflags; 1284 seg->wnd = 0; 1285 seg->urg = 0; 1286 seg->mss = 0; 1287 seg->ws = 0; 1288 switch(version) { 1289 case V4: 1290 hbp = htontcp4(seg, nil, &ph4, nil); 1291 if(hbp == nil) 1292 return; 1293 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1294 break; 1295 case V6: 1296 hbp = htontcp6(seg, nil, &ph6, nil); 1297 if(hbp == nil) 1298 return; 1299 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1300 break; 1301 default: 1302 panic("sndrst2: version %d", version); 1303 } 1304 } 1305 1306 /* 1307 * send a reset to the remote side and close the conversation 1308 * called with s qlocked 1309 */ 1310 char* 1311 tcphangup(Conv *s) 1312 { 1313 Tcp seg; 1314 Tcpctl *tcb; 1315 Block *hbp; 1316 1317 tcb = (Tcpctl*)s->ptcl; 1318 if(waserror()) 1319 return commonerror(); 1320 if(ipcmp(s->raddr, IPnoaddr) != 0) { 1321 if(!waserror()){ 1322 memset(&seg, 0, sizeof seg); 1323 seg.flags = RST | ACK; 1324 seg.ack = tcb->rcv.nxt; 1325 tcb->rcv.una = 0; 1326 seg.seq = tcb->snd.ptr; 1327 seg.wnd = 0; 1328 seg.urg = 0; 1329 seg.mss = 0; 1330 seg.ws = 0; 1331 switch(s->ipversion) { 1332 case V4: 1333 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1334 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1335 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1336 break; 1337 case V6: 1338 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1339 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1340 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1341 break; 1342 default: 1343 panic("tcphangup: version %d", s->ipversion); 1344 } 1345 poperror(); 1346 } 1347 } 1348 localclose(s, nil); 1349 poperror(); 1350 return nil; 1351 } 1352 1353 /* 1354 * (re)send a SYN ACK 1355 */ 1356 int 1357 sndsynack(Proto *tcp, Limbo *lp) 1358 { 1359 Block *hbp; 1360 Tcp4hdr ph4; 1361 Tcp6hdr ph6; 1362 Tcp seg; 1363 int scale; 1364 1365 /* make pseudo header */ 1366 switch(lp->version) { 1367 case V4: 1368 memset(&ph4, 0, sizeof(ph4)); 1369 ph4.vihl = IP_VER4; 1370 v6tov4(ph4.tcpsrc, lp->laddr); 1371 v6tov4(ph4.tcpdst, lp->raddr); 1372 ph4.proto = IP_TCPPROTO; 1373 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1374 hnputs(ph4.tcpsport, lp->lport); 1375 hnputs(ph4.tcpdport, lp->rport); 1376 break; 1377 case V6: 1378 memset(&ph6, 0, sizeof(ph6)); 1379 ph6.vcf[0] = IP_VER6; 1380 ipmove(ph6.tcpsrc, lp->laddr); 1381 ipmove(ph6.tcpdst, lp->raddr); 1382 ph6.proto = IP_TCPPROTO; 1383 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1384 hnputs(ph6.tcpsport, lp->lport); 1385 hnputs(ph6.tcpdport, lp->rport); 1386 break; 1387 default: 1388 panic("sndrst: version %d", lp->version); 1389 } 1390 1391 seg.seq = lp->iss; 1392 seg.ack = lp->irs+1; 1393 seg.flags = SYN|ACK; 1394 seg.urg = 0; 1395 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1396 seg.wnd = QMAX; 1397 1398 /* if the other side set scale, we should too */ 1399 if(lp->rcvscale){ 1400 seg.ws = scale; 1401 lp->sndscale = scale; 1402 } else { 1403 seg.ws = 0; 1404 lp->sndscale = 0; 1405 } 1406 1407 switch(lp->version) { 1408 case V4: 1409 hbp = htontcp4(&seg, nil, &ph4, nil); 1410 if(hbp == nil) 1411 return -1; 1412 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1413 break; 1414 case V6: 1415 hbp = htontcp6(&seg, nil, &ph6, nil); 1416 if(hbp == nil) 1417 return -1; 1418 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1419 break; 1420 default: 1421 panic("sndsnack: version %d", lp->version); 1422 } 1423 lp->lastsend = NOW; 1424 return 0; 1425 } 1426 1427 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1428 1429 /* 1430 * put a call into limbo and respond with a SYN ACK 1431 * 1432 * called with proto locked 1433 */ 1434 static void 1435 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1436 { 1437 Limbo *lp, **l; 1438 Tcppriv *tpriv; 1439 int h; 1440 1441 tpriv = s->p->priv; 1442 h = hashipa(source, seg->source); 1443 1444 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1445 lp = *l; 1446 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1447 continue; 1448 if(ipcmp(lp->raddr, source) != 0) 1449 continue; 1450 if(ipcmp(lp->laddr, dest) != 0) 1451 continue; 1452 1453 /* each new SYN restarts the retransmits */ 1454 lp->irs = seg->seq; 1455 break; 1456 } 1457 lp = *l; 1458 if(lp == nil){ 1459 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1460 lp = tpriv->lht[h]; 1461 tpriv->lht[h] = lp->next; 1462 lp->next = nil; 1463 } else { 1464 lp = malloc(sizeof(*lp)); 1465 if(lp == nil) 1466 return; 1467 tpriv->nlimbo++; 1468 } 1469 *l = lp; 1470 lp->version = version; 1471 ipmove(lp->laddr, dest); 1472 ipmove(lp->raddr, source); 1473 lp->lport = seg->dest; 1474 lp->rport = seg->source; 1475 lp->mss = seg->mss; 1476 lp->rcvscale = seg->ws; 1477 lp->irs = seg->seq; 1478 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1479 } 1480 1481 if(sndsynack(s->p, lp) < 0){ 1482 *l = lp->next; 1483 tpriv->nlimbo--; 1484 free(lp); 1485 } 1486 } 1487 1488 /* 1489 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1490 */ 1491 static void 1492 limborexmit(Proto *tcp) 1493 { 1494 Tcppriv *tpriv; 1495 Limbo **l, *lp; 1496 int h; 1497 int seen; 1498 ulong now; 1499 1500 tpriv = tcp->priv; 1501 1502 if(!canqlock(tcp)) 1503 return; 1504 seen = 0; 1505 now = NOW; 1506 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1507 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1508 lp = *l; 1509 seen++; 1510 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1511 continue; 1512 1513 /* time it out after 1 second */ 1514 if(++(lp->rexmits) > 5){ 1515 tpriv->nlimbo--; 1516 *l = lp->next; 1517 free(lp); 1518 continue; 1519 } 1520 1521 /* if we're being attacked, don't bother resending SYN ACK's */ 1522 if(tpriv->nlimbo > 100) 1523 continue; 1524 1525 if(sndsynack(tcp, lp) < 0){ 1526 tpriv->nlimbo--; 1527 *l = lp->next; 1528 free(lp); 1529 continue; 1530 } 1531 1532 l = &lp->next; 1533 } 1534 } 1535 qunlock(tcp); 1536 } 1537 1538 /* 1539 * lookup call in limbo. if found, throw it out. 1540 * 1541 * called with proto locked 1542 */ 1543 static void 1544 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1545 { 1546 Limbo *lp, **l; 1547 int h; 1548 Tcppriv *tpriv; 1549 1550 tpriv = s->p->priv; 1551 1552 /* find a call in limbo */ 1553 h = hashipa(src, segp->source); 1554 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1555 lp = *l; 1556 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1557 continue; 1558 if(ipcmp(lp->laddr, dst) != 0) 1559 continue; 1560 if(ipcmp(lp->raddr, src) != 0) 1561 continue; 1562 1563 /* RST can only follow the SYN */ 1564 if(segp->seq == lp->irs+1){ 1565 tpriv->nlimbo--; 1566 *l = lp->next; 1567 free(lp); 1568 } 1569 break; 1570 } 1571 } 1572 1573 /* 1574 * come here when we finally get an ACK to our SYN-ACK. 1575 * lookup call in limbo. if found, create a new conversation 1576 * 1577 * called with proto locked 1578 */ 1579 static Conv* 1580 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1581 { 1582 Conv *new; 1583 Tcpctl *tcb; 1584 Tcppriv *tpriv; 1585 Tcp4hdr *h4; 1586 Tcp6hdr *h6; 1587 Limbo *lp, **l; 1588 int h; 1589 1590 /* unless it's just an ack, it can't be someone coming out of limbo */ 1591 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1592 return nil; 1593 1594 tpriv = s->p->priv; 1595 1596 /* find a call in limbo */ 1597 h = hashipa(src, segp->source); 1598 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1599 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d", 1600 src, segp->source, lp->raddr, lp->rport, 1601 dst, segp->dest, lp->laddr, lp->lport, 1602 version, lp->version 1603 ); 1604 1605 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1606 continue; 1607 if(ipcmp(lp->laddr, dst) != 0) 1608 continue; 1609 if(ipcmp(lp->raddr, src) != 0) 1610 continue; 1611 1612 /* we're assuming no data with the initial SYN */ 1613 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1614 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux", 1615 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1616 lp = nil; 1617 } else { 1618 tpriv->nlimbo--; 1619 *l = lp->next; 1620 } 1621 break; 1622 } 1623 if(lp == nil) 1624 return nil; 1625 1626 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1627 if(new == nil) 1628 return nil; 1629 1630 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1631 tcb = (Tcpctl*)new->ptcl; 1632 tcb->flags &= ~CLONE; 1633 tcb->timer.arg = new; 1634 tcb->timer.state = TcptimerOFF; 1635 tcb->acktimer.arg = new; 1636 tcb->acktimer.state = TcptimerOFF; 1637 tcb->katimer.arg = new; 1638 tcb->katimer.state = TcptimerOFF; 1639 tcb->rtt_timer.arg = new; 1640 tcb->rtt_timer.state = TcptimerOFF; 1641 1642 tcb->irs = lp->irs; 1643 tcb->rcv.nxt = tcb->irs+1; 1644 tcb->rcv.urg = tcb->rcv.nxt; 1645 1646 tcb->iss = lp->iss; 1647 tcb->rttseq = tcb->iss; 1648 tcb->snd.wl2 = tcb->iss; 1649 tcb->snd.una = tcb->iss+1; 1650 tcb->snd.ptr = tcb->iss+1; 1651 tcb->snd.nxt = tcb->iss+1; 1652 tcb->flgcnt = 0; 1653 tcb->flags |= SYNACK; 1654 1655 /* our sending max segment size cannot be bigger than what he asked for */ 1656 if(lp->mss != 0 && lp->mss < tcb->mss) 1657 tcb->mss = lp->mss; 1658 1659 /* window scaling */ 1660 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1661 1662 /* the congestion window always starts out as a single segment */ 1663 tcb->snd.wnd = segp->wnd; 1664 tcb->cwind = tcb->mss; 1665 1666 /* set initial round trip time */ 1667 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1668 tcpsynackrtt(new); 1669 1670 free(lp); 1671 1672 /* set up proto header */ 1673 switch(version){ 1674 case V4: 1675 h4 = &tcb->protohdr.tcp4hdr; 1676 memset(h4, 0, sizeof(*h4)); 1677 h4->proto = IP_TCPPROTO; 1678 hnputs(h4->tcpsport, new->lport); 1679 hnputs(h4->tcpdport, new->rport); 1680 v6tov4(h4->tcpsrc, dst); 1681 v6tov4(h4->tcpdst, src); 1682 break; 1683 case V6: 1684 h6 = &tcb->protohdr.tcp6hdr; 1685 memset(h6, 0, sizeof(*h6)); 1686 h6->proto = IP_TCPPROTO; 1687 hnputs(h6->tcpsport, new->lport); 1688 hnputs(h6->tcpdport, new->rport); 1689 ipmove(h6->tcpsrc, dst); 1690 ipmove(h6->tcpdst, src); 1691 break; 1692 default: 1693 panic("tcpincoming: version %d", new->ipversion); 1694 } 1695 1696 tcpsetstate(new, Established); 1697 1698 iphtadd(&tpriv->ht, new); 1699 1700 return new; 1701 } 1702 1703 int 1704 seq_within(ulong x, ulong low, ulong high) 1705 { 1706 if(low <= high){ 1707 if(low <= x && x <= high) 1708 return 1; 1709 } 1710 else { 1711 if(x >= low || x <= high) 1712 return 1; 1713 } 1714 return 0; 1715 } 1716 1717 int 1718 seq_lt(ulong x, ulong y) 1719 { 1720 return (int)(x-y) < 0; 1721 } 1722 1723 int 1724 seq_le(ulong x, ulong y) 1725 { 1726 return (int)(x-y) <= 0; 1727 } 1728 1729 int 1730 seq_gt(ulong x, ulong y) 1731 { 1732 return (int)(x-y) > 0; 1733 } 1734 1735 int 1736 seq_ge(ulong x, ulong y) 1737 { 1738 return (int)(x-y) >= 0; 1739 } 1740 1741 /* 1742 * use the time between the first SYN and it's ack as the 1743 * initial round trip time 1744 */ 1745 void 1746 tcpsynackrtt(Conv *s) 1747 { 1748 Tcpctl *tcb; 1749 int delta; 1750 Tcppriv *tpriv; 1751 1752 tcb = (Tcpctl*)s->ptcl; 1753 tpriv = s->p->priv; 1754 1755 delta = NOW - tcb->sndsyntime; 1756 tcb->srtt = delta<<LOGAGAIN; 1757 tcb->mdev = delta<<LOGDGAIN; 1758 1759 /* halt round trip timer */ 1760 tcphalt(tpriv, &tcb->rtt_timer); 1761 } 1762 1763 void 1764 update(Conv *s, Tcp *seg) 1765 { 1766 int rtt, delta; 1767 Tcpctl *tcb; 1768 ulong acked; 1769 ulong expand; 1770 Tcppriv *tpriv; 1771 1772 tpriv = s->p->priv; 1773 tcb = (Tcpctl*)s->ptcl; 1774 1775 /* if everything has been acked, force output(?) */ 1776 if(seq_gt(seg->ack, tcb->snd.nxt)) { 1777 tcb->flags |= FORCE; 1778 return; 1779 } 1780 1781 /* added by Dong Lin for fast retransmission */ 1782 if(seg->ack == tcb->snd.una 1783 && tcb->snd.una != tcb->snd.nxt 1784 && seg->len == 0 1785 && seg->wnd == tcb->snd.wnd) { 1786 1787 /* this is a pure ack w/o window update */ 1788 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", 1789 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); 1790 1791 if(++tcb->snd.dupacks == TCPREXMTTHRESH) { 1792 /* 1793 * tahoe tcp rxt the packet, half sshthresh, 1794 * and set cwnd to one packet 1795 */ 1796 tcb->snd.recovery = 1; 1797 tcb->snd.rxt = tcb->snd.nxt; 1798 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); 1799 tcprxmit(s); 1800 } else { 1801 /* do reno tcp here. */ 1802 } 1803 } 1804 1805 /* 1806 * update window 1807 */ 1808 if(seq_gt(seg->ack, tcb->snd.wl2) 1809 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1810 tcb->snd.wnd = seg->wnd; 1811 tcb->snd.wl2 = seg->ack; 1812 } 1813 1814 if(!seq_gt(seg->ack, tcb->snd.una)){ 1815 /* 1816 * don't let us hangup if sending into a closed window and 1817 * we're still getting acks 1818 */ 1819 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ 1820 tcb->backedoff = MAXBACKMS/4; 1821 } 1822 return; 1823 } 1824 1825 /* 1826 * any positive ack turns off fast rxt, 1827 * (should we do new-reno on partial acks?) 1828 */ 1829 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { 1830 tcb->snd.dupacks = 0; 1831 tcb->snd.recovery = 0; 1832 } else 1833 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); 1834 1835 /* Compute the new send window size */ 1836 acked = seg->ack - tcb->snd.una; 1837 1838 /* avoid slow start and timers for SYN acks */ 1839 if((tcb->flags & SYNACK) == 0) { 1840 tcb->flags |= SYNACK; 1841 acked--; 1842 tcb->flgcnt--; 1843 goto done; 1844 } 1845 1846 /* slow start as long as we're not recovering from lost packets */ 1847 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { 1848 if(tcb->cwind < tcb->ssthresh) { 1849 expand = tcb->mss; 1850 if(acked < expand) 1851 expand = acked; 1852 } 1853 else 1854 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; 1855 1856 if(tcb->cwind + expand < tcb->cwind) 1857 expand = tcb->snd.wnd - tcb->cwind; 1858 if(tcb->cwind + expand > tcb->snd.wnd) 1859 expand = tcb->snd.wnd - tcb->cwind; 1860 tcb->cwind += expand; 1861 } 1862 1863 /* Adjust the timers according to the round trip time */ 1864 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 1865 tcphalt(tpriv, &tcb->rtt_timer); 1866 if((tcb->flags&RETRAN) == 0) { 1867 tcb->backoff = 0; 1868 tcb->backedoff = 0; 1869 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 1870 if(rtt == 0) 1871 rtt = 1; /* otherwise all close systems will rexmit in 0 time */ 1872 rtt *= MSPTICK; 1873 if(tcb->srtt == 0) { 1874 tcb->srtt = rtt << LOGAGAIN; 1875 tcb->mdev = rtt << LOGDGAIN; 1876 } else { 1877 delta = rtt - (tcb->srtt>>LOGAGAIN); 1878 tcb->srtt += delta; 1879 if(tcb->srtt <= 0) 1880 tcb->srtt = 1; 1881 1882 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 1883 tcb->mdev += delta; 1884 if(tcb->mdev <= 0) 1885 tcb->mdev = 1; 1886 } 1887 tcpsettimer(tcb); 1888 } 1889 } 1890 1891 done: 1892 if(qdiscard(s->wq, acked) < acked) 1893 tcb->flgcnt--; 1894 1895 tcb->snd.una = seg->ack; 1896 if(seq_gt(seg->ack, tcb->snd.urg)) 1897 tcb->snd.urg = seg->ack; 1898 1899 if(tcb->snd.una != tcb->snd.nxt) 1900 tcpgo(tpriv, &tcb->timer); 1901 else 1902 tcphalt(tpriv, &tcb->timer); 1903 1904 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 1905 tcb->snd.ptr = tcb->snd.una; 1906 1907 tcb->flags &= ~RETRAN; 1908 tcb->backoff = 0; 1909 tcb->backedoff = 0; 1910 } 1911 1912 void 1913 tcpiput(Proto *tcp, Ipifc*, Block *bp) 1914 { 1915 Tcp seg; 1916 Tcp4hdr *h4; 1917 Tcp6hdr *h6; 1918 int hdrlen; 1919 Tcpctl *tcb; 1920 ushort length; 1921 uchar source[IPaddrlen], dest[IPaddrlen]; 1922 Conv *s; 1923 Fs *f; 1924 Tcppriv *tpriv; 1925 uchar version; 1926 1927 f = tcp->f; 1928 tpriv = tcp->priv; 1929 1930 tpriv->stats[InSegs]++; 1931 1932 h4 = (Tcp4hdr*)(bp->rp); 1933 h6 = (Tcp6hdr*)(bp->rp); 1934 1935 if((h4->vihl&0xF0)==IP_VER4) { 1936 version = V4; 1937 length = nhgets(h4->length); 1938 v4tov6(dest, h4->tcpdst); 1939 v4tov6(source, h4->tcpsrc); 1940 1941 h4->Unused = 0; 1942 hnputs(h4->tcplen, length-TCP4_PKT); 1943 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 1944 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 1945 tpriv->stats[CsumErrs]++; 1946 tpriv->stats[InErrs]++; 1947 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1948 freeblist(bp); 1949 return; 1950 } 1951 1952 hdrlen = ntohtcp4(&seg, &bp); 1953 if(hdrlen < 0){ 1954 tpriv->stats[HlenErrs]++; 1955 tpriv->stats[InErrs]++; 1956 netlog(f, Logtcp, "bad tcp hdr len\n"); 1957 return; 1958 } 1959 1960 /* trim the packet to the size claimed by the datagram */ 1961 length -= hdrlen+TCP4_PKT; 1962 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 1963 if(bp == nil){ 1964 tpriv->stats[LenErrs]++; 1965 tpriv->stats[InErrs]++; 1966 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 1967 return; 1968 } 1969 } 1970 else { 1971 int ttl = h6->ttl; 1972 int proto = h6->proto; 1973 1974 version = V6; 1975 length = nhgets(h6->ploadlen); 1976 ipmove(dest, h6->tcpdst); 1977 ipmove(source, h6->tcpsrc); 1978 1979 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 1980 h6->ttl = proto; 1981 hnputl(h6->vcf, length); 1982 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 1983 ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) { 1984 tpriv->stats[CsumErrs]++; 1985 tpriv->stats[InErrs]++; 1986 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1987 freeblist(bp); 1988 return; 1989 } 1990 h6->ttl = ttl; 1991 h6->proto = proto; 1992 hnputs(h6->ploadlen, length); 1993 1994 hdrlen = ntohtcp6(&seg, &bp); 1995 if(hdrlen < 0){ 1996 tpriv->stats[HlenErrs]++; 1997 tpriv->stats[InErrs]++; 1998 netlog(f, Logtcp, "bad tcp hdr len\n"); 1999 return; 2000 } 2001 2002 /* trim the packet to the size claimed by the datagram */ 2003 length -= hdrlen; 2004 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2005 if(bp == nil){ 2006 tpriv->stats[LenErrs]++; 2007 tpriv->stats[InErrs]++; 2008 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 2009 return; 2010 } 2011 } 2012 2013 /* lock protocol while searching for a conversation */ 2014 qlock(tcp); 2015 2016 /* Look for a matching conversation */ 2017 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2018 if(s == nil){ 2019 netlog(f, Logtcp, "iphtlook failed"); 2020 reset: 2021 qunlock(tcp); 2022 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2023 freeblist(bp); 2024 return; 2025 } 2026 2027 /* if it's a listener, look for the right flags and get a new conv */ 2028 tcb = (Tcpctl*)s->ptcl; 2029 if(tcb->state == Listen){ 2030 if(seg.flags & RST){ 2031 limborst(s, &seg, source, dest, version); 2032 qunlock(tcp); 2033 freeblist(bp); 2034 return; 2035 } 2036 2037 /* if this is a new SYN, put the call into limbo */ 2038 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2039 limbo(s, source, dest, &seg, version); 2040 qunlock(tcp); 2041 freeblist(bp); 2042 return; 2043 } 2044 2045 /* 2046 * if there's a matching call in limbo, tcpincoming will 2047 * return it in state Syn_received 2048 */ 2049 s = tcpincoming(s, &seg, source, dest, version); 2050 if(s == nil) 2051 goto reset; 2052 } 2053 2054 /* The rest of the input state machine is run with the control block 2055 * locked and implements the state machine directly out of the RFC. 2056 * Out-of-band data is ignored - it was always a bad idea. 2057 */ 2058 tcb = (Tcpctl*)s->ptcl; 2059 if(waserror()){ 2060 qunlock(s); 2061 nexterror(); 2062 } 2063 qlock(s); 2064 qunlock(tcp); 2065 2066 /* fix up window */ 2067 seg.wnd <<= tcb->rcv.scale; 2068 2069 /* every input packet in puts off the keep alive time out */ 2070 tcpsetkacounter(tcb); 2071 2072 switch(tcb->state) { 2073 case Closed: 2074 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2075 goto raise; 2076 case Syn_sent: 2077 if(seg.flags & ACK) { 2078 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2079 sndrst(tcp, source, dest, length, &seg, version, 2080 "bad seq in Syn_sent"); 2081 goto raise; 2082 } 2083 } 2084 if(seg.flags & RST) { 2085 if(seg.flags & ACK) 2086 localclose(s, Econrefused); 2087 goto raise; 2088 } 2089 2090 if(seg.flags & SYN) { 2091 procsyn(s, &seg); 2092 if(seg.flags & ACK){ 2093 update(s, &seg); 2094 tcpsynackrtt(s); 2095 tcpsetstate(s, Established); 2096 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2097 } 2098 else { 2099 tcb->time = NOW; 2100 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2101 } 2102 2103 if(length != 0 || (seg.flags & FIN)) 2104 break; 2105 2106 freeblist(bp); 2107 goto output; 2108 } 2109 else 2110 freeblist(bp); 2111 2112 qunlock(s); 2113 poperror(); 2114 return; 2115 case Syn_received: 2116 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2117 if(seg.flags & ACK) 2118 tcpsynackrtt(s); 2119 break; 2120 } 2121 2122 /* 2123 * One DOS attack is to open connections to us and then forget about them, 2124 * thereby tying up a conv at no long term cost to the attacker. 2125 * This is an attempt to defeat these stateless DOS attacks. See 2126 * corresponding code in tcpsendka(). 2127 */ 2128 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2129 if(tcpporthogdefense 2130 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2131 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2132 source, seg.source, dest, seg.dest, seg.flags, 2133 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2134 localclose(s, "stateless hog"); 2135 } 2136 } 2137 2138 /* Cut the data to fit the receive window */ 2139 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2140 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); 2141 update(s, &seg); 2142 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2143 tcphalt(tpriv, &tcb->rtt_timer); 2144 tcphalt(tpriv, &tcb->acktimer); 2145 tcphalt(tpriv, &tcb->katimer); 2146 tcpsetstate(s, Time_wait); 2147 tcb->timer.start = MSL2*(1000 / MSPTICK); 2148 tcpgo(tpriv, &tcb->timer); 2149 } 2150 if(!(seg.flags & RST)) { 2151 tcb->flags |= FORCE; 2152 goto output; 2153 } 2154 qunlock(s); 2155 poperror(); 2156 return; 2157 } 2158 2159 /* Cannot accept so answer with a rst */ 2160 if(length && tcb->state == Closed) { 2161 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2162 goto raise; 2163 } 2164 2165 /* The segment is beyond the current receive pointer so 2166 * queue the data in the resequence queue 2167 */ 2168 if(seg.seq != tcb->rcv.nxt) 2169 if(length != 0 || (seg.flags & (SYN|FIN))) { 2170 update(s, &seg); 2171 if(addreseq(tcb, tpriv, &seg, bp, length) < 0) 2172 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); 2173 tcb->flags |= FORCE; 2174 goto output; 2175 } 2176 2177 /* 2178 * keep looping till we've processed this packet plus any 2179 * adjacent packets in the resequence queue 2180 */ 2181 for(;;) { 2182 if(seg.flags & RST) { 2183 if(tcb->state == Established) { 2184 tpriv->stats[EstabResets]++; 2185 if(tcb->rcv.nxt != seg.seq) 2186 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); 2187 } 2188 localclose(s, Econrefused); 2189 goto raise; 2190 } 2191 2192 if((seg.flags&ACK) == 0) 2193 goto raise; 2194 2195 switch(tcb->state) { 2196 case Syn_received: 2197 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2198 sndrst(tcp, source, dest, length, &seg, version, 2199 "bad seq in Syn_received"); 2200 goto raise; 2201 } 2202 update(s, &seg); 2203 tcpsetstate(s, Established); 2204 case Established: 2205 case Close_wait: 2206 update(s, &seg); 2207 break; 2208 case Finwait1: 2209 update(s, &seg); 2210 if(qlen(s->wq)+tcb->flgcnt == 0){ 2211 tcphalt(tpriv, &tcb->rtt_timer); 2212 tcphalt(tpriv, &tcb->acktimer); 2213 tcpsetkacounter(tcb); 2214 tcb->time = NOW; 2215 tcpsetstate(s, Finwait2); 2216 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2217 tcpgo(tpriv, &tcb->katimer); 2218 } 2219 break; 2220 case Finwait2: 2221 update(s, &seg); 2222 break; 2223 case Closing: 2224 update(s, &seg); 2225 if(qlen(s->wq)+tcb->flgcnt == 0) { 2226 tcphalt(tpriv, &tcb->rtt_timer); 2227 tcphalt(tpriv, &tcb->acktimer); 2228 tcphalt(tpriv, &tcb->katimer); 2229 tcpsetstate(s, Time_wait); 2230 tcb->timer.start = MSL2*(1000 / MSPTICK); 2231 tcpgo(tpriv, &tcb->timer); 2232 } 2233 break; 2234 case Last_ack: 2235 update(s, &seg); 2236 if(qlen(s->wq)+tcb->flgcnt == 0) { 2237 localclose(s, nil); 2238 goto raise; 2239 } 2240 case Time_wait: 2241 tcb->flags |= FORCE; 2242 if(tcb->timer.state != TcptimerON) 2243 tcpgo(tpriv, &tcb->timer); 2244 } 2245 2246 if((seg.flags&URG) && seg.urg) { 2247 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2248 tcb->rcv.urg = seg.urg + seg.seq; 2249 pullblock(&bp, seg.urg); 2250 } 2251 } 2252 else 2253 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2254 tcb->rcv.urg = tcb->rcv.nxt; 2255 2256 if(length == 0) { 2257 if(bp != nil) 2258 freeblist(bp); 2259 } 2260 else { 2261 switch(tcb->state){ 2262 default: 2263 /* Ignore segment text */ 2264 if(bp != nil) 2265 freeblist(bp); 2266 break; 2267 2268 case Syn_received: 2269 case Established: 2270 case Finwait1: 2271 /* If we still have some data place on 2272 * receive queue 2273 */ 2274 if(bp) { 2275 bp = packblock(bp); 2276 if(bp == nil) 2277 panic("tcp packblock"); 2278 qpassnolim(s->rq, bp); 2279 bp = nil; 2280 2281 /* 2282 * Force an ack every 2 data messages. This is 2283 * a hack for rob to make his home system run 2284 * faster. 2285 * 2286 * this also keeps the standard TCP congestion 2287 * control working since it needs an ack every 2288 * 2 max segs worth. This is not quite that, 2289 * but under a real stream is equivalent since 2290 * every packet has a max seg in it. 2291 */ 2292 if(++(tcb->rcv.una) >= 2) 2293 tcb->flags |= FORCE; 2294 } 2295 tcb->rcv.nxt += length; 2296 2297 /* 2298 * update our rcv window 2299 */ 2300 tcprcvwin(s); 2301 2302 /* 2303 * turn on the acktimer if there's something 2304 * to ack 2305 */ 2306 if(tcb->acktimer.state != TcptimerON) 2307 tcpgo(tpriv, &tcb->acktimer); 2308 2309 break; 2310 case Finwait2: 2311 /* no process to read the data, send a reset */ 2312 if(bp != nil) 2313 freeblist(bp); 2314 sndrst(tcp, source, dest, length, &seg, version, 2315 "send to Finwait2"); 2316 qunlock(s); 2317 poperror(); 2318 return; 2319 } 2320 } 2321 2322 if(seg.flags & FIN) { 2323 tcb->flags |= FORCE; 2324 2325 switch(tcb->state) { 2326 case Syn_received: 2327 case Established: 2328 tcb->rcv.nxt++; 2329 tcpsetstate(s, Close_wait); 2330 break; 2331 case Finwait1: 2332 tcb->rcv.nxt++; 2333 if(qlen(s->wq)+tcb->flgcnt == 0) { 2334 tcphalt(tpriv, &tcb->rtt_timer); 2335 tcphalt(tpriv, &tcb->acktimer); 2336 tcphalt(tpriv, &tcb->katimer); 2337 tcpsetstate(s, Time_wait); 2338 tcb->timer.start = MSL2*(1000/MSPTICK); 2339 tcpgo(tpriv, &tcb->timer); 2340 } 2341 else 2342 tcpsetstate(s, Closing); 2343 break; 2344 case Finwait2: 2345 tcb->rcv.nxt++; 2346 tcphalt(tpriv, &tcb->rtt_timer); 2347 tcphalt(tpriv, &tcb->acktimer); 2348 tcphalt(tpriv, &tcb->katimer); 2349 tcpsetstate(s, Time_wait); 2350 tcb->timer.start = MSL2 * (1000/MSPTICK); 2351 tcpgo(tpriv, &tcb->timer); 2352 break; 2353 case Close_wait: 2354 case Closing: 2355 case Last_ack: 2356 break; 2357 case Time_wait: 2358 tcpgo(tpriv, &tcb->timer); 2359 break; 2360 } 2361 } 2362 2363 /* 2364 * get next adjacent segment from the resequence queue. 2365 * dump/trim any overlapping segments 2366 */ 2367 for(;;) { 2368 if(tcb->reseq == nil) 2369 goto output; 2370 2371 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2372 goto output; 2373 2374 getreseq(tcb, &seg, &bp, &length); 2375 2376 if(tcptrim(tcb, &seg, &bp, &length) == 0) 2377 break; 2378 } 2379 } 2380 output: 2381 tcpoutput(s); 2382 qunlock(s); 2383 poperror(); 2384 return; 2385 raise: 2386 qunlock(s); 2387 poperror(); 2388 freeblist(bp); 2389 tcpkick(s); 2390 } 2391 2392 /* 2393 * always enters and exits with the s locked. We drop 2394 * the lock to ipoput the packet so some care has to be 2395 * taken by callers. 2396 */ 2397 void 2398 tcpoutput(Conv *s) 2399 { 2400 Tcp seg; 2401 int msgs; 2402 Tcpctl *tcb; 2403 Block *hbp, *bp; 2404 int sndcnt, n; 2405 ulong ssize, dsize, usable, sent; 2406 Fs *f; 2407 Tcppriv *tpriv; 2408 uchar version; 2409 2410 f = s->p->f; 2411 tpriv = s->p->priv; 2412 version = s->ipversion; 2413 2414 for(msgs = 0; msgs < 100; msgs++) { 2415 tcb = (Tcpctl*)s->ptcl; 2416 2417 switch(tcb->state) { 2418 case Listen: 2419 case Closed: 2420 case Finwait2: 2421 return; 2422 } 2423 2424 /* force an ack when a window has opened up */ 2425 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2426 tcb->rcv.blocked = 0; 2427 tcb->flags |= FORCE; 2428 } 2429 2430 sndcnt = qlen(s->wq)+tcb->flgcnt; 2431 sent = tcb->snd.ptr - tcb->snd.una; 2432 2433 /* Don't send anything else until our SYN has been acked */ 2434 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2435 break; 2436 2437 /* Compute usable segment based on offered window and limit 2438 * window probes to one 2439 */ 2440 if(tcb->snd.wnd == 0){ 2441 if(sent != 0) { 2442 if((tcb->flags&FORCE) == 0) 2443 break; 2444 // tcb->snd.ptr = tcb->snd.una; 2445 } 2446 usable = 1; 2447 } 2448 else { 2449 usable = tcb->cwind; 2450 if(tcb->snd.wnd < usable) 2451 usable = tcb->snd.wnd; 2452 usable -= sent; 2453 } 2454 ssize = sndcnt-sent; 2455 if(ssize && usable < 2) 2456 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", 2457 tcb->snd.wnd, tcb->cwind); 2458 if(usable < ssize) 2459 ssize = usable; 2460 if(tcb->mss < ssize) 2461 ssize = tcb->mss; 2462 dsize = ssize; 2463 seg.urg = 0; 2464 2465 if(ssize == 0) 2466 if((tcb->flags&FORCE) == 0) 2467 break; 2468 2469 tcb->flags &= ~FORCE; 2470 tcprcvwin(s); 2471 2472 /* By default we will generate an ack */ 2473 tcphalt(tpriv, &tcb->acktimer); 2474 tcb->rcv.una = 0; 2475 seg.source = s->lport; 2476 seg.dest = s->rport; 2477 seg.flags = ACK; 2478 seg.mss = 0; 2479 seg.ws = 0; 2480 switch(tcb->state){ 2481 case Syn_sent: 2482 seg.flags = 0; 2483 if(tcb->snd.ptr == tcb->iss){ 2484 seg.flags |= SYN; 2485 dsize--; 2486 seg.mss = tcb->mss; 2487 seg.ws = tcb->scale; 2488 } 2489 break; 2490 case Syn_received: 2491 /* 2492 * don't send any data with a SYN/ACK packet 2493 * because Linux rejects the packet in its 2494 * attempt to solve the SYN attack problem 2495 */ 2496 if(tcb->snd.ptr == tcb->iss){ 2497 seg.flags |= SYN; 2498 dsize = 0; 2499 ssize = 1; 2500 seg.mss = tcb->mss; 2501 seg.ws = tcb->scale; 2502 } 2503 break; 2504 } 2505 seg.seq = tcb->snd.ptr; 2506 seg.ack = tcb->rcv.nxt; 2507 seg.wnd = tcb->rcv.wnd; 2508 2509 /* Pull out data to send */ 2510 bp = nil; 2511 if(dsize != 0) { 2512 bp = qcopy(s->wq, dsize, sent); 2513 if(BLEN(bp) != dsize) { 2514 seg.flags |= FIN; 2515 dsize--; 2516 } 2517 } 2518 2519 if(sent+dsize == sndcnt) 2520 seg.flags |= PSH; 2521 2522 /* keep track of balance of resent data */ 2523 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { 2524 n = tcb->snd.nxt - tcb->snd.ptr; 2525 if(ssize < n) 2526 n = ssize; 2527 tcb->resent += n; 2528 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", 2529 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); 2530 tpriv->stats[RetransSegs]++; 2531 } 2532 2533 tcb->snd.ptr += ssize; 2534 2535 /* Pull up the send pointer so we can accept acks 2536 * for this window 2537 */ 2538 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2539 tcb->snd.nxt = tcb->snd.ptr; 2540 2541 /* Build header, link data and compute cksum */ 2542 switch(version){ 2543 case V4: 2544 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2545 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2546 if(hbp == nil) { 2547 freeblist(bp); 2548 return; 2549 } 2550 break; 2551 case V6: 2552 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2553 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2554 if(hbp == nil) { 2555 freeblist(bp); 2556 return; 2557 } 2558 break; 2559 default: 2560 hbp = nil; /* to suppress a warning */ 2561 panic("tcpoutput: version %d", version); 2562 } 2563 2564 /* Start the transmission timers if there is new data and we 2565 * expect acknowledges 2566 */ 2567 if(ssize != 0){ 2568 if(tcb->timer.state != TcptimerON) 2569 tcpgo(tpriv, &tcb->timer); 2570 2571 /* If round trip timer isn't running, start it. 2572 * measure the longest packet only in case the 2573 * transmission time dominates RTT 2574 */ 2575 if(tcb->rtt_timer.state != TcptimerON) 2576 if(ssize == tcb->mss) { 2577 tcpgo(tpriv, &tcb->rtt_timer); 2578 tcb->rttseq = tcb->snd.ptr; 2579 } 2580 } 2581 2582 tpriv->stats[OutSegs]++; 2583 2584 /* put off the next keep alive */ 2585 tcpgo(tpriv, &tcb->katimer); 2586 2587 switch(version){ 2588 case V4: 2589 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2590 /* a negative return means no route */ 2591 localclose(s, "no route"); 2592 } 2593 break; 2594 case V6: 2595 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2596 /* a negative return means no route */ 2597 localclose(s, "no route"); 2598 } 2599 break; 2600 default: 2601 panic("tcpoutput2: version %d", version); 2602 } 2603 if((msgs%4) == 1){ 2604 qunlock(s); 2605 sched(); 2606 qlock(s); 2607 } 2608 } 2609 } 2610 2611 /* 2612 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2613 */ 2614 void 2615 tcpsendka(Conv *s) 2616 { 2617 Tcp seg; 2618 Tcpctl *tcb; 2619 Block *hbp,*dbp; 2620 2621 tcb = (Tcpctl*)s->ptcl; 2622 2623 dbp = nil; 2624 seg.urg = 0; 2625 seg.source = s->lport; 2626 seg.dest = s->rport; 2627 seg.flags = ACK|PSH; 2628 seg.mss = 0; 2629 seg.ws = 0; 2630 if(tcpporthogdefense) 2631 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2632 else 2633 seg.seq = tcb->snd.una-1; 2634 seg.ack = tcb->rcv.nxt; 2635 tcb->rcv.una = 0; 2636 seg.wnd = tcb->rcv.wnd; 2637 if(tcb->state == Finwait2){ 2638 seg.flags |= FIN; 2639 } else { 2640 dbp = allocb(1); 2641 dbp->wp++; 2642 } 2643 2644 if(isv4(s->raddr)) { 2645 /* Build header, link data and compute cksum */ 2646 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2647 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2648 if(hbp == nil) { 2649 freeblist(dbp); 2650 return; 2651 } 2652 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2653 } 2654 else { 2655 /* Build header, link data and compute cksum */ 2656 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2657 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2658 if(hbp == nil) { 2659 freeblist(dbp); 2660 return; 2661 } 2662 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2663 } 2664 } 2665 2666 /* 2667 * set connection to time out after 12 minutes 2668 */ 2669 void 2670 tcpsetkacounter(Tcpctl *tcb) 2671 { 2672 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2673 if(tcb->kacounter < 3) 2674 tcb->kacounter = 3; 2675 } 2676 2677 /* 2678 * if we've timed out, close the connection 2679 * otherwise, send a keepalive and restart the timer 2680 */ 2681 void 2682 tcpkeepalive(void *v) 2683 { 2684 Tcpctl *tcb; 2685 Conv *s; 2686 2687 s = v; 2688 tcb = (Tcpctl*)s->ptcl; 2689 if(waserror()){ 2690 qunlock(s); 2691 nexterror(); 2692 } 2693 qlock(s); 2694 if(tcb->state != Closed){ 2695 if(--(tcb->kacounter) <= 0) { 2696 localclose(s, Etimedout); 2697 } else { 2698 tcpsendka(s); 2699 tcpgo(s->p->priv, &tcb->katimer); 2700 } 2701 } 2702 qunlock(s); 2703 poperror(); 2704 } 2705 2706 /* 2707 * start keepalive timer 2708 */ 2709 char* 2710 tcpstartka(Conv *s, char **f, int n) 2711 { 2712 Tcpctl *tcb; 2713 int x; 2714 2715 tcb = (Tcpctl*)s->ptcl; 2716 if(tcb->state != Established) 2717 return "connection must be in Establised state"; 2718 if(n > 1){ 2719 x = atoi(f[1]); 2720 if(x >= MSPTICK) 2721 tcb->katimer.start = x/MSPTICK; 2722 } 2723 tcpsetkacounter(tcb); 2724 tcpgo(s->p->priv, &tcb->katimer); 2725 2726 return nil; 2727 } 2728 2729 /* 2730 * turn checksums on/off 2731 */ 2732 char* 2733 tcpsetchecksum(Conv *s, char **f, int) 2734 { 2735 Tcpctl *tcb; 2736 2737 tcb = (Tcpctl*)s->ptcl; 2738 tcb->nochecksum = !atoi(f[1]); 2739 2740 return nil; 2741 } 2742 2743 void 2744 tcprxmit(Conv *s) 2745 { 2746 Tcpctl *tcb; 2747 2748 tcb = (Tcpctl*)s->ptcl; 2749 2750 tcb->flags |= RETRAN|FORCE; 2751 tcb->snd.ptr = tcb->snd.una; 2752 2753 /* 2754 * We should be halving the slow start threshhold (down to one 2755 * mss) but leaving it at mss seems to work well enough 2756 */ 2757 tcb->ssthresh = tcb->mss; 2758 2759 /* 2760 * pull window down to a single packet 2761 */ 2762 tcb->cwind = tcb->mss; 2763 tcpoutput(s); 2764 } 2765 2766 void 2767 tcptimeout(void *arg) 2768 { 2769 Conv *s; 2770 Tcpctl *tcb; 2771 int maxback; 2772 Tcppriv *tpriv; 2773 2774 s = (Conv*)arg; 2775 tpriv = s->p->priv; 2776 tcb = (Tcpctl*)s->ptcl; 2777 2778 if(waserror()){ 2779 qunlock(s); 2780 nexterror(); 2781 } 2782 qlock(s); 2783 switch(tcb->state){ 2784 default: 2785 tcb->backoff++; 2786 if(tcb->state == Syn_sent) 2787 maxback = MAXBACKMS/2; 2788 else 2789 maxback = MAXBACKMS; 2790 tcb->backedoff += tcb->timer.start * MSPTICK; 2791 if(tcb->backedoff >= maxback) { 2792 localclose(s, Etimedout); 2793 break; 2794 } 2795 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); 2796 tcpsettimer(tcb); 2797 tcprxmit(s); 2798 tpriv->stats[RetransTimeouts]++; 2799 tcb->snd.dupacks = 0; 2800 break; 2801 case Time_wait: 2802 localclose(s, nil); 2803 break; 2804 case Closed: 2805 break; 2806 } 2807 qunlock(s); 2808 poperror(); 2809 } 2810 2811 int 2812 inwindow(Tcpctl *tcb, int seq) 2813 { 2814 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 2815 } 2816 2817 /* 2818 * set up state for a received SYN (or SYN ACK) packet 2819 */ 2820 void 2821 procsyn(Conv *s, Tcp *seg) 2822 { 2823 Tcpctl *tcb; 2824 2825 tcb = (Tcpctl*)s->ptcl; 2826 tcb->flags |= FORCE; 2827 2828 tcb->rcv.nxt = seg->seq + 1; 2829 tcb->rcv.urg = tcb->rcv.nxt; 2830 tcb->irs = seg->seq; 2831 2832 /* our sending max segment size cannot be bigger than what he asked for */ 2833 if(seg->mss != 0 && seg->mss < tcb->mss) 2834 tcb->mss = seg->mss; 2835 2836 /* the congestion window always starts out as a single segment */ 2837 tcb->snd.wnd = seg->wnd; 2838 tcb->cwind = tcb->mss; 2839 } 2840 2841 int 2842 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 2843 { 2844 Reseq *rp, *rp1; 2845 int i, rqlen, qmax; 2846 2847 rp = malloc(sizeof(Reseq)); 2848 if(rp == nil){ 2849 freeblist(bp); /* bp always consumed by add_reseq */ 2850 return 0; 2851 } 2852 2853 rp->seg = *seg; 2854 rp->bp = bp; 2855 rp->length = length; 2856 2857 /* Place on reassembly list sorting by starting seq number */ 2858 rp1 = tcb->reseq; 2859 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { 2860 rp->next = rp1; 2861 tcb->reseq = rp; 2862 if(rp->next != nil) 2863 tpriv->stats[OutOfOrder]++; 2864 return 0; 2865 } 2866 2867 rqlen = 0; 2868 for(i = 0;; i++) { 2869 rqlen += rp1->length; 2870 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { 2871 rp->next = rp1->next; 2872 rp1->next = rp; 2873 if(rp->next != nil) 2874 tpriv->stats[OutOfOrder]++; 2875 break; 2876 } 2877 rp1 = rp1->next; 2878 } 2879 qmax = QMAX<<tcb->rcv.scale; 2880 if(rqlen > qmax){ 2881 print("resequence queue > window: %d > %d\n", rqlen, qmax); 2882 i = 0; 2883 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ 2884 print("%#lux %#lux %#ux\n", rp1->seg.seq, 2885 rp1->seg.ack, rp1->seg.flags); 2886 if(i++ > 10){ 2887 print("...\n"); 2888 break; 2889 } 2890 } 2891 2892 // delete entire reassembly queue; wait for retransmit. 2893 // - should we be smarter and only delete the tail? 2894 for(rp = tcb->reseq; rp != nil; rp = rp1){ 2895 rp1 = rp->next; 2896 freeblist(rp->bp); 2897 free(rp); 2898 } 2899 tcb->reseq = nil; 2900 2901 return -1; 2902 } 2903 return 0; 2904 } 2905 2906 void 2907 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2908 { 2909 Reseq *rp; 2910 2911 rp = tcb->reseq; 2912 if(rp == nil) 2913 return; 2914 2915 tcb->reseq = rp->next; 2916 2917 *seg = rp->seg; 2918 *bp = rp->bp; 2919 *length = rp->length; 2920 2921 free(rp); 2922 } 2923 2924 int 2925 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2926 { 2927 ushort len; 2928 uchar accept; 2929 int dupcnt, excess; 2930 2931 accept = 0; 2932 len = *length; 2933 if(seg->flags & SYN) 2934 len++; 2935 if(seg->flags & FIN) 2936 len++; 2937 2938 if(tcb->rcv.wnd == 0) { 2939 if(len == 0 && seg->seq == tcb->rcv.nxt) 2940 return 0; 2941 } 2942 else { 2943 /* Some part of the segment should be in the window */ 2944 if(inwindow(tcb,seg->seq)) 2945 accept++; 2946 else 2947 if(len != 0) { 2948 if(inwindow(tcb, seg->seq+len-1) || 2949 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 2950 accept++; 2951 } 2952 } 2953 if(!accept) { 2954 freeblist(*bp); 2955 return -1; 2956 } 2957 dupcnt = tcb->rcv.nxt - seg->seq; 2958 if(dupcnt > 0){ 2959 tcb->rerecv += dupcnt; 2960 if(seg->flags & SYN){ 2961 seg->flags &= ~SYN; 2962 seg->seq++; 2963 2964 if(seg->urg > 1) 2965 seg->urg--; 2966 else 2967 seg->flags &= ~URG; 2968 dupcnt--; 2969 } 2970 if(dupcnt > 0){ 2971 pullblock(bp, (ushort)dupcnt); 2972 seg->seq += dupcnt; 2973 *length -= dupcnt; 2974 2975 if(seg->urg > dupcnt) 2976 seg->urg -= dupcnt; 2977 else { 2978 seg->flags &= ~URG; 2979 seg->urg = 0; 2980 } 2981 } 2982 } 2983 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 2984 if(excess > 0) { 2985 tcb->rerecv += excess; 2986 *length -= excess; 2987 *bp = trimblock(*bp, 0, *length); 2988 if(*bp == nil) 2989 panic("presotto is a boofhead"); 2990 seg->flags &= ~FIN; 2991 } 2992 return 0; 2993 } 2994 2995 void 2996 tcpadvise(Proto *tcp, Block *bp, char *msg) 2997 { 2998 Tcp4hdr *h4; 2999 Tcp6hdr *h6; 3000 Tcpctl *tcb; 3001 uchar source[IPaddrlen]; 3002 uchar dest[IPaddrlen]; 3003 ushort psource, pdest; 3004 Conv *s, **p; 3005 3006 h4 = (Tcp4hdr*)(bp->rp); 3007 h6 = (Tcp6hdr*)(bp->rp); 3008 3009 if((h4->vihl&0xF0)==IP_VER4) { 3010 v4tov6(dest, h4->tcpdst); 3011 v4tov6(source, h4->tcpsrc); 3012 psource = nhgets(h4->tcpsport); 3013 pdest = nhgets(h4->tcpdport); 3014 } 3015 else { 3016 ipmove(dest, h6->tcpdst); 3017 ipmove(source, h6->tcpsrc); 3018 psource = nhgets(h6->tcpsport); 3019 pdest = nhgets(h6->tcpdport); 3020 } 3021 3022 /* Look for a connection */ 3023 qlock(tcp); 3024 for(p = tcp->conv; *p; p++) { 3025 s = *p; 3026 tcb = (Tcpctl*)s->ptcl; 3027 if(s->rport == pdest) 3028 if(s->lport == psource) 3029 if(tcb->state != Closed) 3030 if(ipcmp(s->raddr, dest) == 0) 3031 if(ipcmp(s->laddr, source) == 0){ 3032 qlock(s); 3033 qunlock(tcp); 3034 switch(tcb->state){ 3035 case Syn_sent: 3036 localclose(s, msg); 3037 break; 3038 } 3039 qunlock(s); 3040 freeblist(bp); 3041 return; 3042 } 3043 } 3044 qunlock(tcp); 3045 freeblist(bp); 3046 } 3047 3048 static char* 3049 tcpporthogdefensectl(char *val) 3050 { 3051 if(strcmp(val, "on") == 0) 3052 tcpporthogdefense = 1; 3053 else if(strcmp(val, "off") == 0) 3054 tcpporthogdefense = 0; 3055 else 3056 return "unknown value for tcpporthogdefense"; 3057 return nil; 3058 } 3059 3060 /* called with c qlocked */ 3061 char* 3062 tcpctl(Conv* c, char** f, int n) 3063 { 3064 if(n == 1 && strcmp(f[0], "hangup") == 0) 3065 return tcphangup(c); 3066 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3067 return tcpstartka(c, f, n); 3068 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3069 return tcpsetchecksum(c, f, n); 3070 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3071 return tcpporthogdefensectl(f[1]); 3072 return "unknown control request"; 3073 } 3074 3075 int 3076 tcpstats(Proto *tcp, char *buf, int len) 3077 { 3078 Tcppriv *priv; 3079 char *p, *e; 3080 int i; 3081 3082 priv = tcp->priv; 3083 p = buf; 3084 e = p+len; 3085 for(i = 0; i < Nstats; i++) 3086 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); 3087 return p - buf; 3088 } 3089 3090 /* 3091 * garbage collect any stale conversations: 3092 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3093 * - Finwait2 after 5 minutes 3094 * 3095 * this is called whenever we run out of channels. Both checks are 3096 * of questionable validity so we try to use them only when we're 3097 * up against the wall. 3098 */ 3099 int 3100 tcpgc(Proto *tcp) 3101 { 3102 Conv *c, **pp, **ep; 3103 int n; 3104 Tcpctl *tcb; 3105 3106 3107 n = 0; 3108 ep = &tcp->conv[tcp->nc]; 3109 for(pp = tcp->conv; pp < ep; pp++) { 3110 c = *pp; 3111 if(c == nil) 3112 break; 3113 if(!canqlock(c)) 3114 continue; 3115 tcb = (Tcpctl*)c->ptcl; 3116 switch(tcb->state){ 3117 case Syn_received: 3118 if(NOW - tcb->time > 5000){ 3119 localclose(c, "timed out"); 3120 n++; 3121 } 3122 break; 3123 case Finwait2: 3124 if(NOW - tcb->time > 5*60*1000){ 3125 localclose(c, "timed out"); 3126 n++; 3127 } 3128 break; 3129 } 3130 qunlock(c); 3131 } 3132 return n; 3133 } 3134 3135 void 3136 tcpsettimer(Tcpctl *tcb) 3137 { 3138 int x; 3139 3140 /* round trip dependency */ 3141 x = backoff(tcb->backoff) * 3142 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3143 3144 /* bounded twixt 1/2 and 64 seconds */ 3145 if(x < 500/MSPTICK) 3146 x = 500/MSPTICK; 3147 else if(x > (64000/MSPTICK)) 3148 x = 64000/MSPTICK; 3149 tcb->timer.start = x; 3150 } 3151 3152 void 3153 tcpinit(Fs *fs) 3154 { 3155 Proto *tcp; 3156 Tcppriv *tpriv; 3157 3158 tcp = smalloc(sizeof(Proto)); 3159 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3160 tcp->name = "tcp"; 3161 tcp->connect = tcpconnect; 3162 tcp->announce = tcpannounce; 3163 tcp->ctl = tcpctl; 3164 tcp->state = tcpstate; 3165 tcp->create = tcpcreate; 3166 tcp->close = tcpclose; 3167 tcp->rcv = tcpiput; 3168 tcp->advise = tcpadvise; 3169 tcp->stats = tcpstats; 3170 tcp->inuse = tcpinuse; 3171 tcp->gc = tcpgc; 3172 tcp->ipproto = IP_TCPPROTO; 3173 tcp->nc = scalednconv(); 3174 tcp->ptclsize = sizeof(Tcpctl); 3175 tpriv->stats[MaxConn] = tcp->nc; 3176 3177 Fsproto(fs, tcp); 3178 } 3179 3180 void 3181 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3182 { 3183 if(rcvscale){ 3184 tcb->rcv.scale = rcvscale & 0xff; 3185 tcb->snd.scale = sndscale & 0xff; 3186 tcb->window = QMAX<<tcb->snd.scale; 3187 qsetlimit(s->rq, tcb->window); 3188 } else { 3189 tcb->rcv.scale = 0; 3190 tcb->snd.scale = 0; 3191 tcb->window = QMAX; 3192 qsetlimit(s->rq, tcb->window); 3193 } 3194 } 3195