1 #include "u.h" 2 #include "../port/lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "../port/error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Mean segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default mean segment */ 50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 HaveWS = 1<<8, 85 }; 86 87 /* Must correspond to the enumeration above */ 88 char *tcpstates[] = 89 { 90 "Closed", "Listen", "Syn_sent", "Syn_received", 91 "Established", "Finwait1", "Finwait2", "Close_wait", 92 "Closing", "Last_ack", "Time_wait" 93 }; 94 95 typedef struct Tcptimer Tcptimer; 96 struct Tcptimer 97 { 98 Tcptimer *next; 99 Tcptimer *prev; 100 Tcptimer *readynext; 101 int state; 102 int start; 103 int count; 104 void (*func)(void*); 105 void *arg; 106 }; 107 108 /* 109 * v4 and v6 pseudo headers used for 110 * checksuming tcp 111 */ 112 typedef struct Tcp4hdr Tcp4hdr; 113 struct Tcp4hdr 114 { 115 uchar vihl; /* Version and header length */ 116 uchar tos; /* Type of service */ 117 uchar length[2]; /* packet length */ 118 uchar id[2]; /* Identification */ 119 uchar frag[2]; /* Fragment information */ 120 uchar Unused; 121 uchar proto; 122 uchar tcplen[2]; 123 uchar tcpsrc[4]; 124 uchar tcpdst[4]; 125 uchar tcpsport[2]; 126 uchar tcpdport[2]; 127 uchar tcpseq[4]; 128 uchar tcpack[4]; 129 uchar tcpflag[2]; 130 uchar tcpwin[2]; 131 uchar tcpcksum[2]; 132 uchar tcpurg[2]; 133 /* Options segment */ 134 uchar tcpopt[1]; 135 }; 136 137 typedef struct Tcp6hdr Tcp6hdr; 138 struct Tcp6hdr 139 { 140 uchar vcf[4]; 141 uchar ploadlen[2]; 142 uchar proto; 143 uchar ttl; 144 uchar tcpsrc[IPaddrlen]; 145 uchar tcpdst[IPaddrlen]; 146 uchar tcpsport[2]; 147 uchar tcpdport[2]; 148 uchar tcpseq[4]; 149 uchar tcpack[4]; 150 uchar tcpflag[2]; 151 uchar tcpwin[2]; 152 uchar tcpcksum[2]; 153 uchar tcpurg[2]; 154 /* Options segment */ 155 uchar tcpopt[1]; 156 }; 157 158 /* 159 * this represents the control info 160 * for a single packet. It is derived from 161 * a packet in ntohtcp{4,6}() and stuck into 162 * a packet in htontcp{4,6}(). 163 */ 164 typedef struct Tcp Tcp; 165 struct Tcp 166 { 167 ushort source; 168 ushort dest; 169 ulong seq; 170 ulong ack; 171 uchar flags; 172 ushort ws; /* window scale option (if not zero) */ 173 ulong wnd; 174 ushort urg; 175 ushort mss; /* max segment size option (if not zero) */ 176 ushort len; /* size of data */ 177 }; 178 179 /* 180 * this header is malloc'd to thread together fragments 181 * waiting to be coalesced 182 */ 183 typedef struct Reseq Reseq; 184 struct Reseq 185 { 186 Reseq *next; 187 Tcp seg; 188 Block *bp; 189 ushort length; 190 }; 191 192 /* 193 * the qlock in the Conv locks this structure 194 */ 195 typedef struct Tcpctl Tcpctl; 196 struct Tcpctl 197 { 198 uchar state; /* Connection state */ 199 uchar type; /* Listening or active connection */ 200 uchar code; /* Icmp code */ 201 struct { 202 ulong una; /* Unacked data pointer */ 203 ulong nxt; /* Next sequence expected */ 204 ulong ptr; /* Data pointer */ 205 ulong wnd; /* Tcp send window */ 206 ulong urg; /* Urgent data pointer */ 207 ulong wl2; 208 int scale; /* how much to right shift window in xmitted packets */ 209 /* to implement tahoe and reno TCP */ 210 ulong dupacks; /* number of duplicate acks rcvd */ 211 int recovery; /* loss recovery flag */ 212 ulong rxt; /* right window marker for recovery */ 213 } snd; 214 struct { 215 ulong nxt; /* Receive pointer to next uchar slot */ 216 ulong wnd; /* Receive window incoming */ 217 ulong urg; /* Urgent pointer */ 218 int blocked; 219 int una; /* unacked data segs */ 220 int scale; /* how much to left shift window in rcved packets */ 221 } rcv; 222 ulong iss; /* Initial sequence number */ 223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ 224 ulong cwind; /* Congestion window */ 225 int scale; /* desired snd.scale */ 226 ushort ssthresh; /* Slow start threshold */ 227 int resent; /* Bytes just resent */ 228 int irs; /* Initial received squence */ 229 ushort mss; /* Mean segment size */ 230 int rerecv; /* Overlap of data rerecevived */ 231 ulong window; /* Recevive window */ 232 uchar backoff; /* Exponential backoff counter */ 233 int backedoff; /* ms we've backed off for rexmits */ 234 uchar flags; /* State flags */ 235 Reseq *reseq; /* Resequencing queue */ 236 Tcptimer timer; /* Activity timer */ 237 Tcptimer acktimer; /* Acknowledge timer */ 238 Tcptimer rtt_timer; /* Round trip timer */ 239 Tcptimer katimer; /* keep alive timer */ 240 ulong rttseq; /* Round trip sequence */ 241 int srtt; /* Shortened round trip */ 242 int mdev; /* Mean deviation of round trip */ 243 int kacounter; /* count down for keep alive */ 244 uint sndsyntime; /* time syn sent */ 245 ulong time; /* time Finwait2 or Syn_received was sent */ 246 int nochecksum; /* non-zero means don't send checksums */ 247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 248 249 union { 250 Tcp4hdr tcp4hdr; 251 Tcp6hdr tcp6hdr; 252 } protohdr; /* prototype header */ 253 }; 254 255 /* 256 * New calls are put in limbo rather than having a conversation structure 257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 258 * any real Conv structures mucking things up. Calls in limbo rexmit their 259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 260 * 261 * In particular they aren't on a listener's queue so that they don't figure 262 * in the input queue limit. 263 * 264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 266 * there is no hashing of this list. 267 */ 268 typedef struct Limbo Limbo; 269 struct Limbo 270 { 271 Limbo *next; 272 273 uchar laddr[IPaddrlen]; 274 uchar raddr[IPaddrlen]; 275 ushort lport; 276 ushort rport; 277 ulong irs; /* initial received sequence */ 278 ulong iss; /* initial sent sequence */ 279 ushort mss; /* mss from the other end */ 280 ushort rcvscale; /* how much to scale rcvd windows */ 281 ushort sndscale; /* how much to scale sent windows */ 282 ulong lastsend; /* last time we sent a synack */ 283 uchar version; /* v4 or v6 */ 284 uchar rexmits; /* number of retransmissions */ 285 }; 286 287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ 289 290 enum { 291 /* MIB stats */ 292 MaxConn, 293 ActiveOpens, 294 PassiveOpens, 295 EstabResets, 296 CurrEstab, 297 InSegs, 298 OutSegs, 299 RetransSegs, 300 RetransTimeouts, 301 InErrs, 302 OutRsts, 303 304 /* non-MIB stats */ 305 CsumErrs, 306 HlenErrs, 307 LenErrs, 308 OutOfOrder, 309 310 Nstats 311 }; 312 313 static char *statnames[] = 314 { 315 [MaxConn] "MaxConn", 316 [ActiveOpens] "ActiveOpens", 317 [PassiveOpens] "PassiveOpens", 318 [EstabResets] "EstabResets", 319 [CurrEstab] "CurrEstab", 320 [InSegs] "InSegs", 321 [OutSegs] "OutSegs", 322 [RetransSegs] "RetransSegs", 323 [RetransTimeouts] "RetransTimeouts", 324 [InErrs] "InErrs", 325 [OutRsts] "OutRsts", 326 [CsumErrs] "CsumErrs", 327 [HlenErrs] "HlenErrs", 328 [LenErrs] "LenErrs", 329 [OutOfOrder] "OutOfOrder", 330 }; 331 332 typedef struct Tcppriv Tcppriv; 333 struct Tcppriv 334 { 335 /* List of active timers */ 336 QLock tl; 337 Tcptimer *timers; 338 339 /* hash table for matching conversations */ 340 Ipht ht; 341 342 /* calls in limbo waiting for an ACK to our SYN ACK */ 343 int nlimbo; 344 Limbo *lht[NLHT]; 345 346 /* for keeping track of tcpackproc */ 347 QLock apl; 348 int ackprocstarted; 349 350 ulong stats[Nstats]; 351 }; 352 353 /* 354 * Setting tcpporthogdefense to non-zero enables Dong Lin's 355 * solution to hijacked systems staking out port's as a form 356 * of DoS attack. 357 * 358 * To avoid stateless Conv hogs, we pick a sequence number at random. If 359 * it that number gets acked by the other end, we shut down the connection. 360 * Look for tcpporthogedefense in the code. 361 */ 362 int tcpporthogdefense = 0; 363 364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 366 void localclose(Conv*, char*); 367 void procsyn(Conv*, Tcp*); 368 void tcpiput(Proto*, Ipifc*, Block*); 369 void tcpoutput(Conv*); 370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 371 void tcpstart(Conv*, int); 372 void tcptimeout(void*); 373 void tcpsndsyn(Conv*, Tcpctl*); 374 void tcprcvwin(Conv*); 375 void tcpacktimer(void*); 376 void tcpkeepalive(void*); 377 void tcpsetkacounter(Tcpctl*); 378 void tcprxmit(Conv*); 379 void tcpsettimer(Tcpctl*); 380 void tcpsynackrtt(Conv*); 381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 382 383 static void limborexmit(Proto*); 384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 385 386 void 387 tcpsetstate(Conv *s, uchar newstate) 388 { 389 Tcpctl *tcb; 390 uchar oldstate; 391 Tcppriv *tpriv; 392 393 tpriv = s->p->priv; 394 395 tcb = (Tcpctl*)s->ptcl; 396 397 oldstate = tcb->state; 398 if(oldstate == newstate) 399 return; 400 401 if(oldstate == Established) 402 tpriv->stats[CurrEstab]--; 403 if(newstate == Established) 404 tpriv->stats[CurrEstab]++; 405 406 /** 407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, 408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); 409 **/ 410 411 switch(newstate) { 412 case Closed: 413 qclose(s->rq); 414 qclose(s->wq); 415 qclose(s->eq); 416 break; 417 418 case Close_wait: /* Remote closes */ 419 qhangup(s->rq, nil); 420 break; 421 } 422 423 tcb->state = newstate; 424 425 if(oldstate == Syn_sent && newstate != Closed) 426 Fsconnected(s, nil); 427 } 428 429 static char* 430 tcpconnect(Conv *c, char **argv, int argc) 431 { 432 char *e; 433 434 e = Fsstdconnect(c, argv, argc); 435 if(e != nil) 436 return e; 437 tcpstart(c, TCP_CONNECT); 438 439 return nil; 440 } 441 442 static int 443 tcpstate(Conv *c, char *state, int n) 444 { 445 Tcpctl *s; 446 447 s = (Tcpctl*)(c->ptcl); 448 449 return snprint(state, n, 450 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 451 tcpstates[s->state], 452 c->rq ? qlen(c->rq) : 0, 453 c->wq ? qlen(c->wq) : 0, 454 s->srtt, s->mdev, 455 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 456 s->timer.start, s->timer.count, s->rerecv, 457 s->katimer.start, s->katimer.count); 458 } 459 460 static int 461 tcpinuse(Conv *c) 462 { 463 Tcpctl *s; 464 465 s = (Tcpctl*)(c->ptcl); 466 return s->state != Closed; 467 } 468 469 static char* 470 tcpannounce(Conv *c, char **argv, int argc) 471 { 472 char *e; 473 474 e = Fsstdannounce(c, argv, argc); 475 if(e != nil) 476 return e; 477 tcpstart(c, TCP_LISTEN); 478 Fsconnected(c, nil); 479 480 return nil; 481 } 482 483 /* 484 * tcpclose is always called with the q locked 485 */ 486 static void 487 tcpclose(Conv *c) 488 { 489 Tcpctl *tcb; 490 491 tcb = (Tcpctl*)c->ptcl; 492 493 qhangup(c->rq, nil); 494 qhangup(c->wq, nil); 495 qhangup(c->eq, nil); 496 qflush(c->rq); 497 498 switch(tcb->state) { 499 case Listen: 500 /* 501 * reset any incoming calls to this listener 502 */ 503 Fsconnected(c, "Hangup"); 504 505 localclose(c, nil); 506 break; 507 case Closed: 508 case Syn_sent: 509 localclose(c, nil); 510 break; 511 case Syn_received: 512 case Established: 513 tcb->flgcnt++; 514 tcb->snd.nxt++; 515 tcpsetstate(c, Finwait1); 516 tcpoutput(c); 517 break; 518 case Close_wait: 519 tcb->flgcnt++; 520 tcb->snd.nxt++; 521 tcpsetstate(c, Last_ack); 522 tcpoutput(c); 523 break; 524 } 525 } 526 527 void 528 tcpkick(void *x) 529 { 530 Conv *s = x; 531 Tcpctl *tcb; 532 533 tcb = (Tcpctl*)s->ptcl; 534 535 if(waserror()){ 536 qunlock(s); 537 nexterror(); 538 } 539 qlock(s); 540 541 switch(tcb->state) { 542 case Syn_sent: 543 case Syn_received: 544 case Established: 545 case Close_wait: 546 /* 547 * Push data 548 */ 549 tcprcvwin(s); 550 tcpoutput(s); 551 break; 552 default: 553 localclose(s, "Hangup"); 554 break; 555 } 556 557 qunlock(s); 558 poperror(); 559 } 560 561 void 562 tcprcvwin(Conv *s) /* Call with tcb locked */ 563 { 564 int w; 565 Tcpctl *tcb; 566 567 tcb = (Tcpctl*)s->ptcl; 568 w = tcb->window - qlen(s->rq); 569 if(w < 0) 570 w = 0; 571 tcb->rcv.wnd = w; 572 if(w == 0) 573 tcb->rcv.blocked = 1; 574 } 575 576 void 577 tcpacktimer(void *v) 578 { 579 Tcpctl *tcb; 580 Conv *s; 581 582 s = v; 583 tcb = (Tcpctl*)s->ptcl; 584 585 if(waserror()){ 586 qunlock(s); 587 nexterror(); 588 } 589 qlock(s); 590 if(tcb->state != Closed){ 591 tcb->flags |= FORCE; 592 tcprcvwin(s); 593 tcpoutput(s); 594 } 595 qunlock(s); 596 poperror(); 597 } 598 599 static void 600 tcpcreate(Conv *c) 601 { 602 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 603 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); 604 } 605 606 static void 607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 608 { 609 if(newstate != TcptimerON){ 610 if(t->state == TcptimerON){ 611 /* unchain */ 612 if(priv->timers == t){ 613 priv->timers = t->next; 614 if(t->prev != nil) 615 panic("timerstate1"); 616 } 617 if(t->next) 618 t->next->prev = t->prev; 619 if(t->prev) 620 t->prev->next = t->next; 621 t->next = t->prev = nil; 622 } 623 } else { 624 if(t->state != TcptimerON){ 625 /* chain */ 626 if(t->prev != nil || t->next != nil) 627 panic("timerstate2"); 628 t->prev = nil; 629 t->next = priv->timers; 630 if(t->next) 631 t->next->prev = t; 632 priv->timers = t; 633 } 634 } 635 t->state = newstate; 636 } 637 638 void 639 tcpackproc(void *a) 640 { 641 Tcptimer *t, *tp, *timeo; 642 Proto *tcp; 643 Tcppriv *priv; 644 int loop; 645 646 tcp = a; 647 priv = tcp->priv; 648 649 for(;;) { 650 tsleep(&up->sleep, return0, 0, MSPTICK); 651 652 qlock(&priv->tl); 653 timeo = nil; 654 loop = 0; 655 for(t = priv->timers; t != nil; t = tp) { 656 if(loop++ > 10000) 657 panic("tcpackproc1"); 658 tp = t->next; 659 if(t->state == TcptimerON) { 660 t->count--; 661 if(t->count == 0) { 662 timerstate(priv, t, TcptimerDONE); 663 t->readynext = timeo; 664 timeo = t; 665 } 666 } 667 } 668 qunlock(&priv->tl); 669 670 loop = 0; 671 for(t = timeo; t != nil; t = t->readynext) { 672 if(loop++ > 10000) 673 panic("tcpackproc2"); 674 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 675 (*t->func)(t->arg); 676 poperror(); 677 } 678 } 679 680 limborexmit(tcp); 681 } 682 } 683 684 void 685 tcpgo(Tcppriv *priv, Tcptimer *t) 686 { 687 if(t == nil || t->start == 0) 688 return; 689 690 qlock(&priv->tl); 691 t->count = t->start; 692 timerstate(priv, t, TcptimerON); 693 qunlock(&priv->tl); 694 } 695 696 void 697 tcphalt(Tcppriv *priv, Tcptimer *t) 698 { 699 if(t == nil) 700 return; 701 702 qlock(&priv->tl); 703 timerstate(priv, t, TcptimerOFF); 704 qunlock(&priv->tl); 705 } 706 707 int 708 backoff(int n) 709 { 710 return 1 << n; 711 } 712 713 void 714 localclose(Conv *s, char *reason) /* called with tcb locked */ 715 { 716 Tcpctl *tcb; 717 Reseq *rp,*rp1; 718 Tcppriv *tpriv; 719 720 tpriv = s->p->priv; 721 tcb = (Tcpctl*)s->ptcl; 722 723 iphtrem(&tpriv->ht, s); 724 725 tcphalt(tpriv, &tcb->timer); 726 tcphalt(tpriv, &tcb->rtt_timer); 727 tcphalt(tpriv, &tcb->acktimer); 728 tcphalt(tpriv, &tcb->katimer); 729 730 /* Flush reassembly queue; nothing more can arrive */ 731 for(rp = tcb->reseq; rp != nil; rp = rp1) { 732 rp1 = rp->next; 733 freeblist(rp->bp); 734 free(rp); 735 } 736 tcb->reseq = nil; 737 738 if(tcb->state == Syn_sent) 739 Fsconnected(s, reason); 740 if(s->state == Announced) 741 wakeup(&s->listenr); 742 743 qhangup(s->rq, reason); 744 qhangup(s->wq, reason); 745 746 tcpsetstate(s, Closed); 747 } 748 749 /* mtu (- TCP + IP hdr len) of 1st hop */ 750 int 751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) 752 { 753 Ipifc *ifc; 754 int mtu; 755 756 ifc = findipifc(tcp->f, addr, 0); 757 switch(version){ 758 default: 759 case V4: 760 mtu = DEF_MSS; 761 if(ifc != nil) 762 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 763 break; 764 case V6: 765 mtu = DEF_MSS6; 766 if(ifc != nil) 767 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 768 break; 769 } 770 if(ifc != nil){ 771 if(ifc->mbps > 1000) 772 *scale = HaveWS | 4; 773 else if(ifc->mbps > 100) 774 *scale = HaveWS | 3; 775 else if(ifc->mbps > 10) 776 *scale = HaveWS | 1; 777 else 778 *scale = HaveWS | 0; 779 } else 780 *scale = HaveWS | 0; 781 782 return mtu; 783 } 784 785 void 786 inittcpctl(Conv *s, int mode) 787 { 788 Tcpctl *tcb; 789 Tcp4hdr* h4; 790 Tcp6hdr* h6; 791 int mss; 792 793 tcb = (Tcpctl*)s->ptcl; 794 795 memset(tcb, 0, sizeof(Tcpctl)); 796 797 tcb->ssthresh = 65535; 798 tcb->srtt = tcp_irtt<<LOGAGAIN; 799 tcb->mdev = 0; 800 801 /* setup timers */ 802 tcb->timer.start = tcp_irtt / MSPTICK; 803 tcb->timer.func = tcptimeout; 804 tcb->timer.arg = s; 805 tcb->rtt_timer.start = MAX_TIME; 806 tcb->acktimer.start = TCP_ACK / MSPTICK; 807 tcb->acktimer.func = tcpacktimer; 808 tcb->acktimer.arg = s; 809 tcb->katimer.start = DEF_KAT / MSPTICK; 810 tcb->katimer.func = tcpkeepalive; 811 tcb->katimer.arg = s; 812 813 mss = DEF_MSS; 814 815 /* create a prototype(pseudo) header */ 816 if(mode != TCP_LISTEN){ 817 if(ipcmp(s->laddr, IPnoaddr) == 0) 818 findlocalip(s->p->f, s->laddr, s->raddr); 819 820 switch(s->ipversion){ 821 case V4: 822 h4 = &tcb->protohdr.tcp4hdr; 823 memset(h4, 0, sizeof(*h4)); 824 h4->proto = IP_TCPPROTO; 825 hnputs(h4->tcpsport, s->lport); 826 hnputs(h4->tcpdport, s->rport); 827 v6tov4(h4->tcpsrc, s->laddr); 828 v6tov4(h4->tcpdst, s->raddr); 829 break; 830 case V6: 831 h6 = &tcb->protohdr.tcp6hdr; 832 memset(h6, 0, sizeof(*h6)); 833 h6->proto = IP_TCPPROTO; 834 hnputs(h6->tcpsport, s->lport); 835 hnputs(h6->tcpdport, s->rport); 836 ipmove(h6->tcpsrc, s->laddr); 837 ipmove(h6->tcpdst, s->raddr); 838 mss = DEF_MSS6; 839 break; 840 default: 841 panic("inittcpctl: version %d", s->ipversion); 842 } 843 } 844 845 tcb->mss = tcb->cwind = mss; 846 847 /* default is no window scaling */ 848 tcb->window = QMAX; 849 tcb->rcv.wnd = QMAX; 850 tcb->rcv.scale = 0; 851 tcb->snd.scale = 0; 852 qsetlimit(s->rq, QMAX); 853 } 854 855 /* 856 * called with s qlocked 857 */ 858 void 859 tcpstart(Conv *s, int mode) 860 { 861 Tcpctl *tcb; 862 Tcppriv *tpriv; 863 char kpname[KNAMELEN]; 864 865 tpriv = s->p->priv; 866 867 if(tpriv->ackprocstarted == 0){ 868 qlock(&tpriv->apl); 869 if(tpriv->ackprocstarted == 0){ 870 sprint(kpname, "#I%dtcpack", s->p->f->dev); 871 kproc(kpname, tcpackproc, s->p); 872 tpriv->ackprocstarted = 1; 873 } 874 qunlock(&tpriv->apl); 875 } 876 877 tcb = (Tcpctl*)s->ptcl; 878 879 inittcpctl(s, mode); 880 881 iphtadd(&tpriv->ht, s); 882 switch(mode) { 883 case TCP_LISTEN: 884 tpriv->stats[PassiveOpens]++; 885 tcb->flags |= CLONE; 886 tcpsetstate(s, Listen); 887 break; 888 889 case TCP_CONNECT: 890 tpriv->stats[ActiveOpens]++; 891 tcb->flags |= ACTIVE; 892 tcpsndsyn(s, tcb); 893 tcpsetstate(s, Syn_sent); 894 tcpoutput(s); 895 break; 896 } 897 } 898 899 static char* 900 tcpflag(ushort flag) 901 { 902 static char buf[128]; 903 904 sprint(buf, "%d", flag>>10); /* Head len */ 905 if(flag & URG) 906 strcat(buf, " URG"); 907 if(flag & ACK) 908 strcat(buf, " ACK"); 909 if(flag & PSH) 910 strcat(buf, " PSH"); 911 if(flag & RST) 912 strcat(buf, " RST"); 913 if(flag & SYN) 914 strcat(buf, " SYN"); 915 if(flag & FIN) 916 strcat(buf, " FIN"); 917 918 return buf; 919 } 920 921 Block * 922 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 923 { 924 int dlen; 925 Tcp6hdr *h; 926 ushort csum; 927 ushort hdrlen, optpad = 0; 928 uchar *opt; 929 930 hdrlen = TCP6_HDRSIZE; 931 if(tcph->flags & SYN){ 932 if(tcph->mss) 933 hdrlen += MSS_LENGTH; 934 if(tcph->ws) 935 hdrlen += WS_LENGTH; 936 optpad = hdrlen & 3; 937 if(optpad) 938 optpad = 4 - optpad; 939 hdrlen += optpad; 940 } 941 942 if(data) { 943 dlen = blocklen(data); 944 data = padblock(data, hdrlen + TCP6_PKT); 945 if(data == nil) 946 return nil; 947 } 948 else { 949 dlen = 0; 950 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 951 if(data == nil) 952 return nil; 953 data->wp += hdrlen + TCP6_PKT; 954 } 955 956 /* copy in pseudo ip header plus port numbers */ 957 h = (Tcp6hdr *)(data->rp); 958 memmove(h, ph, TCP6_TCBPHDRSZ); 959 960 /* compose pseudo tcp header, do cksum calculation */ 961 hnputl(h->vcf, hdrlen + dlen); 962 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 963 h->ttl = ph->proto; 964 965 /* copy in variable bits */ 966 hnputl(h->tcpseq, tcph->seq); 967 hnputl(h->tcpack, tcph->ack); 968 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 969 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 970 hnputs(h->tcpurg, tcph->urg); 971 972 if(tcph->flags & SYN){ 973 opt = h->tcpopt; 974 if(tcph->mss != 0){ 975 *opt++ = MSSOPT; 976 *opt++ = MSS_LENGTH; 977 hnputs(opt, tcph->mss); 978 opt += 2; 979 } 980 if(tcph->ws != 0){ 981 *opt++ = WSOPT; 982 *opt++ = WS_LENGTH; 983 *opt++ = tcph->ws; 984 } 985 while(optpad-- > 0) 986 *opt++ = NOOPOPT; 987 } 988 989 if(tcb != nil && tcb->nochecksum){ 990 h->tcpcksum[0] = h->tcpcksum[1] = 0; 991 } else { 992 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 993 hnputs(h->tcpcksum, csum); 994 } 995 996 /* move from pseudo header back to normal ip header */ 997 memset(h->vcf, 0, 4); 998 h->vcf[0] = IP_VER6; 999 hnputs(h->ploadlen, hdrlen+dlen); 1000 h->proto = ph->proto; 1001 1002 return data; 1003 } 1004 1005 Block * 1006 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1007 { 1008 int dlen; 1009 Tcp4hdr *h; 1010 ushort csum; 1011 ushort hdrlen, optpad = 0; 1012 uchar *opt; 1013 1014 hdrlen = TCP4_HDRSIZE; 1015 if(tcph->flags & SYN){ 1016 if(tcph->mss) 1017 hdrlen += MSS_LENGTH; 1018 if(tcph->ws) 1019 hdrlen += WS_LENGTH; 1020 optpad = hdrlen & 3; 1021 if(optpad) 1022 optpad = 4 - optpad; 1023 hdrlen += optpad; 1024 } 1025 1026 if(data) { 1027 dlen = blocklen(data); 1028 data = padblock(data, hdrlen + TCP4_PKT); 1029 if(data == nil) 1030 return nil; 1031 } 1032 else { 1033 dlen = 0; 1034 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1035 if(data == nil) 1036 return nil; 1037 data->wp += hdrlen + TCP4_PKT; 1038 } 1039 1040 /* copy in pseudo ip header plus port numbers */ 1041 h = (Tcp4hdr *)(data->rp); 1042 memmove(h, ph, TCP4_TCBPHDRSZ); 1043 1044 /* copy in variable bits */ 1045 hnputs(h->tcplen, hdrlen + dlen); 1046 hnputl(h->tcpseq, tcph->seq); 1047 hnputl(h->tcpack, tcph->ack); 1048 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1049 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1050 hnputs(h->tcpurg, tcph->urg); 1051 1052 if(tcph->flags & SYN){ 1053 opt = h->tcpopt; 1054 if(tcph->mss != 0){ 1055 *opt++ = MSSOPT; 1056 *opt++ = MSS_LENGTH; 1057 hnputs(opt, tcph->mss); 1058 opt += 2; 1059 } 1060 if(tcph->ws != 0){ 1061 *opt++ = WSOPT; 1062 *opt++ = WS_LENGTH; 1063 *opt++ = tcph->ws; 1064 } 1065 while(optpad-- > 0) 1066 *opt++ = NOOPOPT; 1067 } 1068 1069 if(tcb != nil && tcb->nochecksum){ 1070 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1071 } else { 1072 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1073 hnputs(h->tcpcksum, csum); 1074 } 1075 1076 return data; 1077 } 1078 1079 int 1080 ntohtcp6(Tcp *tcph, Block **bpp) 1081 { 1082 Tcp6hdr *h; 1083 uchar *optr; 1084 ushort hdrlen; 1085 ushort optlen; 1086 int n; 1087 1088 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1089 if(*bpp == nil) 1090 return -1; 1091 1092 h = (Tcp6hdr *)((*bpp)->rp); 1093 tcph->source = nhgets(h->tcpsport); 1094 tcph->dest = nhgets(h->tcpdport); 1095 tcph->seq = nhgetl(h->tcpseq); 1096 tcph->ack = nhgetl(h->tcpack); 1097 hdrlen = (h->tcpflag[0]>>2) & ~3; 1098 if(hdrlen < TCP6_HDRSIZE) { 1099 freeblist(*bpp); 1100 return -1; 1101 } 1102 1103 tcph->flags = h->tcpflag[1]; 1104 tcph->wnd = nhgets(h->tcpwin); 1105 tcph->urg = nhgets(h->tcpurg); 1106 tcph->mss = 0; 1107 tcph->ws = 0; 1108 tcph->len = nhgets(h->ploadlen) - hdrlen; 1109 1110 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1111 if(*bpp == nil) 1112 return -1; 1113 1114 optr = h->tcpopt; 1115 n = hdrlen - TCP6_HDRSIZE; 1116 while(n > 0 && *optr != EOLOPT) { 1117 if(*optr == NOOPOPT) { 1118 n--; 1119 optr++; 1120 continue; 1121 } 1122 optlen = optr[1]; 1123 if(optlen < 2 || optlen > n) 1124 break; 1125 switch(*optr) { 1126 case MSSOPT: 1127 if(optlen == MSS_LENGTH) 1128 tcph->mss = nhgets(optr+2); 1129 break; 1130 case WSOPT: 1131 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1132 tcph->ws = HaveWS | *(optr+2); 1133 break; 1134 } 1135 n -= optlen; 1136 optr += optlen; 1137 } 1138 return hdrlen; 1139 } 1140 1141 int 1142 ntohtcp4(Tcp *tcph, Block **bpp) 1143 { 1144 Tcp4hdr *h; 1145 uchar *optr; 1146 ushort hdrlen; 1147 ushort optlen; 1148 int n; 1149 1150 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1151 if(*bpp == nil) 1152 return -1; 1153 1154 h = (Tcp4hdr *)((*bpp)->rp); 1155 tcph->source = nhgets(h->tcpsport); 1156 tcph->dest = nhgets(h->tcpdport); 1157 tcph->seq = nhgetl(h->tcpseq); 1158 tcph->ack = nhgetl(h->tcpack); 1159 1160 hdrlen = (h->tcpflag[0]>>2) & ~3; 1161 if(hdrlen < TCP4_HDRSIZE) { 1162 freeblist(*bpp); 1163 return -1; 1164 } 1165 1166 tcph->flags = h->tcpflag[1]; 1167 tcph->wnd = nhgets(h->tcpwin); 1168 tcph->urg = nhgets(h->tcpurg); 1169 tcph->mss = 0; 1170 tcph->ws = 0; 1171 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1172 1173 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1174 if(*bpp == nil) 1175 return -1; 1176 1177 optr = h->tcpopt; 1178 n = hdrlen - TCP4_HDRSIZE; 1179 while(n > 0 && *optr != EOLOPT) { 1180 if(*optr == NOOPOPT) { 1181 n--; 1182 optr++; 1183 continue; 1184 } 1185 optlen = optr[1]; 1186 if(optlen < 2 || optlen > n) 1187 break; 1188 switch(*optr) { 1189 case MSSOPT: 1190 if(optlen == MSS_LENGTH) 1191 tcph->mss = nhgets(optr+2); 1192 break; 1193 case WSOPT: 1194 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1195 tcph->ws = HaveWS | *(optr+2); 1196 break; 1197 } 1198 n -= optlen; 1199 optr += optlen; 1200 } 1201 return hdrlen; 1202 } 1203 1204 /* 1205 * For outgiing calls, generate an initial sequence 1206 * number and put a SYN on the send queue 1207 */ 1208 void 1209 tcpsndsyn(Conv *s, Tcpctl *tcb) 1210 { 1211 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1212 tcb->rttseq = tcb->iss; 1213 tcb->snd.wl2 = tcb->iss; 1214 tcb->snd.una = tcb->iss; 1215 tcb->snd.ptr = tcb->rttseq; 1216 tcb->snd.nxt = tcb->rttseq; 1217 tcb->flgcnt++; 1218 tcb->flags |= FORCE; 1219 tcb->sndsyntime = NOW; 1220 1221 /* set desired mss and scale */ 1222 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1223 } 1224 1225 void 1226 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1227 { 1228 Block *hbp; 1229 uchar rflags; 1230 Tcppriv *tpriv; 1231 Tcp4hdr ph4; 1232 Tcp6hdr ph6; 1233 1234 netlog(tcp->f, Logtcp, "sndrst: %s", reason); 1235 1236 tpriv = tcp->priv; 1237 1238 if(seg->flags & RST) 1239 return; 1240 1241 /* make pseudo header */ 1242 switch(version) { 1243 case V4: 1244 memset(&ph4, 0, sizeof(ph4)); 1245 ph4.vihl = IP_VER4; 1246 v6tov4(ph4.tcpsrc, dest); 1247 v6tov4(ph4.tcpdst, source); 1248 ph4.proto = IP_TCPPROTO; 1249 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1250 hnputs(ph4.tcpsport, seg->dest); 1251 hnputs(ph4.tcpdport, seg->source); 1252 break; 1253 case V6: 1254 memset(&ph6, 0, sizeof(ph6)); 1255 ph6.vcf[0] = IP_VER6; 1256 ipmove(ph6.tcpsrc, dest); 1257 ipmove(ph6.tcpdst, source); 1258 ph6.proto = IP_TCPPROTO; 1259 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1260 hnputs(ph6.tcpsport, seg->dest); 1261 hnputs(ph6.tcpdport, seg->source); 1262 break; 1263 default: 1264 panic("sndrst: version %d", version); 1265 } 1266 1267 tpriv->stats[OutRsts]++; 1268 rflags = RST; 1269 1270 /* convince the other end that this reset is in band */ 1271 if(seg->flags & ACK) { 1272 seg->seq = seg->ack; 1273 seg->ack = 0; 1274 } 1275 else { 1276 rflags |= ACK; 1277 seg->ack = seg->seq; 1278 seg->seq = 0; 1279 if(seg->flags & SYN) 1280 seg->ack++; 1281 seg->ack += length; 1282 if(seg->flags & FIN) 1283 seg->ack++; 1284 } 1285 seg->flags = rflags; 1286 seg->wnd = 0; 1287 seg->urg = 0; 1288 seg->mss = 0; 1289 seg->ws = 0; 1290 switch(version) { 1291 case V4: 1292 hbp = htontcp4(seg, nil, &ph4, nil); 1293 if(hbp == nil) 1294 return; 1295 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1296 break; 1297 case V6: 1298 hbp = htontcp6(seg, nil, &ph6, nil); 1299 if(hbp == nil) 1300 return; 1301 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1302 break; 1303 default: 1304 panic("sndrst2: version %d", version); 1305 } 1306 } 1307 1308 /* 1309 * send a reset to the remote side and close the conversation 1310 * called with s qlocked 1311 */ 1312 char* 1313 tcphangup(Conv *s) 1314 { 1315 Tcp seg; 1316 Tcpctl *tcb; 1317 Block *hbp; 1318 1319 tcb = (Tcpctl*)s->ptcl; 1320 if(waserror()) 1321 return commonerror(); 1322 if(ipcmp(s->raddr, IPnoaddr) != 0) { 1323 if(!waserror()){ 1324 seg.flags = RST | ACK; 1325 seg.ack = tcb->rcv.nxt; 1326 tcb->rcv.una = 0; 1327 seg.seq = tcb->snd.ptr; 1328 seg.wnd = 0; 1329 seg.urg = 0; 1330 seg.mss = 0; 1331 seg.ws = 0; 1332 switch(s->ipversion) { 1333 case V4: 1334 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1335 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1336 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1337 break; 1338 case V6: 1339 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1340 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1341 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1342 break; 1343 default: 1344 panic("tcphangup: version %d", s->ipversion); 1345 } 1346 poperror(); 1347 } 1348 } 1349 localclose(s, nil); 1350 poperror(); 1351 return nil; 1352 } 1353 1354 /* 1355 * (re)send a SYN ACK 1356 */ 1357 int 1358 sndsynack(Proto *tcp, Limbo *lp) 1359 { 1360 Block *hbp; 1361 Tcp4hdr ph4; 1362 Tcp6hdr ph6; 1363 Tcp seg; 1364 int scale; 1365 1366 /* make pseudo header */ 1367 switch(lp->version) { 1368 case V4: 1369 memset(&ph4, 0, sizeof(ph4)); 1370 ph4.vihl = IP_VER4; 1371 v6tov4(ph4.tcpsrc, lp->laddr); 1372 v6tov4(ph4.tcpdst, lp->raddr); 1373 ph4.proto = IP_TCPPROTO; 1374 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1375 hnputs(ph4.tcpsport, lp->lport); 1376 hnputs(ph4.tcpdport, lp->rport); 1377 break; 1378 case V6: 1379 memset(&ph6, 0, sizeof(ph6)); 1380 ph6.vcf[0] = IP_VER6; 1381 ipmove(ph6.tcpsrc, lp->laddr); 1382 ipmove(ph6.tcpdst, lp->raddr); 1383 ph6.proto = IP_TCPPROTO; 1384 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1385 hnputs(ph6.tcpsport, lp->lport); 1386 hnputs(ph6.tcpdport, lp->rport); 1387 break; 1388 default: 1389 panic("sndrst: version %d", lp->version); 1390 } 1391 1392 seg.seq = lp->iss; 1393 seg.ack = lp->irs+1; 1394 seg.flags = SYN|ACK; 1395 seg.urg = 0; 1396 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1397 seg.wnd = QMAX; 1398 1399 /* if the other side set scale, we should too */ 1400 if(lp->rcvscale){ 1401 seg.ws = scale; 1402 lp->sndscale = scale; 1403 } else { 1404 seg.ws = 0; 1405 lp->sndscale = 0; 1406 } 1407 1408 switch(lp->version) { 1409 case V4: 1410 hbp = htontcp4(&seg, nil, &ph4, nil); 1411 if(hbp == nil) 1412 return -1; 1413 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1414 break; 1415 case V6: 1416 hbp = htontcp6(&seg, nil, &ph6, nil); 1417 if(hbp == nil) 1418 return -1; 1419 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1420 break; 1421 default: 1422 panic("sndsnack: version %d", lp->version); 1423 } 1424 lp->lastsend = NOW; 1425 return 0; 1426 } 1427 1428 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1429 1430 /* 1431 * put a call into limbo and respond with a SYN ACK 1432 * 1433 * called with proto locked 1434 */ 1435 static void 1436 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1437 { 1438 Limbo *lp, **l; 1439 Tcppriv *tpriv; 1440 int h; 1441 1442 tpriv = s->p->priv; 1443 h = hashipa(source, seg->source); 1444 1445 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1446 lp = *l; 1447 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1448 continue; 1449 if(ipcmp(lp->raddr, source) != 0) 1450 continue; 1451 if(ipcmp(lp->laddr, dest) != 0) 1452 continue; 1453 1454 /* each new SYN restarts the retransmits */ 1455 lp->irs = seg->seq; 1456 break; 1457 } 1458 lp = *l; 1459 if(lp == nil){ 1460 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1461 lp = tpriv->lht[h]; 1462 tpriv->lht[h] = lp->next; 1463 lp->next = nil; 1464 } else { 1465 lp = malloc(sizeof(*lp)); 1466 if(lp == nil) 1467 return; 1468 tpriv->nlimbo++; 1469 } 1470 *l = lp; 1471 lp->version = version; 1472 ipmove(lp->laddr, dest); 1473 ipmove(lp->raddr, source); 1474 lp->lport = seg->dest; 1475 lp->rport = seg->source; 1476 lp->mss = seg->mss; 1477 lp->rcvscale = seg->ws; 1478 lp->irs = seg->seq; 1479 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1480 } 1481 1482 if(sndsynack(s->p, lp) < 0){ 1483 *l = lp->next; 1484 tpriv->nlimbo--; 1485 free(lp); 1486 } 1487 } 1488 1489 /* 1490 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1491 */ 1492 static void 1493 limborexmit(Proto *tcp) 1494 { 1495 Tcppriv *tpriv; 1496 Limbo **l, *lp; 1497 int h; 1498 int seen; 1499 ulong now; 1500 1501 tpriv = tcp->priv; 1502 1503 if(!canqlock(tcp)) 1504 return; 1505 seen = 0; 1506 now = NOW; 1507 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1508 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1509 lp = *l; 1510 seen++; 1511 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1512 continue; 1513 1514 /* time it out after 1 second */ 1515 if(++(lp->rexmits) > 5){ 1516 tpriv->nlimbo--; 1517 *l = lp->next; 1518 free(lp); 1519 continue; 1520 } 1521 1522 /* if we're being attacked, don't bother resending SYN ACK's */ 1523 if(tpriv->nlimbo > 100) 1524 continue; 1525 1526 if(sndsynack(tcp, lp) < 0){ 1527 tpriv->nlimbo--; 1528 *l = lp->next; 1529 free(lp); 1530 continue; 1531 } 1532 1533 l = &lp->next; 1534 } 1535 } 1536 qunlock(tcp); 1537 } 1538 1539 /* 1540 * lookup call in limbo. if found, throw it out. 1541 * 1542 * called with proto locked 1543 */ 1544 static void 1545 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1546 { 1547 Limbo *lp, **l; 1548 int h; 1549 Tcppriv *tpriv; 1550 1551 tpriv = s->p->priv; 1552 1553 /* find a call in limbo */ 1554 h = hashipa(src, segp->source); 1555 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1556 lp = *l; 1557 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1558 continue; 1559 if(ipcmp(lp->laddr, dst) != 0) 1560 continue; 1561 if(ipcmp(lp->raddr, src) != 0) 1562 continue; 1563 1564 /* RST can only follow the SYN */ 1565 if(segp->seq == lp->irs+1){ 1566 tpriv->nlimbo--; 1567 *l = lp->next; 1568 free(lp); 1569 } 1570 break; 1571 } 1572 } 1573 1574 /* 1575 * come here when we finally get an ACK to our SYN-ACK. 1576 * lookup call in limbo. if found, create a new conversation 1577 * 1578 * called with proto locked 1579 */ 1580 static Conv* 1581 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1582 { 1583 Conv *new; 1584 Tcpctl *tcb; 1585 Tcppriv *tpriv; 1586 Tcp4hdr *h4; 1587 Tcp6hdr *h6; 1588 Limbo *lp, **l; 1589 int h; 1590 1591 /* unless it's just an ack, it can't be someone coming out of limbo */ 1592 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1593 return nil; 1594 1595 tpriv = s->p->priv; 1596 1597 /* find a call in limbo */ 1598 h = hashipa(src, segp->source); 1599 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1600 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d", 1601 src, segp->source, lp->raddr, lp->rport, 1602 dst, segp->dest, lp->laddr, lp->lport, 1603 version, lp->version 1604 ); 1605 1606 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1607 continue; 1608 if(ipcmp(lp->laddr, dst) != 0) 1609 continue; 1610 if(ipcmp(lp->raddr, src) != 0) 1611 continue; 1612 1613 /* we're assuming no data with the initial SYN */ 1614 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1615 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux", 1616 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1617 lp = nil; 1618 } else { 1619 tpriv->nlimbo--; 1620 *l = lp->next; 1621 } 1622 break; 1623 } 1624 if(lp == nil) 1625 return nil; 1626 1627 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1628 if(new == nil) 1629 return nil; 1630 1631 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1632 tcb = (Tcpctl*)new->ptcl; 1633 tcb->flags &= ~CLONE; 1634 tcb->timer.arg = new; 1635 tcb->timer.state = TcptimerOFF; 1636 tcb->acktimer.arg = new; 1637 tcb->acktimer.state = TcptimerOFF; 1638 tcb->katimer.arg = new; 1639 tcb->katimer.state = TcptimerOFF; 1640 tcb->rtt_timer.arg = new; 1641 tcb->rtt_timer.state = TcptimerOFF; 1642 1643 tcb->irs = lp->irs; 1644 tcb->rcv.nxt = tcb->irs+1; 1645 tcb->rcv.urg = tcb->rcv.nxt; 1646 1647 tcb->iss = lp->iss; 1648 tcb->rttseq = tcb->iss; 1649 tcb->snd.wl2 = tcb->iss; 1650 tcb->snd.una = tcb->iss+1; 1651 tcb->snd.ptr = tcb->iss+1; 1652 tcb->snd.nxt = tcb->iss+1; 1653 tcb->flgcnt = 0; 1654 tcb->flags |= SYNACK; 1655 1656 /* our sending max segment size cannot be bigger than what he asked for */ 1657 if(lp->mss != 0 && lp->mss < tcb->mss) 1658 tcb->mss = lp->mss; 1659 1660 /* window scaling */ 1661 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1662 1663 /* the congestion window always starts out as a single segment */ 1664 tcb->snd.wnd = segp->wnd; 1665 tcb->cwind = tcb->mss; 1666 1667 /* set initial round trip time */ 1668 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1669 tcpsynackrtt(new); 1670 1671 free(lp); 1672 1673 /* set up proto header */ 1674 switch(version){ 1675 case V4: 1676 h4 = &tcb->protohdr.tcp4hdr; 1677 memset(h4, 0, sizeof(*h4)); 1678 h4->proto = IP_TCPPROTO; 1679 hnputs(h4->tcpsport, new->lport); 1680 hnputs(h4->tcpdport, new->rport); 1681 v6tov4(h4->tcpsrc, dst); 1682 v6tov4(h4->tcpdst, src); 1683 break; 1684 case V6: 1685 h6 = &tcb->protohdr.tcp6hdr; 1686 memset(h6, 0, sizeof(*h6)); 1687 h6->proto = IP_TCPPROTO; 1688 hnputs(h6->tcpsport, new->lport); 1689 hnputs(h6->tcpdport, new->rport); 1690 ipmove(h6->tcpsrc, dst); 1691 ipmove(h6->tcpdst, src); 1692 break; 1693 default: 1694 panic("tcpincoming: version %d", new->ipversion); 1695 } 1696 1697 tcpsetstate(new, Established); 1698 1699 iphtadd(&tpriv->ht, new); 1700 1701 return new; 1702 } 1703 1704 int 1705 seq_within(ulong x, ulong low, ulong high) 1706 { 1707 if(low <= high){ 1708 if(low <= x && x <= high) 1709 return 1; 1710 } 1711 else { 1712 if(x >= low || x <= high) 1713 return 1; 1714 } 1715 return 0; 1716 } 1717 1718 int 1719 seq_lt(ulong x, ulong y) 1720 { 1721 return (int)(x-y) < 0; 1722 } 1723 1724 int 1725 seq_le(ulong x, ulong y) 1726 { 1727 return (int)(x-y) <= 0; 1728 } 1729 1730 int 1731 seq_gt(ulong x, ulong y) 1732 { 1733 return (int)(x-y) > 0; 1734 } 1735 1736 int 1737 seq_ge(ulong x, ulong y) 1738 { 1739 return (int)(x-y) >= 0; 1740 } 1741 1742 /* 1743 * use the time between the first SYN and it's ack as the 1744 * initial round trip time 1745 */ 1746 void 1747 tcpsynackrtt(Conv *s) 1748 { 1749 Tcpctl *tcb; 1750 int delta; 1751 Tcppriv *tpriv; 1752 1753 tcb = (Tcpctl*)s->ptcl; 1754 tpriv = s->p->priv; 1755 1756 delta = NOW - tcb->sndsyntime; 1757 tcb->srtt = delta<<LOGAGAIN; 1758 tcb->mdev = delta<<LOGDGAIN; 1759 1760 /* halt round trip timer */ 1761 tcphalt(tpriv, &tcb->rtt_timer); 1762 } 1763 1764 void 1765 update(Conv *s, Tcp *seg) 1766 { 1767 int rtt, delta; 1768 Tcpctl *tcb; 1769 ulong acked; 1770 ulong expand; 1771 Tcppriv *tpriv; 1772 1773 tpriv = s->p->priv; 1774 tcb = (Tcpctl*)s->ptcl; 1775 1776 /* if everything has been acked, force output(?) */ 1777 if(seq_gt(seg->ack, tcb->snd.nxt)) { 1778 tcb->flags |= FORCE; 1779 return; 1780 } 1781 1782 /* added by Dong Lin for fast retransmission */ 1783 if(seg->ack == tcb->snd.una 1784 && tcb->snd.una != tcb->snd.nxt 1785 && seg->len == 0 1786 && seg->wnd == tcb->snd.wnd) { 1787 1788 /* this is a pure ack w/o window update */ 1789 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", 1790 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); 1791 1792 if(++tcb->snd.dupacks == TCPREXMTTHRESH) { 1793 /* 1794 * tahoe tcp rxt the packet, half sshthresh, 1795 * and set cwnd to one packet 1796 */ 1797 tcb->snd.recovery = 1; 1798 tcb->snd.rxt = tcb->snd.nxt; 1799 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); 1800 tcprxmit(s); 1801 } else { 1802 /* do reno tcp here. */ 1803 } 1804 } 1805 1806 /* 1807 * update window 1808 */ 1809 if(seq_gt(seg->ack, tcb->snd.wl2) 1810 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1811 tcb->snd.wnd = seg->wnd; 1812 tcb->snd.wl2 = seg->ack; 1813 } 1814 1815 if(!seq_gt(seg->ack, tcb->snd.una)){ 1816 /* 1817 * don't let us hangup if sending into a closed window and 1818 * we're still getting acks 1819 */ 1820 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ 1821 tcb->backedoff = MAXBACKMS/4; 1822 } 1823 return; 1824 } 1825 1826 /* 1827 * any positive ack turns off fast rxt, 1828 * (should we do new-reno on partial acks?) 1829 */ 1830 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { 1831 tcb->snd.dupacks = 0; 1832 tcb->snd.recovery = 0; 1833 } else 1834 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); 1835 1836 /* Compute the new send window size */ 1837 acked = seg->ack - tcb->snd.una; 1838 1839 /* avoid slow start and timers for SYN acks */ 1840 if((tcb->flags & SYNACK) == 0) { 1841 tcb->flags |= SYNACK; 1842 acked--; 1843 tcb->flgcnt--; 1844 goto done; 1845 } 1846 1847 /* slow start as long as we're not recovering from lost packets */ 1848 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { 1849 if(tcb->cwind < tcb->ssthresh) { 1850 expand = tcb->mss; 1851 if(acked < expand) 1852 expand = acked; 1853 } 1854 else 1855 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; 1856 1857 if(tcb->cwind + expand < tcb->cwind) 1858 expand = tcb->snd.wnd - tcb->cwind; 1859 if(tcb->cwind + expand > tcb->snd.wnd) 1860 expand = tcb->snd.wnd - tcb->cwind; 1861 tcb->cwind += expand; 1862 } 1863 1864 /* Adjust the timers according to the round trip time */ 1865 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 1866 tcphalt(tpriv, &tcb->rtt_timer); 1867 if((tcb->flags&RETRAN) == 0) { 1868 tcb->backoff = 0; 1869 tcb->backedoff = 0; 1870 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 1871 if(rtt == 0) 1872 rtt = 1; /* otherwise all close systems will rexmit in 0 time */ 1873 rtt *= MSPTICK; 1874 if(tcb->srtt == 0) { 1875 tcb->srtt = rtt << LOGAGAIN; 1876 tcb->mdev = rtt << LOGDGAIN; 1877 } else { 1878 delta = rtt - (tcb->srtt>>LOGAGAIN); 1879 tcb->srtt += delta; 1880 if(tcb->srtt <= 0) 1881 tcb->srtt = 1; 1882 1883 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 1884 tcb->mdev += delta; 1885 if(tcb->mdev <= 0) 1886 tcb->mdev = 1; 1887 } 1888 tcpsettimer(tcb); 1889 } 1890 } 1891 1892 done: 1893 if(qdiscard(s->wq, acked) < acked) 1894 tcb->flgcnt--; 1895 1896 tcb->snd.una = seg->ack; 1897 if(seq_gt(seg->ack, tcb->snd.urg)) 1898 tcb->snd.urg = seg->ack; 1899 1900 if(tcb->snd.una != tcb->snd.nxt) 1901 tcpgo(tpriv, &tcb->timer); 1902 else 1903 tcphalt(tpriv, &tcb->timer); 1904 1905 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 1906 tcb->snd.ptr = tcb->snd.una; 1907 1908 tcb->flags &= ~RETRAN; 1909 tcb->backoff = 0; 1910 tcb->backedoff = 0; 1911 } 1912 1913 void 1914 tcpiput(Proto *tcp, Ipifc*, Block *bp) 1915 { 1916 Tcp seg; 1917 Tcp4hdr *h4; 1918 Tcp6hdr *h6; 1919 int hdrlen; 1920 Tcpctl *tcb; 1921 ushort length; 1922 uchar source[IPaddrlen], dest[IPaddrlen]; 1923 Conv *s; 1924 Fs *f; 1925 Tcppriv *tpriv; 1926 uchar version; 1927 1928 f = tcp->f; 1929 tpriv = tcp->priv; 1930 1931 tpriv->stats[InSegs]++; 1932 1933 h4 = (Tcp4hdr*)(bp->rp); 1934 h6 = (Tcp6hdr*)(bp->rp); 1935 1936 if((h4->vihl&0xF0)==IP_VER4) { 1937 version = V4; 1938 length = nhgets(h4->length); 1939 v4tov6(dest, h4->tcpdst); 1940 v4tov6(source, h4->tcpsrc); 1941 1942 h4->Unused = 0; 1943 hnputs(h4->tcplen, length-TCP4_PKT); 1944 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 1945 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 1946 tpriv->stats[CsumErrs]++; 1947 tpriv->stats[InErrs]++; 1948 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1949 freeblist(bp); 1950 return; 1951 } 1952 1953 hdrlen = ntohtcp4(&seg, &bp); 1954 if(hdrlen < 0){ 1955 tpriv->stats[HlenErrs]++; 1956 tpriv->stats[InErrs]++; 1957 netlog(f, Logtcp, "bad tcp hdr len\n"); 1958 return; 1959 } 1960 1961 /* trim the packet to the size claimed by the datagram */ 1962 length -= hdrlen+TCP4_PKT; 1963 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 1964 if(bp == nil){ 1965 tpriv->stats[LenErrs]++; 1966 tpriv->stats[InErrs]++; 1967 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 1968 return; 1969 } 1970 } 1971 else { 1972 int ttl = h6->ttl; 1973 int proto = h6->proto; 1974 1975 version = V6; 1976 length = nhgets(h6->ploadlen); 1977 ipmove(dest, h6->tcpdst); 1978 ipmove(source, h6->tcpsrc); 1979 1980 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 1981 h6->ttl = proto; 1982 hnputl(h6->vcf, length); 1983 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 1984 ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) { 1985 tpriv->stats[CsumErrs]++; 1986 tpriv->stats[InErrs]++; 1987 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1988 freeblist(bp); 1989 return; 1990 } 1991 h6->ttl = ttl; 1992 h6->proto = proto; 1993 hnputs(h6->ploadlen, length); 1994 1995 hdrlen = ntohtcp6(&seg, &bp); 1996 if(hdrlen < 0){ 1997 tpriv->stats[HlenErrs]++; 1998 tpriv->stats[InErrs]++; 1999 netlog(f, Logtcp, "bad tcp hdr len\n"); 2000 return; 2001 } 2002 2003 /* trim the packet to the size claimed by the datagram */ 2004 length -= hdrlen; 2005 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2006 if(bp == nil){ 2007 tpriv->stats[LenErrs]++; 2008 tpriv->stats[InErrs]++; 2009 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 2010 return; 2011 } 2012 } 2013 2014 /* lock protocol while searching for a conversation */ 2015 qlock(tcp); 2016 2017 /* Look for a matching conversation */ 2018 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2019 if(s == nil){ 2020 netlog(f, Logtcp, "iphtlook failed"); 2021 reset: 2022 qunlock(tcp); 2023 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2024 freeblist(bp); 2025 return; 2026 } 2027 2028 /* if it's a listener, look for the right flags and get a new conv */ 2029 tcb = (Tcpctl*)s->ptcl; 2030 if(tcb->state == Listen){ 2031 if(seg.flags & RST){ 2032 limborst(s, &seg, source, dest, version); 2033 qunlock(tcp); 2034 freeblist(bp); 2035 return; 2036 } 2037 2038 /* if this is a new SYN, put the call into limbo */ 2039 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2040 limbo(s, source, dest, &seg, version); 2041 qunlock(tcp); 2042 freeblist(bp); 2043 return; 2044 } 2045 2046 /* 2047 * if there's a matching call in limbo, tcpincoming will 2048 * return it in state Syn_received 2049 */ 2050 s = tcpincoming(s, &seg, source, dest, version); 2051 if(s == nil) 2052 goto reset; 2053 } 2054 2055 /* The rest of the input state machine is run with the control block 2056 * locked and implements the state machine directly out of the RFC. 2057 * Out-of-band data is ignored - it was always a bad idea. 2058 */ 2059 tcb = (Tcpctl*)s->ptcl; 2060 if(waserror()){ 2061 qunlock(s); 2062 nexterror(); 2063 } 2064 qlock(s); 2065 qunlock(tcp); 2066 2067 /* fix up window */ 2068 seg.wnd <<= tcb->rcv.scale; 2069 2070 /* every input packet in puts off the keep alive time out */ 2071 tcpsetkacounter(tcb); 2072 2073 switch(tcb->state) { 2074 case Closed: 2075 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2076 goto raise; 2077 case Syn_sent: 2078 if(seg.flags & ACK) { 2079 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2080 sndrst(tcp, source, dest, length, &seg, version, 2081 "bad seq in Syn_sent"); 2082 goto raise; 2083 } 2084 } 2085 if(seg.flags & RST) { 2086 if(seg.flags & ACK) 2087 localclose(s, Econrefused); 2088 goto raise; 2089 } 2090 2091 if(seg.flags & SYN) { 2092 procsyn(s, &seg); 2093 if(seg.flags & ACK){ 2094 update(s, &seg); 2095 tcpsynackrtt(s); 2096 tcpsetstate(s, Established); 2097 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2098 } 2099 else { 2100 tcb->time = NOW; 2101 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2102 } 2103 2104 if(length != 0 || (seg.flags & FIN)) 2105 break; 2106 2107 freeblist(bp); 2108 goto output; 2109 } 2110 else 2111 freeblist(bp); 2112 2113 qunlock(s); 2114 poperror(); 2115 return; 2116 case Syn_received: 2117 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2118 if(seg.flags & ACK) 2119 tcpsynackrtt(s); 2120 break; 2121 } 2122 2123 /* 2124 * One DOS attack is to open connections to us and then forget about them, 2125 * thereby tying up a conv at no long term cost to the attacker. 2126 * This is an attempt to defeat these stateless DOS attacks. See 2127 * corresponding code in tcpsendka(). 2128 */ 2129 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2130 if(tcpporthogdefense 2131 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2132 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2133 source, seg.source, dest, seg.dest, seg.flags, 2134 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2135 localclose(s, "stateless hog"); 2136 } 2137 } 2138 2139 /* Cut the data to fit the receive window */ 2140 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2141 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); 2142 update(s, &seg); 2143 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2144 tcphalt(tpriv, &tcb->rtt_timer); 2145 tcphalt(tpriv, &tcb->acktimer); 2146 tcphalt(tpriv, &tcb->katimer); 2147 tcpsetstate(s, Time_wait); 2148 tcb->timer.start = MSL2*(1000 / MSPTICK); 2149 tcpgo(tpriv, &tcb->timer); 2150 } 2151 if(!(seg.flags & RST)) { 2152 tcb->flags |= FORCE; 2153 goto output; 2154 } 2155 qunlock(s); 2156 poperror(); 2157 return; 2158 } 2159 2160 /* Cannot accept so answer with a rst */ 2161 if(length && tcb->state == Closed) { 2162 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2163 goto raise; 2164 } 2165 2166 /* The segment is beyond the current receive pointer so 2167 * queue the data in the resequence queue 2168 */ 2169 if(seg.seq != tcb->rcv.nxt) 2170 if(length != 0 || (seg.flags & (SYN|FIN))) { 2171 update(s, &seg); 2172 if(addreseq(tcb, tpriv, &seg, bp, length) < 0) 2173 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); 2174 tcb->flags |= FORCE; 2175 goto output; 2176 } 2177 2178 /* 2179 * keep looping till we've processed this packet plus any 2180 * adjacent packets in the resequence queue 2181 */ 2182 for(;;) { 2183 if(seg.flags & RST) { 2184 if(tcb->state == Established) { 2185 tpriv->stats[EstabResets]++; 2186 if(tcb->rcv.nxt != seg.seq) 2187 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); 2188 } 2189 localclose(s, Econrefused); 2190 goto raise; 2191 } 2192 2193 if((seg.flags&ACK) == 0) 2194 goto raise; 2195 2196 switch(tcb->state) { 2197 case Syn_received: 2198 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2199 sndrst(tcp, source, dest, length, &seg, version, 2200 "bad seq in Syn_received"); 2201 goto raise; 2202 } 2203 update(s, &seg); 2204 tcpsetstate(s, Established); 2205 case Established: 2206 case Close_wait: 2207 update(s, &seg); 2208 break; 2209 case Finwait1: 2210 update(s, &seg); 2211 if(qlen(s->wq)+tcb->flgcnt == 0){ 2212 tcphalt(tpriv, &tcb->rtt_timer); 2213 tcphalt(tpriv, &tcb->acktimer); 2214 tcpsetkacounter(tcb); 2215 tcb->time = NOW; 2216 tcpsetstate(s, Finwait2); 2217 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2218 tcpgo(tpriv, &tcb->katimer); 2219 } 2220 break; 2221 case Finwait2: 2222 update(s, &seg); 2223 break; 2224 case Closing: 2225 update(s, &seg); 2226 if(qlen(s->wq)+tcb->flgcnt == 0) { 2227 tcphalt(tpriv, &tcb->rtt_timer); 2228 tcphalt(tpriv, &tcb->acktimer); 2229 tcphalt(tpriv, &tcb->katimer); 2230 tcpsetstate(s, Time_wait); 2231 tcb->timer.start = MSL2*(1000 / MSPTICK); 2232 tcpgo(tpriv, &tcb->timer); 2233 } 2234 break; 2235 case Last_ack: 2236 update(s, &seg); 2237 if(qlen(s->wq)+tcb->flgcnt == 0) { 2238 localclose(s, nil); 2239 goto raise; 2240 } 2241 case Time_wait: 2242 tcb->flags |= FORCE; 2243 if(tcb->timer.state != TcptimerON) 2244 tcpgo(tpriv, &tcb->timer); 2245 } 2246 2247 if((seg.flags&URG) && seg.urg) { 2248 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2249 tcb->rcv.urg = seg.urg + seg.seq; 2250 pullblock(&bp, seg.urg); 2251 } 2252 } 2253 else 2254 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2255 tcb->rcv.urg = tcb->rcv.nxt; 2256 2257 if(length == 0) { 2258 if(bp != nil) 2259 freeblist(bp); 2260 } 2261 else { 2262 switch(tcb->state){ 2263 default: 2264 /* Ignore segment text */ 2265 if(bp != nil) 2266 freeblist(bp); 2267 break; 2268 2269 case Syn_received: 2270 case Established: 2271 case Finwait1: 2272 /* If we still have some data place on 2273 * receive queue 2274 */ 2275 if(bp) { 2276 bp = packblock(bp); 2277 if(bp == nil) 2278 panic("tcp packblock"); 2279 qpassnolim(s->rq, bp); 2280 bp = nil; 2281 2282 /* 2283 * Force an ack every 2 data messages. This is 2284 * a hack for rob to make his home system run 2285 * faster. 2286 * 2287 * this also keeps the standard TCP congestion 2288 * control working since it needs an ack every 2289 * 2 max segs worth. This is not quite that, 2290 * but under a real stream is equivalent since 2291 * every packet has a max seg in it. 2292 */ 2293 if(++(tcb->rcv.una) >= 2) 2294 tcb->flags |= FORCE; 2295 } 2296 tcb->rcv.nxt += length; 2297 2298 /* 2299 * update our rcv window 2300 */ 2301 tcprcvwin(s); 2302 2303 /* 2304 * turn on the acktimer if there's something 2305 * to ack 2306 */ 2307 if(tcb->acktimer.state != TcptimerON) 2308 tcpgo(tpriv, &tcb->acktimer); 2309 2310 break; 2311 case Finwait2: 2312 /* no process to read the data, send a reset */ 2313 if(bp != nil) 2314 freeblist(bp); 2315 sndrst(tcp, source, dest, length, &seg, version, 2316 "send to Finwait2"); 2317 qunlock(s); 2318 poperror(); 2319 return; 2320 } 2321 } 2322 2323 if(seg.flags & FIN) { 2324 tcb->flags |= FORCE; 2325 2326 switch(tcb->state) { 2327 case Syn_received: 2328 case Established: 2329 tcb->rcv.nxt++; 2330 tcpsetstate(s, Close_wait); 2331 break; 2332 case Finwait1: 2333 tcb->rcv.nxt++; 2334 if(qlen(s->wq)+tcb->flgcnt == 0) { 2335 tcphalt(tpriv, &tcb->rtt_timer); 2336 tcphalt(tpriv, &tcb->acktimer); 2337 tcphalt(tpriv, &tcb->katimer); 2338 tcpsetstate(s, Time_wait); 2339 tcb->timer.start = MSL2*(1000/MSPTICK); 2340 tcpgo(tpriv, &tcb->timer); 2341 } 2342 else 2343 tcpsetstate(s, Closing); 2344 break; 2345 case Finwait2: 2346 tcb->rcv.nxt++; 2347 tcphalt(tpriv, &tcb->rtt_timer); 2348 tcphalt(tpriv, &tcb->acktimer); 2349 tcphalt(tpriv, &tcb->katimer); 2350 tcpsetstate(s, Time_wait); 2351 tcb->timer.start = MSL2 * (1000/MSPTICK); 2352 tcpgo(tpriv, &tcb->timer); 2353 break; 2354 case Close_wait: 2355 case Closing: 2356 case Last_ack: 2357 break; 2358 case Time_wait: 2359 tcpgo(tpriv, &tcb->timer); 2360 break; 2361 } 2362 } 2363 2364 /* 2365 * get next adjacent segment from the resequence queue. 2366 * dump/trim any overlapping segments 2367 */ 2368 for(;;) { 2369 if(tcb->reseq == nil) 2370 goto output; 2371 2372 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2373 goto output; 2374 2375 getreseq(tcb, &seg, &bp, &length); 2376 2377 if(tcptrim(tcb, &seg, &bp, &length) == 0) 2378 break; 2379 } 2380 } 2381 output: 2382 tcpoutput(s); 2383 qunlock(s); 2384 poperror(); 2385 return; 2386 raise: 2387 qunlock(s); 2388 poperror(); 2389 freeblist(bp); 2390 tcpkick(s); 2391 } 2392 2393 /* 2394 * always enters and exits with the s locked. We drop 2395 * the lock to ipoput the packet so some care has to be 2396 * taken by callers. 2397 */ 2398 void 2399 tcpoutput(Conv *s) 2400 { 2401 Tcp seg; 2402 int msgs; 2403 Tcpctl *tcb; 2404 Block *hbp, *bp; 2405 int sndcnt, n; 2406 ulong ssize, dsize, usable, sent; 2407 Fs *f; 2408 Tcppriv *tpriv; 2409 uchar version; 2410 2411 f = s->p->f; 2412 tpriv = s->p->priv; 2413 version = s->ipversion; 2414 2415 for(msgs = 0; msgs < 100; msgs++) { 2416 tcb = (Tcpctl*)s->ptcl; 2417 2418 switch(tcb->state) { 2419 case Listen: 2420 case Closed: 2421 case Finwait2: 2422 return; 2423 } 2424 2425 /* force an ack when a window has opened up */ 2426 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2427 tcb->rcv.blocked = 0; 2428 tcb->flags |= FORCE; 2429 } 2430 2431 sndcnt = qlen(s->wq)+tcb->flgcnt; 2432 sent = tcb->snd.ptr - tcb->snd.una; 2433 2434 /* Don't send anything else until our SYN has been acked */ 2435 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2436 break; 2437 2438 /* Compute usable segment based on offered window and limit 2439 * window probes to one 2440 */ 2441 if(tcb->snd.wnd == 0){ 2442 if(sent != 0) { 2443 if((tcb->flags&FORCE) == 0) 2444 break; 2445 // tcb->snd.ptr = tcb->snd.una; 2446 } 2447 usable = 1; 2448 } 2449 else { 2450 usable = tcb->cwind; 2451 if(tcb->snd.wnd < usable) 2452 usable = tcb->snd.wnd; 2453 usable -= sent; 2454 } 2455 ssize = sndcnt-sent; 2456 if(ssize && usable < 2) 2457 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", 2458 tcb->snd.wnd, tcb->cwind); 2459 if(usable < ssize) 2460 ssize = usable; 2461 if(tcb->mss < ssize) 2462 ssize = tcb->mss; 2463 dsize = ssize; 2464 seg.urg = 0; 2465 2466 if(ssize == 0) 2467 if((tcb->flags&FORCE) == 0) 2468 break; 2469 2470 tcb->flags &= ~FORCE; 2471 tcprcvwin(s); 2472 2473 /* By default we will generate an ack */ 2474 tcphalt(tpriv, &tcb->acktimer); 2475 tcb->rcv.una = 0; 2476 seg.source = s->lport; 2477 seg.dest = s->rport; 2478 seg.flags = ACK; 2479 seg.mss = 0; 2480 seg.ws = 0; 2481 switch(tcb->state){ 2482 case Syn_sent: 2483 seg.flags = 0; 2484 if(tcb->snd.ptr == tcb->iss){ 2485 seg.flags |= SYN; 2486 dsize--; 2487 seg.mss = tcb->mss; 2488 seg.ws = tcb->scale; 2489 } 2490 break; 2491 case Syn_received: 2492 /* 2493 * don't send any data with a SYN/ACK packet 2494 * because Linux rejects the packet in its 2495 * attempt to solve the SYN attack problem 2496 */ 2497 if(tcb->snd.ptr == tcb->iss){ 2498 seg.flags |= SYN; 2499 dsize = 0; 2500 ssize = 1; 2501 seg.mss = tcb->mss; 2502 seg.ws = tcb->scale; 2503 } 2504 break; 2505 } 2506 seg.seq = tcb->snd.ptr; 2507 seg.ack = tcb->rcv.nxt; 2508 seg.wnd = tcb->rcv.wnd; 2509 2510 /* Pull out data to send */ 2511 bp = nil; 2512 if(dsize != 0) { 2513 bp = qcopy(s->wq, dsize, sent); 2514 if(BLEN(bp) != dsize) { 2515 seg.flags |= FIN; 2516 dsize--; 2517 } 2518 } 2519 2520 if(sent+dsize == sndcnt) 2521 seg.flags |= PSH; 2522 2523 /* keep track of balance of resent data */ 2524 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { 2525 n = tcb->snd.nxt - tcb->snd.ptr; 2526 if(ssize < n) 2527 n = ssize; 2528 tcb->resent += n; 2529 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", 2530 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); 2531 tpriv->stats[RetransSegs]++; 2532 } 2533 2534 tcb->snd.ptr += ssize; 2535 2536 /* Pull up the send pointer so we can accept acks 2537 * for this window 2538 */ 2539 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2540 tcb->snd.nxt = tcb->snd.ptr; 2541 2542 /* Build header, link data and compute cksum */ 2543 switch(version){ 2544 case V4: 2545 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2546 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2547 if(hbp == nil) { 2548 freeblist(bp); 2549 return; 2550 } 2551 break; 2552 case V6: 2553 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2554 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2555 if(hbp == nil) { 2556 freeblist(bp); 2557 return; 2558 } 2559 break; 2560 default: 2561 hbp = nil; /* to suppress a warning */ 2562 panic("tcpoutput: version %d", version); 2563 } 2564 2565 /* Start the transmission timers if there is new data and we 2566 * expect acknowledges 2567 */ 2568 if(ssize != 0){ 2569 if(tcb->timer.state != TcptimerON) 2570 tcpgo(tpriv, &tcb->timer); 2571 2572 /* If round trip timer isn't running, start it. 2573 * measure the longest packet only in case the 2574 * transmission time dominates RTT 2575 */ 2576 if(tcb->rtt_timer.state != TcptimerON) 2577 if(ssize == tcb->mss) { 2578 tcpgo(tpriv, &tcb->rtt_timer); 2579 tcb->rttseq = tcb->snd.ptr; 2580 } 2581 } 2582 2583 tpriv->stats[OutSegs]++; 2584 2585 /* put off the next keep alive */ 2586 tcpgo(tpriv, &tcb->katimer); 2587 2588 switch(version){ 2589 case V4: 2590 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2591 /* a negative return means no route */ 2592 localclose(s, "no route"); 2593 } 2594 break; 2595 case V6: 2596 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2597 /* a negative return means no route */ 2598 localclose(s, "no route"); 2599 } 2600 break; 2601 default: 2602 panic("tcpoutput2: version %d", version); 2603 } 2604 if((msgs%4) == 1){ 2605 qunlock(s); 2606 sched(); 2607 qlock(s); 2608 } 2609 } 2610 } 2611 2612 /* 2613 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2614 */ 2615 void 2616 tcpsendka(Conv *s) 2617 { 2618 Tcp seg; 2619 Tcpctl *tcb; 2620 Block *hbp,*dbp; 2621 2622 tcb = (Tcpctl*)s->ptcl; 2623 2624 dbp = nil; 2625 seg.urg = 0; 2626 seg.source = s->lport; 2627 seg.dest = s->rport; 2628 seg.flags = ACK|PSH; 2629 seg.mss = 0; 2630 seg.ws = 0; 2631 if(tcpporthogdefense) 2632 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2633 else 2634 seg.seq = tcb->snd.una-1; 2635 seg.ack = tcb->rcv.nxt; 2636 tcb->rcv.una = 0; 2637 seg.wnd = tcb->rcv.wnd; 2638 if(tcb->state == Finwait2){ 2639 seg.flags |= FIN; 2640 } else { 2641 dbp = allocb(1); 2642 dbp->wp++; 2643 } 2644 2645 if(isv4(s->raddr)) { 2646 /* Build header, link data and compute cksum */ 2647 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2648 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2649 if(hbp == nil) { 2650 freeblist(dbp); 2651 return; 2652 } 2653 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2654 } 2655 else { 2656 /* Build header, link data and compute cksum */ 2657 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2658 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2659 if(hbp == nil) { 2660 freeblist(dbp); 2661 return; 2662 } 2663 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2664 } 2665 } 2666 2667 /* 2668 * set connection to time out after 12 minutes 2669 */ 2670 void 2671 tcpsetkacounter(Tcpctl *tcb) 2672 { 2673 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2674 if(tcb->kacounter < 3) 2675 tcb->kacounter = 3; 2676 } 2677 2678 /* 2679 * if we've timed out, close the connection 2680 * otherwise, send a keepalive and restart the timer 2681 */ 2682 void 2683 tcpkeepalive(void *v) 2684 { 2685 Tcpctl *tcb; 2686 Conv *s; 2687 2688 s = v; 2689 tcb = (Tcpctl*)s->ptcl; 2690 if(waserror()){ 2691 qunlock(s); 2692 nexterror(); 2693 } 2694 qlock(s); 2695 if(tcb->state != Closed){ 2696 if(--(tcb->kacounter) <= 0) { 2697 localclose(s, Etimedout); 2698 } else { 2699 tcpsendka(s); 2700 tcpgo(s->p->priv, &tcb->katimer); 2701 } 2702 } 2703 qunlock(s); 2704 poperror(); 2705 } 2706 2707 /* 2708 * start keepalive timer 2709 */ 2710 char* 2711 tcpstartka(Conv *s, char **f, int n) 2712 { 2713 Tcpctl *tcb; 2714 int x; 2715 2716 tcb = (Tcpctl*)s->ptcl; 2717 if(tcb->state != Established) 2718 return "connection must be in Establised state"; 2719 if(n > 1){ 2720 x = atoi(f[1]); 2721 if(x >= MSPTICK) 2722 tcb->katimer.start = x/MSPTICK; 2723 } 2724 tcpsetkacounter(tcb); 2725 tcpgo(s->p->priv, &tcb->katimer); 2726 2727 return nil; 2728 } 2729 2730 /* 2731 * turn checksums on/off 2732 */ 2733 char* 2734 tcpsetchecksum(Conv *s, char **f, int) 2735 { 2736 Tcpctl *tcb; 2737 2738 tcb = (Tcpctl*)s->ptcl; 2739 tcb->nochecksum = !atoi(f[1]); 2740 2741 return nil; 2742 } 2743 2744 void 2745 tcprxmit(Conv *s) 2746 { 2747 Tcpctl *tcb; 2748 2749 tcb = (Tcpctl*)s->ptcl; 2750 2751 tcb->flags |= RETRAN|FORCE; 2752 tcb->snd.ptr = tcb->snd.una; 2753 2754 /* 2755 * We should be halving the slow start threshhold (down to one 2756 * mss) but leaving it at mss seems to work well enough 2757 */ 2758 tcb->ssthresh = tcb->mss; 2759 2760 /* 2761 * pull window down to a single packet 2762 */ 2763 tcb->cwind = tcb->mss; 2764 tcpoutput(s); 2765 } 2766 2767 void 2768 tcptimeout(void *arg) 2769 { 2770 Conv *s; 2771 Tcpctl *tcb; 2772 int maxback; 2773 Tcppriv *tpriv; 2774 2775 s = (Conv*)arg; 2776 tpriv = s->p->priv; 2777 tcb = (Tcpctl*)s->ptcl; 2778 2779 if(waserror()){ 2780 qunlock(s); 2781 nexterror(); 2782 } 2783 qlock(s); 2784 switch(tcb->state){ 2785 default: 2786 tcb->backoff++; 2787 if(tcb->state == Syn_sent) 2788 maxback = MAXBACKMS/2; 2789 else 2790 maxback = MAXBACKMS; 2791 tcb->backedoff += tcb->timer.start * MSPTICK; 2792 if(tcb->backedoff >= maxback) { 2793 localclose(s, Etimedout); 2794 break; 2795 } 2796 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); 2797 tcpsettimer(tcb); 2798 tcprxmit(s); 2799 tpriv->stats[RetransTimeouts]++; 2800 tcb->snd.dupacks = 0; 2801 break; 2802 case Time_wait: 2803 localclose(s, nil); 2804 break; 2805 case Closed: 2806 break; 2807 } 2808 qunlock(s); 2809 poperror(); 2810 } 2811 2812 int 2813 inwindow(Tcpctl *tcb, int seq) 2814 { 2815 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 2816 } 2817 2818 /* 2819 * set up state for a received SYN (or SYN ACK) packet 2820 */ 2821 void 2822 procsyn(Conv *s, Tcp *seg) 2823 { 2824 Tcpctl *tcb; 2825 2826 tcb = (Tcpctl*)s->ptcl; 2827 tcb->flags |= FORCE; 2828 2829 tcb->rcv.nxt = seg->seq + 1; 2830 tcb->rcv.urg = tcb->rcv.nxt; 2831 tcb->irs = seg->seq; 2832 2833 /* our sending max segment size cannot be bigger than what he asked for */ 2834 if(seg->mss != 0 && seg->mss < tcb->mss) 2835 tcb->mss = seg->mss; 2836 2837 /* the congestion window always starts out as a single segment */ 2838 tcb->snd.wnd = seg->wnd; 2839 tcb->cwind = tcb->mss; 2840 } 2841 2842 int 2843 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 2844 { 2845 Reseq *rp, *rp1; 2846 int i, rqlen, qmax; 2847 2848 rp = malloc(sizeof(Reseq)); 2849 if(rp == nil){ 2850 freeblist(bp); /* bp always consumed by add_reseq */ 2851 return 0; 2852 } 2853 2854 rp->seg = *seg; 2855 rp->bp = bp; 2856 rp->length = length; 2857 2858 /* Place on reassembly list sorting by starting seq number */ 2859 rp1 = tcb->reseq; 2860 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { 2861 rp->next = rp1; 2862 tcb->reseq = rp; 2863 if(rp->next != nil) 2864 tpriv->stats[OutOfOrder]++; 2865 return 0; 2866 } 2867 2868 rqlen = 0; 2869 for(i = 0;; i++) { 2870 rqlen += rp1->length; 2871 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { 2872 rp->next = rp1->next; 2873 rp1->next = rp; 2874 if(rp->next != nil) 2875 tpriv->stats[OutOfOrder]++; 2876 break; 2877 } 2878 rp1 = rp1->next; 2879 } 2880 qmax = QMAX<<tcb->rcv.scale; 2881 if(rqlen > qmax){ 2882 print("resequence queue > window: %d > %d\n", rqlen, qmax); 2883 i = 0; 2884 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ 2885 print("%#lux %#lux %#ux\n", rp1->seg.seq, 2886 rp1->seg.ack, rp1->seg.flags); 2887 if(i++ > 10){ 2888 print("...\n"); 2889 break; 2890 } 2891 } 2892 2893 /* 2894 * delete entire reassembly queue; wait for retransmit. 2895 * - should we be smarter and only delete the tail? 2896 */ 2897 for(rp = tcb->reseq; rp != nil; rp = rp1){ 2898 rp1 = rp->next; 2899 freeblist(rp->bp); 2900 free(rp); 2901 } 2902 tcb->reseq = nil; 2903 2904 return -1; 2905 } 2906 return 0; 2907 } 2908 2909 void 2910 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2911 { 2912 Reseq *rp; 2913 2914 rp = tcb->reseq; 2915 if(rp == nil) 2916 return; 2917 2918 tcb->reseq = rp->next; 2919 2920 *seg = rp->seg; 2921 *bp = rp->bp; 2922 *length = rp->length; 2923 2924 free(rp); 2925 } 2926 2927 int 2928 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2929 { 2930 ushort len; 2931 uchar accept; 2932 int dupcnt, excess; 2933 2934 accept = 0; 2935 len = *length; 2936 if(seg->flags & SYN) 2937 len++; 2938 if(seg->flags & FIN) 2939 len++; 2940 2941 if(tcb->rcv.wnd == 0) { 2942 if(len == 0 && seg->seq == tcb->rcv.nxt) 2943 return 0; 2944 } 2945 else { 2946 /* Some part of the segment should be in the window */ 2947 if(inwindow(tcb,seg->seq)) 2948 accept++; 2949 else 2950 if(len != 0) { 2951 if(inwindow(tcb, seg->seq+len-1) || 2952 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 2953 accept++; 2954 } 2955 } 2956 if(!accept) { 2957 freeblist(*bp); 2958 return -1; 2959 } 2960 dupcnt = tcb->rcv.nxt - seg->seq; 2961 if(dupcnt > 0){ 2962 tcb->rerecv += dupcnt; 2963 if(seg->flags & SYN){ 2964 seg->flags &= ~SYN; 2965 seg->seq++; 2966 2967 if(seg->urg > 1) 2968 seg->urg--; 2969 else 2970 seg->flags &= ~URG; 2971 dupcnt--; 2972 } 2973 if(dupcnt > 0){ 2974 pullblock(bp, (ushort)dupcnt); 2975 seg->seq += dupcnt; 2976 *length -= dupcnt; 2977 2978 if(seg->urg > dupcnt) 2979 seg->urg -= dupcnt; 2980 else { 2981 seg->flags &= ~URG; 2982 seg->urg = 0; 2983 } 2984 } 2985 } 2986 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 2987 if(excess > 0) { 2988 tcb->rerecv += excess; 2989 *length -= excess; 2990 *bp = trimblock(*bp, 0, *length); 2991 if(*bp == nil) 2992 panic("presotto is a boofhead"); 2993 seg->flags &= ~FIN; 2994 } 2995 return 0; 2996 } 2997 2998 void 2999 tcpadvise(Proto *tcp, Block *bp, char *msg) 3000 { 3001 Tcp4hdr *h4; 3002 Tcp6hdr *h6; 3003 Tcpctl *tcb; 3004 uchar source[IPaddrlen]; 3005 uchar dest[IPaddrlen]; 3006 ushort psource, pdest; 3007 Conv *s, **p; 3008 3009 h4 = (Tcp4hdr*)(bp->rp); 3010 h6 = (Tcp6hdr*)(bp->rp); 3011 3012 if((h4->vihl&0xF0)==IP_VER4) { 3013 v4tov6(dest, h4->tcpdst); 3014 v4tov6(source, h4->tcpsrc); 3015 psource = nhgets(h4->tcpsport); 3016 pdest = nhgets(h4->tcpdport); 3017 } 3018 else { 3019 ipmove(dest, h6->tcpdst); 3020 ipmove(source, h6->tcpsrc); 3021 psource = nhgets(h6->tcpsport); 3022 pdest = nhgets(h6->tcpdport); 3023 } 3024 3025 /* Look for a connection */ 3026 qlock(tcp); 3027 for(p = tcp->conv; *p; p++) { 3028 s = *p; 3029 tcb = (Tcpctl*)s->ptcl; 3030 if(s->rport == pdest) 3031 if(s->lport == psource) 3032 if(tcb->state != Closed) 3033 if(ipcmp(s->raddr, dest) == 0) 3034 if(ipcmp(s->laddr, source) == 0){ 3035 qlock(s); 3036 qunlock(tcp); 3037 switch(tcb->state){ 3038 case Syn_sent: 3039 localclose(s, msg); 3040 break; 3041 } 3042 qunlock(s); 3043 freeblist(bp); 3044 return; 3045 } 3046 } 3047 qunlock(tcp); 3048 freeblist(bp); 3049 } 3050 3051 static char* 3052 tcpporthogdefensectl(char *val) 3053 { 3054 if(strcmp(val, "on") == 0) 3055 tcpporthogdefense = 1; 3056 else if(strcmp(val, "off") == 0) 3057 tcpporthogdefense = 0; 3058 else 3059 return "unknown value for tcpporthogdefense"; 3060 return nil; 3061 } 3062 3063 /* called with c qlocked */ 3064 char* 3065 tcpctl(Conv* c, char** f, int n) 3066 { 3067 if(n == 1 && strcmp(f[0], "hangup") == 0) 3068 return tcphangup(c); 3069 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3070 return tcpstartka(c, f, n); 3071 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3072 return tcpsetchecksum(c, f, n); 3073 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3074 return tcpporthogdefensectl(f[1]); 3075 return "unknown control request"; 3076 } 3077 3078 int 3079 tcpstats(Proto *tcp, char *buf, int len) 3080 { 3081 Tcppriv *priv; 3082 char *p, *e; 3083 int i; 3084 3085 priv = tcp->priv; 3086 p = buf; 3087 e = p+len; 3088 for(i = 0; i < Nstats; i++) 3089 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); 3090 return p - buf; 3091 } 3092 3093 /* 3094 * garbage collect any stale conversations: 3095 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3096 * - Finwait2 after 5 minutes 3097 * 3098 * this is called whenever we run out of channels. Both checks are 3099 * of questionable validity so we try to use them only when we're 3100 * up against the wall. 3101 */ 3102 int 3103 tcpgc(Proto *tcp) 3104 { 3105 Conv *c, **pp, **ep; 3106 int n; 3107 Tcpctl *tcb; 3108 3109 3110 n = 0; 3111 ep = &tcp->conv[tcp->nc]; 3112 for(pp = tcp->conv; pp < ep; pp++) { 3113 c = *pp; 3114 if(c == nil) 3115 break; 3116 if(!canqlock(c)) 3117 continue; 3118 tcb = (Tcpctl*)c->ptcl; 3119 switch(tcb->state){ 3120 case Syn_received: 3121 if(NOW - tcb->time > 5000){ 3122 localclose(c, "timed out"); 3123 n++; 3124 } 3125 break; 3126 case Finwait2: 3127 if(NOW - tcb->time > 5*60*1000){ 3128 localclose(c, "timed out"); 3129 n++; 3130 } 3131 break; 3132 } 3133 qunlock(c); 3134 } 3135 return n; 3136 } 3137 3138 void 3139 tcpsettimer(Tcpctl *tcb) 3140 { 3141 int x; 3142 3143 /* round trip dependency */ 3144 x = backoff(tcb->backoff) * 3145 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3146 3147 /* bounded twixt 1/2 and 64 seconds */ 3148 if(x < 500/MSPTICK) 3149 x = 500/MSPTICK; 3150 else if(x > (64000/MSPTICK)) 3151 x = 64000/MSPTICK; 3152 tcb->timer.start = x; 3153 } 3154 3155 void 3156 tcpinit(Fs *fs) 3157 { 3158 Proto *tcp; 3159 Tcppriv *tpriv; 3160 3161 tcp = smalloc(sizeof(Proto)); 3162 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3163 tcp->name = "tcp"; 3164 tcp->connect = tcpconnect; 3165 tcp->announce = tcpannounce; 3166 tcp->ctl = tcpctl; 3167 tcp->state = tcpstate; 3168 tcp->create = tcpcreate; 3169 tcp->close = tcpclose; 3170 tcp->rcv = tcpiput; 3171 tcp->advise = tcpadvise; 3172 tcp->stats = tcpstats; 3173 tcp->inuse = tcpinuse; 3174 tcp->gc = tcpgc; 3175 tcp->ipproto = IP_TCPPROTO; 3176 tcp->nc = scalednconv(); 3177 tcp->ptclsize = sizeof(Tcpctl); 3178 tpriv->stats[MaxConn] = tcp->nc; 3179 3180 Fsproto(fs, tcp); 3181 } 3182 3183 void 3184 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3185 { 3186 if(rcvscale){ 3187 tcb->rcv.scale = rcvscale & 0xff; 3188 tcb->snd.scale = sndscale & 0xff; 3189 tcb->window = QMAX<<tcb->snd.scale; 3190 qsetlimit(s->rq, tcb->window); 3191 } else { 3192 tcb->rcv.scale = 0; 3193 tcb->snd.scale = 0; 3194 tcb->window = QMAX; 3195 qsetlimit(s->rq, tcb->window); 3196 } 3197 } 3198