1 #include "u.h" 2 #include "../port/lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "../port/error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Maximum segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default maximum segment */ 50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 /* 85 * window is 64kb * 2ⁿ 86 * these factors determine the ultimate bandwidth-delay product. 87 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms. 88 */ 89 Maxqscale = 4, /* maximum queuing scale */ 90 Defadvscale = 4, /* default advertisement */ 91 }; 92 93 /* Must correspond to the enumeration above */ 94 char *tcpstates[] = 95 { 96 "Closed", "Listen", "Syn_sent", "Syn_received", 97 "Established", "Finwait1", "Finwait2", "Close_wait", 98 "Closing", "Last_ack", "Time_wait" 99 }; 100 101 typedef struct Tcptimer Tcptimer; 102 struct Tcptimer 103 { 104 Tcptimer *next; 105 Tcptimer *prev; 106 Tcptimer *readynext; 107 int state; 108 int start; 109 int count; 110 void (*func)(void*); 111 void *arg; 112 }; 113 114 /* 115 * v4 and v6 pseudo headers used for 116 * checksuming tcp 117 */ 118 typedef struct Tcp4hdr Tcp4hdr; 119 struct Tcp4hdr 120 { 121 uchar vihl; /* Version and header length */ 122 uchar tos; /* Type of service */ 123 uchar length[2]; /* packet length */ 124 uchar id[2]; /* Identification */ 125 uchar frag[2]; /* Fragment information */ 126 uchar Unused; 127 uchar proto; 128 uchar tcplen[2]; 129 uchar tcpsrc[4]; 130 uchar tcpdst[4]; 131 uchar tcpsport[2]; 132 uchar tcpdport[2]; 133 uchar tcpseq[4]; 134 uchar tcpack[4]; 135 uchar tcpflag[2]; 136 uchar tcpwin[2]; 137 uchar tcpcksum[2]; 138 uchar tcpurg[2]; 139 /* Options segment */ 140 uchar tcpopt[1]; 141 }; 142 143 typedef struct Tcp6hdr Tcp6hdr; 144 struct Tcp6hdr 145 { 146 uchar vcf[4]; 147 uchar ploadlen[2]; 148 uchar proto; 149 uchar ttl; 150 uchar tcpsrc[IPaddrlen]; 151 uchar tcpdst[IPaddrlen]; 152 uchar tcpsport[2]; 153 uchar tcpdport[2]; 154 uchar tcpseq[4]; 155 uchar tcpack[4]; 156 uchar tcpflag[2]; 157 uchar tcpwin[2]; 158 uchar tcpcksum[2]; 159 uchar tcpurg[2]; 160 /* Options segment */ 161 uchar tcpopt[1]; 162 }; 163 164 /* 165 * this represents the control info 166 * for a single packet. It is derived from 167 * a packet in ntohtcp{4,6}() and stuck into 168 * a packet in htontcp{4,6}(). 169 */ 170 typedef struct Tcp Tcp; 171 struct Tcp 172 { 173 ushort source; 174 ushort dest; 175 ulong seq; 176 ulong ack; 177 uchar flags; 178 uchar update; 179 ushort ws; /* window scale option */ 180 ulong wnd; /* prescaled window*/ 181 ushort urg; 182 ushort mss; /* max segment size option (if not zero) */ 183 ushort len; /* size of data */ 184 }; 185 186 /* 187 * this header is malloc'd to thread together fragments 188 * waiting to be coalesced 189 */ 190 typedef struct Reseq Reseq; 191 struct Reseq 192 { 193 Reseq *next; 194 Tcp seg; 195 Block *bp; 196 ushort length; 197 }; 198 199 /* 200 * the qlock in the Conv locks this structure 201 */ 202 typedef struct Tcpctl Tcpctl; 203 struct Tcpctl 204 { 205 uchar state; /* Connection state */ 206 uchar type; /* Listening or active connection */ 207 uchar code; /* Icmp code */ 208 struct { 209 ulong una; /* Unacked data pointer */ 210 ulong nxt; /* Next sequence expected */ 211 ulong ptr; /* Data pointer */ 212 ulong wnd; /* Tcp send window */ 213 ulong urg; /* Urgent data pointer */ 214 ulong wl2; 215 uint scale; /* how much to right shift window */ 216 /* in xmitted packets */ 217 /* to implement tahoe and reno TCP */ 218 ulong dupacks; /* number of duplicate acks rcvd */ 219 ulong partialack; 220 int recovery; /* loss recovery flag */ 221 int retransmit; /* retransmit 1 packet @ una flag */ 222 int rto; 223 ulong rxt; /* right window marker for recovery */ 224 /* "recover" rfc3782 */ 225 } snd; 226 struct { 227 ulong nxt; /* Receive pointer to next uchar slot */ 228 ulong wnd; /* Receive window incoming */ 229 ulong wsnt; /* Last wptr sent. important to */ 230 /* track for large bdp */ 231 ulong wptr; 232 ulong urg; /* Urgent pointer */ 233 ulong ackptr; /* last acked sequence */ 234 int blocked; 235 uint scale; /* how much to left shift window in */ 236 /* rcv'd packets */ 237 } rcv; 238 ulong iss; /* Initial sequence number */ 239 ulong cwind; /* Congestion window */ 240 ulong abcbytes; /* appropriate byte counting rfc 3465 */ 241 uint scale; /* desired snd.scale */ 242 ulong ssthresh; /* Slow start threshold */ 243 int resent; /* Bytes just resent */ 244 int irs; /* Initial received squence */ 245 ushort mss; /* Maximum segment size */ 246 int rerecv; /* Overlap of data rerecevived */ 247 ulong window; /* Our receive window (queue) */ 248 uint qscale; /* Log2 of our receive window (queue) */ 249 uchar backoff; /* Exponential backoff counter */ 250 int backedoff; /* ms we've backed off for rexmits */ 251 uchar flags; /* State flags */ 252 Reseq *reseq; /* Resequencing queue */ 253 int nreseq; 254 int reseqlen; 255 Tcptimer timer; /* Activity timer */ 256 Tcptimer acktimer; /* Acknowledge timer */ 257 Tcptimer rtt_timer; /* Round trip timer */ 258 Tcptimer katimer; /* keep alive timer */ 259 ulong rttseq; /* Round trip sequence */ 260 int srtt; /* Smoothed round trip */ 261 int mdev; /* Mean deviation of round trip */ 262 int kacounter; /* count down for keep alive */ 263 uint sndsyntime; /* time syn sent */ 264 ulong time; /* time Finwait2 or Syn_received was sent */ 265 ulong timeuna; /* snd.una when time was set */ 266 int nochecksum; /* non-zero means don't send checksums */ 267 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 268 269 union { 270 Tcp4hdr tcp4hdr; 271 Tcp6hdr tcp6hdr; 272 } protohdr; /* prototype header */ 273 }; 274 275 /* 276 * New calls are put in limbo rather than having a conversation structure 277 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 278 * any real Conv structures mucking things up. Calls in limbo rexmit their 279 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 280 * 281 * In particular they aren't on a listener's queue so that they don't figure 282 * in the input queue limit. 283 * 284 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 285 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 286 * there is no hashing of this list. 287 */ 288 typedef struct Limbo Limbo; 289 struct Limbo 290 { 291 Limbo *next; 292 293 uchar laddr[IPaddrlen]; 294 uchar raddr[IPaddrlen]; 295 ushort lport; 296 ushort rport; 297 ulong irs; /* initial received sequence */ 298 ulong iss; /* initial sent sequence */ 299 ushort mss; /* mss from the other end */ 300 ushort rcvscale; /* how much to scale rcvd windows */ 301 ushort sndscale; /* how much to scale sent windows */ 302 ulong lastsend; /* last time we sent a synack */ 303 uchar version; /* v4 or v6 */ 304 uchar rexmits; /* number of retransmissions */ 305 }; 306 307 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 308 309 enum { 310 /* MIB stats */ 311 MaxConn, 312 Mss, 313 ActiveOpens, 314 PassiveOpens, 315 EstabResets, 316 CurrEstab, 317 InSegs, 318 OutSegs, 319 RetransSegs, 320 RetransSegsSent, 321 RetransTimeouts, 322 InErrs, 323 OutRsts, 324 325 /* non-MIB stats */ 326 CsumErrs, 327 HlenErrs, 328 LenErrs, 329 Resequenced, 330 OutOfOrder, 331 ReseqBytelim, 332 ReseqPktlim, 333 Delayack, 334 Wopenack, 335 336 Recovery, 337 RecoveryDone, 338 RecoveryRTO, 339 RecoveryNoSeq, 340 RecoveryCwind, 341 RecoveryPA, 342 343 Nstats 344 }; 345 346 static char *statnames[Nstats] = 347 { 348 [MaxConn] "MaxConn", 349 [Mss] "MaxSegment", 350 [ActiveOpens] "ActiveOpens", 351 [PassiveOpens] "PassiveOpens", 352 [EstabResets] "EstabResets", 353 [CurrEstab] "CurrEstab", 354 [InSegs] "InSegs", 355 [OutSegs] "OutSegs", 356 [RetransSegs] "RetransSegs", 357 [RetransSegsSent] "RetransSegsSent", 358 [RetransTimeouts] "RetransTimeouts", 359 [InErrs] "InErrs", 360 [OutRsts] "OutRsts", 361 [CsumErrs] "CsumErrs", 362 [HlenErrs] "HlenErrs", 363 [LenErrs] "LenErrs", 364 [OutOfOrder] "OutOfOrder", 365 [Resequenced] "Resequenced", 366 [ReseqBytelim] "ReseqBytelim", 367 [ReseqPktlim] "ReseqPktlim", 368 [Delayack] "Delayack", 369 [Wopenack] "Wopenack", 370 371 [Recovery] "Recovery", 372 [RecoveryDone] "RecoveryDone", 373 [RecoveryRTO] "RecoveryRTO", 374 375 [RecoveryNoSeq] "RecoveryNoSeq", 376 [RecoveryCwind] "RecoveryCwind", 377 [RecoveryPA] "RecoveryPA", 378 }; 379 380 typedef struct Tcppriv Tcppriv; 381 struct Tcppriv 382 { 383 /* List of active timers */ 384 QLock tl; 385 Tcptimer *timers; 386 387 /* hash table for matching conversations */ 388 Ipht ht; 389 390 /* calls in limbo waiting for an ACK to our SYN ACK */ 391 int nlimbo; 392 Limbo *lht[NLHT]; 393 394 /* for keeping track of tcpackproc */ 395 QLock apl; 396 int ackprocstarted; 397 398 uvlong stats[Nstats]; 399 }; 400 401 /* 402 * Setting tcpporthogdefense to non-zero enables Dong Lin's 403 * solution to hijacked systems staking out port's as a form 404 * of DoS attack. 405 * 406 * To avoid stateless Conv hogs, we pick a sequence number at random. If 407 * that number gets acked by the other end, we shut down the connection. 408 * Look for tcpporthogdefense in the code. 409 */ 410 int tcpporthogdefense = 0; 411 412 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 413 static int dumpreseq(Tcpctl*); 414 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 415 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 416 static void limborexmit(Proto*); 417 static void localclose(Conv*, char*); 418 static void procsyn(Conv*, Tcp*); 419 static void tcpacktimer(void*); 420 static void tcpiput(Proto*, Ipifc*, Block*); 421 static void tcpkeepalive(void*); 422 static void tcpoutput(Conv*); 423 static void tcprcvwin(Conv*); 424 static void tcprxmit(Conv*); 425 static void tcpsetkacounter(Tcpctl*); 426 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 427 static void tcpsettimer(Tcpctl*); 428 static void tcpsndsyn(Conv*, Tcpctl*); 429 static void tcpstart(Conv*, int); 430 static void tcpsynackrtt(Conv*); 431 static void tcptimeout(void*); 432 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 433 434 static void 435 tcpsetstate(Conv *s, uchar newstate) 436 { 437 Tcpctl *tcb; 438 uchar oldstate; 439 Tcppriv *tpriv; 440 441 tpriv = s->p->priv; 442 443 tcb = (Tcpctl*)s->ptcl; 444 445 oldstate = tcb->state; 446 if(oldstate == newstate) 447 return; 448 449 if(oldstate == Established) 450 tpriv->stats[CurrEstab]--; 451 if(newstate == Established) 452 tpriv->stats[CurrEstab]++; 453 454 switch(newstate) { 455 case Closed: 456 qclose(s->rq); 457 qclose(s->wq); 458 qclose(s->eq); 459 break; 460 461 case Close_wait: /* Remote closes */ 462 qhangup(s->rq, nil); 463 break; 464 } 465 466 tcb->state = newstate; 467 468 if(oldstate == Syn_sent && newstate != Closed) 469 Fsconnected(s, nil); 470 } 471 472 static char* 473 tcpconnect(Conv *c, char **argv, int argc) 474 { 475 char *e; 476 Tcpctl *tcb; 477 478 tcb = (Tcpctl*)(c->ptcl); 479 if(tcb->state != Closed) 480 return Econinuse; 481 482 e = Fsstdconnect(c, argv, argc); 483 if(e != nil) 484 return e; 485 tcpstart(c, TCP_CONNECT); 486 487 return nil; 488 } 489 490 static int 491 tcpstate(Conv *c, char *state, int n) 492 { 493 Tcpctl *s; 494 495 s = (Tcpctl*)(c->ptcl); 496 497 return snprint(state, n, 498 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud " 499 "swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d " 500 "timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 501 tcpstates[s->state], 502 c->rq ? qlen(c->rq) : 0, 503 c->wq ? qlen(c->wq) : 0, 504 s->nreseq, s->reseqlen, 505 s->srtt, s->mdev, s->ssthresh, 506 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 507 s->qscale, 508 s->timer.start, s->timer.count, s->rerecv, 509 s->katimer.start, s->katimer.count); 510 } 511 512 static int 513 tcpinuse(Conv *c) 514 { 515 Tcpctl *s; 516 517 s = (Tcpctl*)(c->ptcl); 518 return s->state != Closed; 519 } 520 521 static char* 522 tcpannounce(Conv *c, char **argv, int argc) 523 { 524 char *e; 525 Tcpctl *tcb; 526 527 tcb = (Tcpctl*)(c->ptcl); 528 if(tcb->state != Closed) 529 return Econinuse; 530 531 e = Fsstdannounce(c, argv, argc); 532 if(e != nil) 533 return e; 534 tcpstart(c, TCP_LISTEN); 535 Fsconnected(c, nil); 536 537 return nil; 538 } 539 540 /* 541 * tcpclose is always called with the q locked 542 */ 543 static void 544 tcpclose(Conv *c) 545 { 546 Tcpctl *tcb; 547 548 tcb = (Tcpctl*)c->ptcl; 549 550 qhangup(c->rq, nil); 551 qhangup(c->wq, nil); 552 qhangup(c->eq, nil); 553 qflush(c->rq); 554 555 switch(tcb->state) { 556 case Listen: 557 /* 558 * reset any incoming calls to this listener 559 */ 560 Fsconnected(c, "Hangup"); 561 562 localclose(c, nil); 563 break; 564 case Closed: 565 case Syn_sent: 566 localclose(c, nil); 567 break; 568 case Syn_received: 569 case Established: 570 tcb->flgcnt++; 571 tcb->snd.nxt++; 572 tcpsetstate(c, Finwait1); 573 tcpoutput(c); 574 break; 575 case Close_wait: 576 tcb->flgcnt++; 577 tcb->snd.nxt++; 578 tcpsetstate(c, Last_ack); 579 tcpoutput(c); 580 break; 581 } 582 } 583 584 static void 585 tcpkick(void *x) 586 { 587 Conv *s = x; 588 Tcpctl *tcb; 589 590 tcb = (Tcpctl*)s->ptcl; 591 592 if(waserror()){ 593 qunlock(s); 594 nexterror(); 595 } 596 qlock(s); 597 598 switch(tcb->state) { 599 case Syn_sent: 600 case Syn_received: 601 case Established: 602 case Close_wait: 603 /* 604 * Push data 605 */ 606 tcpoutput(s); 607 break; 608 default: 609 localclose(s, "Hangup"); 610 break; 611 } 612 613 qunlock(s); 614 poperror(); 615 } 616 617 static int seq_lt(ulong, ulong); 618 619 static void 620 tcprcvwin(Conv *s) /* Call with tcb locked */ 621 { 622 int w; 623 Tcpctl *tcb; 624 625 tcb = (Tcpctl*)s->ptcl; 626 w = tcb->window - qlen(s->rq); 627 if(w < 0) 628 w = 0; 629 /* RFC 1122 § 4.2.2.17 do not move right edge of window left */ 630 if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr)) 631 w = tcb->rcv.wptr - tcb->rcv.nxt; 632 if(w != tcb->rcv.wnd) 633 if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ 634 tcb->rcv.blocked = 1; 635 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n", 636 tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport); 637 } 638 tcb->rcv.wnd = w; 639 tcb->rcv.wptr = tcb->rcv.nxt + w; 640 } 641 642 static void 643 tcpacktimer(void *v) 644 { 645 Tcpctl *tcb; 646 Conv *s; 647 648 s = v; 649 tcb = (Tcpctl*)s->ptcl; 650 651 if(waserror()){ 652 qunlock(s); 653 nexterror(); 654 } 655 qlock(s); 656 if(tcb->state != Closed){ 657 tcb->flags |= FORCE; 658 tcpoutput(s); 659 } 660 qunlock(s); 661 poperror(); 662 } 663 664 static void 665 tcpcongestion(Tcpctl *tcb) 666 { 667 ulong inflight; 668 669 inflight = tcb->snd.nxt - tcb->snd.una; 670 if(inflight > tcb->cwind) 671 inflight = tcb->cwind; 672 tcb->ssthresh = inflight / 2; 673 if(tcb->ssthresh < 2*tcb->mss) 674 tcb->ssthresh = 2*tcb->mss; 675 } 676 677 enum { 678 L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */ 679 }; 680 681 static void 682 tcpabcincr(Tcpctl *tcb, uint acked) 683 { 684 uint limit; 685 686 tcb->abcbytes += acked; 687 if(tcb->cwind < tcb->ssthresh){ 688 /* slow start */ 689 if(tcb->snd.rto) 690 limit = tcb->mss; 691 else 692 limit = L*tcb->mss; 693 tcb->cwind += MIN(tcb->abcbytes, limit); 694 tcb->abcbytes = 0; 695 } else { 696 tcb->snd.rto = 0; 697 /* avoidance */ 698 if(tcb->abcbytes >= tcb->cwind){ 699 tcb->abcbytes -= tcb->cwind; 700 tcb->cwind += tcb->mss; 701 } 702 } 703 } 704 705 static void 706 tcpcreate(Conv *c) 707 { 708 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 709 c->wq = qopen(QMAX, Qkick, tcpkick, c); 710 } 711 712 static void 713 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 714 { 715 if(newstate != TcptimerON){ 716 if(t->state == TcptimerON){ 717 /* unchain */ 718 if(priv->timers == t){ 719 priv->timers = t->next; 720 if(t->prev != nil) 721 panic("timerstate1"); 722 } 723 if(t->next) 724 t->next->prev = t->prev; 725 if(t->prev) 726 t->prev->next = t->next; 727 t->next = t->prev = nil; 728 } 729 } else { 730 if(t->state != TcptimerON){ 731 /* chain */ 732 if(t->prev != nil || t->next != nil) 733 panic("timerstate2"); 734 t->prev = nil; 735 t->next = priv->timers; 736 if(t->next) 737 t->next->prev = t; 738 priv->timers = t; 739 } 740 } 741 t->state = newstate; 742 } 743 744 static void 745 tcpackproc(void *a) 746 { 747 Tcptimer *t, *tp, *timeo; 748 Proto *tcp; 749 Tcppriv *priv; 750 int loop; 751 752 tcp = a; 753 priv = tcp->priv; 754 755 for(;;) { 756 tsleep(&up->sleep, return0, 0, MSPTICK); 757 758 qlock(&priv->tl); 759 timeo = nil; 760 loop = 0; 761 for(t = priv->timers; t != nil; t = tp) { 762 if(loop++ > 10000) 763 panic("tcpackproc1"); 764 tp = t->next; 765 if(t->state == TcptimerON) { 766 t->count--; 767 if(t->count == 0) { 768 timerstate(priv, t, TcptimerDONE); 769 t->readynext = timeo; 770 timeo = t; 771 } 772 } 773 } 774 qunlock(&priv->tl); 775 776 loop = 0; 777 for(t = timeo; t != nil; t = t->readynext) { 778 if(loop++ > 10000) 779 panic("tcpackproc2"); 780 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 781 (*t->func)(t->arg); 782 poperror(); 783 } 784 } 785 786 limborexmit(tcp); 787 } 788 } 789 790 static void 791 tcpgo(Tcppriv *priv, Tcptimer *t) 792 { 793 if(t == nil || t->start == 0) 794 return; 795 796 qlock(&priv->tl); 797 t->count = t->start; 798 timerstate(priv, t, TcptimerON); 799 qunlock(&priv->tl); 800 } 801 802 static void 803 tcphalt(Tcppriv *priv, Tcptimer *t) 804 { 805 if(t == nil) 806 return; 807 808 qlock(&priv->tl); 809 timerstate(priv, t, TcptimerOFF); 810 qunlock(&priv->tl); 811 } 812 813 static int 814 backoff(int n) 815 { 816 return 1 << n; 817 } 818 819 static void 820 localclose(Conv *s, char *reason) /* called with tcb locked */ 821 { 822 Tcpctl *tcb; 823 Tcppriv *tpriv; 824 825 tpriv = s->p->priv; 826 tcb = (Tcpctl*)s->ptcl; 827 828 iphtrem(&tpriv->ht, s); 829 830 tcphalt(tpriv, &tcb->timer); 831 tcphalt(tpriv, &tcb->rtt_timer); 832 tcphalt(tpriv, &tcb->acktimer); 833 tcphalt(tpriv, &tcb->katimer); 834 835 /* Flush reassembly queue; nothing more can arrive */ 836 dumpreseq(tcb); 837 838 if(tcb->state == Syn_sent) 839 Fsconnected(s, reason); 840 if(s->state == Announced) 841 wakeup(&s->listenr); 842 843 qhangup(s->rq, reason); 844 qhangup(s->wq, reason); 845 846 tcpsetstate(s, Closed); 847 } 848 849 /* mtu (- TCP + IP hdr len) of 1st hop */ 850 static int 851 tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale) 852 { 853 Ipifc *ifc; 854 int mtu; 855 856 ifc = findipifc(tcp->f, addr, 0); 857 switch(version){ 858 default: 859 case V4: 860 mtu = DEF_MSS; 861 if(ifc != nil) 862 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 863 break; 864 case V6: 865 mtu = DEF_MSS6; 866 if(ifc != nil) 867 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 868 break; 869 } 870 /* 871 * set the ws. it doesn't commit us to anything. 872 * ws is the ultimate limit to the bandwidth-delay product. 873 */ 874 *scale = Defadvscale; 875 876 return mtu; 877 } 878 879 static void 880 inittcpctl(Conv *s, int mode) 881 { 882 Tcpctl *tcb; 883 Tcp4hdr* h4; 884 Tcp6hdr* h6; 885 Tcppriv *tpriv; 886 int mss; 887 888 tcb = (Tcpctl*)s->ptcl; 889 890 memset(tcb, 0, sizeof(Tcpctl)); 891 892 tcb->ssthresh = QMAX; /* reset by tcpsetscale() */ 893 tcb->srtt = tcp_irtt<<LOGAGAIN; 894 tcb->mdev = 0; 895 896 /* setup timers */ 897 tcb->timer.start = tcp_irtt / MSPTICK; 898 tcb->timer.func = tcptimeout; 899 tcb->timer.arg = s; 900 tcb->rtt_timer.start = MAX_TIME; 901 tcb->acktimer.start = TCP_ACK / MSPTICK; 902 tcb->acktimer.func = tcpacktimer; 903 tcb->acktimer.arg = s; 904 tcb->katimer.start = DEF_KAT / MSPTICK; 905 tcb->katimer.func = tcpkeepalive; 906 tcb->katimer.arg = s; 907 908 mss = DEF_MSS; 909 910 /* create a prototype(pseudo) header */ 911 if(mode != TCP_LISTEN){ 912 if(ipcmp(s->laddr, IPnoaddr) == 0) 913 findlocalip(s->p->f, s->laddr, s->raddr); 914 915 switch(s->ipversion){ 916 case V4: 917 h4 = &tcb->protohdr.tcp4hdr; 918 memset(h4, 0, sizeof(*h4)); 919 h4->proto = IP_TCPPROTO; 920 hnputs(h4->tcpsport, s->lport); 921 hnputs(h4->tcpdport, s->rport); 922 v6tov4(h4->tcpsrc, s->laddr); 923 v6tov4(h4->tcpdst, s->raddr); 924 break; 925 case V6: 926 h6 = &tcb->protohdr.tcp6hdr; 927 memset(h6, 0, sizeof(*h6)); 928 h6->proto = IP_TCPPROTO; 929 hnputs(h6->tcpsport, s->lport); 930 hnputs(h6->tcpdport, s->rport); 931 ipmove(h6->tcpsrc, s->laddr); 932 ipmove(h6->tcpdst, s->raddr); 933 mss = DEF_MSS6; 934 break; 935 default: 936 panic("inittcpctl: version %d", s->ipversion); 937 } 938 } 939 940 tcb->mss = tcb->cwind = mss; 941 tcb->abcbytes = 0; 942 tpriv = s->p->priv; 943 tpriv->stats[Mss] = tcb->mss; 944 945 /* default is no window scaling */ 946 tcpsetscale(s, tcb, 0, 0); 947 } 948 949 /* 950 * called with s qlocked 951 */ 952 static void 953 tcpstart(Conv *s, int mode) 954 { 955 Tcpctl *tcb; 956 Tcppriv *tpriv; 957 char kpname[KNAMELEN]; 958 959 tpriv = s->p->priv; 960 961 if(tpriv->ackprocstarted == 0){ 962 qlock(&tpriv->apl); 963 if(tpriv->ackprocstarted == 0){ 964 snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev); 965 kproc(kpname, tcpackproc, s->p); 966 tpriv->ackprocstarted = 1; 967 } 968 qunlock(&tpriv->apl); 969 } 970 971 tcb = (Tcpctl*)s->ptcl; 972 973 inittcpctl(s, mode); 974 975 iphtadd(&tpriv->ht, s); 976 switch(mode) { 977 case TCP_LISTEN: 978 tpriv->stats[PassiveOpens]++; 979 tcb->flags |= CLONE; 980 tcpsetstate(s, Listen); 981 break; 982 983 case TCP_CONNECT: 984 tpriv->stats[ActiveOpens]++; 985 tcb->flags |= ACTIVE; 986 tcpsndsyn(s, tcb); 987 tcpsetstate(s, Syn_sent); 988 tcpoutput(s); 989 break; 990 } 991 } 992 993 static char* 994 tcpflag(char *buf, char *e, ushort flag) 995 { 996 char *p; 997 998 p = seprint(buf, e, "%d", flag>>10); /* Head len */ 999 if(flag & URG) 1000 p = seprint(p, e, " URG"); 1001 if(flag & ACK) 1002 p = seprint(p, e, " ACK"); 1003 if(flag & PSH) 1004 p = seprint(p, e, " PSH"); 1005 if(flag & RST) 1006 p = seprint(p, e, " RST"); 1007 if(flag & SYN) 1008 p = seprint(p, e, " SYN"); 1009 if(flag & FIN) 1010 p = seprint(p, e, " FIN"); 1011 USED(p); 1012 return buf; 1013 } 1014 1015 static Block* 1016 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 1017 { 1018 int dlen; 1019 Tcp6hdr *h; 1020 ushort csum; 1021 ushort hdrlen, optpad = 0; 1022 uchar *opt; 1023 1024 hdrlen = TCP6_HDRSIZE; 1025 if(tcph->flags & SYN){ 1026 if(tcph->mss) 1027 hdrlen += MSS_LENGTH; 1028 if(tcph->ws) 1029 hdrlen += WS_LENGTH; 1030 optpad = hdrlen & 3; 1031 if(optpad) 1032 optpad = 4 - optpad; 1033 hdrlen += optpad; 1034 } 1035 1036 if(data) { 1037 dlen = blocklen(data); 1038 data = padblock(data, hdrlen + TCP6_PKT); 1039 if(data == nil) 1040 return nil; 1041 } 1042 else { 1043 dlen = 0; 1044 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 1045 if(data == nil) 1046 return nil; 1047 data->wp += hdrlen + TCP6_PKT; 1048 } 1049 1050 /* copy in pseudo ip header plus port numbers */ 1051 h = (Tcp6hdr *)(data->rp); 1052 memmove(h, ph, TCP6_TCBPHDRSZ); 1053 1054 /* compose pseudo tcp header, do cksum calculation */ 1055 hnputl(h->vcf, hdrlen + dlen); 1056 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 1057 h->ttl = ph->proto; 1058 1059 /* copy in variable bits */ 1060 hnputl(h->tcpseq, tcph->seq); 1061 hnputl(h->tcpack, tcph->ack); 1062 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1063 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1064 hnputs(h->tcpurg, tcph->urg); 1065 1066 if(tcph->flags & SYN){ 1067 opt = h->tcpopt; 1068 if(tcph->mss != 0){ 1069 *opt++ = MSSOPT; 1070 *opt++ = MSS_LENGTH; 1071 hnputs(opt, tcph->mss); 1072 opt += 2; 1073 } 1074 if(tcph->ws != 0){ 1075 *opt++ = WSOPT; 1076 *opt++ = WS_LENGTH; 1077 *opt++ = tcph->ws; 1078 } 1079 while(optpad-- > 0) 1080 *opt++ = NOOPOPT; 1081 } 1082 1083 if(tcb != nil && tcb->nochecksum){ 1084 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1085 } else { 1086 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 1087 hnputs(h->tcpcksum, csum); 1088 } 1089 1090 /* move from pseudo header back to normal ip header */ 1091 memset(h->vcf, 0, 4); 1092 h->vcf[0] = IP_VER6; 1093 hnputs(h->ploadlen, hdrlen+dlen); 1094 h->proto = ph->proto; 1095 1096 return data; 1097 } 1098 1099 static Block* 1100 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1101 { 1102 int dlen; 1103 Tcp4hdr *h; 1104 ushort csum; 1105 ushort hdrlen, optpad = 0; 1106 uchar *opt; 1107 1108 hdrlen = TCP4_HDRSIZE; 1109 if(tcph->flags & SYN){ 1110 if(tcph->mss) 1111 hdrlen += MSS_LENGTH; 1112 if(1) 1113 hdrlen += WS_LENGTH; 1114 optpad = hdrlen & 3; 1115 if(optpad) 1116 optpad = 4 - optpad; 1117 hdrlen += optpad; 1118 } 1119 1120 if(data) { 1121 dlen = blocklen(data); 1122 data = padblock(data, hdrlen + TCP4_PKT); 1123 if(data == nil) 1124 return nil; 1125 } 1126 else { 1127 dlen = 0; 1128 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1129 if(data == nil) 1130 return nil; 1131 data->wp += hdrlen + TCP4_PKT; 1132 } 1133 1134 /* copy in pseudo ip header plus port numbers */ 1135 h = (Tcp4hdr *)(data->rp); 1136 memmove(h, ph, TCP4_TCBPHDRSZ); 1137 1138 /* copy in variable bits */ 1139 hnputs(h->tcplen, hdrlen + dlen); 1140 hnputl(h->tcpseq, tcph->seq); 1141 hnputl(h->tcpack, tcph->ack); 1142 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1143 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1144 hnputs(h->tcpurg, tcph->urg); 1145 1146 if(tcph->flags & SYN){ 1147 opt = h->tcpopt; 1148 if(tcph->mss != 0){ 1149 *opt++ = MSSOPT; 1150 *opt++ = MSS_LENGTH; 1151 hnputs(opt, tcph->mss); 1152 opt += 2; 1153 } 1154 /* always offer. rfc1323 §2.2 */ 1155 if(1){ 1156 *opt++ = WSOPT; 1157 *opt++ = WS_LENGTH; 1158 *opt++ = tcph->ws; 1159 } 1160 while(optpad-- > 0) 1161 *opt++ = NOOPOPT; 1162 } 1163 1164 if(tcb != nil && tcb->nochecksum){ 1165 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1166 } else { 1167 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1168 hnputs(h->tcpcksum, csum); 1169 } 1170 1171 return data; 1172 } 1173 1174 static int 1175 ntohtcp6(Tcp *tcph, Block **bpp) 1176 { 1177 Tcp6hdr *h; 1178 uchar *optr; 1179 ushort hdrlen; 1180 ushort optlen; 1181 int n; 1182 1183 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1184 if(*bpp == nil) 1185 return -1; 1186 1187 h = (Tcp6hdr *)((*bpp)->rp); 1188 tcph->source = nhgets(h->tcpsport); 1189 tcph->dest = nhgets(h->tcpdport); 1190 tcph->seq = nhgetl(h->tcpseq); 1191 tcph->ack = nhgetl(h->tcpack); 1192 hdrlen = (h->tcpflag[0]>>2) & ~3; 1193 if(hdrlen < TCP6_HDRSIZE) { 1194 freeblist(*bpp); 1195 return -1; 1196 } 1197 1198 tcph->flags = h->tcpflag[1]; 1199 tcph->wnd = nhgets(h->tcpwin); 1200 tcph->urg = nhgets(h->tcpurg); 1201 tcph->mss = 0; 1202 tcph->ws = 0; 1203 tcph->update = 0; 1204 tcph->len = nhgets(h->ploadlen) - hdrlen; 1205 1206 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1207 if(*bpp == nil) 1208 return -1; 1209 1210 optr = h->tcpopt; 1211 n = hdrlen - TCP6_HDRSIZE; 1212 while(n > 0 && *optr != EOLOPT) { 1213 if(*optr == NOOPOPT) { 1214 n--; 1215 optr++; 1216 continue; 1217 } 1218 optlen = optr[1]; 1219 if(optlen < 2 || optlen > n) 1220 break; 1221 switch(*optr) { 1222 case MSSOPT: 1223 if(optlen == MSS_LENGTH) 1224 tcph->mss = nhgets(optr+2); 1225 break; 1226 case WSOPT: 1227 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1228 tcph->ws = *(optr+2); 1229 break; 1230 } 1231 n -= optlen; 1232 optr += optlen; 1233 } 1234 return hdrlen; 1235 } 1236 1237 static int 1238 ntohtcp4(Tcp *tcph, Block **bpp) 1239 { 1240 Tcp4hdr *h; 1241 uchar *optr; 1242 ushort hdrlen; 1243 ushort optlen; 1244 int n; 1245 1246 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1247 if(*bpp == nil) 1248 return -1; 1249 1250 h = (Tcp4hdr *)((*bpp)->rp); 1251 tcph->source = nhgets(h->tcpsport); 1252 tcph->dest = nhgets(h->tcpdport); 1253 tcph->seq = nhgetl(h->tcpseq); 1254 tcph->ack = nhgetl(h->tcpack); 1255 1256 hdrlen = (h->tcpflag[0]>>2) & ~3; 1257 if(hdrlen < TCP4_HDRSIZE) { 1258 freeblist(*bpp); 1259 return -1; 1260 } 1261 1262 tcph->flags = h->tcpflag[1]; 1263 tcph->wnd = nhgets(h->tcpwin); 1264 tcph->urg = nhgets(h->tcpurg); 1265 tcph->mss = 0; 1266 tcph->ws = 0; 1267 tcph->update = 0; 1268 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1269 1270 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1271 if(*bpp == nil) 1272 return -1; 1273 1274 optr = h->tcpopt; 1275 n = hdrlen - TCP4_HDRSIZE; 1276 while(n > 0 && *optr != EOLOPT) { 1277 if(*optr == NOOPOPT) { 1278 n--; 1279 optr++; 1280 continue; 1281 } 1282 optlen = optr[1]; 1283 if(optlen < 2 || optlen > n) 1284 break; 1285 switch(*optr) { 1286 case MSSOPT: 1287 if(optlen == MSS_LENGTH) 1288 tcph->mss = nhgets(optr+2); 1289 break; 1290 case WSOPT: 1291 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1292 tcph->ws = *(optr+2); 1293 break; 1294 } 1295 n -= optlen; 1296 optr += optlen; 1297 } 1298 return hdrlen; 1299 } 1300 1301 /* 1302 * For outgoing calls, generate an initial sequence 1303 * number and put a SYN on the send queue 1304 */ 1305 static void 1306 tcpsndsyn(Conv *s, Tcpctl *tcb) 1307 { 1308 Tcppriv *tpriv; 1309 1310 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1311 tcb->rttseq = tcb->iss; 1312 tcb->snd.wl2 = tcb->iss; 1313 tcb->snd.una = tcb->iss; 1314 tcb->snd.rxt = tcb->iss; 1315 tcb->snd.ptr = tcb->rttseq; 1316 tcb->snd.nxt = tcb->rttseq; 1317 tcb->flgcnt++; 1318 tcb->flags |= FORCE; 1319 tcb->sndsyntime = NOW; 1320 1321 /* set desired mss and scale */ 1322 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1323 tpriv = s->p->priv; 1324 tpriv->stats[Mss] = tcb->mss; 1325 } 1326 1327 void 1328 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1329 { 1330 Block *hbp; 1331 uchar rflags; 1332 Tcppriv *tpriv; 1333 Tcp4hdr ph4; 1334 Tcp6hdr ph6; 1335 1336 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason); 1337 1338 tpriv = tcp->priv; 1339 1340 if(seg->flags & RST) 1341 return; 1342 1343 /* make pseudo header */ 1344 switch(version) { 1345 case V4: 1346 memset(&ph4, 0, sizeof(ph4)); 1347 ph4.vihl = IP_VER4; 1348 v6tov4(ph4.tcpsrc, dest); 1349 v6tov4(ph4.tcpdst, source); 1350 ph4.proto = IP_TCPPROTO; 1351 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1352 hnputs(ph4.tcpsport, seg->dest); 1353 hnputs(ph4.tcpdport, seg->source); 1354 break; 1355 case V6: 1356 memset(&ph6, 0, sizeof(ph6)); 1357 ph6.vcf[0] = IP_VER6; 1358 ipmove(ph6.tcpsrc, dest); 1359 ipmove(ph6.tcpdst, source); 1360 ph6.proto = IP_TCPPROTO; 1361 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1362 hnputs(ph6.tcpsport, seg->dest); 1363 hnputs(ph6.tcpdport, seg->source); 1364 break; 1365 default: 1366 panic("sndrst: version %d", version); 1367 } 1368 1369 tpriv->stats[OutRsts]++; 1370 rflags = RST; 1371 1372 /* convince the other end that this reset is in band */ 1373 if(seg->flags & ACK) { 1374 seg->seq = seg->ack; 1375 seg->ack = 0; 1376 } 1377 else { 1378 rflags |= ACK; 1379 seg->ack = seg->seq; 1380 seg->seq = 0; 1381 if(seg->flags & SYN) 1382 seg->ack++; 1383 seg->ack += length; 1384 if(seg->flags & FIN) 1385 seg->ack++; 1386 } 1387 seg->flags = rflags; 1388 seg->wnd = 0; 1389 seg->urg = 0; 1390 seg->mss = 0; 1391 seg->ws = 0; 1392 switch(version) { 1393 case V4: 1394 hbp = htontcp4(seg, nil, &ph4, nil); 1395 if(hbp == nil) 1396 return; 1397 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1398 break; 1399 case V6: 1400 hbp = htontcp6(seg, nil, &ph6, nil); 1401 if(hbp == nil) 1402 return; 1403 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1404 break; 1405 default: 1406 panic("sndrst2: version %d", version); 1407 } 1408 } 1409 1410 /* 1411 * send a reset to the remote side and close the conversation 1412 * called with s qlocked 1413 */ 1414 static char* 1415 tcphangup(Conv *s) 1416 { 1417 Tcp seg; 1418 Tcpctl *tcb; 1419 Block *hbp; 1420 1421 tcb = (Tcpctl*)s->ptcl; 1422 if(waserror()) 1423 return commonerror(); 1424 if(ipcmp(s->raddr, IPnoaddr) != 0) { 1425 if(!waserror()){ 1426 memset(&seg, 0, sizeof seg); 1427 seg.flags = RST | ACK; 1428 seg.ack = tcb->rcv.nxt; 1429 tcb->rcv.ackptr = seg.ack; 1430 seg.seq = tcb->snd.ptr; 1431 seg.wnd = 0; 1432 seg.urg = 0; 1433 seg.mss = 0; 1434 seg.ws = 0; 1435 switch(s->ipversion) { 1436 case V4: 1437 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1438 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1439 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1440 break; 1441 case V6: 1442 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1443 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1444 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1445 break; 1446 default: 1447 panic("tcphangup: version %d", s->ipversion); 1448 } 1449 poperror(); 1450 } 1451 } 1452 localclose(s, nil); 1453 poperror(); 1454 return nil; 1455 } 1456 1457 /* 1458 * (re)send a SYN ACK 1459 */ 1460 static int 1461 sndsynack(Proto *tcp, Limbo *lp) 1462 { 1463 Block *hbp; 1464 Tcp4hdr ph4; 1465 Tcp6hdr ph6; 1466 Tcp seg; 1467 uint scale; 1468 1469 /* make pseudo header */ 1470 switch(lp->version) { 1471 case V4: 1472 memset(&ph4, 0, sizeof(ph4)); 1473 ph4.vihl = IP_VER4; 1474 v6tov4(ph4.tcpsrc, lp->laddr); 1475 v6tov4(ph4.tcpdst, lp->raddr); 1476 ph4.proto = IP_TCPPROTO; 1477 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1478 hnputs(ph4.tcpsport, lp->lport); 1479 hnputs(ph4.tcpdport, lp->rport); 1480 break; 1481 case V6: 1482 memset(&ph6, 0, sizeof(ph6)); 1483 ph6.vcf[0] = IP_VER6; 1484 ipmove(ph6.tcpsrc, lp->laddr); 1485 ipmove(ph6.tcpdst, lp->raddr); 1486 ph6.proto = IP_TCPPROTO; 1487 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1488 hnputs(ph6.tcpsport, lp->lport); 1489 hnputs(ph6.tcpdport, lp->rport); 1490 break; 1491 default: 1492 panic("sndrst: version %d", lp->version); 1493 } 1494 1495 memset(&seg, 0, sizeof seg); 1496 seg.seq = lp->iss; 1497 seg.ack = lp->irs+1; 1498 seg.flags = SYN|ACK; 1499 seg.urg = 0; 1500 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1501 seg.wnd = QMAX; 1502 1503 /* if the other side set scale, we should too */ 1504 if(lp->rcvscale){ 1505 seg.ws = scale; 1506 lp->sndscale = scale; 1507 } else { 1508 seg.ws = 0; 1509 lp->sndscale = 0; 1510 } 1511 1512 switch(lp->version) { 1513 case V4: 1514 hbp = htontcp4(&seg, nil, &ph4, nil); 1515 if(hbp == nil) 1516 return -1; 1517 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1518 break; 1519 case V6: 1520 hbp = htontcp6(&seg, nil, &ph6, nil); 1521 if(hbp == nil) 1522 return -1; 1523 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1524 break; 1525 default: 1526 panic("sndsnack: version %d", lp->version); 1527 } 1528 lp->lastsend = NOW; 1529 return 0; 1530 } 1531 1532 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1533 1534 /* 1535 * put a call into limbo and respond with a SYN ACK 1536 * 1537 * called with proto locked 1538 */ 1539 static void 1540 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1541 { 1542 Limbo *lp, **l; 1543 Tcppriv *tpriv; 1544 int h; 1545 1546 tpriv = s->p->priv; 1547 h = hashipa(source, seg->source); 1548 1549 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1550 lp = *l; 1551 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1552 continue; 1553 if(ipcmp(lp->raddr, source) != 0) 1554 continue; 1555 if(ipcmp(lp->laddr, dest) != 0) 1556 continue; 1557 1558 /* each new SYN restarts the retransmits */ 1559 lp->irs = seg->seq; 1560 break; 1561 } 1562 lp = *l; 1563 if(lp == nil){ 1564 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1565 lp = tpriv->lht[h]; 1566 tpriv->lht[h] = lp->next; 1567 lp->next = nil; 1568 } else { 1569 lp = malloc(sizeof(*lp)); 1570 if(lp == nil) 1571 return; 1572 tpriv->nlimbo++; 1573 } 1574 *l = lp; 1575 lp->version = version; 1576 ipmove(lp->laddr, dest); 1577 ipmove(lp->raddr, source); 1578 lp->lport = seg->dest; 1579 lp->rport = seg->source; 1580 lp->mss = seg->mss; 1581 lp->rcvscale = seg->ws; 1582 lp->irs = seg->seq; 1583 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1584 } 1585 1586 if(sndsynack(s->p, lp) < 0){ 1587 *l = lp->next; 1588 tpriv->nlimbo--; 1589 free(lp); 1590 } 1591 } 1592 1593 /* 1594 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1595 */ 1596 static void 1597 limborexmit(Proto *tcp) 1598 { 1599 Tcppriv *tpriv; 1600 Limbo **l, *lp; 1601 int h; 1602 int seen; 1603 ulong now; 1604 1605 tpriv = tcp->priv; 1606 1607 if(!canqlock(tcp)) 1608 return; 1609 seen = 0; 1610 now = NOW; 1611 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1612 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1613 lp = *l; 1614 seen++; 1615 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1616 continue; 1617 1618 /* time it out after 1 second */ 1619 if(++(lp->rexmits) > 5){ 1620 tpriv->nlimbo--; 1621 *l = lp->next; 1622 free(lp); 1623 continue; 1624 } 1625 1626 /* if we're being attacked, don't bother resending SYN ACK's */ 1627 if(tpriv->nlimbo > 100) 1628 continue; 1629 1630 if(sndsynack(tcp, lp) < 0){ 1631 tpriv->nlimbo--; 1632 *l = lp->next; 1633 free(lp); 1634 continue; 1635 } 1636 1637 l = &lp->next; 1638 } 1639 } 1640 qunlock(tcp); 1641 } 1642 1643 /* 1644 * lookup call in limbo. if found, throw it out. 1645 * 1646 * called with proto locked 1647 */ 1648 static void 1649 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1650 { 1651 Limbo *lp, **l; 1652 int h; 1653 Tcppriv *tpriv; 1654 1655 tpriv = s->p->priv; 1656 1657 /* find a call in limbo */ 1658 h = hashipa(src, segp->source); 1659 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1660 lp = *l; 1661 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1662 continue; 1663 if(ipcmp(lp->laddr, dst) != 0) 1664 continue; 1665 if(ipcmp(lp->raddr, src) != 0) 1666 continue; 1667 1668 /* RST can only follow the SYN */ 1669 if(segp->seq == lp->irs+1){ 1670 tpriv->nlimbo--; 1671 *l = lp->next; 1672 free(lp); 1673 } 1674 break; 1675 } 1676 } 1677 1678 static void 1679 initialwindow(Tcpctl *tcb) 1680 { 1681 /* RFC 3390 initial window */ 1682 if(tcb->mss < 1095) 1683 tcb->cwind = 4*tcb->mss; 1684 else if(tcb->mss < 2190) 1685 tcb->cwind = 2*2190; 1686 else 1687 tcb->cwind = 2*tcb->mss; 1688 } 1689 1690 /* 1691 * come here when we finally get an ACK to our SYN-ACK. 1692 * lookup call in limbo. if found, create a new conversation 1693 * 1694 * called with proto locked 1695 */ 1696 static Conv* 1697 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1698 { 1699 Conv *new; 1700 Tcpctl *tcb; 1701 Tcppriv *tpriv; 1702 Tcp4hdr *h4; 1703 Tcp6hdr *h6; 1704 Limbo *lp, **l; 1705 int h; 1706 1707 /* unless it's just an ack, it can't be someone coming out of limbo */ 1708 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1709 return nil; 1710 1711 tpriv = s->p->priv; 1712 1713 /* find a call in limbo */ 1714 h = hashipa(src, segp->source); 1715 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1716 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n", 1717 src, segp->source, lp->raddr, lp->rport, 1718 dst, segp->dest, lp->laddr, lp->lport, 1719 version, lp->version 1720 ); 1721 1722 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1723 continue; 1724 if(ipcmp(lp->laddr, dst) != 0) 1725 continue; 1726 if(ipcmp(lp->raddr, src) != 0) 1727 continue; 1728 1729 /* we're assuming no data with the initial SYN */ 1730 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1731 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n", 1732 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1733 lp = nil; 1734 } else { 1735 tpriv->nlimbo--; 1736 *l = lp->next; 1737 } 1738 break; 1739 } 1740 if(lp == nil) 1741 return nil; 1742 1743 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1744 if(new == nil) 1745 return nil; 1746 1747 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1748 tcb = (Tcpctl*)new->ptcl; 1749 tcb->flags &= ~CLONE; 1750 tcb->timer.arg = new; 1751 tcb->timer.state = TcptimerOFF; 1752 tcb->acktimer.arg = new; 1753 tcb->acktimer.state = TcptimerOFF; 1754 tcb->katimer.arg = new; 1755 tcb->katimer.state = TcptimerOFF; 1756 tcb->rtt_timer.arg = new; 1757 tcb->rtt_timer.state = TcptimerOFF; 1758 1759 tcb->irs = lp->irs; 1760 tcb->rcv.nxt = tcb->irs+1; 1761 tcb->rcv.wptr = tcb->rcv.nxt; 1762 tcb->rcv.wsnt = 0; 1763 tcb->rcv.urg = tcb->rcv.nxt; 1764 1765 tcb->iss = lp->iss; 1766 tcb->rttseq = tcb->iss; 1767 tcb->snd.wl2 = tcb->iss; 1768 tcb->snd.una = tcb->iss+1; 1769 tcb->snd.ptr = tcb->iss+1; 1770 tcb->snd.nxt = tcb->iss+1; 1771 tcb->snd.rxt = tcb->iss+1; 1772 tcb->flgcnt = 0; 1773 tcb->flags |= SYNACK; 1774 1775 /* our sending max segment size cannot be bigger than what he asked for */ 1776 if(lp->mss != 0 && lp->mss < tcb->mss) { 1777 tcb->mss = lp->mss; 1778 tpriv->stats[Mss] = tcb->mss; 1779 } 1780 1781 /* window scaling */ 1782 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1783 1784 /* congestion window */ 1785 tcb->snd.wnd = segp->wnd; 1786 initialwindow(tcb); 1787 1788 /* set initial round trip time */ 1789 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1790 tcpsynackrtt(new); 1791 1792 free(lp); 1793 1794 /* set up proto header */ 1795 switch(version){ 1796 case V4: 1797 h4 = &tcb->protohdr.tcp4hdr; 1798 memset(h4, 0, sizeof(*h4)); 1799 h4->proto = IP_TCPPROTO; 1800 hnputs(h4->tcpsport, new->lport); 1801 hnputs(h4->tcpdport, new->rport); 1802 v6tov4(h4->tcpsrc, dst); 1803 v6tov4(h4->tcpdst, src); 1804 break; 1805 case V6: 1806 h6 = &tcb->protohdr.tcp6hdr; 1807 memset(h6, 0, sizeof(*h6)); 1808 h6->proto = IP_TCPPROTO; 1809 hnputs(h6->tcpsport, new->lport); 1810 hnputs(h6->tcpdport, new->rport); 1811 ipmove(h6->tcpsrc, dst); 1812 ipmove(h6->tcpdst, src); 1813 break; 1814 default: 1815 panic("tcpincoming: version %d", new->ipversion); 1816 } 1817 1818 tcpsetstate(new, Established); 1819 1820 iphtadd(&tpriv->ht, new); 1821 1822 return new; 1823 } 1824 1825 static int 1826 seq_within(ulong x, ulong low, ulong high) 1827 { 1828 if(low <= high){ 1829 if(low <= x && x <= high) 1830 return 1; 1831 } 1832 else { 1833 if(x >= low || x <= high) 1834 return 1; 1835 } 1836 return 0; 1837 } 1838 1839 static int 1840 seq_lt(ulong x, ulong y) 1841 { 1842 return (int)(x-y) < 0; 1843 } 1844 1845 static int 1846 seq_le(ulong x, ulong y) 1847 { 1848 return (int)(x-y) <= 0; 1849 } 1850 1851 static int 1852 seq_gt(ulong x, ulong y) 1853 { 1854 return (int)(x-y) > 0; 1855 } 1856 1857 static int 1858 seq_ge(ulong x, ulong y) 1859 { 1860 return (int)(x-y) >= 0; 1861 } 1862 1863 /* 1864 * use the time between the first SYN and it's ack as the 1865 * initial round trip time 1866 */ 1867 static void 1868 tcpsynackrtt(Conv *s) 1869 { 1870 Tcpctl *tcb; 1871 int delta; 1872 Tcppriv *tpriv; 1873 1874 tcb = (Tcpctl*)s->ptcl; 1875 tpriv = s->p->priv; 1876 1877 delta = NOW - tcb->sndsyntime; 1878 tcb->srtt = delta<<LOGAGAIN; 1879 tcb->mdev = delta<<LOGDGAIN; 1880 1881 /* halt round trip timer */ 1882 tcphalt(tpriv, &tcb->rtt_timer); 1883 } 1884 1885 static void 1886 update(Conv *s, Tcp *seg) 1887 { 1888 int rtt, delta; 1889 Tcpctl *tcb; 1890 ulong acked; 1891 Tcppriv *tpriv; 1892 1893 if(seg->update) 1894 return; 1895 seg->update = 1; 1896 1897 tpriv = s->p->priv; 1898 tcb = (Tcpctl*)s->ptcl; 1899 1900 /* catch zero-window updates, update window & recover */ 1901 if(tcb->snd.wnd == 0 && seg->wnd > 0 && 1902 seq_lt(seg->ack, tcb->snd.ptr)){ 1903 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n", 1904 seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd); 1905 tcb->snd.wnd = seg->wnd; 1906 goto recovery; 1907 } 1908 1909 /* newreno fast retransmit */ 1910 if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt && 1911 ++tcb->snd.dupacks == 3){ /* was TCPREXMTTHRESH */ 1912 recovery: 1913 if(tcb->snd.recovery){ 1914 tpriv->stats[RecoveryCwind]++; 1915 tcb->cwind += tcb->mss; 1916 }else if(seq_le(tcb->snd.rxt, seg->ack)){ 1917 tpriv->stats[Recovery]++; 1918 tcb->abcbytes = 0; 1919 tcb->snd.recovery = 1; 1920 tcb->snd.partialack = 0; 1921 tcb->snd.rxt = tcb->snd.nxt; 1922 tcpcongestion(tcb); 1923 tcb->cwind = tcb->ssthresh + 3*tcb->mss; 1924 netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n", 1925 tcb->cwind, tcb->ssthresh, tcb->snd.rxt); 1926 tcprxmit(s); 1927 }else{ 1928 tpriv->stats[RecoveryNoSeq]++; 1929 netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n", 1930 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack); 1931 /* don't enter fast retransmit, don't change ssthresh */ 1932 } 1933 }else if(tcb->snd.recovery){ 1934 tpriv->stats[RecoveryCwind]++; 1935 tcb->cwind += tcb->mss; 1936 } 1937 1938 /* 1939 * update window 1940 */ 1941 if(seq_gt(seg->ack, tcb->snd.wl2) 1942 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1943 /* clear dupack if we advance wl2 */ 1944 if(tcb->snd.wl2 != seg->ack) 1945 tcb->snd.dupacks = 0; 1946 tcb->snd.wnd = seg->wnd; 1947 tcb->snd.wl2 = seg->ack; 1948 } 1949 1950 if(!seq_gt(seg->ack, tcb->snd.una)){ 1951 /* 1952 * don't let us hangup if sending into a closed window and 1953 * we're still getting acks 1954 */ 1955 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0) 1956 tcb->backedoff = MAXBACKMS/4; 1957 return; 1958 } 1959 1960 /* Compute the new send window size */ 1961 acked = seg->ack - tcb->snd.una; 1962 1963 /* avoid slow start and timers for SYN acks */ 1964 if((tcb->flags & SYNACK) == 0) { 1965 tcb->flags |= SYNACK; 1966 acked--; 1967 tcb->flgcnt--; 1968 goto done; 1969 } 1970 1971 /* 1972 * congestion control 1973 */ 1974 if(tcb->snd.recovery){ 1975 if(seq_ge(seg->ack, tcb->snd.rxt)){ 1976 /* recovery finished; deflate window */ 1977 tpriv->stats[RecoveryDone]++; 1978 tcb->snd.dupacks = 0; 1979 tcb->snd.recovery = 0; 1980 tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss; 1981 if(tcb->ssthresh < tcb->cwind) 1982 tcb->cwind = tcb->ssthresh; 1983 netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n", 1984 tcb->cwind, tcb->ssthresh); 1985 } else { 1986 /* partial ack; we lost more than one segment */ 1987 tpriv->stats[RecoveryPA]++; 1988 if(tcb->cwind > acked) 1989 tcb->cwind -= acked; 1990 else{ 1991 netlog(s->p->f, Logtcpwin, "partial ack neg\n"); 1992 tcb->cwind = tcb->mss; 1993 } 1994 netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n", 1995 acked, tcb->snd.rxt - seg->ack, tcb->cwind); 1996 1997 if(acked >= tcb->mss) 1998 tcb->cwind += tcb->mss; 1999 tcb->snd.partialack++; 2000 } 2001 } else 2002 tcpabcincr(tcb, acked); 2003 2004 /* Adjust the timers according to the round trip time */ 2005 /* TODO: fix sloppy treatment of overflow cases here. */ 2006 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 2007 tcphalt(tpriv, &tcb->rtt_timer); 2008 if((tcb->flags&RETRAN) == 0) { 2009 tcb->backoff = 0; 2010 tcb->backedoff = 0; 2011 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 2012 if(rtt == 0) 2013 rtt = 1; /* else all close sys's will rexmit in 0 time */ 2014 rtt *= MSPTICK; 2015 if(tcb->srtt == 0) { 2016 tcb->srtt = rtt << LOGAGAIN; 2017 tcb->mdev = rtt << LOGDGAIN; 2018 } else { 2019 delta = rtt - (tcb->srtt>>LOGAGAIN); 2020 tcb->srtt += delta; 2021 if(tcb->srtt <= 0) 2022 tcb->srtt = 1; 2023 2024 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 2025 tcb->mdev += delta; 2026 if(tcb->mdev <= 0) 2027 tcb->mdev = 1; 2028 } 2029 tcpsettimer(tcb); 2030 } 2031 } 2032 2033 done: 2034 if(qdiscard(s->wq, acked) < acked) 2035 tcb->flgcnt--; 2036 tcb->snd.una = seg->ack; 2037 2038 /* newreno fast recovery */ 2039 if(tcb->snd.recovery) 2040 tcprxmit(s); 2041 2042 if(seq_gt(seg->ack, tcb->snd.urg)) 2043 tcb->snd.urg = seg->ack; 2044 2045 if(tcb->snd.una != tcb->snd.nxt){ 2046 /* `impatient' variant */ 2047 if(!tcb->snd.recovery || tcb->snd.partialack == 1){ 2048 tcb->time = NOW; 2049 tcb->timeuna = tcb->snd.una; 2050 tcpgo(tpriv, &tcb->timer); 2051 } 2052 } else 2053 tcphalt(tpriv, &tcb->timer); 2054 2055 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 2056 tcb->snd.ptr = tcb->snd.una; 2057 2058 if(!tcb->snd.recovery) 2059 tcb->flags &= ~RETRAN; 2060 tcb->backoff = 0; 2061 tcb->backedoff = 0; 2062 } 2063 2064 static void 2065 tcpiput(Proto *tcp, Ipifc*, Block *bp) 2066 { 2067 Tcp seg; 2068 Tcp4hdr *h4; 2069 Tcp6hdr *h6; 2070 int hdrlen; 2071 Tcpctl *tcb; 2072 ushort length, csum; 2073 uchar source[IPaddrlen], dest[IPaddrlen]; 2074 Conv *s; 2075 Fs *f; 2076 Tcppriv *tpriv; 2077 uchar version; 2078 2079 f = tcp->f; 2080 tpriv = tcp->priv; 2081 2082 tpriv->stats[InSegs]++; 2083 2084 h4 = (Tcp4hdr*)(bp->rp); 2085 h6 = (Tcp6hdr*)(bp->rp); 2086 2087 if((h4->vihl&0xF0)==IP_VER4) { 2088 version = V4; 2089 length = nhgets(h4->length); 2090 v4tov6(dest, h4->tcpdst); 2091 v4tov6(source, h4->tcpsrc); 2092 2093 h4->Unused = 0; 2094 hnputs(h4->tcplen, length-TCP4_PKT); 2095 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 2096 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 2097 tpriv->stats[CsumErrs]++; 2098 tpriv->stats[InErrs]++; 2099 netlog(f, Logtcp, "bad tcp proto cksum\n"); 2100 freeblist(bp); 2101 return; 2102 } 2103 2104 hdrlen = ntohtcp4(&seg, &bp); 2105 if(hdrlen < 0){ 2106 tpriv->stats[HlenErrs]++; 2107 tpriv->stats[InErrs]++; 2108 netlog(f, Logtcp, "bad tcp hdr len\n"); 2109 return; 2110 } 2111 2112 /* trim the packet to the size claimed by the datagram */ 2113 length -= hdrlen+TCP4_PKT; 2114 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 2115 if(bp == nil){ 2116 tpriv->stats[LenErrs]++; 2117 tpriv->stats[InErrs]++; 2118 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 2119 return; 2120 } 2121 } 2122 else { 2123 int ttl = h6->ttl; 2124 int proto = h6->proto; 2125 2126 version = V6; 2127 length = nhgets(h6->ploadlen); 2128 ipmove(dest, h6->tcpdst); 2129 ipmove(source, h6->tcpsrc); 2130 2131 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 2132 h6->ttl = proto; 2133 hnputl(h6->vcf, length); 2134 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 2135 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) { 2136 tpriv->stats[CsumErrs]++; 2137 tpriv->stats[InErrs]++; 2138 netlog(f, Logtcp, 2139 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n", 2140 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum); 2141 freeblist(bp); 2142 return; 2143 } 2144 h6->ttl = ttl; 2145 h6->proto = proto; 2146 hnputs(h6->ploadlen, length); 2147 2148 hdrlen = ntohtcp6(&seg, &bp); 2149 if(hdrlen < 0){ 2150 tpriv->stats[HlenErrs]++; 2151 tpriv->stats[InErrs]++; 2152 netlog(f, Logtcp, "bad tcpv6 hdr len\n"); 2153 return; 2154 } 2155 2156 /* trim the packet to the size claimed by the datagram */ 2157 length -= hdrlen; 2158 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2159 if(bp == nil){ 2160 tpriv->stats[LenErrs]++; 2161 tpriv->stats[InErrs]++; 2162 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n"); 2163 return; 2164 } 2165 } 2166 2167 /* lock protocol while searching for a conversation */ 2168 qlock(tcp); 2169 2170 /* Look for a matching conversation */ 2171 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2172 if(s == nil){ 2173 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n", 2174 source, seg.source, dest, seg.dest); 2175 reset: 2176 qunlock(tcp); 2177 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2178 freeblist(bp); 2179 return; 2180 } 2181 2182 /* if it's a listener, look for the right flags and get a new conv */ 2183 tcb = (Tcpctl*)s->ptcl; 2184 if(tcb->state == Listen){ 2185 if(seg.flags & RST){ 2186 limborst(s, &seg, source, dest, version); 2187 qunlock(tcp); 2188 freeblist(bp); 2189 return; 2190 } 2191 2192 /* if this is a new SYN, put the call into limbo */ 2193 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2194 limbo(s, source, dest, &seg, version); 2195 qunlock(tcp); 2196 freeblist(bp); 2197 return; 2198 } 2199 2200 /* 2201 * if there's a matching call in limbo, tcpincoming will 2202 * return it in state Syn_received 2203 */ 2204 s = tcpincoming(s, &seg, source, dest, version); 2205 if(s == nil) 2206 goto reset; 2207 } 2208 2209 /* The rest of the input state machine is run with the control block 2210 * locked and implements the state machine directly out of the RFC. 2211 * Out-of-band data is ignored - it was always a bad idea. 2212 */ 2213 tcb = (Tcpctl*)s->ptcl; 2214 if(waserror()){ 2215 qunlock(s); 2216 nexterror(); 2217 } 2218 qlock(s); 2219 qunlock(tcp); 2220 2221 /* fix up window */ 2222 seg.wnd <<= tcb->rcv.scale; 2223 2224 /* every input packet in puts off the keep alive time out */ 2225 tcpsetkacounter(tcb); 2226 2227 switch(tcb->state) { 2228 case Closed: 2229 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2230 goto raise; 2231 case Syn_sent: 2232 if(seg.flags & ACK) { 2233 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2234 sndrst(tcp, source, dest, length, &seg, version, 2235 "bad seq in Syn_sent"); 2236 goto raise; 2237 } 2238 } 2239 if(seg.flags & RST) { 2240 if(seg.flags & ACK) 2241 localclose(s, Econrefused); 2242 goto raise; 2243 } 2244 2245 if(seg.flags & SYN) { 2246 procsyn(s, &seg); 2247 if(seg.flags & ACK){ 2248 update(s, &seg); 2249 tcpsynackrtt(s); 2250 tcpsetstate(s, Established); 2251 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2252 } 2253 else { 2254 tcb->time = NOW; 2255 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2256 } 2257 2258 if(length != 0 || (seg.flags & FIN)) 2259 break; 2260 2261 freeblist(bp); 2262 goto output; 2263 } 2264 else 2265 freeblist(bp); 2266 2267 qunlock(s); 2268 poperror(); 2269 return; 2270 case Syn_received: 2271 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2272 if(seg.flags & ACK) 2273 tcpsynackrtt(s); 2274 break; 2275 } 2276 2277 /* 2278 * One DOS attack is to open connections to us and then forget about them, 2279 * thereby tying up a conv at no long term cost to the attacker. 2280 * This is an attempt to defeat these stateless DOS attacks. See 2281 * corresponding code in tcpsendka(). 2282 */ 2283 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2284 if(tcpporthogdefense 2285 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2286 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2287 source, seg.source, dest, seg.dest, seg.flags, 2288 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2289 localclose(s, "stateless hog"); 2290 } 2291 } 2292 2293 /* Cut the data to fit the receive window */ 2294 tcprcvwin(s); 2295 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2296 if(seg.seq+1 != tcb->rcv.nxt || length != 1) 2297 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win " 2298 "%lud-%lud l %d from %I\n", seg.seq, 2299 seg.seq + length - 1, tcb->rcv.nxt, 2300 tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr); 2301 update(s, &seg); 2302 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2303 tcphalt(tpriv, &tcb->rtt_timer); 2304 tcphalt(tpriv, &tcb->acktimer); 2305 tcphalt(tpriv, &tcb->katimer); 2306 tcpsetstate(s, Time_wait); 2307 tcb->timer.start = MSL2*(1000 / MSPTICK); 2308 tcpgo(tpriv, &tcb->timer); 2309 } 2310 if(!(seg.flags & RST)) { 2311 tcb->flags |= FORCE; 2312 goto output; 2313 } 2314 qunlock(s); 2315 poperror(); 2316 return; 2317 } 2318 2319 /* Cannot accept so answer with a rst */ 2320 if(length && tcb->state == Closed) { 2321 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2322 goto raise; 2323 } 2324 2325 /* The segment is beyond the current receive pointer so 2326 * queue the data in the resequence queue 2327 */ 2328 if(seg.seq != tcb->rcv.nxt) 2329 if(length != 0 || (seg.flags & (SYN|FIN))) { 2330 update(s, &seg); 2331 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) 2332 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, 2333 s->laddr, s->lport); 2334 tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */ 2335 goto output; 2336 } 2337 2338 if(tcb->nreseq > 0) 2339 tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */ 2340 2341 /* 2342 * keep looping till we've processed this packet plus any 2343 * adjacent packets in the resequence queue 2344 */ 2345 for(;;) { 2346 if(seg.flags & RST) { 2347 if(tcb->state == Established) { 2348 tpriv->stats[EstabResets]++; 2349 if(tcb->rcv.nxt != seg.seq) 2350 print("out of order RST rcvd: %I.%d -> " 2351 "%I.%d, rcv.nxt %lux seq %lux\n", 2352 s->raddr, s->rport, s->laddr, 2353 s->lport, tcb->rcv.nxt, seg.seq); 2354 } 2355 localclose(s, Econrefused); 2356 goto raise; 2357 } 2358 2359 if((seg.flags&ACK) == 0) 2360 goto raise; 2361 2362 switch(tcb->state) { 2363 case Syn_received: 2364 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2365 sndrst(tcp, source, dest, length, &seg, version, 2366 "bad seq in Syn_received"); 2367 goto raise; 2368 } 2369 update(s, &seg); 2370 tcpsetstate(s, Established); 2371 case Established: 2372 case Close_wait: 2373 update(s, &seg); 2374 break; 2375 case Finwait1: 2376 update(s, &seg); 2377 if(qlen(s->wq)+tcb->flgcnt == 0){ 2378 tcphalt(tpriv, &tcb->rtt_timer); 2379 tcphalt(tpriv, &tcb->acktimer); 2380 tcpsetkacounter(tcb); 2381 tcb->time = NOW; 2382 tcpsetstate(s, Finwait2); 2383 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2384 tcpgo(tpriv, &tcb->katimer); 2385 } 2386 break; 2387 case Finwait2: 2388 update(s, &seg); 2389 break; 2390 case Closing: 2391 update(s, &seg); 2392 if(qlen(s->wq)+tcb->flgcnt == 0) { 2393 tcphalt(tpriv, &tcb->rtt_timer); 2394 tcphalt(tpriv, &tcb->acktimer); 2395 tcphalt(tpriv, &tcb->katimer); 2396 tcpsetstate(s, Time_wait); 2397 tcb->timer.start = MSL2*(1000 / MSPTICK); 2398 tcpgo(tpriv, &tcb->timer); 2399 } 2400 break; 2401 case Last_ack: 2402 update(s, &seg); 2403 if(qlen(s->wq)+tcb->flgcnt == 0) { 2404 localclose(s, nil); 2405 goto raise; 2406 } 2407 case Time_wait: 2408 tcb->flags |= FORCE; 2409 if(tcb->timer.state != TcptimerON) 2410 tcpgo(tpriv, &tcb->timer); 2411 } 2412 2413 if((seg.flags&URG) && seg.urg) { 2414 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2415 tcb->rcv.urg = seg.urg + seg.seq; 2416 pullblock(&bp, seg.urg); 2417 } 2418 } 2419 else 2420 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2421 tcb->rcv.urg = tcb->rcv.nxt; 2422 2423 if(length == 0) { 2424 if(bp != nil) 2425 freeblist(bp); 2426 } 2427 else { 2428 switch(tcb->state){ 2429 default: 2430 /* Ignore segment text */ 2431 if(bp != nil) 2432 freeblist(bp); 2433 break; 2434 2435 case Syn_received: 2436 case Established: 2437 case Finwait1: 2438 /* If we still have some data place on 2439 * receive queue 2440 */ 2441 if(bp) { 2442 bp = packblock(bp); 2443 if(bp == nil) 2444 panic("tcp packblock"); 2445 qpassnolim(s->rq, bp); 2446 bp = nil; 2447 } 2448 tcb->rcv.nxt += length; 2449 2450 /* 2451 * turn on the acktimer if there's something 2452 * to ack 2453 */ 2454 if(tcb->acktimer.state != TcptimerON) 2455 tcpgo(tpriv, &tcb->acktimer); 2456 2457 break; 2458 case Finwait2: 2459 /* no process to read the data, send a reset */ 2460 if(bp != nil) 2461 freeblist(bp); 2462 sndrst(tcp, source, dest, length, &seg, version, 2463 "send to Finwait2"); 2464 qunlock(s); 2465 poperror(); 2466 return; 2467 } 2468 } 2469 2470 if(seg.flags & FIN) { 2471 tcb->flags |= FORCE; 2472 2473 switch(tcb->state) { 2474 case Syn_received: 2475 case Established: 2476 tcb->rcv.nxt++; 2477 tcpsetstate(s, Close_wait); 2478 break; 2479 case Finwait1: 2480 tcb->rcv.nxt++; 2481 if(qlen(s->wq)+tcb->flgcnt == 0) { 2482 tcphalt(tpriv, &tcb->rtt_timer); 2483 tcphalt(tpriv, &tcb->acktimer); 2484 tcphalt(tpriv, &tcb->katimer); 2485 tcpsetstate(s, Time_wait); 2486 tcb->timer.start = MSL2*(1000/MSPTICK); 2487 tcpgo(tpriv, &tcb->timer); 2488 } 2489 else 2490 tcpsetstate(s, Closing); 2491 break; 2492 case Finwait2: 2493 tcb->rcv.nxt++; 2494 tcphalt(tpriv, &tcb->rtt_timer); 2495 tcphalt(tpriv, &tcb->acktimer); 2496 tcphalt(tpriv, &tcb->katimer); 2497 tcpsetstate(s, Time_wait); 2498 tcb->timer.start = MSL2 * (1000/MSPTICK); 2499 tcpgo(tpriv, &tcb->timer); 2500 break; 2501 case Close_wait: 2502 case Closing: 2503 case Last_ack: 2504 break; 2505 case Time_wait: 2506 tcpgo(tpriv, &tcb->timer); 2507 break; 2508 } 2509 } 2510 2511 /* 2512 * get next adjacent segment from the resequence queue. 2513 * dump/trim any overlapping segments 2514 */ 2515 for(;;) { 2516 if(tcb->reseq == nil) 2517 goto output; 2518 2519 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2520 goto output; 2521 2522 getreseq(tcb, &seg, &bp, &length); 2523 2524 tcprcvwin(s); 2525 if(tcptrim(tcb, &seg, &bp, &length) == 0){ 2526 tcb->flags |= FORCE; 2527 break; 2528 } 2529 } 2530 } 2531 output: 2532 tcpoutput(s); 2533 qunlock(s); 2534 poperror(); 2535 return; 2536 raise: 2537 qunlock(s); 2538 poperror(); 2539 freeblist(bp); 2540 tcpkick(s); 2541 } 2542 2543 /* 2544 * always enters and exits with the s locked. We drop 2545 * the lock to ipoput the packet so some care has to be 2546 * taken by callers. 2547 */ 2548 static void 2549 tcpoutput(Conv *s) 2550 { 2551 Tcp seg; 2552 uint msgs; 2553 Tcpctl *tcb; 2554 Block *hbp, *bp; 2555 int sndcnt; 2556 ulong ssize, dsize, sent; 2557 Fs *f; 2558 Tcppriv *tpriv; 2559 uchar version; 2560 2561 f = s->p->f; 2562 tpriv = s->p->priv; 2563 version = s->ipversion; 2564 2565 tcb = (Tcpctl*)s->ptcl; 2566 2567 /* force ack every 2*mss */ 2568 if((tcb->flags & FORCE) == 0 && 2569 tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){ 2570 tpriv->stats[Delayack]++; 2571 tcb->flags |= FORCE; 2572 } 2573 2574 /* force ack if window opening */ 2575 if((tcb->flags & FORCE) == 0){ 2576 tcprcvwin(s); 2577 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){ 2578 tpriv->stats[Wopenack]++; 2579 tcb->flags |= FORCE; 2580 } 2581 } 2582 2583 for(msgs = 0; msgs < 100; msgs++) { 2584 switch(tcb->state) { 2585 case Listen: 2586 case Closed: 2587 case Finwait2: 2588 return; 2589 } 2590 2591 /* Don't send anything else until our SYN has been acked */ 2592 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2593 break; 2594 2595 /* force an ack when a window has opened up */ 2596 tcprcvwin(s); 2597 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2598 tcb->rcv.blocked = 0; 2599 tcb->flags |= FORCE; 2600 } 2601 2602 sndcnt = qlen(s->wq)+tcb->flgcnt; 2603 sent = tcb->snd.ptr - tcb->snd.una; 2604 ssize = sndcnt; 2605 if(tcb->snd.wnd == 0){ 2606 /* zero window probe */ 2607 if(sent > 0 && !(tcb->flags & FORCE)) 2608 break; /* already probing, rto re-probes */ 2609 if(ssize < sent) 2610 ssize = 0; 2611 else{ 2612 ssize -= sent; 2613 if(ssize > 0) 2614 ssize = 1; 2615 } 2616 } else { 2617 /* calculate usable segment size */ 2618 if(ssize > tcb->cwind) 2619 ssize = tcb->cwind; 2620 if(ssize > tcb->snd.wnd) 2621 ssize = tcb->snd.wnd; 2622 2623 if(ssize < sent) 2624 ssize = 0; 2625 else { 2626 ssize -= sent; 2627 if(ssize > tcb->mss) 2628 ssize = tcb->mss; 2629 } 2630 } 2631 2632 dsize = ssize; 2633 seg.urg = 0; 2634 2635 if(!(tcb->flags & FORCE)) 2636 if(ssize == 0 || 2637 ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr && 2638 sent > TCPREXMTTHRESH * tcb->mss) 2639 break; 2640 2641 tcb->flags &= ~FORCE; 2642 2643 /* By default we will generate an ack */ 2644 tcphalt(tpriv, &tcb->acktimer); 2645 seg.source = s->lport; 2646 seg.dest = s->rport; 2647 seg.flags = ACK; 2648 seg.mss = 0; 2649 seg.ws = 0; 2650 seg.update = 0; 2651 switch(tcb->state){ 2652 case Syn_sent: 2653 seg.flags = 0; 2654 if(tcb->snd.ptr == tcb->iss){ 2655 seg.flags |= SYN; 2656 dsize--; 2657 seg.mss = tcb->mss; 2658 seg.ws = tcb->scale; 2659 } 2660 break; 2661 case Syn_received: 2662 /* 2663 * don't send any data with a SYN/ACK packet 2664 * because Linux rejects the packet in its 2665 * attempt to solve the SYN attack problem 2666 */ 2667 if(tcb->snd.ptr == tcb->iss){ 2668 seg.flags |= SYN; 2669 dsize = 0; 2670 ssize = 1; 2671 seg.mss = tcb->mss; 2672 seg.ws = tcb->scale; 2673 } 2674 break; 2675 } 2676 seg.seq = tcb->snd.ptr; 2677 seg.ack = tcb->rcv.nxt; 2678 seg.wnd = tcb->rcv.wnd; 2679 2680 /* Pull out data to send */ 2681 bp = nil; 2682 if(dsize != 0) { 2683 bp = qcopy(s->wq, dsize, sent); 2684 if(BLEN(bp) != dsize) { 2685 seg.flags |= FIN; 2686 dsize--; 2687 } 2688 } 2689 2690 if(sent+dsize == sndcnt && dsize) 2691 seg.flags |= PSH; 2692 2693 tcb->snd.ptr += ssize; 2694 2695 /* Pull up the send pointer so we can accept acks 2696 * for this window 2697 */ 2698 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2699 tcb->snd.nxt = tcb->snd.ptr; 2700 2701 /* Build header, link data and compute cksum */ 2702 switch(version){ 2703 case V4: 2704 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2705 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2706 if(hbp == nil) { 2707 freeblist(bp); 2708 return; 2709 } 2710 break; 2711 case V6: 2712 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2713 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2714 if(hbp == nil) { 2715 freeblist(bp); 2716 return; 2717 } 2718 break; 2719 default: 2720 hbp = nil; /* to suppress a warning */ 2721 panic("tcpoutput: version %d", version); 2722 } 2723 2724 /* Start the transmission timers if there is new data and we 2725 * expect acknowledges 2726 */ 2727 if(ssize != 0){ 2728 if(tcb->timer.state != TcptimerON){ 2729 tcb->time = NOW; 2730 tcb->timeuna = tcb->snd.una; 2731 tcpgo(tpriv, &tcb->timer); 2732 } 2733 2734 /* If round trip timer isn't running, start it. 2735 * measure the longest packet only in case the 2736 * transmission time dominates RTT 2737 */ 2738 if(tcb->snd.retransmit == 0) 2739 if(tcb->rtt_timer.state != TcptimerON) 2740 if(ssize == tcb->mss) { 2741 tcpgo(tpriv, &tcb->rtt_timer); 2742 tcb->rttseq = tcb->snd.ptr; 2743 } 2744 } 2745 2746 tpriv->stats[OutSegs]++; 2747 if(tcb->snd.retransmit) 2748 tpriv->stats[RetransSegsSent]++; 2749 tcb->rcv.ackptr = seg.ack; 2750 tcb->rcv.wsnt = tcb->rcv.wptr; 2751 2752 /* put off the next keep alive */ 2753 tcpgo(tpriv, &tcb->katimer); 2754 2755 switch(version){ 2756 case V4: 2757 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2758 /* a negative return means no route */ 2759 localclose(s, "no route"); 2760 } 2761 break; 2762 case V6: 2763 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2764 /* a negative return means no route */ 2765 localclose(s, "no route"); 2766 } 2767 break; 2768 default: 2769 panic("tcpoutput2: version %d", version); 2770 } 2771 if((msgs%4) == 3){ 2772 qunlock(s); 2773 qlock(s); 2774 } 2775 } 2776 } 2777 2778 /* 2779 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2780 */ 2781 static void 2782 tcpsendka(Conv *s) 2783 { 2784 Tcp seg; 2785 Tcpctl *tcb; 2786 Block *hbp,*dbp; 2787 2788 tcb = (Tcpctl*)s->ptcl; 2789 2790 dbp = nil; 2791 memset(&seg, 0, sizeof seg); 2792 seg.urg = 0; 2793 seg.source = s->lport; 2794 seg.dest = s->rport; 2795 seg.flags = ACK|PSH; 2796 seg.mss = 0; 2797 seg.ws = 0; 2798 if(tcpporthogdefense) 2799 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2800 else 2801 seg.seq = tcb->snd.una-1; 2802 seg.ack = tcb->rcv.nxt; 2803 tcb->rcv.ackptr = seg.ack; 2804 tcprcvwin(s); 2805 seg.wnd = tcb->rcv.wnd; 2806 if(tcb->state == Finwait2){ 2807 seg.flags |= FIN; 2808 } else { 2809 dbp = allocb(1); 2810 dbp->wp++; 2811 } 2812 2813 if(isv4(s->raddr)) { 2814 /* Build header, link data and compute cksum */ 2815 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2816 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2817 if(hbp == nil) { 2818 freeblist(dbp); 2819 return; 2820 } 2821 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2822 } 2823 else { 2824 /* Build header, link data and compute cksum */ 2825 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2826 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2827 if(hbp == nil) { 2828 freeblist(dbp); 2829 return; 2830 } 2831 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2832 } 2833 } 2834 2835 /* 2836 * set connection to time out after 12 minutes 2837 */ 2838 static void 2839 tcpsetkacounter(Tcpctl *tcb) 2840 { 2841 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2842 if(tcb->kacounter < 3) 2843 tcb->kacounter = 3; 2844 } 2845 2846 /* 2847 * if we've timed out, close the connection 2848 * otherwise, send a keepalive and restart the timer 2849 */ 2850 static void 2851 tcpkeepalive(void *v) 2852 { 2853 Tcpctl *tcb; 2854 Conv *s; 2855 2856 s = v; 2857 tcb = (Tcpctl*)s->ptcl; 2858 if(waserror()){ 2859 qunlock(s); 2860 nexterror(); 2861 } 2862 qlock(s); 2863 if(tcb->state != Closed){ 2864 if(--(tcb->kacounter) <= 0) { 2865 localclose(s, Etimedout); 2866 } else { 2867 tcpsendka(s); 2868 tcpgo(s->p->priv, &tcb->katimer); 2869 } 2870 } 2871 qunlock(s); 2872 poperror(); 2873 } 2874 2875 /* 2876 * start keepalive timer 2877 */ 2878 static char* 2879 tcpstartka(Conv *s, char **f, int n) 2880 { 2881 Tcpctl *tcb; 2882 int x; 2883 2884 tcb = (Tcpctl*)s->ptcl; 2885 if(tcb->state != Established) 2886 return "connection must be in Establised state"; 2887 if(n > 1){ 2888 x = atoi(f[1]); 2889 if(x >= MSPTICK) 2890 tcb->katimer.start = x/MSPTICK; 2891 } 2892 tcpsetkacounter(tcb); 2893 tcpgo(s->p->priv, &tcb->katimer); 2894 2895 return nil; 2896 } 2897 2898 /* 2899 * turn checksums on/off 2900 */ 2901 static char* 2902 tcpsetchecksum(Conv *s, char **f, int) 2903 { 2904 Tcpctl *tcb; 2905 2906 tcb = (Tcpctl*)s->ptcl; 2907 tcb->nochecksum = !atoi(f[1]); 2908 2909 return nil; 2910 } 2911 2912 /* 2913 * retransmit (at most) one segment at snd.una. 2914 * preserve cwind & snd.ptr 2915 */ 2916 static void 2917 tcprxmit(Conv *s) 2918 { 2919 Tcpctl *tcb; 2920 Tcppriv *tpriv; 2921 ulong tcwind, tptr; 2922 2923 tcb = (Tcpctl*)s->ptcl; 2924 tcb->flags |= RETRAN|FORCE; 2925 2926 tptr = tcb->snd.ptr; 2927 tcwind = tcb->cwind; 2928 tcb->snd.ptr = tcb->snd.una; 2929 tcb->cwind = tcb->mss; 2930 tcb->snd.retransmit = 1; 2931 tcpoutput(s); 2932 tcb->snd.retransmit = 0; 2933 tcb->cwind = tcwind; 2934 tcb->snd.ptr = tptr; 2935 2936 tpriv = s->p->priv; 2937 tpriv->stats[RetransSegs]++; 2938 } 2939 2940 /* 2941 * TODO: RFC 4138 F-RTO 2942 */ 2943 static void 2944 tcptimeout(void *arg) 2945 { 2946 Conv *s; 2947 Tcpctl *tcb; 2948 int maxback; 2949 Tcppriv *tpriv; 2950 2951 s = (Conv*)arg; 2952 tpriv = s->p->priv; 2953 tcb = (Tcpctl*)s->ptcl; 2954 2955 if(waserror()){ 2956 qunlock(s); 2957 nexterror(); 2958 } 2959 qlock(s); 2960 switch(tcb->state){ 2961 default: 2962 tcb->backoff++; 2963 if(tcb->state == Syn_sent) 2964 maxback = MAXBACKMS/2; 2965 else 2966 maxback = MAXBACKMS; 2967 tcb->backedoff += tcb->timer.start * MSPTICK; 2968 if(tcb->backedoff >= maxback) { 2969 localclose(s, Etimedout); 2970 break; 2971 } 2972 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n", 2973 tcb->srtt, tcb->mdev, NOW - tcb->time, 2974 tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr, 2975 tcpstates[s->state]); 2976 tcpsettimer(tcb); 2977 if(tcb->snd.rto == 0) 2978 tcpcongestion(tcb); 2979 tcprxmit(s); 2980 tcb->snd.ptr = tcb->snd.una; 2981 tcb->cwind = tcb->mss; 2982 tcb->snd.rto = 1; 2983 tpriv->stats[RetransTimeouts]++; 2984 2985 if(tcb->snd.recovery){ 2986 tcb->snd.dupacks = 0; /* reno rto */ 2987 tcb->snd.recovery = 0; 2988 tpriv->stats[RecoveryRTO]++; 2989 tcb->snd.rxt = tcb->snd.nxt; 2990 netlog(s->p->f, Logtcpwin, 2991 "rto recovery rxt @%lud\n", tcb->snd.nxt); 2992 } 2993 2994 tcb->abcbytes = 0; 2995 break; 2996 case Time_wait: 2997 localclose(s, nil); 2998 break; 2999 case Closed: 3000 break; 3001 } 3002 qunlock(s); 3003 poperror(); 3004 } 3005 3006 static int 3007 inwindow(Tcpctl *tcb, int seq) 3008 { 3009 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 3010 } 3011 3012 /* 3013 * set up state for a received SYN (or SYN ACK) packet 3014 */ 3015 static void 3016 procsyn(Conv *s, Tcp *seg) 3017 { 3018 Tcpctl *tcb; 3019 Tcppriv *tpriv; 3020 3021 tcb = (Tcpctl*)s->ptcl; 3022 tcb->flags |= FORCE; 3023 3024 tcb->rcv.nxt = seg->seq + 1; 3025 tcb->rcv.wptr = tcb->rcv.nxt; 3026 tcb->rcv.wsnt = 0; 3027 tcb->rcv.urg = tcb->rcv.nxt; 3028 tcb->irs = seg->seq; 3029 3030 /* our sending max segment size cannot be bigger than what he asked for */ 3031 if(seg->mss != 0 && seg->mss < tcb->mss) { 3032 tcb->mss = seg->mss; 3033 tpriv = s->p->priv; 3034 tpriv->stats[Mss] = tcb->mss; 3035 } 3036 3037 tcb->snd.wnd = seg->wnd; 3038 initialwindow(tcb); 3039 } 3040 3041 static int 3042 dumpreseq(Tcpctl *tcb) 3043 { 3044 Reseq *r, *next; 3045 3046 for(r = tcb->reseq; r != nil; r = next){ 3047 next = r->next; 3048 freeblist(r->bp); 3049 free(r); 3050 } 3051 tcb->reseq = nil; 3052 tcb->nreseq = 0; 3053 tcb->reseqlen = 0; 3054 return -1; 3055 } 3056 3057 static void 3058 logreseq(Fs *f, Reseq *r, ulong n) 3059 { 3060 char *s; 3061 3062 for(; r != nil; r = r->next){ 3063 s = nil; 3064 if(r->next == nil && r->seg.seq != n) 3065 s = "hole/end"; 3066 else if(r->next == nil) 3067 s = "end"; 3068 else if(r->seg.seq != n) 3069 s = "hole"; 3070 if(s != nil) 3071 netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s, 3072 n, r->seg.seq, r->seg.seq - n, r->seg.flags); 3073 n = r->seg.seq + r->seg.len; 3074 } 3075 } 3076 3077 static int 3078 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 3079 { 3080 Reseq *rp, **rr; 3081 int qmax; 3082 3083 rp = malloc(sizeof *rp); 3084 if(rp == nil){ 3085 freeblist(bp); /* bp always consumed by addreseq */ 3086 return 0; 3087 } 3088 3089 rp->seg = *seg; 3090 rp->bp = bp; 3091 rp->length = length; 3092 3093 tcb->reseqlen += length; 3094 tcb->nreseq++; 3095 3096 /* Place on reassembly list sorting by starting seq number */ 3097 for(rr = &tcb->reseq; ; rr = &(*rr)->next) 3098 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){ 3099 rp->next = *rr; 3100 *rr = rp; 3101 tpriv->stats[Resequenced]++; 3102 if(rp->next != nil) 3103 tpriv->stats[OutOfOrder]++; 3104 break; 3105 } 3106 3107 qmax = tcb->window; 3108 if(tcb->reseqlen > qmax){ 3109 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", 3110 tcb->reseqlen, qmax, tcb->nreseq); 3111 logreseq(f, tcb->reseq, tcb->rcv.nxt); 3112 tpriv->stats[ReseqBytelim]++; 3113 return dumpreseq(tcb); 3114 } 3115 qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */ 3116 if(tcb->nreseq > qmax){ 3117 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", 3118 tcb->nreseq, qmax, tcb->reseqlen); 3119 logreseq(f, tcb->reseq, tcb->rcv.nxt); 3120 tpriv->stats[ReseqPktlim]++; 3121 return dumpreseq(tcb); 3122 } 3123 return 0; 3124 } 3125 3126 static void 3127 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 3128 { 3129 Reseq *rp; 3130 3131 rp = tcb->reseq; 3132 if(rp == nil) 3133 return; 3134 3135 tcb->reseq = rp->next; 3136 3137 *seg = rp->seg; 3138 *bp = rp->bp; 3139 *length = rp->length; 3140 3141 tcb->nreseq--; 3142 tcb->reseqlen -= rp->length; 3143 3144 free(rp); 3145 } 3146 3147 static int 3148 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 3149 { 3150 ushort len; 3151 uchar accept; 3152 int dupcnt, excess; 3153 3154 accept = 0; 3155 len = *length; 3156 if(seg->flags & SYN) 3157 len++; 3158 if(seg->flags & FIN) 3159 len++; 3160 3161 if(tcb->rcv.wnd == 0) { 3162 if(len == 0 && seg->seq == tcb->rcv.nxt) 3163 return 0; 3164 } 3165 else { 3166 /* Some part of the segment should be in the window */ 3167 if(inwindow(tcb,seg->seq)) 3168 accept++; 3169 else 3170 if(len != 0) { 3171 if(inwindow(tcb, seg->seq+len-1) || 3172 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 3173 accept++; 3174 } 3175 } 3176 if(!accept) { 3177 freeblist(*bp); 3178 return -1; 3179 } 3180 dupcnt = tcb->rcv.nxt - seg->seq; 3181 if(dupcnt > 0){ 3182 tcb->rerecv += dupcnt; 3183 if(seg->flags & SYN){ 3184 seg->flags &= ~SYN; 3185 seg->seq++; 3186 3187 if(seg->urg > 1) 3188 seg->urg--; 3189 else 3190 seg->flags &= ~URG; 3191 dupcnt--; 3192 } 3193 if(dupcnt > 0){ 3194 pullblock(bp, (ushort)dupcnt); 3195 seg->seq += dupcnt; 3196 *length -= dupcnt; 3197 3198 if(seg->urg > dupcnt) 3199 seg->urg -= dupcnt; 3200 else { 3201 seg->flags &= ~URG; 3202 seg->urg = 0; 3203 } 3204 } 3205 } 3206 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 3207 if(excess > 0) { 3208 tcb->rerecv += excess; 3209 *length -= excess; 3210 *bp = trimblock(*bp, 0, *length); 3211 if(*bp == nil) 3212 panic("presotto is a boofhead"); 3213 seg->flags &= ~FIN; 3214 } 3215 return 0; 3216 } 3217 3218 static void 3219 tcpadvise(Proto *tcp, Block *bp, char *msg) 3220 { 3221 Tcp4hdr *h4; 3222 Tcp6hdr *h6; 3223 Tcpctl *tcb; 3224 uchar source[IPaddrlen]; 3225 uchar dest[IPaddrlen]; 3226 ushort psource, pdest; 3227 Conv *s, **p; 3228 3229 h4 = (Tcp4hdr*)(bp->rp); 3230 h6 = (Tcp6hdr*)(bp->rp); 3231 3232 if((h4->vihl&0xF0)==IP_VER4) { 3233 v4tov6(dest, h4->tcpdst); 3234 v4tov6(source, h4->tcpsrc); 3235 psource = nhgets(h4->tcpsport); 3236 pdest = nhgets(h4->tcpdport); 3237 } 3238 else { 3239 ipmove(dest, h6->tcpdst); 3240 ipmove(source, h6->tcpsrc); 3241 psource = nhgets(h6->tcpsport); 3242 pdest = nhgets(h6->tcpdport); 3243 } 3244 3245 /* Look for a connection */ 3246 qlock(tcp); 3247 for(p = tcp->conv; *p; p++) { 3248 s = *p; 3249 tcb = (Tcpctl*)s->ptcl; 3250 if(s->rport == pdest) 3251 if(s->lport == psource) 3252 if(tcb->state != Closed) 3253 if(ipcmp(s->raddr, dest) == 0) 3254 if(ipcmp(s->laddr, source) == 0){ 3255 qlock(s); 3256 qunlock(tcp); 3257 switch(tcb->state){ 3258 case Syn_sent: 3259 localclose(s, msg); 3260 break; 3261 } 3262 qunlock(s); 3263 freeblist(bp); 3264 return; 3265 } 3266 } 3267 qunlock(tcp); 3268 freeblist(bp); 3269 } 3270 3271 static char* 3272 tcpporthogdefensectl(char *val) 3273 { 3274 if(strcmp(val, "on") == 0) 3275 tcpporthogdefense = 1; 3276 else if(strcmp(val, "off") == 0) 3277 tcpporthogdefense = 0; 3278 else 3279 return "unknown value for tcpporthogdefense"; 3280 return nil; 3281 } 3282 3283 /* called with c qlocked */ 3284 static char* 3285 tcpctl(Conv* c, char** f, int n) 3286 { 3287 if(n == 1 && strcmp(f[0], "hangup") == 0) 3288 return tcphangup(c); 3289 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3290 return tcpstartka(c, f, n); 3291 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3292 return tcpsetchecksum(c, f, n); 3293 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3294 return tcpporthogdefensectl(f[1]); 3295 return "unknown control request"; 3296 } 3297 3298 static int 3299 tcpstats(Proto *tcp, char *buf, int len) 3300 { 3301 Tcppriv *priv; 3302 char *p, *e; 3303 int i; 3304 3305 priv = tcp->priv; 3306 p = buf; 3307 e = p+len; 3308 for(i = 0; i < Nstats; i++) 3309 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]); 3310 return p - buf; 3311 } 3312 3313 /* 3314 * garbage collect any stale conversations: 3315 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3316 * - Finwait2 after 5 minutes 3317 * 3318 * this is called whenever we run out of channels. Both checks are 3319 * of questionable validity so we try to use them only when we're 3320 * up against the wall. 3321 */ 3322 static int 3323 tcpgc(Proto *tcp) 3324 { 3325 Conv *c, **pp, **ep; 3326 int n; 3327 Tcpctl *tcb; 3328 3329 3330 n = 0; 3331 ep = &tcp->conv[tcp->nc]; 3332 for(pp = tcp->conv; pp < ep; pp++) { 3333 c = *pp; 3334 if(c == nil) 3335 break; 3336 if(!canqlock(c)) 3337 continue; 3338 tcb = (Tcpctl*)c->ptcl; 3339 switch(tcb->state){ 3340 case Syn_received: 3341 if(NOW - tcb->time > 5000){ 3342 localclose(c, Etimedout); 3343 n++; 3344 } 3345 break; 3346 case Finwait2: 3347 if(NOW - tcb->time > 5*60*1000){ 3348 localclose(c, Etimedout); 3349 n++; 3350 } 3351 break; 3352 } 3353 qunlock(c); 3354 } 3355 return n; 3356 } 3357 3358 static void 3359 tcpsettimer(Tcpctl *tcb) 3360 { 3361 int x; 3362 3363 /* round trip dependency */ 3364 x = backoff(tcb->backoff) * 3365 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3366 3367 /* bounded twixt 0.3 and 64 seconds */ 3368 if(x < 300/MSPTICK) 3369 x = 300/MSPTICK; 3370 else if(x > (64000/MSPTICK)) 3371 x = 64000/MSPTICK; 3372 tcb->timer.start = x; 3373 } 3374 3375 void 3376 tcpinit(Fs *fs) 3377 { 3378 Proto *tcp; 3379 Tcppriv *tpriv; 3380 3381 tcp = smalloc(sizeof(Proto)); 3382 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3383 tcp->name = "tcp"; 3384 tcp->connect = tcpconnect; 3385 tcp->announce = tcpannounce; 3386 tcp->ctl = tcpctl; 3387 tcp->state = tcpstate; 3388 tcp->create = tcpcreate; 3389 tcp->close = tcpclose; 3390 tcp->rcv = tcpiput; 3391 tcp->advise = tcpadvise; 3392 tcp->stats = tcpstats; 3393 tcp->inuse = tcpinuse; 3394 tcp->gc = tcpgc; 3395 tcp->ipproto = IP_TCPPROTO; 3396 tcp->nc = scalednconv(); 3397 tcp->ptclsize = sizeof(Tcpctl); 3398 tpriv->stats[MaxConn] = tcp->nc; 3399 3400 Fsproto(fs, tcp); 3401 } 3402 3403 static void 3404 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3405 { 3406 /* 3407 * guess at reasonable queue sizes. there's no current way 3408 * to know how many nic receive buffers we can safely tie up in the 3409 * tcp stack, and we don't adjust our queues to maximize throughput 3410 * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be 3411 * respected, but we still control our own buffer commitment by 3412 * keeping a seperate qscale. 3413 */ 3414 tcb->rcv.scale = rcvscale & 0xff; 3415 tcb->snd.scale = sndscale & 0xff; 3416 tcb->qscale = rcvscale & 0xff; 3417 if(rcvscale > Maxqscale) 3418 tcb->qscale = Maxqscale; 3419 3420 if(rcvscale != tcb->rcv.scale) 3421 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud " 3422 "qlen %d >> window %ud lport %d\n", 3423 tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport); 3424 tcb->window = QMAX << tcb->qscale; 3425 tcb->ssthresh = tcb->window; 3426 3427 /* 3428 * it's important to set wq large enough to cover the full 3429 * bandwidth-delay product. it's possible to be in loss 3430 * recovery with a big window, and we need to keep sending 3431 * into the inflated window. the difference can be huge 3432 * for even modest (70ms) ping times. 3433 */ 3434 qsetlimit(s->rq, tcb->window); 3435 qsetlimit(s->wq, tcb->window); 3436 tcprcvwin(s); 3437 } 3438