1 /* $NetBSD: tcp_input.c,v 1.33 1997/10/10 01:51:07 explorer Exp $ */ 2 3 /* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 36 */ 37 38 /* 39 * TODO list for SYN cache stuff: 40 * 41 * (a) The definition of "struct syn_cache" says: 42 * 43 * This structure should not exceeed 32 bytes. 44 * 45 * but it's 40 bytes on the Alpha. Can reduce memory use one 46 * of two ways: 47 * 48 * (1) Use a dynamically-sized hash table, and handle 49 * collisions by rehashing. Then sc_next is unnecessary. 50 * 51 * (2) Allocate syn_cache structures in pages (or some other 52 * large chunk). This would probably be desirable for 53 * maintaining locality of reference anyway. 54 * 55 * If you do this, you can change sc_next to a page/index 56 * value, and make it a 32-bit (or maybe even 16-bit) 57 * integer, thus partly obviating the need for the previous 58 * hack. 59 * 60 * It's also worth noting this this is necessary for IPv6, as well, 61 * where we use 32 bytes just for the IP addresses, so eliminating 62 * wastage is going to become more important. (BTW, has anyone 63 * integreated these changes with one fo the IPv6 status that are 64 * available?) 65 * 66 * (b) Find room for a "state" field, which is needed to keep a 67 * compressed state for TIME_WAIT TCBs. It's been noted already 68 * that this is fairly important for very high-volume web and 69 * mail servers, which use a large number of short-lived 70 * connections. 71 */ 72 73 #ifndef TUBA_INCLUDE 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/malloc.h> 77 #include <sys/mbuf.h> 78 #include <sys/protosw.h> 79 #include <sys/socket.h> 80 #include <sys/socketvar.h> 81 #include <sys/errno.h> 82 83 #include <net/if.h> 84 #include <net/route.h> 85 86 #include <netinet/in.h> 87 #include <netinet/in_systm.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #include <machine/stdarg.h> 100 101 int tcprexmtthresh = 3; 102 struct tcpiphdr tcp_saveti; 103 104 extern u_long sb_max; 105 106 #endif /* TUBA_INCLUDE */ 107 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 108 109 /* for modulo comparisons of timestamps */ 110 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 111 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 112 113 /* 114 * Insert segment ti into reassembly queue of tcp with 115 * control block tp. Return TH_FIN if reassembly now includes 116 * a segment with FIN. The macro form does the common case inline 117 * (segment is the next to be received on an established connection, 118 * and the queue is empty), avoiding linkage into and removal 119 * from the queue and repetition of various conversions. 120 * Set DELACK for segments received in order, but ack immediately 121 * when segments are out of order (so fast retransmit can work). 122 */ 123 #define TCP_REASS(tp, ti, m, so, flags) { \ 124 if ((ti)->ti_seq == (tp)->rcv_nxt && \ 125 (tp)->segq.lh_first == NULL && \ 126 (tp)->t_state == TCPS_ESTABLISHED) { \ 127 if ((ti)->ti_flags & TH_PUSH) \ 128 tp->t_flags |= TF_ACKNOW; \ 129 else \ 130 tp->t_flags |= TF_DELACK; \ 131 (tp)->rcv_nxt += (ti)->ti_len; \ 132 flags = (ti)->ti_flags & TH_FIN; \ 133 tcpstat.tcps_rcvpack++;\ 134 tcpstat.tcps_rcvbyte += (ti)->ti_len;\ 135 sbappend(&(so)->so_rcv, (m)); \ 136 sorwakeup(so); \ 137 } else { \ 138 (flags) = tcp_reass((tp), (ti), (m)); \ 139 tp->t_flags |= TF_ACKNOW; \ 140 } \ 141 } 142 #ifndef TUBA_INCLUDE 143 144 int 145 tcp_reass(tp, ti, m) 146 register struct tcpcb *tp; 147 register struct tcpiphdr *ti; 148 struct mbuf *m; 149 { 150 register struct ipqent *p, *q, *nq, *tiqe; 151 struct socket *so = tp->t_inpcb->inp_socket; 152 int flags; 153 154 /* 155 * Call with ti==0 after become established to 156 * force pre-ESTABLISHED data up to user socket. 157 */ 158 if (ti == 0) 159 goto present; 160 161 /* 162 * Allocate a new queue entry, before we throw away any data. 163 * If we can't, just drop the packet. XXX 164 */ 165 MALLOC(tiqe, struct ipqent *, sizeof (struct ipqent), M_IPQ, M_NOWAIT); 166 if (tiqe == NULL) { 167 tcpstat.tcps_rcvmemdrop++; 168 m_freem(m); 169 return (0); 170 } 171 172 /* 173 * Find a segment which begins after this one does. 174 */ 175 for (p = NULL, q = tp->segq.lh_first; q != NULL; 176 p = q, q = q->ipqe_q.le_next) 177 if (SEQ_GT(q->ipqe_tcp->ti_seq, ti->ti_seq)) 178 break; 179 180 /* 181 * If there is a preceding segment, it may provide some of 182 * our data already. If so, drop the data from the incoming 183 * segment. If it provides all of our data, drop us. 184 */ 185 if (p != NULL) { 186 register struct tcpiphdr *phdr = p->ipqe_tcp; 187 register int i; 188 189 /* conversion to int (in i) handles seq wraparound */ 190 i = phdr->ti_seq + phdr->ti_len - ti->ti_seq; 191 if (i > 0) { 192 if (i >= ti->ti_len) { 193 tcpstat.tcps_rcvduppack++; 194 tcpstat.tcps_rcvdupbyte += ti->ti_len; 195 m_freem(m); 196 FREE(tiqe, M_IPQ); 197 return (0); 198 } 199 m_adj(m, i); 200 ti->ti_len -= i; 201 ti->ti_seq += i; 202 } 203 } 204 tcpstat.tcps_rcvoopack++; 205 tcpstat.tcps_rcvoobyte += ti->ti_len; 206 207 /* 208 * While we overlap succeeding segments trim them or, 209 * if they are completely covered, dequeue them. 210 */ 211 for (; q != NULL; q = nq) { 212 register struct tcpiphdr *qhdr = q->ipqe_tcp; 213 register int i = (ti->ti_seq + ti->ti_len) - qhdr->ti_seq; 214 215 if (i <= 0) 216 break; 217 if (i < qhdr->ti_len) { 218 qhdr->ti_seq += i; 219 qhdr->ti_len -= i; 220 m_adj(q->ipqe_m, i); 221 break; 222 } 223 nq = q->ipqe_q.le_next; 224 m_freem(q->ipqe_m); 225 LIST_REMOVE(q, ipqe_q); 226 FREE(q, M_IPQ); 227 } 228 229 /* Insert the new fragment queue entry into place. */ 230 tiqe->ipqe_m = m; 231 tiqe->ipqe_tcp = ti; 232 if (p == NULL) { 233 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 234 } else { 235 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 236 } 237 238 present: 239 /* 240 * Present data to user, advancing rcv_nxt through 241 * completed sequence space. 242 */ 243 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 244 return (0); 245 q = tp->segq.lh_first; 246 if (q == NULL || q->ipqe_tcp->ti_seq != tp->rcv_nxt) 247 return (0); 248 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->ti_len) 249 return (0); 250 do { 251 tp->rcv_nxt += q->ipqe_tcp->ti_len; 252 flags = q->ipqe_tcp->ti_flags & TH_FIN; 253 254 nq = q->ipqe_q.le_next; 255 LIST_REMOVE(q, ipqe_q); 256 if (so->so_state & SS_CANTRCVMORE) 257 m_freem(q->ipqe_m); 258 else 259 sbappend(&so->so_rcv, q->ipqe_m); 260 FREE(q, M_IPQ); 261 q = nq; 262 } while (q != NULL && q->ipqe_tcp->ti_seq == tp->rcv_nxt); 263 sorwakeup(so); 264 return (flags); 265 } 266 267 /* 268 * TCP input routine, follows pages 65-76 of the 269 * protocol specification dated September, 1981 very closely. 270 */ 271 void 272 #if __STDC__ 273 tcp_input(struct mbuf *m, ...) 274 #else 275 tcp_input(m, va_alist) 276 register struct mbuf *m; 277 #endif 278 { 279 register struct tcpiphdr *ti; 280 register struct inpcb *inp; 281 caddr_t optp = NULL; 282 int optlen = 0; 283 int len, tlen, off, hdroptlen; 284 register struct tcpcb *tp = 0; 285 register int tiflags; 286 struct socket *so = NULL; 287 int todrop, acked, ourfinisacked, needoutput = 0; 288 short ostate = 0; 289 int iss = 0; 290 u_long tiwin; 291 struct tcp_opt_info opti; 292 int iphlen; 293 va_list ap; 294 295 va_start(ap, m); 296 iphlen = va_arg(ap, int); 297 va_end(ap); 298 299 tcpstat.tcps_rcvtotal++; 300 301 opti.ts_present = 0; 302 opti.maxseg = 0; 303 304 /* 305 * Get IP and TCP header together in first mbuf. 306 * Note: IP leaves IP header in first mbuf. 307 */ 308 ti = mtod(m, struct tcpiphdr *); 309 if (iphlen > sizeof (struct ip)) 310 ip_stripoptions(m, (struct mbuf *)0); 311 if (m->m_len < sizeof (struct tcpiphdr)) { 312 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 313 tcpstat.tcps_rcvshort++; 314 return; 315 } 316 ti = mtod(m, struct tcpiphdr *); 317 } 318 319 /* 320 * Checksum extended TCP header and data. 321 */ 322 tlen = ((struct ip *)ti)->ip_len; 323 len = sizeof (struct ip) + tlen; 324 bzero(ti->ti_x1, sizeof ti->ti_x1); 325 ti->ti_len = (u_int16_t)tlen; 326 HTONS(ti->ti_len); 327 if ((ti->ti_sum = in_cksum(m, len)) != 0) { 328 tcpstat.tcps_rcvbadsum++; 329 goto drop; 330 } 331 #endif /* TUBA_INCLUDE */ 332 333 /* 334 * Check that TCP offset makes sense, 335 * pull out TCP options and adjust length. XXX 336 */ 337 off = ti->ti_off << 2; 338 if (off < sizeof (struct tcphdr) || off > tlen) { 339 tcpstat.tcps_rcvbadoff++; 340 goto drop; 341 } 342 tlen -= off; 343 ti->ti_len = tlen; 344 if (off > sizeof (struct tcphdr)) { 345 if (m->m_len < sizeof(struct ip) + off) { 346 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { 347 tcpstat.tcps_rcvshort++; 348 return; 349 } 350 ti = mtod(m, struct tcpiphdr *); 351 } 352 optlen = off - sizeof (struct tcphdr); 353 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); 354 /* 355 * Do quick retrieval of timestamp options ("options 356 * prediction?"). If timestamp is the only option and it's 357 * formatted as recommended in RFC 1323 appendix A, we 358 * quickly get the values now and not bother calling 359 * tcp_dooptions(), etc. 360 */ 361 if ((optlen == TCPOLEN_TSTAMP_APPA || 362 (optlen > TCPOLEN_TSTAMP_APPA && 363 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 364 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 365 (ti->ti_flags & TH_SYN) == 0) { 366 opti.ts_present = 1; 367 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 368 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 369 optp = NULL; /* we've parsed the options */ 370 } 371 } 372 tiflags = ti->ti_flags; 373 374 /* 375 * Convert TCP protocol specific fields to host format. 376 */ 377 NTOHL(ti->ti_seq); 378 NTOHL(ti->ti_ack); 379 NTOHS(ti->ti_win); 380 NTOHS(ti->ti_urp); 381 382 /* 383 * Locate pcb for segment. 384 */ 385 findpcb: 386 inp = in_pcblookup_connect(&tcbtable, ti->ti_src, ti->ti_sport, 387 ti->ti_dst, ti->ti_dport); 388 if (inp == 0) { 389 ++tcpstat.tcps_pcbhashmiss; 390 inp = in_pcblookup_bind(&tcbtable, ti->ti_dst, ti->ti_dport); 391 if (inp == 0) { 392 ++tcpstat.tcps_noport; 393 goto dropwithreset; 394 } 395 } 396 397 /* 398 * If the state is CLOSED (i.e., TCB does not exist) then 399 * all data in the incoming segment is discarded. 400 * If the TCB exists but is in CLOSED state, it is embryonic, 401 * but should either do a listen or a connect soon. 402 */ 403 tp = intotcpcb(inp); 404 if (tp == 0) 405 goto dropwithreset; 406 if (tp->t_state == TCPS_CLOSED) 407 goto drop; 408 409 /* Unscale the window into a 32-bit value. */ 410 if ((tiflags & TH_SYN) == 0) 411 tiwin = ti->ti_win << tp->snd_scale; 412 else 413 tiwin = ti->ti_win; 414 415 so = inp->inp_socket; 416 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 417 if (so->so_options & SO_DEBUG) { 418 ostate = tp->t_state; 419 tcp_saveti = *ti; 420 } 421 if (so->so_options & SO_ACCEPTCONN) { 422 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 423 if (tiflags & TH_RST) 424 syn_cache_reset(ti); 425 else if (tiflags & TH_ACK) { 426 so = syn_cache_get(so, m); 427 if (so == NULL) { 428 /* 429 * We don't have a SYN for 430 * this ACK; send an RST. 431 */ 432 tcpstat.tcps_badsyn++; 433 tp = NULL; 434 goto dropwithreset; 435 } else if (so == 436 (struct socket *)(-1)) { 437 /* 438 * We were unable to create 439 * the connection. If the 440 * 3-way handshake was 441 * completeed, and RST has 442 * been sent to the peer. 443 * Since the mbuf might be 444 * in use for the reply, 445 * do not free it. 446 */ 447 m = NULL; 448 } else { 449 /* 450 * We have created a 451 * full-blown connection. 452 */ 453 inp = sotoinpcb(so); 454 tp = intotcpcb(inp); 455 tiwin <<= tp->snd_scale; 456 goto after_listen; 457 } 458 } 459 } else { 460 /* 461 * Received a SYN; create compressed 462 * TCP state for it. 463 */ 464 if (so->so_qlen <= so->so_qlimit && 465 syn_cache_add(so, m, optp, optlen, &opti)) 466 m = NULL; 467 } 468 goto drop; 469 } 470 } 471 472 after_listen: 473 #ifdef DIAGNOSTIC 474 /* 475 * Should not happen now that all embryonic connections 476 * are handled with compressed state. 477 */ 478 if (tp->t_state == TCPS_LISTEN) 479 panic("tcp_input: TCPS_LISTEN"); 480 #endif 481 482 /* 483 * Segment received on connection. 484 * Reset idle time and keep-alive timer. 485 */ 486 tp->t_idle = 0; 487 if (TCPS_HAVEESTABLISHED(tp->t_state)) 488 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 489 490 /* 491 * Process options. 492 */ 493 if (optp) 494 tcp_dooptions(tp, optp, optlen, ti, &opti); 495 496 /* 497 * Header prediction: check for the two common cases 498 * of a uni-directional data xfer. If the packet has 499 * no control flags, is in-sequence, the window didn't 500 * change and we're not retransmitting, it's a 501 * candidate. If the length is zero and the ack moved 502 * forward, we're the sender side of the xfer. Just 503 * free the data acked & wake any higher level process 504 * that was blocked waiting for space. If the length 505 * is non-zero and the ack didn't move, we're the 506 * receiver side. If we're getting packets in-order 507 * (the reassembly queue is empty), add the data to 508 * the socket buffer and note that we need a delayed ack. 509 */ 510 if (tp->t_state == TCPS_ESTABLISHED && 511 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 512 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 513 ti->ti_seq == tp->rcv_nxt && 514 tiwin && tiwin == tp->snd_wnd && 515 tp->snd_nxt == tp->snd_max) { 516 517 /* 518 * If last ACK falls within this segment's sequence numbers, 519 * record the timestamp. 520 */ 521 if (opti.ts_present && 522 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 523 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) { 524 tp->ts_recent_age = tcp_now; 525 tp->ts_recent = opti.ts_val; 526 } 527 528 if (ti->ti_len == 0) { 529 if (SEQ_GT(ti->ti_ack, tp->snd_una) && 530 SEQ_LEQ(ti->ti_ack, tp->snd_max) && 531 tp->snd_cwnd >= tp->snd_wnd && 532 tp->t_dupacks < tcprexmtthresh) { 533 /* 534 * this is a pure ack for outstanding data. 535 */ 536 ++tcpstat.tcps_predack; 537 if (opti.ts_present) 538 tcp_xmit_timer(tp, 539 tcp_now-opti.ts_ecr+1); 540 else if (tp->t_rtt && 541 SEQ_GT(ti->ti_ack, tp->t_rtseq)) 542 tcp_xmit_timer(tp, tp->t_rtt); 543 acked = ti->ti_ack - tp->snd_una; 544 tcpstat.tcps_rcvackpack++; 545 tcpstat.tcps_rcvackbyte += acked; 546 sbdrop(&so->so_snd, acked); 547 tp->snd_una = ti->ti_ack; 548 m_freem(m); 549 550 /* 551 * If all outstanding data are acked, stop 552 * retransmit timer, otherwise restart timer 553 * using current (possibly backed-off) value. 554 * If process is waiting for space, 555 * wakeup/selwakeup/signal. If data 556 * are ready to send, let tcp_output 557 * decide between more output or persist. 558 */ 559 if (tp->snd_una == tp->snd_max) 560 tp->t_timer[TCPT_REXMT] = 0; 561 else if (tp->t_timer[TCPT_PERSIST] == 0) 562 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 563 564 if (sb_notify(&so->so_snd)) 565 sowwakeup(so); 566 if (so->so_snd.sb_cc) 567 (void) tcp_output(tp); 568 return; 569 } 570 } else if (ti->ti_ack == tp->snd_una && 571 tp->segq.lh_first == NULL && 572 ti->ti_len <= sbspace(&so->so_rcv)) { 573 /* 574 * this is a pure, in-sequence data packet 575 * with nothing on the reassembly queue and 576 * we have enough buffer space to take it. 577 */ 578 ++tcpstat.tcps_preddat; 579 tp->rcv_nxt += ti->ti_len; 580 tcpstat.tcps_rcvpack++; 581 tcpstat.tcps_rcvbyte += ti->ti_len; 582 /* 583 * Drop TCP, IP headers and TCP options then add data 584 * to socket buffer. 585 */ 586 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 587 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 588 sbappend(&so->so_rcv, m); 589 sorwakeup(so); 590 if (ti->ti_flags & TH_PUSH) 591 tp->t_flags |= TF_ACKNOW; 592 else 593 tp->t_flags |= TF_DELACK; 594 return; 595 } 596 } 597 598 /* 599 * Drop TCP, IP headers and TCP options. 600 */ 601 hdroptlen = sizeof(struct tcpiphdr) + off - sizeof(struct tcphdr); 602 m->m_data += hdroptlen; 603 m->m_len -= hdroptlen; 604 605 /* 606 * Calculate amount of space in receive window, 607 * and then do TCP input processing. 608 * Receive window is amount of space in rcv queue, 609 * but not less than advertised window. 610 */ 611 { int win; 612 613 win = sbspace(&so->so_rcv); 614 if (win < 0) 615 win = 0; 616 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 617 } 618 619 switch (tp->t_state) { 620 621 /* 622 * If the state is SYN_SENT: 623 * if seg contains an ACK, but not for our SYN, drop the input. 624 * if seg contains a RST, then drop the connection. 625 * if seg does not contain SYN, then drop it. 626 * Otherwise this is an acceptable SYN segment 627 * initialize tp->rcv_nxt and tp->irs 628 * if seg contains ack then advance tp->snd_una 629 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 630 * arrange for segment to be acked (eventually) 631 * continue processing rest of data/controls, beginning with URG 632 */ 633 case TCPS_SYN_SENT: 634 if ((tiflags & TH_ACK) && 635 (SEQ_LEQ(ti->ti_ack, tp->iss) || 636 SEQ_GT(ti->ti_ack, tp->snd_max))) 637 goto dropwithreset; 638 if (tiflags & TH_RST) { 639 if (tiflags & TH_ACK) 640 tp = tcp_drop(tp, ECONNREFUSED); 641 goto drop; 642 } 643 if ((tiflags & TH_SYN) == 0) 644 goto drop; 645 if (tiflags & TH_ACK) { 646 tp->snd_una = ti->ti_ack; 647 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 648 tp->snd_nxt = tp->snd_una; 649 } 650 tp->t_timer[TCPT_REXMT] = 0; 651 tp->irs = ti->ti_seq; 652 tcp_rcvseqinit(tp); 653 tp->t_flags |= TF_ACKNOW; 654 tcp_mss_from_peer(tp, opti.maxseg); 655 tcp_rmx_rtt(tp); 656 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 657 tcpstat.tcps_connects++; 658 soisconnected(so); 659 tcp_established(tp); 660 /* Do window scaling on this connection? */ 661 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 662 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 663 tp->snd_scale = tp->requested_s_scale; 664 tp->rcv_scale = tp->request_r_scale; 665 } 666 (void) tcp_reass(tp, (struct tcpiphdr *)0, 667 (struct mbuf *)0); 668 /* 669 * if we didn't have to retransmit the SYN, 670 * use its rtt as our initial srtt & rtt var. 671 */ 672 if (tp->t_rtt) 673 tcp_xmit_timer(tp, tp->t_rtt); 674 } else 675 tp->t_state = TCPS_SYN_RECEIVED; 676 677 /* 678 * Advance ti->ti_seq to correspond to first data byte. 679 * If data, trim to stay within window, 680 * dropping FIN if necessary. 681 */ 682 ti->ti_seq++; 683 if (ti->ti_len > tp->rcv_wnd) { 684 todrop = ti->ti_len - tp->rcv_wnd; 685 m_adj(m, -todrop); 686 ti->ti_len = tp->rcv_wnd; 687 tiflags &= ~TH_FIN; 688 tcpstat.tcps_rcvpackafterwin++; 689 tcpstat.tcps_rcvbyteafterwin += todrop; 690 } 691 tp->snd_wl1 = ti->ti_seq - 1; 692 tp->rcv_up = ti->ti_seq; 693 goto step6; 694 695 /* 696 * If the state is SYN_RECEIVED: 697 * If seg contains an ACK, but not for our SYN, drop the input 698 * and generate an RST. See page 36, rfc793 699 */ 700 case TCPS_SYN_RECEIVED: 701 if ((tiflags & TH_ACK) && 702 (SEQ_LEQ(ti->ti_ack, tp->iss) || 703 SEQ_GT(ti->ti_ack, tp->snd_max))) 704 goto dropwithreset; 705 break; 706 } 707 708 /* 709 * States other than LISTEN or SYN_SENT. 710 * First check timestamp, if present. 711 * Then check that at least some bytes of segment are within 712 * receive window. If segment begins before rcv_nxt, 713 * drop leading data (and SYN); if nothing left, just ack. 714 * 715 * RFC 1323 PAWS: If we have a timestamp reply on this segment 716 * and it's less than ts_recent, drop it. 717 */ 718 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 719 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 720 721 /* Check to see if ts_recent is over 24 days old. */ 722 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 723 /* 724 * Invalidate ts_recent. If this segment updates 725 * ts_recent, the age will be reset later and ts_recent 726 * will get a valid value. If it does not, setting 727 * ts_recent to zero will at least satisfy the 728 * requirement that zero be placed in the timestamp 729 * echo reply when ts_recent isn't valid. The 730 * age isn't reset until we get a valid ts_recent 731 * because we don't want out-of-order segments to be 732 * dropped when ts_recent is old. 733 */ 734 tp->ts_recent = 0; 735 } else { 736 tcpstat.tcps_rcvduppack++; 737 tcpstat.tcps_rcvdupbyte += ti->ti_len; 738 tcpstat.tcps_pawsdrop++; 739 goto dropafterack; 740 } 741 } 742 743 todrop = tp->rcv_nxt - ti->ti_seq; 744 if (todrop > 0) { 745 if (tiflags & TH_SYN) { 746 tiflags &= ~TH_SYN; 747 ti->ti_seq++; 748 if (ti->ti_urp > 1) 749 ti->ti_urp--; 750 else { 751 tiflags &= ~TH_URG; 752 ti->ti_urp = 0; 753 } 754 todrop--; 755 } 756 if (todrop >= ti->ti_len) { 757 /* 758 * Any valid FIN must be to the left of the 759 * window. At this point, FIN must be a 760 * duplicate or out-of-sequence, so drop it. 761 */ 762 tiflags &= ~TH_FIN; 763 /* 764 * Send ACK to resynchronize, and drop any data, 765 * but keep on processing for RST or ACK. 766 */ 767 tp->t_flags |= TF_ACKNOW; 768 tcpstat.tcps_rcvdupbyte += todrop = ti->ti_len; 769 tcpstat.tcps_rcvduppack++; 770 } else { 771 tcpstat.tcps_rcvpartduppack++; 772 tcpstat.tcps_rcvpartdupbyte += todrop; 773 } 774 m_adj(m, todrop); 775 ti->ti_seq += todrop; 776 ti->ti_len -= todrop; 777 if (ti->ti_urp > todrop) 778 ti->ti_urp -= todrop; 779 else { 780 tiflags &= ~TH_URG; 781 ti->ti_urp = 0; 782 } 783 } 784 785 /* 786 * If new data are received on a connection after the 787 * user processes are gone, then RST the other end. 788 */ 789 if ((so->so_state & SS_NOFDREF) && 790 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { 791 tp = tcp_close(tp); 792 tcpstat.tcps_rcvafterclose++; 793 goto dropwithreset; 794 } 795 796 /* 797 * If segment ends after window, drop trailing data 798 * (and PUSH and FIN); if nothing left, just ACK. 799 */ 800 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); 801 if (todrop > 0) { 802 tcpstat.tcps_rcvpackafterwin++; 803 if (todrop >= ti->ti_len) { 804 tcpstat.tcps_rcvbyteafterwin += ti->ti_len; 805 /* 806 * If a new connection request is received 807 * while in TIME_WAIT, drop the old connection 808 * and start over if the sequence numbers 809 * are above the previous ones. 810 */ 811 if (tiflags & TH_SYN && 812 tp->t_state == TCPS_TIME_WAIT && 813 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { 814 iss = tcp_new_iss(tp, sizeof(struct tcpcb), 815 tp->rcv_nxt); 816 tp = tcp_close(tp); 817 /* 818 * We have already advanced the mbuf 819 * pointers past the IP+TCP headers and 820 * options. Restore those pointers before 821 * attempting to use the TCP header again. 822 */ 823 m->m_data -= hdroptlen; 824 m->m_len += hdroptlen; 825 goto findpcb; 826 } 827 /* 828 * If window is closed can only take segments at 829 * window edge, and have to drop data and PUSH from 830 * incoming segments. Continue processing, but 831 * remember to ack. Otherwise, drop segment 832 * and ack. 833 */ 834 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { 835 tp->t_flags |= TF_ACKNOW; 836 tcpstat.tcps_rcvwinprobe++; 837 } else 838 goto dropafterack; 839 } else 840 tcpstat.tcps_rcvbyteafterwin += todrop; 841 m_adj(m, -todrop); 842 ti->ti_len -= todrop; 843 tiflags &= ~(TH_PUSH|TH_FIN); 844 } 845 846 /* 847 * If last ACK falls within this segment's sequence numbers, 848 * record its timestamp. 849 */ 850 if (opti.ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 851 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + 852 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 853 tp->ts_recent_age = tcp_now; 854 tp->ts_recent = opti.ts_val; 855 } 856 857 /* 858 * If the RST bit is set examine the state: 859 * SYN_RECEIVED STATE: 860 * If passive open, return to LISTEN state. 861 * If active open, inform user that connection was refused. 862 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 863 * Inform user that connection was reset, and close tcb. 864 * CLOSING, LAST_ACK, TIME_WAIT STATES 865 * Close the tcb. 866 */ 867 if (tiflags&TH_RST) switch (tp->t_state) { 868 869 case TCPS_SYN_RECEIVED: 870 so->so_error = ECONNREFUSED; 871 goto close; 872 873 case TCPS_ESTABLISHED: 874 case TCPS_FIN_WAIT_1: 875 case TCPS_FIN_WAIT_2: 876 case TCPS_CLOSE_WAIT: 877 so->so_error = ECONNRESET; 878 close: 879 tp->t_state = TCPS_CLOSED; 880 tcpstat.tcps_drops++; 881 tp = tcp_close(tp); 882 goto drop; 883 884 case TCPS_CLOSING: 885 case TCPS_LAST_ACK: 886 case TCPS_TIME_WAIT: 887 tp = tcp_close(tp); 888 goto drop; 889 } 890 891 /* 892 * If a SYN is in the window, then this is an 893 * error and we send an RST and drop the connection. 894 */ 895 if (tiflags & TH_SYN) { 896 tp = tcp_drop(tp, ECONNRESET); 897 goto dropwithreset; 898 } 899 900 /* 901 * If the ACK bit is off we drop the segment and return. 902 */ 903 if ((tiflags & TH_ACK) == 0) 904 goto drop; 905 906 /* 907 * Ack processing. 908 */ 909 switch (tp->t_state) { 910 911 /* 912 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 913 * ESTABLISHED state and continue processing, otherwise 914 * send an RST. 915 */ 916 case TCPS_SYN_RECEIVED: 917 if (SEQ_GT(tp->snd_una, ti->ti_ack) || 918 SEQ_GT(ti->ti_ack, tp->snd_max)) 919 goto dropwithreset; 920 tcpstat.tcps_connects++; 921 soisconnected(so); 922 tcp_established(tp); 923 /* Do window scaling? */ 924 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 925 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 926 tp->snd_scale = tp->requested_s_scale; 927 tp->rcv_scale = tp->request_r_scale; 928 } 929 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); 930 tp->snd_wl1 = ti->ti_seq - 1; 931 /* fall into ... */ 932 933 /* 934 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 935 * ACKs. If the ack is in the range 936 * tp->snd_una < ti->ti_ack <= tp->snd_max 937 * then advance tp->snd_una to ti->ti_ack and drop 938 * data from the retransmission queue. If this ACK reflects 939 * more up to date window information we update our window information. 940 */ 941 case TCPS_ESTABLISHED: 942 case TCPS_FIN_WAIT_1: 943 case TCPS_FIN_WAIT_2: 944 case TCPS_CLOSE_WAIT: 945 case TCPS_CLOSING: 946 case TCPS_LAST_ACK: 947 case TCPS_TIME_WAIT: 948 949 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { 950 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { 951 tcpstat.tcps_rcvdupack++; 952 /* 953 * If we have outstanding data (other than 954 * a window probe), this is a completely 955 * duplicate ack (ie, window info didn't 956 * change), the ack is the biggest we've 957 * seen and we've seen exactly our rexmt 958 * threshhold of them, assume a packet 959 * has been dropped and retransmit it. 960 * Kludge snd_nxt & the congestion 961 * window so we send only this one 962 * packet. 963 * 964 * We know we're losing at the current 965 * window size so do congestion avoidance 966 * (set ssthresh to half the current window 967 * and pull our congestion window back to 968 * the new ssthresh). 969 * 970 * Dup acks mean that packets have left the 971 * network (they're now cached at the receiver) 972 * so bump cwnd by the amount in the receiver 973 * to keep a constant cwnd packets in the 974 * network. 975 */ 976 if (tp->t_timer[TCPT_REXMT] == 0 || 977 ti->ti_ack != tp->snd_una) 978 tp->t_dupacks = 0; 979 else if (++tp->t_dupacks == tcprexmtthresh) { 980 tcp_seq onxt = tp->snd_nxt; 981 u_int win = 982 min(tp->snd_wnd, tp->snd_cwnd) / 2 / 983 tp->t_maxseg; 984 985 if (win < 2) 986 win = 2; 987 tp->snd_ssthresh = win * tp->t_maxseg; 988 tp->t_timer[TCPT_REXMT] = 0; 989 tp->t_rtt = 0; 990 tp->snd_nxt = ti->ti_ack; 991 tp->snd_cwnd = tp->t_maxseg; 992 (void) tcp_output(tp); 993 tp->snd_cwnd = tp->snd_ssthresh + 994 tp->t_maxseg * tp->t_dupacks; 995 if (SEQ_GT(onxt, tp->snd_nxt)) 996 tp->snd_nxt = onxt; 997 goto drop; 998 } else if (tp->t_dupacks > tcprexmtthresh) { 999 tp->snd_cwnd += tp->t_maxseg; 1000 (void) tcp_output(tp); 1001 goto drop; 1002 } 1003 } else 1004 tp->t_dupacks = 0; 1005 break; 1006 } 1007 /* 1008 * If the congestion window was inflated to account 1009 * for the other side's cached packets, retract it. 1010 */ 1011 if (tp->t_dupacks >= tcprexmtthresh && 1012 tp->snd_cwnd > tp->snd_ssthresh) 1013 tp->snd_cwnd = tp->snd_ssthresh; 1014 tp->t_dupacks = 0; 1015 if (SEQ_GT(ti->ti_ack, tp->snd_max)) { 1016 tcpstat.tcps_rcvacktoomuch++; 1017 goto dropafterack; 1018 } 1019 acked = ti->ti_ack - tp->snd_una; 1020 tcpstat.tcps_rcvackpack++; 1021 tcpstat.tcps_rcvackbyte += acked; 1022 1023 /* 1024 * If we have a timestamp reply, update smoothed 1025 * round trip time. If no timestamp is present but 1026 * transmit timer is running and timed sequence 1027 * number was acked, update smoothed round trip time. 1028 * Since we now have an rtt measurement, cancel the 1029 * timer backoff (cf., Phil Karn's retransmit alg.). 1030 * Recompute the initial retransmit timer. 1031 */ 1032 if (opti.ts_present) 1033 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1); 1034 else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) 1035 tcp_xmit_timer(tp,tp->t_rtt); 1036 1037 /* 1038 * If all outstanding data is acked, stop retransmit 1039 * timer and remember to restart (more output or persist). 1040 * If there is more data to be acked, restart retransmit 1041 * timer, using current (possibly backed-off) value. 1042 */ 1043 if (ti->ti_ack == tp->snd_max) { 1044 tp->t_timer[TCPT_REXMT] = 0; 1045 needoutput = 1; 1046 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1047 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1048 /* 1049 * When new data is acked, open the congestion window. 1050 * If the window gives us less than ssthresh packets 1051 * in flight, open exponentially (maxseg per packet). 1052 * Otherwise open linearly: maxseg per window 1053 * (maxseg^2 / cwnd per packet), plus a constant 1054 * fraction of a packet (maxseg/8) to help larger windows 1055 * open quickly enough. 1056 */ 1057 { 1058 register u_int cw = tp->snd_cwnd; 1059 register u_int incr = tp->t_maxseg; 1060 1061 if (cw > tp->snd_ssthresh) 1062 incr = incr * incr / cw; 1063 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1064 } 1065 if (acked > so->so_snd.sb_cc) { 1066 tp->snd_wnd -= so->so_snd.sb_cc; 1067 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1068 ourfinisacked = 1; 1069 } else { 1070 sbdrop(&so->so_snd, acked); 1071 tp->snd_wnd -= acked; 1072 ourfinisacked = 0; 1073 } 1074 if (sb_notify(&so->so_snd)) 1075 sowwakeup(so); 1076 tp->snd_una = ti->ti_ack; 1077 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1078 tp->snd_nxt = tp->snd_una; 1079 1080 switch (tp->t_state) { 1081 1082 /* 1083 * In FIN_WAIT_1 STATE in addition to the processing 1084 * for the ESTABLISHED state if our FIN is now acknowledged 1085 * then enter FIN_WAIT_2. 1086 */ 1087 case TCPS_FIN_WAIT_1: 1088 if (ourfinisacked) { 1089 /* 1090 * If we can't receive any more 1091 * data, then closing user can proceed. 1092 * Starting the timer is contrary to the 1093 * specification, but if we don't get a FIN 1094 * we'll hang forever. 1095 */ 1096 if (so->so_state & SS_CANTRCVMORE) { 1097 soisdisconnected(so); 1098 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 1099 } 1100 tp->t_state = TCPS_FIN_WAIT_2; 1101 } 1102 break; 1103 1104 /* 1105 * In CLOSING STATE in addition to the processing for 1106 * the ESTABLISHED state if the ACK acknowledges our FIN 1107 * then enter the TIME-WAIT state, otherwise ignore 1108 * the segment. 1109 */ 1110 case TCPS_CLOSING: 1111 if (ourfinisacked) { 1112 tp->t_state = TCPS_TIME_WAIT; 1113 tcp_canceltimers(tp); 1114 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1115 soisdisconnected(so); 1116 } 1117 break; 1118 1119 /* 1120 * In LAST_ACK, we may still be waiting for data to drain 1121 * and/or to be acked, as well as for the ack of our FIN. 1122 * If our FIN is now acknowledged, delete the TCB, 1123 * enter the closed state and return. 1124 */ 1125 case TCPS_LAST_ACK: 1126 if (ourfinisacked) { 1127 tp = tcp_close(tp); 1128 goto drop; 1129 } 1130 break; 1131 1132 /* 1133 * In TIME_WAIT state the only thing that should arrive 1134 * is a retransmission of the remote FIN. Acknowledge 1135 * it and restart the finack timer. 1136 */ 1137 case TCPS_TIME_WAIT: 1138 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1139 goto dropafterack; 1140 } 1141 } 1142 1143 step6: 1144 /* 1145 * Update window information. 1146 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1147 */ 1148 if (((tiflags & TH_ACK) && SEQ_LT(tp->snd_wl1, ti->ti_seq)) || 1149 (tp->snd_wl1 == ti->ti_seq && SEQ_LT(tp->snd_wl2, ti->ti_ack)) || 1150 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)) { 1151 /* keep track of pure window updates */ 1152 if (ti->ti_len == 0 && 1153 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) 1154 tcpstat.tcps_rcvwinupd++; 1155 tp->snd_wnd = tiwin; 1156 tp->snd_wl1 = ti->ti_seq; 1157 tp->snd_wl2 = ti->ti_ack; 1158 if (tp->snd_wnd > tp->max_sndwnd) 1159 tp->max_sndwnd = tp->snd_wnd; 1160 needoutput = 1; 1161 } 1162 1163 /* 1164 * Process segments with URG. 1165 */ 1166 if ((tiflags & TH_URG) && ti->ti_urp && 1167 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1168 /* 1169 * This is a kludge, but if we receive and accept 1170 * random urgent pointers, we'll crash in 1171 * soreceive. It's hard to imagine someone 1172 * actually wanting to send this much urgent data. 1173 */ 1174 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { 1175 ti->ti_urp = 0; /* XXX */ 1176 tiflags &= ~TH_URG; /* XXX */ 1177 goto dodata; /* XXX */ 1178 } 1179 /* 1180 * If this segment advances the known urgent pointer, 1181 * then mark the data stream. This should not happen 1182 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1183 * a FIN has been received from the remote side. 1184 * In these states we ignore the URG. 1185 * 1186 * According to RFC961 (Assigned Protocols), 1187 * the urgent pointer points to the last octet 1188 * of urgent data. We continue, however, 1189 * to consider it to indicate the first octet 1190 * of data past the urgent section as the original 1191 * spec states (in one of two places). 1192 */ 1193 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { 1194 tp->rcv_up = ti->ti_seq + ti->ti_urp; 1195 so->so_oobmark = so->so_rcv.sb_cc + 1196 (tp->rcv_up - tp->rcv_nxt) - 1; 1197 if (so->so_oobmark == 0) 1198 so->so_state |= SS_RCVATMARK; 1199 sohasoutofband(so); 1200 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1201 } 1202 /* 1203 * Remove out of band data so doesn't get presented to user. 1204 * This can happen independent of advancing the URG pointer, 1205 * but if two URG's are pending at once, some out-of-band 1206 * data may creep in... ick. 1207 */ 1208 if (ti->ti_urp <= (u_int16_t) ti->ti_len 1209 #ifdef SO_OOBINLINE 1210 && (so->so_options & SO_OOBINLINE) == 0 1211 #endif 1212 ) 1213 tcp_pulloutofband(so, ti, m); 1214 } else 1215 /* 1216 * If no out of band data is expected, 1217 * pull receive urgent pointer along 1218 * with the receive window. 1219 */ 1220 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1221 tp->rcv_up = tp->rcv_nxt; 1222 dodata: /* XXX */ 1223 1224 /* 1225 * Process the segment text, merging it into the TCP sequencing queue, 1226 * and arranging for acknowledgment of receipt if necessary. 1227 * This process logically involves adjusting tp->rcv_wnd as data 1228 * is presented to the user (this happens in tcp_usrreq.c, 1229 * case PRU_RCVD). If a FIN has already been received on this 1230 * connection then we just ignore the text. 1231 */ 1232 if ((ti->ti_len || (tiflags & TH_FIN)) && 1233 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1234 TCP_REASS(tp, ti, m, so, tiflags); 1235 /* 1236 * Note the amount of data that peer has sent into 1237 * our window, in order to estimate the sender's 1238 * buffer size. 1239 */ 1240 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1241 } else { 1242 m_freem(m); 1243 tiflags &= ~TH_FIN; 1244 } 1245 1246 /* 1247 * If FIN is received ACK the FIN and let the user know 1248 * that the connection is closing. Ignore a FIN received before 1249 * the connection is fully established. 1250 */ 1251 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1252 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1253 socantrcvmore(so); 1254 tp->t_flags |= TF_ACKNOW; 1255 tp->rcv_nxt++; 1256 } 1257 switch (tp->t_state) { 1258 1259 /* 1260 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1261 */ 1262 case TCPS_ESTABLISHED: 1263 tp->t_state = TCPS_CLOSE_WAIT; 1264 break; 1265 1266 /* 1267 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1268 * enter the CLOSING state. 1269 */ 1270 case TCPS_FIN_WAIT_1: 1271 tp->t_state = TCPS_CLOSING; 1272 break; 1273 1274 /* 1275 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1276 * starting the time-wait timer, turning off the other 1277 * standard timers. 1278 */ 1279 case TCPS_FIN_WAIT_2: 1280 tp->t_state = TCPS_TIME_WAIT; 1281 tcp_canceltimers(tp); 1282 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1283 soisdisconnected(so); 1284 break; 1285 1286 /* 1287 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1288 */ 1289 case TCPS_TIME_WAIT: 1290 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1291 break; 1292 } 1293 } 1294 if (so->so_options & SO_DEBUG) 1295 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); 1296 1297 /* 1298 * Return any desired output. 1299 */ 1300 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1301 (void) tcp_output(tp); 1302 return; 1303 1304 dropafterack: 1305 /* 1306 * Generate an ACK dropping incoming segment if it occupies 1307 * sequence space, where the ACK reflects our state. 1308 */ 1309 if (tiflags & TH_RST) 1310 goto drop; 1311 m_freem(m); 1312 tp->t_flags |= TF_ACKNOW; 1313 (void) tcp_output(tp); 1314 return; 1315 1316 dropwithreset: 1317 /* 1318 * Generate a RST, dropping incoming segment. 1319 * Make ACK acceptable to originator of segment. 1320 * Don't bother to respond if destination was broadcast/multicast. 1321 */ 1322 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || 1323 IN_MULTICAST(ti->ti_dst.s_addr)) 1324 goto drop; 1325 if (tiflags & TH_ACK) 1326 (void)tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); 1327 else { 1328 if (tiflags & TH_SYN) 1329 ti->ti_len++; 1330 (void)tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, 1331 TH_RST|TH_ACK); 1332 } 1333 return; 1334 1335 drop: 1336 /* 1337 * Drop space held by incoming segment and return. 1338 */ 1339 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1340 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); 1341 m_freem(m); 1342 return; 1343 #ifndef TUBA_INCLUDE 1344 } 1345 1346 void 1347 tcp_dooptions(tp, cp, cnt, ti, oi) 1348 struct tcpcb *tp; 1349 u_char *cp; 1350 int cnt; 1351 struct tcpiphdr *ti; 1352 struct tcp_opt_info *oi; 1353 { 1354 u_int16_t mss; 1355 int opt, optlen; 1356 1357 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1358 opt = cp[0]; 1359 if (opt == TCPOPT_EOL) 1360 break; 1361 if (opt == TCPOPT_NOP) 1362 optlen = 1; 1363 else { 1364 optlen = cp[1]; 1365 if (optlen <= 0) 1366 break; 1367 } 1368 switch (opt) { 1369 1370 default: 1371 continue; 1372 1373 case TCPOPT_MAXSEG: 1374 if (optlen != TCPOLEN_MAXSEG) 1375 continue; 1376 if (!(ti->ti_flags & TH_SYN)) 1377 continue; 1378 bcopy(cp + 2, &mss, sizeof(mss)); 1379 oi->maxseg = ntohs(mss); 1380 break; 1381 1382 case TCPOPT_WINDOW: 1383 if (optlen != TCPOLEN_WINDOW) 1384 continue; 1385 if (!(ti->ti_flags & TH_SYN)) 1386 continue; 1387 tp->t_flags |= TF_RCVD_SCALE; 1388 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 1389 break; 1390 1391 case TCPOPT_TIMESTAMP: 1392 if (optlen != TCPOLEN_TIMESTAMP) 1393 continue; 1394 oi->ts_present = 1; 1395 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 1396 NTOHL(oi->ts_val); 1397 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 1398 NTOHL(oi->ts_ecr); 1399 1400 /* 1401 * A timestamp received in a SYN makes 1402 * it ok to send timestamp requests and replies. 1403 */ 1404 if (ti->ti_flags & TH_SYN) { 1405 tp->t_flags |= TF_RCVD_TSTMP; 1406 tp->ts_recent = oi->ts_val; 1407 tp->ts_recent_age = tcp_now; 1408 } 1409 break; 1410 } 1411 } 1412 } 1413 1414 /* 1415 * Pull out of band byte out of a segment so 1416 * it doesn't appear in the user's data queue. 1417 * It is still reflected in the segment length for 1418 * sequencing purposes. 1419 */ 1420 void 1421 tcp_pulloutofband(so, ti, m) 1422 struct socket *so; 1423 struct tcpiphdr *ti; 1424 register struct mbuf *m; 1425 { 1426 int cnt = ti->ti_urp - 1; 1427 1428 while (cnt >= 0) { 1429 if (m->m_len > cnt) { 1430 char *cp = mtod(m, caddr_t) + cnt; 1431 struct tcpcb *tp = sototcpcb(so); 1432 1433 tp->t_iobc = *cp; 1434 tp->t_oobflags |= TCPOOB_HAVEDATA; 1435 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 1436 m->m_len--; 1437 return; 1438 } 1439 cnt -= m->m_len; 1440 m = m->m_next; 1441 if (m == 0) 1442 break; 1443 } 1444 panic("tcp_pulloutofband"); 1445 } 1446 1447 /* 1448 * Collect new round-trip time estimate 1449 * and update averages and current timeout. 1450 */ 1451 void 1452 tcp_xmit_timer(tp, rtt) 1453 register struct tcpcb *tp; 1454 short rtt; 1455 { 1456 register short delta; 1457 1458 tcpstat.tcps_rttupdated++; 1459 --rtt; 1460 if (tp->t_srtt != 0) { 1461 /* 1462 * srtt is stored as fixed point with 3 bits after the 1463 * binary point (i.e., scaled by 8). The following magic 1464 * is equivalent to the smoothing algorithm in rfc793 with 1465 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 1466 * point). Adjust rtt to origin 0. 1467 */ 1468 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 1469 if ((tp->t_srtt += delta) <= 0) 1470 tp->t_srtt = 1 << 2; 1471 /* 1472 * We accumulate a smoothed rtt variance (actually, a 1473 * smoothed mean difference), then set the retransmit 1474 * timer to smoothed rtt + 4 times the smoothed variance. 1475 * rttvar is stored as fixed point with 2 bits after the 1476 * binary point (scaled by 4). The following is 1477 * equivalent to rfc793 smoothing with an alpha of .75 1478 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 1479 * rfc793's wired-in beta. 1480 */ 1481 if (delta < 0) 1482 delta = -delta; 1483 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 1484 if ((tp->t_rttvar += delta) <= 0) 1485 tp->t_rttvar = 1 << 2; 1486 } else { 1487 /* 1488 * No rtt measurement yet - use the unsmoothed rtt. 1489 * Set the variance to half the rtt (so our first 1490 * retransmit happens at 3*rtt). 1491 */ 1492 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 1493 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 1494 } 1495 tp->t_rtt = 0; 1496 tp->t_rxtshift = 0; 1497 1498 /* 1499 * the retransmit should happen at rtt + 4 * rttvar. 1500 * Because of the way we do the smoothing, srtt and rttvar 1501 * will each average +1/2 tick of bias. When we compute 1502 * the retransmit timer, we want 1/2 tick of rounding and 1503 * 1 extra tick because of +-1/2 tick uncertainty in the 1504 * firing of the timer. The bias will give us exactly the 1505 * 1.5 tick we need. But, because the bias is 1506 * statistical, we have to test that we don't drop below 1507 * the minimum feasible timer (which is 2 ticks). 1508 */ 1509 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 1510 rtt + 2, TCPTV_REXMTMAX); 1511 1512 /* 1513 * We received an ack for a packet that wasn't retransmitted; 1514 * it is probably safe to discard any error indications we've 1515 * received recently. This isn't quite right, but close enough 1516 * for now (a route might have failed after we sent a segment, 1517 * and the return path might not be symmetrical). 1518 */ 1519 tp->t_softerror = 0; 1520 } 1521 1522 /* 1523 * TCP compressed state engine. Currently used to hold compressed 1524 * state for SYN_RECEIVED. 1525 */ 1526 1527 u_long syn_cache_count; 1528 u_int32_t syn_hash1, syn_hash2; 1529 1530 #define SYN_HASH(sa, sp, dp) \ 1531 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 1532 ((u_int32_t)(sp)))^syn_hash2)) \ 1533 & 0x7fffffff) 1534 1535 #define eptosp(ep, e, s) ((struct s *)((char *)(ep) - \ 1536 ((char *)(&((struct s *)0)->e) - (char *)0))) 1537 1538 #define SYN_CACHE_RM(sc, p, scp) { \ 1539 *(p) = (sc)->sc_next; \ 1540 if ((sc)->sc_next) \ 1541 (sc)->sc_next->sc_timer += (sc)->sc_timer; \ 1542 else { \ 1543 (scp)->sch_timer_sum -= (sc)->sc_timer; \ 1544 if ((scp)->sch_timer_sum <= 0) \ 1545 (scp)->sch_timer_sum = -1; \ 1546 /* If need be, fix up the last pointer */ \ 1547 if ((scp)->sch_first) \ 1548 (scp)->sch_last = eptosp(p, sc_next, syn_cache); \ 1549 } \ 1550 (scp)->sch_length--; \ 1551 syn_cache_count--; \ 1552 } 1553 1554 void 1555 syn_cache_insert(sc, prevp, headp) 1556 struct syn_cache *sc; 1557 struct syn_cache ***prevp; 1558 struct syn_cache_head **headp; 1559 { 1560 struct syn_cache_head *scp, *scp2, *sce; 1561 struct syn_cache *sc2; 1562 static u_int timeo_val; 1563 int s; 1564 1565 /* Initialize the hash secrets when adding the first entry */ 1566 if (syn_cache_count == 0) { 1567 struct timeval tv; 1568 microtime(&tv); 1569 syn_hash1 = random() ^ (u_long)≻ 1570 syn_hash2 = random() ^ tv.tv_usec; 1571 } 1572 1573 sc->sc_hash = SYN_HASH(&sc->sc_src, sc->sc_sport, sc->sc_dport); 1574 sc->sc_next = NULL; 1575 scp = &tcp_syn_cache[sc->sc_hash % tcp_syn_cache_size]; 1576 *headp = scp; 1577 1578 /* 1579 * Make sure that we don't overflow the per-bucket 1580 * limit or the total cache size limit. 1581 */ 1582 s = splsoftnet(); 1583 if (scp->sch_length >= tcp_syn_bucket_limit) { 1584 tcpstat.tcps_sc_bucketoverflow++; 1585 sc2 = scp->sch_first; 1586 scp->sch_first = sc2->sc_next; 1587 FREE(sc2, M_PCB); 1588 } else if (syn_cache_count >= tcp_syn_cache_limit) { 1589 tcpstat.tcps_sc_overflowed++; 1590 /* 1591 * The cache is full. Toss the first (i.e, oldest) 1592 * element in this bucket. 1593 */ 1594 scp2 = scp; 1595 if (scp2->sch_first == NULL) { 1596 sce = &tcp_syn_cache[tcp_syn_cache_size]; 1597 for (++scp2; scp2 != scp; scp2++) { 1598 if (scp2 >= sce) 1599 scp2 = &tcp_syn_cache[0]; 1600 if (scp2->sch_first) 1601 break; 1602 } 1603 } 1604 sc2 = scp2->sch_first; 1605 if (sc2 == NULL) { 1606 FREE(sc, M_PCB); 1607 return; 1608 } 1609 if ((scp2->sch_first = sc2->sc_next) == NULL) 1610 scp2->sch_last = NULL; 1611 else 1612 sc2->sc_next->sc_timer += sc2->sc_timer; 1613 FREE(sc2, M_PCB); 1614 } else { 1615 scp->sch_length++; 1616 syn_cache_count++; 1617 } 1618 tcpstat.tcps_sc_added++; 1619 1620 /* 1621 * Put it into the bucket. 1622 */ 1623 if (scp->sch_first == NULL) 1624 *prevp = &scp->sch_first; 1625 else { 1626 *prevp = &scp->sch_last->sc_next; 1627 tcpstat.tcps_sc_collisions++; 1628 } 1629 **prevp = sc; 1630 scp->sch_last = sc; 1631 1632 /* 1633 * If the timeout value has changed 1634 * 1) force it to fit in a u_char 1635 * 2) Run the timer routine to truncate all 1636 * existing entries to the new timeout value. 1637 */ 1638 if (timeo_val != tcp_syn_cache_timeo) { 1639 tcp_syn_cache_timeo = min(tcp_syn_cache_timeo, UCHAR_MAX); 1640 if (timeo_val > tcp_syn_cache_timeo) 1641 syn_cache_timer(timeo_val - tcp_syn_cache_timeo); 1642 timeo_val = tcp_syn_cache_timeo; 1643 } 1644 if (scp->sch_timer_sum > 0) 1645 sc->sc_timer = tcp_syn_cache_timeo - scp->sch_timer_sum; 1646 else if (scp->sch_timer_sum == 0) { 1647 /* When the bucket timer is 0, it is not in the cache queue. */ 1648 scp->sch_headq = tcp_syn_cache_first; 1649 tcp_syn_cache_first = scp; 1650 sc->sc_timer = tcp_syn_cache_timeo; 1651 } 1652 scp->sch_timer_sum = tcp_syn_cache_timeo; 1653 splx(s); 1654 } 1655 1656 /* 1657 * Walk down the cache list, decrementing the timer of 1658 * the first element on each entry. If the timer goes 1659 * to zero, remove it and all successive entries with 1660 * a zero timer. 1661 */ 1662 void 1663 syn_cache_timer(interval) 1664 int interval; 1665 { 1666 struct syn_cache_head *scp, **pscp; 1667 struct syn_cache *sc, *scn; 1668 int n, s; 1669 1670 pscp = &tcp_syn_cache_first; 1671 scp = tcp_syn_cache_first; 1672 s = splsoftnet(); 1673 while (scp) { 1674 /* 1675 * Remove any empty hash buckets 1676 * from the cache queue. 1677 */ 1678 if ((sc = scp->sch_first) == NULL) { 1679 *pscp = scp->sch_headq; 1680 scp->sch_headq = NULL; 1681 scp->sch_timer_sum = 0; 1682 scp->sch_first = scp->sch_last = NULL; 1683 scp->sch_length = 0; 1684 scp = *pscp; 1685 continue; 1686 } 1687 1688 scp->sch_timer_sum -= interval; 1689 if (scp->sch_timer_sum <= 0) 1690 scp->sch_timer_sum = -1; 1691 n = interval; 1692 while (sc->sc_timer <= n) { 1693 n -= sc->sc_timer; 1694 scn = sc->sc_next; 1695 tcpstat.tcps_sc_timed_out++; 1696 syn_cache_count--; 1697 FREE(sc, M_PCB); 1698 scp->sch_length--; 1699 if ((sc = scn) == NULL) 1700 break; 1701 } 1702 if ((scp->sch_first = sc) != NULL) { 1703 sc->sc_timer -= n; 1704 pscp = &scp->sch_headq; 1705 scp = scp->sch_headq; 1706 } 1707 } 1708 splx(s); 1709 } 1710 1711 /* 1712 * Find an entry in the syn cache. 1713 */ 1714 struct syn_cache * 1715 syn_cache_lookup(ti, prevp, headp) 1716 struct tcpiphdr *ti; 1717 struct syn_cache ***prevp; 1718 struct syn_cache_head **headp; 1719 { 1720 struct syn_cache *sc, **prev; 1721 struct syn_cache_head *head; 1722 u_int32_t hash; 1723 int s; 1724 1725 hash = SYN_HASH(&ti->ti_src, ti->ti_sport, ti->ti_dport); 1726 1727 head = &tcp_syn_cache[hash % tcp_syn_cache_size]; 1728 *headp = head; 1729 prev = &head->sch_first; 1730 s = splsoftnet(); 1731 for (sc = head->sch_first; sc; prev = &sc->sc_next, sc = sc->sc_next) { 1732 if (sc->sc_hash != hash) 1733 continue; 1734 if (sc->sc_src.s_addr == ti->ti_src.s_addr && 1735 sc->sc_sport == ti->ti_sport && 1736 sc->sc_dport == ti->ti_dport && 1737 sc->sc_dst.s_addr == ti->ti_dst.s_addr) { 1738 *prevp = prev; 1739 splx(s); 1740 return (sc); 1741 } 1742 } 1743 splx(s); 1744 return (NULL); 1745 } 1746 1747 /* 1748 * This function gets called when we receive an ACK for a 1749 * socket in the LISTEN state. We look up the connection 1750 * in the syn cache, and if its there, we pull it out of 1751 * the cache and turn it into a full-blown connection in 1752 * the SYN-RECEIVED state. 1753 * 1754 * The return values may not be immediately obvious, and their effects 1755 * can be subtle, so here they are: 1756 * 1757 * NULL SYN was not found in cache; caller should drop the 1758 * packet and send an RST. 1759 * 1760 * -1 We were unable to create the new connection, and are 1761 * aborting it. An ACK,RST is being sent to the peer 1762 * (unless we got screwey sequence numbners; see below), 1763 * because the 3-way handshake has been completed. Caller 1764 * should not free the mbuf, since we may be using it. If 1765 * we are not, we will free it. 1766 * 1767 * Otherwise, the return value is a pointer to the new socket 1768 * associated with the connection. 1769 */ 1770 struct socket * 1771 syn_cache_get(so, m) 1772 struct socket *so; 1773 struct mbuf *m; 1774 { 1775 struct syn_cache *sc, **sc_prev; 1776 struct syn_cache_head *head; 1777 register struct inpcb *inp; 1778 register struct tcpcb *tp = 0; 1779 register struct tcpiphdr *ti; 1780 struct sockaddr_in *sin; 1781 struct mbuf *am; 1782 long win; 1783 int s; 1784 1785 ti = mtod(m, struct tcpiphdr *); 1786 s = splsoftnet(); 1787 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) { 1788 splx(s); 1789 return (NULL); 1790 } 1791 1792 win = sbspace(&so->so_rcv); 1793 if (win > TCP_MAXWIN) 1794 win = TCP_MAXWIN; 1795 1796 /* 1797 * Verify the sequence and ack numbers. 1798 */ 1799 if ((ti->ti_ack != sc->sc_iss + 1) || 1800 SEQ_LEQ(ti->ti_seq, sc->sc_irs) || 1801 SEQ_GT(ti->ti_seq, sc->sc_irs + 1 + win)) { 1802 (void) syn_cache_respond(sc, m, ti, win, 0); 1803 splx(s); 1804 return ((struct socket *)(-1)); 1805 } 1806 1807 /* Remove this cache entry */ 1808 SYN_CACHE_RM(sc, sc_prev, head); 1809 splx(s); 1810 1811 /* 1812 * Ok, create the full blown connection, and set things up 1813 * as they would have been set up if we had created the 1814 * connection when the SYN arrived. If we can't create 1815 * the connection, abort it. 1816 */ 1817 so = sonewconn(so, SS_ISCONNECTED); 1818 if (so == NULL) 1819 goto resetandabort; 1820 1821 inp = sotoinpcb(so); 1822 inp->inp_laddr = sc->sc_dst; 1823 inp->inp_lport = sc->sc_dport; 1824 in_pcbstate(inp, INP_BOUND); 1825 #if BSD>=43 1826 inp->inp_options = ip_srcroute(); 1827 #endif 1828 1829 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 1830 if (am == NULL) { 1831 m_freem(m); 1832 goto resetandabort; 1833 } 1834 am->m_len = sizeof(struct sockaddr_in); 1835 sin = mtod(am, struct sockaddr_in *); 1836 sin->sin_family = AF_INET; 1837 sin->sin_len = sizeof(*sin); 1838 sin->sin_addr = sc->sc_src; 1839 sin->sin_port = sc->sc_sport; 1840 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 1841 if (in_pcbconnect(inp, am)) { 1842 (void) m_free(am); 1843 m_freem(m); 1844 goto resetandabort; 1845 } 1846 (void) m_free(am); 1847 1848 tp = intotcpcb(inp); 1849 if (sc->sc_request_r_scale != 15) { 1850 tp->requested_s_scale = sc->sc_requested_s_scale; 1851 tp->request_r_scale = sc->sc_request_r_scale; 1852 tp->snd_scale = sc->sc_requested_s_scale; 1853 tp->rcv_scale = sc->sc_request_r_scale; 1854 tp->t_flags |= TF_RCVD_SCALE; 1855 } 1856 if (sc->sc_tstmp) 1857 tp->t_flags |= TF_RCVD_TSTMP; 1858 1859 tp->t_template = tcp_template(tp); 1860 if (tp->t_template == 0) { 1861 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 1862 so = NULL; 1863 m_freem(m); 1864 goto abort; 1865 } 1866 1867 tp->iss = sc->sc_iss; 1868 tp->irs = sc->sc_irs; 1869 tcp_sendseqinit(tp); 1870 tcp_rcvseqinit(tp); 1871 tp->t_state = TCPS_SYN_RECEIVED; 1872 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 1873 tcpstat.tcps_accepts++; 1874 1875 /* Initialize tp->t_ourmss before we deal with the peer's! */ 1876 tp->t_ourmss = sc->sc_ourmaxseg; 1877 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 1878 tcp_rmx_rtt(tp); 1879 tp->snd_wl1 = sc->sc_irs; 1880 tp->rcv_up = sc->sc_irs + 1; 1881 1882 /* 1883 * This is what whould have happened in tcp_ouput() when 1884 * the SYN,ACK was sent. 1885 */ 1886 tp->snd_up = tp->snd_una; 1887 tp->snd_max = tp->snd_nxt = tp->iss+1; 1888 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1889 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1890 tp->rcv_adv = tp->rcv_nxt + win; 1891 tp->last_ack_sent = tp->rcv_nxt; 1892 1893 tcpstat.tcps_sc_completed++; 1894 FREE(sc, M_PCB); 1895 return (so); 1896 1897 resetandabort: 1898 (void) tcp_respond(NULL, ti, m, ti->ti_seq+ti->ti_len, 1899 (tcp_seq)0, TH_RST|TH_ACK); 1900 abort: 1901 if (so != NULL) 1902 (void) soabort(so); 1903 FREE(sc, M_PCB); 1904 tcpstat.tcps_sc_aborted++; 1905 return ((struct socket *)(-1)); 1906 } 1907 1908 /* 1909 * This function is called when we get a RST for a 1910 * non-existant connection, so that we can see if the 1911 * connection is in the syn cache. If it is, zap it. 1912 */ 1913 1914 void 1915 syn_cache_reset(ti) 1916 register struct tcpiphdr *ti; 1917 { 1918 struct syn_cache *sc, **sc_prev; 1919 struct syn_cache_head *head; 1920 int s = splsoftnet(); 1921 1922 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) { 1923 splx(s); 1924 return; 1925 } 1926 if (SEQ_LT(ti->ti_seq,sc->sc_irs) || 1927 SEQ_GT(ti->ti_seq, sc->sc_irs+1)) { 1928 splx(s); 1929 return; 1930 } 1931 SYN_CACHE_RM(sc, sc_prev, head); 1932 splx(s); 1933 tcpstat.tcps_sc_reset++; 1934 FREE(sc, M_PCB); 1935 } 1936 1937 void 1938 syn_cache_unreach(ip, th) 1939 struct ip *ip; 1940 struct tcphdr *th; 1941 { 1942 struct syn_cache *sc, **sc_prev; 1943 struct syn_cache_head *head; 1944 struct tcpiphdr ti2; 1945 int s; 1946 1947 ti2.ti_src.s_addr = ip->ip_dst.s_addr; 1948 ti2.ti_dst.s_addr = ip->ip_src.s_addr; 1949 ti2.ti_sport = th->th_dport; 1950 ti2.ti_dport = th->th_sport; 1951 1952 s = splsoftnet(); 1953 if ((sc = syn_cache_lookup(&ti2, &sc_prev, &head)) == NULL) { 1954 splx(s); 1955 return; 1956 } 1957 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 1958 if (ntohl (th->th_seq) != sc->sc_iss) { 1959 splx(s); 1960 return; 1961 } 1962 SYN_CACHE_RM(sc, sc_prev, head); 1963 splx(s); 1964 tcpstat.tcps_sc_unreach++; 1965 FREE(sc, M_PCB); 1966 } 1967 1968 /* 1969 * Given a LISTEN socket and an inbound SYN request, add 1970 * this to the syn cache, and send back a segment: 1971 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 1972 * to the source. 1973 * 1974 * XXX We don't properly handle SYN-with-data! 1975 */ 1976 1977 int 1978 syn_cache_add(so, m, optp, optlen, oi) 1979 struct socket *so; 1980 struct mbuf *m; 1981 u_char *optp; 1982 int optlen; 1983 struct tcp_opt_info *oi; 1984 { 1985 register struct tcpiphdr *ti; 1986 struct tcpcb tb, *tp; 1987 long win; 1988 struct syn_cache *sc, **sc_prev; 1989 struct syn_cache_head *scp; 1990 extern int tcp_do_rfc1323; 1991 1992 tp = sototcpcb(so); 1993 ti = mtod(m, struct tcpiphdr *); 1994 1995 /* 1996 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1997 * in_broadcast() should never return true on a received 1998 * packet with M_BCAST not set. 1999 */ 2000 if (m->m_flags & (M_BCAST|M_MCAST) || 2001 IN_MULTICAST(ti->ti_src.s_addr) || 2002 IN_MULTICAST(ti->ti_dst.s_addr)) 2003 return (0); 2004 2005 /* 2006 * Initialize some local state. 2007 */ 2008 win = sbspace(&so->so_rcv); 2009 if (win > TCP_MAXWIN) 2010 win = TCP_MAXWIN; 2011 2012 if (optp) { 2013 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 2014 tcp_dooptions(&tb, optp, optlen, ti, oi); 2015 } else 2016 tb.t_flags = 0; 2017 2018 /* 2019 * See if we already have an entry for this connection. 2020 */ 2021 if ((sc = syn_cache_lookup(ti, &sc_prev, &scp)) != NULL) { 2022 tcpstat.tcps_sc_dupesyn++; 2023 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) { 2024 tcpstat.tcps_sndacks++; 2025 tcpstat.tcps_sndtotal++; 2026 } 2027 return (1); 2028 } 2029 2030 MALLOC(sc, struct syn_cache *, sizeof(*sc), M_PCB, M_NOWAIT); 2031 if (sc == NULL) 2032 return (0); 2033 /* 2034 * Fill in the cache, and put the necessary TCP 2035 * options into the reply. 2036 */ 2037 sc->sc_src.s_addr = ti->ti_src.s_addr; 2038 sc->sc_dst.s_addr = ti->ti_dst.s_addr; 2039 sc->sc_sport = ti->ti_sport; 2040 sc->sc_dport = ti->ti_dport; 2041 sc->sc_irs = ti->ti_seq; 2042 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0); 2043 sc->sc_peermaxseg = oi->maxseg; 2044 sc->sc_ourmaxseg = tcp_mss_to_advertise(tp); 2045 sc->sc_tstmp = (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) ? 1 : 0; 2046 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2047 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2048 sc->sc_requested_s_scale = tb.requested_s_scale; 2049 sc->sc_request_r_scale = 0; 2050 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 2051 TCP_MAXWIN << sc->sc_request_r_scale < 2052 so->so_rcv.sb_hiwat) 2053 sc->sc_request_r_scale++; 2054 } else { 2055 sc->sc_requested_s_scale = 15; 2056 sc->sc_request_r_scale = 15; 2057 } 2058 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) { 2059 syn_cache_insert(sc, &sc_prev, &scp); 2060 tcpstat.tcps_sndacks++; 2061 tcpstat.tcps_sndtotal++; 2062 } else { 2063 FREE(sc, M_PCB); 2064 tcpstat.tcps_sc_dropped++; 2065 } 2066 return (1); 2067 } 2068 2069 int 2070 syn_cache_respond(sc, m, ti, win, ts) 2071 struct syn_cache *sc; 2072 struct mbuf *m; 2073 register struct tcpiphdr *ti; 2074 long win; 2075 u_long ts; 2076 { 2077 u_int8_t *optp; 2078 int optlen; 2079 2080 /* 2081 * Tack on the TCP options. If there isn't enough trailing 2082 * space for them, move up the fixed header to make space. 2083 */ 2084 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 2085 (sc->sc_tstmp ? TCPOLEN_TSTAMP_APPA : 0); 2086 if (optlen > M_TRAILINGSPACE(m)) { 2087 if (M_LEADINGSPACE(m) >= optlen) { 2088 m->m_data -= optlen; 2089 m->m_len += optlen; 2090 } else { 2091 struct mbuf *m0 = m; 2092 if ((m = m_gethdr(M_DONTWAIT, MT_HEADER)) == NULL) { 2093 m_freem(m0); 2094 return (ENOBUFS); 2095 } 2096 MH_ALIGN(m, sizeof(*ti) + optlen); 2097 m->m_next = m0; /* this gets freed below */ 2098 } 2099 ovbcopy((caddr_t)ti, mtod(m, caddr_t), sizeof(*ti)); 2100 ti = mtod(m, struct tcpiphdr *); 2101 } 2102 2103 optp = (u_int8_t *)(ti + 1); 2104 optp[0] = TCPOPT_MAXSEG; 2105 optp[1] = 4; 2106 optp[2] = (sc->sc_ourmaxseg >> 8) & 0xff; 2107 optp[3] = sc->sc_ourmaxseg & 0xff; 2108 optlen = 4; 2109 2110 if (sc->sc_request_r_scale != 15) { 2111 *((u_int32_t *)(optp + optlen)) = htonl(TCPOPT_NOP << 24 | 2112 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 2113 sc->sc_request_r_scale); 2114 optlen += 4; 2115 } 2116 2117 if (sc->sc_tstmp) { 2118 u_int32_t *lp = (u_int32_t *)(optp + optlen); 2119 /* Form timestamp option as shown in appendix A of RFC 1323. */ 2120 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 2121 *lp++ = htonl(tcp_now); 2122 *lp = htonl(ts); 2123 optlen += TCPOLEN_TSTAMP_APPA; 2124 } 2125 2126 /* 2127 * Toss any trailing mbufs. No need to worry about 2128 * m_len and m_pkthdr.len, since tcp_respond() will 2129 * unconditionally set them. 2130 */ 2131 if (m->m_next) { 2132 m_freem(m->m_next); 2133 m->m_next = NULL; 2134 } 2135 2136 /* 2137 * Fill in the fields that tcp_respond() will not touch, and 2138 * then send the response. 2139 */ 2140 ti->ti_off = (sizeof(struct tcphdr) + optlen) >> 2; 2141 ti->ti_win = htons(win); 2142 return (tcp_respond(NULL, ti, m, sc->sc_irs + 1, sc->sc_iss, 2143 TH_SYN|TH_ACK)); 2144 } 2145 #endif /* TUBA_INCLUDE */ 2146