1 /* $NetBSD: tcp_input.c,v 1.32 1997/09/22 21:49:55 thorpej Exp $ */ 2 3 /* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 36 */ 37 38 /* 39 * TODO list for SYN cache stuff: 40 * 41 * (a) The definition of "struct syn_cache" says: 42 * 43 * This structure should not exceeed 32 bytes. 44 * 45 * but it's 40 bytes on the Alpha. Can reduce memory use one 46 * of two ways: 47 * 48 * (1) Use a dynamically-sized hash table, and handle 49 * collisions by rehashing. Then sc_next is unnecessary. 50 * 51 * (2) Allocate syn_cache structures in pages (or some other 52 * large chunk). This would probably be desirable for 53 * maintaining locality of reference anyway. 54 * 55 * If you do this, you can change sc_next to a page/index 56 * value, and make it a 32-bit (or maybe even 16-bit) 57 * integer, thus partly obviating the need for the previous 58 * hack. 59 * 60 * It's also worth noting this this is necessary for IPv6, as well, 61 * where we use 32 bytes just for the IP addresses, so eliminating 62 * wastage is going to become more important. (BTW, has anyone 63 * integreated these changes with one fo the IPv6 status that are 64 * available?) 65 * 66 * (b) Find room for a "state" field, which is needed to keep a 67 * compressed state for TIME_WAIT TCBs. It's been noted already 68 * that this is fairly important for very high-volume web and 69 * mail servers, which use a large number of short-lived 70 * connections. 71 */ 72 73 #ifndef TUBA_INCLUDE 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/malloc.h> 77 #include <sys/mbuf.h> 78 #include <sys/protosw.h> 79 #include <sys/socket.h> 80 #include <sys/socketvar.h> 81 #include <sys/errno.h> 82 83 #include <net/if.h> 84 #include <net/route.h> 85 86 #include <netinet/in.h> 87 #include <netinet/in_systm.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #include <machine/stdarg.h> 100 101 int tcprexmtthresh = 3; 102 struct tcpiphdr tcp_saveti; 103 104 extern u_long sb_max; 105 106 #endif /* TUBA_INCLUDE */ 107 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 108 109 /* for modulo comparisons of timestamps */ 110 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 111 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 112 113 /* 114 * Insert segment ti into reassembly queue of tcp with 115 * control block tp. Return TH_FIN if reassembly now includes 116 * a segment with FIN. The macro form does the common case inline 117 * (segment is the next to be received on an established connection, 118 * and the queue is empty), avoiding linkage into and removal 119 * from the queue and repetition of various conversions. 120 * Set DELACK for segments received in order, but ack immediately 121 * when segments are out of order (so fast retransmit can work). 122 */ 123 #define TCP_REASS(tp, ti, m, so, flags) { \ 124 if ((ti)->ti_seq == (tp)->rcv_nxt && \ 125 (tp)->segq.lh_first == NULL && \ 126 (tp)->t_state == TCPS_ESTABLISHED) { \ 127 if ((ti)->ti_flags & TH_PUSH) \ 128 tp->t_flags |= TF_ACKNOW; \ 129 else \ 130 tp->t_flags |= TF_DELACK; \ 131 (tp)->rcv_nxt += (ti)->ti_len; \ 132 flags = (ti)->ti_flags & TH_FIN; \ 133 tcpstat.tcps_rcvpack++;\ 134 tcpstat.tcps_rcvbyte += (ti)->ti_len;\ 135 sbappend(&(so)->so_rcv, (m)); \ 136 sorwakeup(so); \ 137 } else { \ 138 (flags) = tcp_reass((tp), (ti), (m)); \ 139 tp->t_flags |= TF_ACKNOW; \ 140 } \ 141 } 142 #ifndef TUBA_INCLUDE 143 144 int 145 tcp_reass(tp, ti, m) 146 register struct tcpcb *tp; 147 register struct tcpiphdr *ti; 148 struct mbuf *m; 149 { 150 register struct ipqent *p, *q, *nq, *tiqe; 151 struct socket *so = tp->t_inpcb->inp_socket; 152 int flags; 153 154 /* 155 * Call with ti==0 after become established to 156 * force pre-ESTABLISHED data up to user socket. 157 */ 158 if (ti == 0) 159 goto present; 160 161 /* 162 * Allocate a new queue entry, before we throw away any data. 163 * If we can't, just drop the packet. XXX 164 */ 165 MALLOC(tiqe, struct ipqent *, sizeof (struct ipqent), M_IPQ, M_NOWAIT); 166 if (tiqe == NULL) { 167 tcpstat.tcps_rcvmemdrop++; 168 m_freem(m); 169 return (0); 170 } 171 172 /* 173 * Find a segment which begins after this one does. 174 */ 175 for (p = NULL, q = tp->segq.lh_first; q != NULL; 176 p = q, q = q->ipqe_q.le_next) 177 if (SEQ_GT(q->ipqe_tcp->ti_seq, ti->ti_seq)) 178 break; 179 180 /* 181 * If there is a preceding segment, it may provide some of 182 * our data already. If so, drop the data from the incoming 183 * segment. If it provides all of our data, drop us. 184 */ 185 if (p != NULL) { 186 register struct tcpiphdr *phdr = p->ipqe_tcp; 187 register int i; 188 189 /* conversion to int (in i) handles seq wraparound */ 190 i = phdr->ti_seq + phdr->ti_len - ti->ti_seq; 191 if (i > 0) { 192 if (i >= ti->ti_len) { 193 tcpstat.tcps_rcvduppack++; 194 tcpstat.tcps_rcvdupbyte += ti->ti_len; 195 m_freem(m); 196 FREE(tiqe, M_IPQ); 197 return (0); 198 } 199 m_adj(m, i); 200 ti->ti_len -= i; 201 ti->ti_seq += i; 202 } 203 } 204 tcpstat.tcps_rcvoopack++; 205 tcpstat.tcps_rcvoobyte += ti->ti_len; 206 207 /* 208 * While we overlap succeeding segments trim them or, 209 * if they are completely covered, dequeue them. 210 */ 211 for (; q != NULL; q = nq) { 212 register struct tcpiphdr *qhdr = q->ipqe_tcp; 213 register int i = (ti->ti_seq + ti->ti_len) - qhdr->ti_seq; 214 215 if (i <= 0) 216 break; 217 if (i < qhdr->ti_len) { 218 qhdr->ti_seq += i; 219 qhdr->ti_len -= i; 220 m_adj(q->ipqe_m, i); 221 break; 222 } 223 nq = q->ipqe_q.le_next; 224 m_freem(q->ipqe_m); 225 LIST_REMOVE(q, ipqe_q); 226 FREE(q, M_IPQ); 227 } 228 229 /* Insert the new fragment queue entry into place. */ 230 tiqe->ipqe_m = m; 231 tiqe->ipqe_tcp = ti; 232 if (p == NULL) { 233 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 234 } else { 235 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 236 } 237 238 present: 239 /* 240 * Present data to user, advancing rcv_nxt through 241 * completed sequence space. 242 */ 243 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 244 return (0); 245 q = tp->segq.lh_first; 246 if (q == NULL || q->ipqe_tcp->ti_seq != tp->rcv_nxt) 247 return (0); 248 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->ti_len) 249 return (0); 250 do { 251 tp->rcv_nxt += q->ipqe_tcp->ti_len; 252 flags = q->ipqe_tcp->ti_flags & TH_FIN; 253 254 nq = q->ipqe_q.le_next; 255 LIST_REMOVE(q, ipqe_q); 256 if (so->so_state & SS_CANTRCVMORE) 257 m_freem(q->ipqe_m); 258 else 259 sbappend(&so->so_rcv, q->ipqe_m); 260 FREE(q, M_IPQ); 261 q = nq; 262 } while (q != NULL && q->ipqe_tcp->ti_seq == tp->rcv_nxt); 263 sorwakeup(so); 264 return (flags); 265 } 266 267 /* 268 * TCP input routine, follows pages 65-76 of the 269 * protocol specification dated September, 1981 very closely. 270 */ 271 void 272 #if __STDC__ 273 tcp_input(struct mbuf *m, ...) 274 #else 275 tcp_input(m, va_alist) 276 register struct mbuf *m; 277 #endif 278 { 279 register struct tcpiphdr *ti; 280 register struct inpcb *inp; 281 caddr_t optp = NULL; 282 int optlen = 0; 283 int len, tlen, off, hdroptlen; 284 register struct tcpcb *tp = 0; 285 register int tiflags; 286 struct socket *so = NULL; 287 int todrop, acked, ourfinisacked, needoutput = 0; 288 short ostate = 0; 289 int iss = 0; 290 u_long tiwin; 291 struct tcp_opt_info opti; 292 int iphlen; 293 va_list ap; 294 295 va_start(ap, m); 296 iphlen = va_arg(ap, int); 297 va_end(ap); 298 299 tcpstat.tcps_rcvtotal++; 300 301 opti.ts_present = 0; 302 opti.maxseg = 0; 303 304 /* 305 * Get IP and TCP header together in first mbuf. 306 * Note: IP leaves IP header in first mbuf. 307 */ 308 ti = mtod(m, struct tcpiphdr *); 309 if (iphlen > sizeof (struct ip)) 310 ip_stripoptions(m, (struct mbuf *)0); 311 if (m->m_len < sizeof (struct tcpiphdr)) { 312 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 313 tcpstat.tcps_rcvshort++; 314 return; 315 } 316 ti = mtod(m, struct tcpiphdr *); 317 } 318 319 /* 320 * Checksum extended TCP header and data. 321 */ 322 tlen = ((struct ip *)ti)->ip_len; 323 len = sizeof (struct ip) + tlen; 324 bzero(ti->ti_x1, sizeof ti->ti_x1); 325 ti->ti_len = (u_int16_t)tlen; 326 HTONS(ti->ti_len); 327 if ((ti->ti_sum = in_cksum(m, len)) != 0) { 328 tcpstat.tcps_rcvbadsum++; 329 goto drop; 330 } 331 #endif /* TUBA_INCLUDE */ 332 333 /* 334 * Check that TCP offset makes sense, 335 * pull out TCP options and adjust length. XXX 336 */ 337 off = ti->ti_off << 2; 338 if (off < sizeof (struct tcphdr) || off > tlen) { 339 tcpstat.tcps_rcvbadoff++; 340 goto drop; 341 } 342 tlen -= off; 343 ti->ti_len = tlen; 344 if (off > sizeof (struct tcphdr)) { 345 if (m->m_len < sizeof(struct ip) + off) { 346 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { 347 tcpstat.tcps_rcvshort++; 348 return; 349 } 350 ti = mtod(m, struct tcpiphdr *); 351 } 352 optlen = off - sizeof (struct tcphdr); 353 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); 354 /* 355 * Do quick retrieval of timestamp options ("options 356 * prediction?"). If timestamp is the only option and it's 357 * formatted as recommended in RFC 1323 appendix A, we 358 * quickly get the values now and not bother calling 359 * tcp_dooptions(), etc. 360 */ 361 if ((optlen == TCPOLEN_TSTAMP_APPA || 362 (optlen > TCPOLEN_TSTAMP_APPA && 363 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 364 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 365 (ti->ti_flags & TH_SYN) == 0) { 366 opti.ts_present = 1; 367 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 368 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 369 optp = NULL; /* we've parsed the options */ 370 } 371 } 372 tiflags = ti->ti_flags; 373 374 /* 375 * Convert TCP protocol specific fields to host format. 376 */ 377 NTOHL(ti->ti_seq); 378 NTOHL(ti->ti_ack); 379 NTOHS(ti->ti_win); 380 NTOHS(ti->ti_urp); 381 382 /* 383 * Locate pcb for segment. 384 */ 385 findpcb: 386 inp = in_pcblookup_connect(&tcbtable, ti->ti_src, ti->ti_sport, 387 ti->ti_dst, ti->ti_dport); 388 if (inp == 0) { 389 ++tcpstat.tcps_pcbhashmiss; 390 inp = in_pcblookup_bind(&tcbtable, ti->ti_dst, ti->ti_dport); 391 if (inp == 0) { 392 ++tcpstat.tcps_noport; 393 goto dropwithreset; 394 } 395 } 396 397 /* 398 * If the state is CLOSED (i.e., TCB does not exist) then 399 * all data in the incoming segment is discarded. 400 * If the TCB exists but is in CLOSED state, it is embryonic, 401 * but should either do a listen or a connect soon. 402 */ 403 tp = intotcpcb(inp); 404 if (tp == 0) 405 goto dropwithreset; 406 if (tp->t_state == TCPS_CLOSED) 407 goto drop; 408 409 /* Unscale the window into a 32-bit value. */ 410 if ((tiflags & TH_SYN) == 0) 411 tiwin = ti->ti_win << tp->snd_scale; 412 else 413 tiwin = ti->ti_win; 414 415 so = inp->inp_socket; 416 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 417 if (so->so_options & SO_DEBUG) { 418 ostate = tp->t_state; 419 tcp_saveti = *ti; 420 } 421 if (so->so_options & SO_ACCEPTCONN) { 422 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 423 if (tiflags & TH_RST) 424 syn_cache_reset(ti); 425 else if (tiflags & TH_ACK) { 426 so = syn_cache_get(so, m); 427 if (so == NULL) { 428 /* 429 * We don't have a SYN for 430 * this ACK; send an RST. 431 */ 432 tcpstat.tcps_badsyn++; 433 tp = NULL; 434 goto dropwithreset; 435 } else if (so == 436 (struct socket *)(-1)) { 437 /* 438 * We were unable to create 439 * the connection. If the 440 * 3-way handshake was 441 * completeed, and RST has 442 * been sent to the peer. 443 * Since the mbuf might be 444 * in use for the reply, 445 * do not free it. 446 */ 447 m = NULL; 448 } else { 449 /* 450 * We have created a 451 * full-blown connection. 452 */ 453 inp = sotoinpcb(so); 454 tp = intotcpcb(inp); 455 tiwin <<= tp->snd_scale; 456 goto after_listen; 457 } 458 } 459 } else { 460 /* 461 * Received a SYN; create compressed 462 * TCP state for it. 463 */ 464 if (so->so_qlen <= so->so_qlimit && 465 syn_cache_add(so, m, optp, optlen, &opti)) 466 m = NULL; 467 } 468 goto drop; 469 } 470 } 471 472 after_listen: 473 #ifdef DIAGNOSTIC 474 /* 475 * Should not happen now that all embryonic connections 476 * are handled with compressed state. 477 */ 478 if (tp->t_state == TCPS_LISTEN) 479 panic("tcp_input: TCPS_LISTEN"); 480 #endif 481 482 /* 483 * Segment received on connection. 484 * Reset idle time and keep-alive timer. 485 */ 486 tp->t_idle = 0; 487 if (TCPS_HAVEESTABLISHED(tp->t_state)) 488 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 489 490 /* 491 * Process options. 492 */ 493 if (optp) 494 tcp_dooptions(tp, optp, optlen, ti, &opti); 495 496 /* 497 * Header prediction: check for the two common cases 498 * of a uni-directional data xfer. If the packet has 499 * no control flags, is in-sequence, the window didn't 500 * change and we're not retransmitting, it's a 501 * candidate. If the length is zero and the ack moved 502 * forward, we're the sender side of the xfer. Just 503 * free the data acked & wake any higher level process 504 * that was blocked waiting for space. If the length 505 * is non-zero and the ack didn't move, we're the 506 * receiver side. If we're getting packets in-order 507 * (the reassembly queue is empty), add the data to 508 * the socket buffer and note that we need a delayed ack. 509 */ 510 if (tp->t_state == TCPS_ESTABLISHED && 511 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 512 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 513 ti->ti_seq == tp->rcv_nxt && 514 tiwin && tiwin == tp->snd_wnd && 515 tp->snd_nxt == tp->snd_max) { 516 517 /* 518 * If last ACK falls within this segment's sequence numbers, 519 * record the timestamp. 520 */ 521 if (opti.ts_present && 522 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 523 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) { 524 tp->ts_recent_age = tcp_now; 525 tp->ts_recent = opti.ts_val; 526 } 527 528 if (ti->ti_len == 0) { 529 if (SEQ_GT(ti->ti_ack, tp->snd_una) && 530 SEQ_LEQ(ti->ti_ack, tp->snd_max) && 531 tp->snd_cwnd >= tp->snd_wnd && 532 tp->t_dupacks < tcprexmtthresh) { 533 /* 534 * this is a pure ack for outstanding data. 535 */ 536 ++tcpstat.tcps_predack; 537 if (opti.ts_present) 538 tcp_xmit_timer(tp, 539 tcp_now-opti.ts_ecr+1); 540 else if (tp->t_rtt && 541 SEQ_GT(ti->ti_ack, tp->t_rtseq)) 542 tcp_xmit_timer(tp, tp->t_rtt); 543 acked = ti->ti_ack - tp->snd_una; 544 tcpstat.tcps_rcvackpack++; 545 tcpstat.tcps_rcvackbyte += acked; 546 sbdrop(&so->so_snd, acked); 547 tp->snd_una = ti->ti_ack; 548 m_freem(m); 549 550 /* 551 * If all outstanding data are acked, stop 552 * retransmit timer, otherwise restart timer 553 * using current (possibly backed-off) value. 554 * If process is waiting for space, 555 * wakeup/selwakeup/signal. If data 556 * are ready to send, let tcp_output 557 * decide between more output or persist. 558 */ 559 if (tp->snd_una == tp->snd_max) 560 tp->t_timer[TCPT_REXMT] = 0; 561 else if (tp->t_timer[TCPT_PERSIST] == 0) 562 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 563 564 if (sb_notify(&so->so_snd)) 565 sowwakeup(so); 566 if (so->so_snd.sb_cc) 567 (void) tcp_output(tp); 568 return; 569 } 570 } else if (ti->ti_ack == tp->snd_una && 571 tp->segq.lh_first == NULL && 572 ti->ti_len <= sbspace(&so->so_rcv)) { 573 /* 574 * this is a pure, in-sequence data packet 575 * with nothing on the reassembly queue and 576 * we have enough buffer space to take it. 577 */ 578 ++tcpstat.tcps_preddat; 579 tp->rcv_nxt += ti->ti_len; 580 tcpstat.tcps_rcvpack++; 581 tcpstat.tcps_rcvbyte += ti->ti_len; 582 /* 583 * Drop TCP, IP headers and TCP options then add data 584 * to socket buffer. 585 */ 586 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 587 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 588 sbappend(&so->so_rcv, m); 589 sorwakeup(so); 590 if (ti->ti_flags & TH_PUSH) 591 tp->t_flags |= TF_ACKNOW; 592 else 593 tp->t_flags |= TF_DELACK; 594 return; 595 } 596 } 597 598 /* 599 * Drop TCP, IP headers and TCP options. 600 */ 601 hdroptlen = sizeof(struct tcpiphdr) + off - sizeof(struct tcphdr); 602 m->m_data += hdroptlen; 603 m->m_len -= hdroptlen; 604 605 /* 606 * Calculate amount of space in receive window, 607 * and then do TCP input processing. 608 * Receive window is amount of space in rcv queue, 609 * but not less than advertised window. 610 */ 611 { int win; 612 613 win = sbspace(&so->so_rcv); 614 if (win < 0) 615 win = 0; 616 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 617 } 618 619 switch (tp->t_state) { 620 621 /* 622 * If the state is SYN_SENT: 623 * if seg contains an ACK, but not for our SYN, drop the input. 624 * if seg contains a RST, then drop the connection. 625 * if seg does not contain SYN, then drop it. 626 * Otherwise this is an acceptable SYN segment 627 * initialize tp->rcv_nxt and tp->irs 628 * if seg contains ack then advance tp->snd_una 629 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 630 * arrange for segment to be acked (eventually) 631 * continue processing rest of data/controls, beginning with URG 632 */ 633 case TCPS_SYN_SENT: 634 if ((tiflags & TH_ACK) && 635 (SEQ_LEQ(ti->ti_ack, tp->iss) || 636 SEQ_GT(ti->ti_ack, tp->snd_max))) 637 goto dropwithreset; 638 if (tiflags & TH_RST) { 639 if (tiflags & TH_ACK) 640 tp = tcp_drop(tp, ECONNREFUSED); 641 goto drop; 642 } 643 if ((tiflags & TH_SYN) == 0) 644 goto drop; 645 if (tiflags & TH_ACK) { 646 tp->snd_una = ti->ti_ack; 647 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 648 tp->snd_nxt = tp->snd_una; 649 } 650 tp->t_timer[TCPT_REXMT] = 0; 651 tp->irs = ti->ti_seq; 652 tcp_rcvseqinit(tp); 653 tp->t_flags |= TF_ACKNOW; 654 tcp_mss_from_peer(tp, opti.maxseg); 655 tcp_rmx_rtt(tp); 656 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 657 tcpstat.tcps_connects++; 658 soisconnected(so); 659 tcp_established(tp); 660 /* Do window scaling on this connection? */ 661 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 662 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 663 tp->snd_scale = tp->requested_s_scale; 664 tp->rcv_scale = tp->request_r_scale; 665 } 666 (void) tcp_reass(tp, (struct tcpiphdr *)0, 667 (struct mbuf *)0); 668 /* 669 * if we didn't have to retransmit the SYN, 670 * use its rtt as our initial srtt & rtt var. 671 */ 672 if (tp->t_rtt) 673 tcp_xmit_timer(tp, tp->t_rtt); 674 } else 675 tp->t_state = TCPS_SYN_RECEIVED; 676 677 /* 678 * Advance ti->ti_seq to correspond to first data byte. 679 * If data, trim to stay within window, 680 * dropping FIN if necessary. 681 */ 682 ti->ti_seq++; 683 if (ti->ti_len > tp->rcv_wnd) { 684 todrop = ti->ti_len - tp->rcv_wnd; 685 m_adj(m, -todrop); 686 ti->ti_len = tp->rcv_wnd; 687 tiflags &= ~TH_FIN; 688 tcpstat.tcps_rcvpackafterwin++; 689 tcpstat.tcps_rcvbyteafterwin += todrop; 690 } 691 tp->snd_wl1 = ti->ti_seq - 1; 692 tp->rcv_up = ti->ti_seq; 693 goto step6; 694 695 /* 696 * If the state is SYN_RECEIVED: 697 * If seg contains an ACK, but not for our SYN, drop the input 698 * and generate an RST. See page 36, rfc793 699 */ 700 case TCPS_SYN_RECEIVED: 701 if ((tiflags & TH_ACK) && 702 (SEQ_LEQ(ti->ti_ack, tp->iss) || 703 SEQ_GT(ti->ti_ack, tp->snd_max))) 704 goto dropwithreset; 705 break; 706 } 707 708 /* 709 * States other than LISTEN or SYN_SENT. 710 * First check timestamp, if present. 711 * Then check that at least some bytes of segment are within 712 * receive window. If segment begins before rcv_nxt, 713 * drop leading data (and SYN); if nothing left, just ack. 714 * 715 * RFC 1323 PAWS: If we have a timestamp reply on this segment 716 * and it's less than ts_recent, drop it. 717 */ 718 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 719 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 720 721 /* Check to see if ts_recent is over 24 days old. */ 722 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 723 /* 724 * Invalidate ts_recent. If this segment updates 725 * ts_recent, the age will be reset later and ts_recent 726 * will get a valid value. If it does not, setting 727 * ts_recent to zero will at least satisfy the 728 * requirement that zero be placed in the timestamp 729 * echo reply when ts_recent isn't valid. The 730 * age isn't reset until we get a valid ts_recent 731 * because we don't want out-of-order segments to be 732 * dropped when ts_recent is old. 733 */ 734 tp->ts_recent = 0; 735 } else { 736 tcpstat.tcps_rcvduppack++; 737 tcpstat.tcps_rcvdupbyte += ti->ti_len; 738 tcpstat.tcps_pawsdrop++; 739 goto dropafterack; 740 } 741 } 742 743 todrop = tp->rcv_nxt - ti->ti_seq; 744 if (todrop > 0) { 745 if (tiflags & TH_SYN) { 746 tiflags &= ~TH_SYN; 747 ti->ti_seq++; 748 if (ti->ti_urp > 1) 749 ti->ti_urp--; 750 else { 751 tiflags &= ~TH_URG; 752 ti->ti_urp = 0; 753 } 754 todrop--; 755 } 756 if (todrop >= ti->ti_len) { 757 /* 758 * Any valid FIN must be to the left of the 759 * window. At this point, FIN must be a 760 * duplicate or out-of-sequence, so drop it. 761 */ 762 tiflags &= ~TH_FIN; 763 /* 764 * Send ACK to resynchronize, and drop any data, 765 * but keep on processing for RST or ACK. 766 */ 767 tp->t_flags |= TF_ACKNOW; 768 tcpstat.tcps_rcvdupbyte += todrop = ti->ti_len; 769 tcpstat.tcps_rcvduppack++; 770 } else { 771 tcpstat.tcps_rcvpartduppack++; 772 tcpstat.tcps_rcvpartdupbyte += todrop; 773 } 774 m_adj(m, todrop); 775 ti->ti_seq += todrop; 776 ti->ti_len -= todrop; 777 if (ti->ti_urp > todrop) 778 ti->ti_urp -= todrop; 779 else { 780 tiflags &= ~TH_URG; 781 ti->ti_urp = 0; 782 } 783 } 784 785 /* 786 * If new data are received on a connection after the 787 * user processes are gone, then RST the other end. 788 */ 789 if ((so->so_state & SS_NOFDREF) && 790 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { 791 tp = tcp_close(tp); 792 tcpstat.tcps_rcvafterclose++; 793 goto dropwithreset; 794 } 795 796 /* 797 * If segment ends after window, drop trailing data 798 * (and PUSH and FIN); if nothing left, just ACK. 799 */ 800 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); 801 if (todrop > 0) { 802 tcpstat.tcps_rcvpackafterwin++; 803 if (todrop >= ti->ti_len) { 804 tcpstat.tcps_rcvbyteafterwin += ti->ti_len; 805 /* 806 * If a new connection request is received 807 * while in TIME_WAIT, drop the old connection 808 * and start over if the sequence numbers 809 * are above the previous ones. 810 */ 811 if (tiflags & TH_SYN && 812 tp->t_state == TCPS_TIME_WAIT && 813 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { 814 iss = tp->rcv_nxt + TCP_ISSINCR; 815 tp = tcp_close(tp); 816 /* 817 * We have already advanced the mbuf 818 * pointers past the IP+TCP headers and 819 * options. Restore those pointers before 820 * attempting to use the TCP header again. 821 */ 822 m->m_data -= hdroptlen; 823 m->m_len += hdroptlen; 824 goto findpcb; 825 } 826 /* 827 * If window is closed can only take segments at 828 * window edge, and have to drop data and PUSH from 829 * incoming segments. Continue processing, but 830 * remember to ack. Otherwise, drop segment 831 * and ack. 832 */ 833 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { 834 tp->t_flags |= TF_ACKNOW; 835 tcpstat.tcps_rcvwinprobe++; 836 } else 837 goto dropafterack; 838 } else 839 tcpstat.tcps_rcvbyteafterwin += todrop; 840 m_adj(m, -todrop); 841 ti->ti_len -= todrop; 842 tiflags &= ~(TH_PUSH|TH_FIN); 843 } 844 845 /* 846 * If last ACK falls within this segment's sequence numbers, 847 * record its timestamp. 848 */ 849 if (opti.ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 850 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + 851 ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 852 tp->ts_recent_age = tcp_now; 853 tp->ts_recent = opti.ts_val; 854 } 855 856 /* 857 * If the RST bit is set examine the state: 858 * SYN_RECEIVED STATE: 859 * If passive open, return to LISTEN state. 860 * If active open, inform user that connection was refused. 861 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 862 * Inform user that connection was reset, and close tcb. 863 * CLOSING, LAST_ACK, TIME_WAIT STATES 864 * Close the tcb. 865 */ 866 if (tiflags&TH_RST) switch (tp->t_state) { 867 868 case TCPS_SYN_RECEIVED: 869 so->so_error = ECONNREFUSED; 870 goto close; 871 872 case TCPS_ESTABLISHED: 873 case TCPS_FIN_WAIT_1: 874 case TCPS_FIN_WAIT_2: 875 case TCPS_CLOSE_WAIT: 876 so->so_error = ECONNRESET; 877 close: 878 tp->t_state = TCPS_CLOSED; 879 tcpstat.tcps_drops++; 880 tp = tcp_close(tp); 881 goto drop; 882 883 case TCPS_CLOSING: 884 case TCPS_LAST_ACK: 885 case TCPS_TIME_WAIT: 886 tp = tcp_close(tp); 887 goto drop; 888 } 889 890 /* 891 * If a SYN is in the window, then this is an 892 * error and we send an RST and drop the connection. 893 */ 894 if (tiflags & TH_SYN) { 895 tp = tcp_drop(tp, ECONNRESET); 896 goto dropwithreset; 897 } 898 899 /* 900 * If the ACK bit is off we drop the segment and return. 901 */ 902 if ((tiflags & TH_ACK) == 0) 903 goto drop; 904 905 /* 906 * Ack processing. 907 */ 908 switch (tp->t_state) { 909 910 /* 911 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 912 * ESTABLISHED state and continue processing, otherwise 913 * send an RST. 914 */ 915 case TCPS_SYN_RECEIVED: 916 if (SEQ_GT(tp->snd_una, ti->ti_ack) || 917 SEQ_GT(ti->ti_ack, tp->snd_max)) 918 goto dropwithreset; 919 tcpstat.tcps_connects++; 920 soisconnected(so); 921 tcp_established(tp); 922 /* Do window scaling? */ 923 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 924 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 925 tp->snd_scale = tp->requested_s_scale; 926 tp->rcv_scale = tp->request_r_scale; 927 } 928 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); 929 tp->snd_wl1 = ti->ti_seq - 1; 930 /* fall into ... */ 931 932 /* 933 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 934 * ACKs. If the ack is in the range 935 * tp->snd_una < ti->ti_ack <= tp->snd_max 936 * then advance tp->snd_una to ti->ti_ack and drop 937 * data from the retransmission queue. If this ACK reflects 938 * more up to date window information we update our window information. 939 */ 940 case TCPS_ESTABLISHED: 941 case TCPS_FIN_WAIT_1: 942 case TCPS_FIN_WAIT_2: 943 case TCPS_CLOSE_WAIT: 944 case TCPS_CLOSING: 945 case TCPS_LAST_ACK: 946 case TCPS_TIME_WAIT: 947 948 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { 949 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { 950 tcpstat.tcps_rcvdupack++; 951 /* 952 * If we have outstanding data (other than 953 * a window probe), this is a completely 954 * duplicate ack (ie, window info didn't 955 * change), the ack is the biggest we've 956 * seen and we've seen exactly our rexmt 957 * threshhold of them, assume a packet 958 * has been dropped and retransmit it. 959 * Kludge snd_nxt & the congestion 960 * window so we send only this one 961 * packet. 962 * 963 * We know we're losing at the current 964 * window size so do congestion avoidance 965 * (set ssthresh to half the current window 966 * and pull our congestion window back to 967 * the new ssthresh). 968 * 969 * Dup acks mean that packets have left the 970 * network (they're now cached at the receiver) 971 * so bump cwnd by the amount in the receiver 972 * to keep a constant cwnd packets in the 973 * network. 974 */ 975 if (tp->t_timer[TCPT_REXMT] == 0 || 976 ti->ti_ack != tp->snd_una) 977 tp->t_dupacks = 0; 978 else if (++tp->t_dupacks == tcprexmtthresh) { 979 tcp_seq onxt = tp->snd_nxt; 980 u_int win = 981 min(tp->snd_wnd, tp->snd_cwnd) / 2 / 982 tp->t_maxseg; 983 984 if (win < 2) 985 win = 2; 986 tp->snd_ssthresh = win * tp->t_maxseg; 987 tp->t_timer[TCPT_REXMT] = 0; 988 tp->t_rtt = 0; 989 tp->snd_nxt = ti->ti_ack; 990 tp->snd_cwnd = tp->t_maxseg; 991 (void) tcp_output(tp); 992 tp->snd_cwnd = tp->snd_ssthresh + 993 tp->t_maxseg * tp->t_dupacks; 994 if (SEQ_GT(onxt, tp->snd_nxt)) 995 tp->snd_nxt = onxt; 996 goto drop; 997 } else if (tp->t_dupacks > tcprexmtthresh) { 998 tp->snd_cwnd += tp->t_maxseg; 999 (void) tcp_output(tp); 1000 goto drop; 1001 } 1002 } else 1003 tp->t_dupacks = 0; 1004 break; 1005 } 1006 /* 1007 * If the congestion window was inflated to account 1008 * for the other side's cached packets, retract it. 1009 */ 1010 if (tp->t_dupacks >= tcprexmtthresh && 1011 tp->snd_cwnd > tp->snd_ssthresh) 1012 tp->snd_cwnd = tp->snd_ssthresh; 1013 tp->t_dupacks = 0; 1014 if (SEQ_GT(ti->ti_ack, tp->snd_max)) { 1015 tcpstat.tcps_rcvacktoomuch++; 1016 goto dropafterack; 1017 } 1018 acked = ti->ti_ack - tp->snd_una; 1019 tcpstat.tcps_rcvackpack++; 1020 tcpstat.tcps_rcvackbyte += acked; 1021 1022 /* 1023 * If we have a timestamp reply, update smoothed 1024 * round trip time. If no timestamp is present but 1025 * transmit timer is running and timed sequence 1026 * number was acked, update smoothed round trip time. 1027 * Since we now have an rtt measurement, cancel the 1028 * timer backoff (cf., Phil Karn's retransmit alg.). 1029 * Recompute the initial retransmit timer. 1030 */ 1031 if (opti.ts_present) 1032 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1); 1033 else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) 1034 tcp_xmit_timer(tp,tp->t_rtt); 1035 1036 /* 1037 * If all outstanding data is acked, stop retransmit 1038 * timer and remember to restart (more output or persist). 1039 * If there is more data to be acked, restart retransmit 1040 * timer, using current (possibly backed-off) value. 1041 */ 1042 if (ti->ti_ack == tp->snd_max) { 1043 tp->t_timer[TCPT_REXMT] = 0; 1044 needoutput = 1; 1045 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1046 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1047 /* 1048 * When new data is acked, open the congestion window. 1049 * If the window gives us less than ssthresh packets 1050 * in flight, open exponentially (maxseg per packet). 1051 * Otherwise open linearly: maxseg per window 1052 * (maxseg^2 / cwnd per packet), plus a constant 1053 * fraction of a packet (maxseg/8) to help larger windows 1054 * open quickly enough. 1055 */ 1056 { 1057 register u_int cw = tp->snd_cwnd; 1058 register u_int incr = tp->t_maxseg; 1059 1060 if (cw > tp->snd_ssthresh) 1061 incr = incr * incr / cw; 1062 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1063 } 1064 if (acked > so->so_snd.sb_cc) { 1065 tp->snd_wnd -= so->so_snd.sb_cc; 1066 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1067 ourfinisacked = 1; 1068 } else { 1069 sbdrop(&so->so_snd, acked); 1070 tp->snd_wnd -= acked; 1071 ourfinisacked = 0; 1072 } 1073 if (sb_notify(&so->so_snd)) 1074 sowwakeup(so); 1075 tp->snd_una = ti->ti_ack; 1076 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1077 tp->snd_nxt = tp->snd_una; 1078 1079 switch (tp->t_state) { 1080 1081 /* 1082 * In FIN_WAIT_1 STATE in addition to the processing 1083 * for the ESTABLISHED state if our FIN is now acknowledged 1084 * then enter FIN_WAIT_2. 1085 */ 1086 case TCPS_FIN_WAIT_1: 1087 if (ourfinisacked) { 1088 /* 1089 * If we can't receive any more 1090 * data, then closing user can proceed. 1091 * Starting the timer is contrary to the 1092 * specification, but if we don't get a FIN 1093 * we'll hang forever. 1094 */ 1095 if (so->so_state & SS_CANTRCVMORE) { 1096 soisdisconnected(so); 1097 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 1098 } 1099 tp->t_state = TCPS_FIN_WAIT_2; 1100 } 1101 break; 1102 1103 /* 1104 * In CLOSING STATE in addition to the processing for 1105 * the ESTABLISHED state if the ACK acknowledges our FIN 1106 * then enter the TIME-WAIT state, otherwise ignore 1107 * the segment. 1108 */ 1109 case TCPS_CLOSING: 1110 if (ourfinisacked) { 1111 tp->t_state = TCPS_TIME_WAIT; 1112 tcp_canceltimers(tp); 1113 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1114 soisdisconnected(so); 1115 } 1116 break; 1117 1118 /* 1119 * In LAST_ACK, we may still be waiting for data to drain 1120 * and/or to be acked, as well as for the ack of our FIN. 1121 * If our FIN is now acknowledged, delete the TCB, 1122 * enter the closed state and return. 1123 */ 1124 case TCPS_LAST_ACK: 1125 if (ourfinisacked) { 1126 tp = tcp_close(tp); 1127 goto drop; 1128 } 1129 break; 1130 1131 /* 1132 * In TIME_WAIT state the only thing that should arrive 1133 * is a retransmission of the remote FIN. Acknowledge 1134 * it and restart the finack timer. 1135 */ 1136 case TCPS_TIME_WAIT: 1137 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1138 goto dropafterack; 1139 } 1140 } 1141 1142 step6: 1143 /* 1144 * Update window information. 1145 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1146 */ 1147 if (((tiflags & TH_ACK) && SEQ_LT(tp->snd_wl1, ti->ti_seq)) || 1148 (tp->snd_wl1 == ti->ti_seq && SEQ_LT(tp->snd_wl2, ti->ti_ack)) || 1149 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)) { 1150 /* keep track of pure window updates */ 1151 if (ti->ti_len == 0 && 1152 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) 1153 tcpstat.tcps_rcvwinupd++; 1154 tp->snd_wnd = tiwin; 1155 tp->snd_wl1 = ti->ti_seq; 1156 tp->snd_wl2 = ti->ti_ack; 1157 if (tp->snd_wnd > tp->max_sndwnd) 1158 tp->max_sndwnd = tp->snd_wnd; 1159 needoutput = 1; 1160 } 1161 1162 /* 1163 * Process segments with URG. 1164 */ 1165 if ((tiflags & TH_URG) && ti->ti_urp && 1166 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1167 /* 1168 * This is a kludge, but if we receive and accept 1169 * random urgent pointers, we'll crash in 1170 * soreceive. It's hard to imagine someone 1171 * actually wanting to send this much urgent data. 1172 */ 1173 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { 1174 ti->ti_urp = 0; /* XXX */ 1175 tiflags &= ~TH_URG; /* XXX */ 1176 goto dodata; /* XXX */ 1177 } 1178 /* 1179 * If this segment advances the known urgent pointer, 1180 * then mark the data stream. This should not happen 1181 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1182 * a FIN has been received from the remote side. 1183 * In these states we ignore the URG. 1184 * 1185 * According to RFC961 (Assigned Protocols), 1186 * the urgent pointer points to the last octet 1187 * of urgent data. We continue, however, 1188 * to consider it to indicate the first octet 1189 * of data past the urgent section as the original 1190 * spec states (in one of two places). 1191 */ 1192 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { 1193 tp->rcv_up = ti->ti_seq + ti->ti_urp; 1194 so->so_oobmark = so->so_rcv.sb_cc + 1195 (tp->rcv_up - tp->rcv_nxt) - 1; 1196 if (so->so_oobmark == 0) 1197 so->so_state |= SS_RCVATMARK; 1198 sohasoutofband(so); 1199 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1200 } 1201 /* 1202 * Remove out of band data so doesn't get presented to user. 1203 * This can happen independent of advancing the URG pointer, 1204 * but if two URG's are pending at once, some out-of-band 1205 * data may creep in... ick. 1206 */ 1207 if (ti->ti_urp <= (u_int16_t) ti->ti_len 1208 #ifdef SO_OOBINLINE 1209 && (so->so_options & SO_OOBINLINE) == 0 1210 #endif 1211 ) 1212 tcp_pulloutofband(so, ti, m); 1213 } else 1214 /* 1215 * If no out of band data is expected, 1216 * pull receive urgent pointer along 1217 * with the receive window. 1218 */ 1219 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1220 tp->rcv_up = tp->rcv_nxt; 1221 dodata: /* XXX */ 1222 1223 /* 1224 * Process the segment text, merging it into the TCP sequencing queue, 1225 * and arranging for acknowledgment of receipt if necessary. 1226 * This process logically involves adjusting tp->rcv_wnd as data 1227 * is presented to the user (this happens in tcp_usrreq.c, 1228 * case PRU_RCVD). If a FIN has already been received on this 1229 * connection then we just ignore the text. 1230 */ 1231 if ((ti->ti_len || (tiflags & TH_FIN)) && 1232 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1233 TCP_REASS(tp, ti, m, so, tiflags); 1234 /* 1235 * Note the amount of data that peer has sent into 1236 * our window, in order to estimate the sender's 1237 * buffer size. 1238 */ 1239 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1240 } else { 1241 m_freem(m); 1242 tiflags &= ~TH_FIN; 1243 } 1244 1245 /* 1246 * If FIN is received ACK the FIN and let the user know 1247 * that the connection is closing. Ignore a FIN received before 1248 * the connection is fully established. 1249 */ 1250 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1251 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1252 socantrcvmore(so); 1253 tp->t_flags |= TF_ACKNOW; 1254 tp->rcv_nxt++; 1255 } 1256 switch (tp->t_state) { 1257 1258 /* 1259 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1260 */ 1261 case TCPS_ESTABLISHED: 1262 tp->t_state = TCPS_CLOSE_WAIT; 1263 break; 1264 1265 /* 1266 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1267 * enter the CLOSING state. 1268 */ 1269 case TCPS_FIN_WAIT_1: 1270 tp->t_state = TCPS_CLOSING; 1271 break; 1272 1273 /* 1274 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1275 * starting the time-wait timer, turning off the other 1276 * standard timers. 1277 */ 1278 case TCPS_FIN_WAIT_2: 1279 tp->t_state = TCPS_TIME_WAIT; 1280 tcp_canceltimers(tp); 1281 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1282 soisdisconnected(so); 1283 break; 1284 1285 /* 1286 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1287 */ 1288 case TCPS_TIME_WAIT: 1289 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1290 break; 1291 } 1292 } 1293 if (so->so_options & SO_DEBUG) 1294 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); 1295 1296 /* 1297 * Return any desired output. 1298 */ 1299 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1300 (void) tcp_output(tp); 1301 return; 1302 1303 dropafterack: 1304 /* 1305 * Generate an ACK dropping incoming segment if it occupies 1306 * sequence space, where the ACK reflects our state. 1307 */ 1308 if (tiflags & TH_RST) 1309 goto drop; 1310 m_freem(m); 1311 tp->t_flags |= TF_ACKNOW; 1312 (void) tcp_output(tp); 1313 return; 1314 1315 dropwithreset: 1316 /* 1317 * Generate a RST, dropping incoming segment. 1318 * Make ACK acceptable to originator of segment. 1319 * Don't bother to respond if destination was broadcast/multicast. 1320 */ 1321 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || 1322 IN_MULTICAST(ti->ti_dst.s_addr)) 1323 goto drop; 1324 if (tiflags & TH_ACK) 1325 (void)tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); 1326 else { 1327 if (tiflags & TH_SYN) 1328 ti->ti_len++; 1329 (void)tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, 1330 TH_RST|TH_ACK); 1331 } 1332 return; 1333 1334 drop: 1335 /* 1336 * Drop space held by incoming segment and return. 1337 */ 1338 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1339 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); 1340 m_freem(m); 1341 return; 1342 #ifndef TUBA_INCLUDE 1343 } 1344 1345 void 1346 tcp_dooptions(tp, cp, cnt, ti, oi) 1347 struct tcpcb *tp; 1348 u_char *cp; 1349 int cnt; 1350 struct tcpiphdr *ti; 1351 struct tcp_opt_info *oi; 1352 { 1353 u_int16_t mss; 1354 int opt, optlen; 1355 1356 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1357 opt = cp[0]; 1358 if (opt == TCPOPT_EOL) 1359 break; 1360 if (opt == TCPOPT_NOP) 1361 optlen = 1; 1362 else { 1363 optlen = cp[1]; 1364 if (optlen <= 0) 1365 break; 1366 } 1367 switch (opt) { 1368 1369 default: 1370 continue; 1371 1372 case TCPOPT_MAXSEG: 1373 if (optlen != TCPOLEN_MAXSEG) 1374 continue; 1375 if (!(ti->ti_flags & TH_SYN)) 1376 continue; 1377 bcopy(cp + 2, &mss, sizeof(mss)); 1378 oi->maxseg = ntohs(mss); 1379 break; 1380 1381 case TCPOPT_WINDOW: 1382 if (optlen != TCPOLEN_WINDOW) 1383 continue; 1384 if (!(ti->ti_flags & TH_SYN)) 1385 continue; 1386 tp->t_flags |= TF_RCVD_SCALE; 1387 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 1388 break; 1389 1390 case TCPOPT_TIMESTAMP: 1391 if (optlen != TCPOLEN_TIMESTAMP) 1392 continue; 1393 oi->ts_present = 1; 1394 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); 1395 NTOHL(oi->ts_val); 1396 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); 1397 NTOHL(oi->ts_ecr); 1398 1399 /* 1400 * A timestamp received in a SYN makes 1401 * it ok to send timestamp requests and replies. 1402 */ 1403 if (ti->ti_flags & TH_SYN) { 1404 tp->t_flags |= TF_RCVD_TSTMP; 1405 tp->ts_recent = oi->ts_val; 1406 tp->ts_recent_age = tcp_now; 1407 } 1408 break; 1409 } 1410 } 1411 } 1412 1413 /* 1414 * Pull out of band byte out of a segment so 1415 * it doesn't appear in the user's data queue. 1416 * It is still reflected in the segment length for 1417 * sequencing purposes. 1418 */ 1419 void 1420 tcp_pulloutofband(so, ti, m) 1421 struct socket *so; 1422 struct tcpiphdr *ti; 1423 register struct mbuf *m; 1424 { 1425 int cnt = ti->ti_urp - 1; 1426 1427 while (cnt >= 0) { 1428 if (m->m_len > cnt) { 1429 char *cp = mtod(m, caddr_t) + cnt; 1430 struct tcpcb *tp = sototcpcb(so); 1431 1432 tp->t_iobc = *cp; 1433 tp->t_oobflags |= TCPOOB_HAVEDATA; 1434 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 1435 m->m_len--; 1436 return; 1437 } 1438 cnt -= m->m_len; 1439 m = m->m_next; 1440 if (m == 0) 1441 break; 1442 } 1443 panic("tcp_pulloutofband"); 1444 } 1445 1446 /* 1447 * Collect new round-trip time estimate 1448 * and update averages and current timeout. 1449 */ 1450 void 1451 tcp_xmit_timer(tp, rtt) 1452 register struct tcpcb *tp; 1453 short rtt; 1454 { 1455 register short delta; 1456 1457 tcpstat.tcps_rttupdated++; 1458 --rtt; 1459 if (tp->t_srtt != 0) { 1460 /* 1461 * srtt is stored as fixed point with 3 bits after the 1462 * binary point (i.e., scaled by 8). The following magic 1463 * is equivalent to the smoothing algorithm in rfc793 with 1464 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 1465 * point). Adjust rtt to origin 0. 1466 */ 1467 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 1468 if ((tp->t_srtt += delta) <= 0) 1469 tp->t_srtt = 1 << 2; 1470 /* 1471 * We accumulate a smoothed rtt variance (actually, a 1472 * smoothed mean difference), then set the retransmit 1473 * timer to smoothed rtt + 4 times the smoothed variance. 1474 * rttvar is stored as fixed point with 2 bits after the 1475 * binary point (scaled by 4). The following is 1476 * equivalent to rfc793 smoothing with an alpha of .75 1477 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 1478 * rfc793's wired-in beta. 1479 */ 1480 if (delta < 0) 1481 delta = -delta; 1482 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 1483 if ((tp->t_rttvar += delta) <= 0) 1484 tp->t_rttvar = 1 << 2; 1485 } else { 1486 /* 1487 * No rtt measurement yet - use the unsmoothed rtt. 1488 * Set the variance to half the rtt (so our first 1489 * retransmit happens at 3*rtt). 1490 */ 1491 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 1492 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 1493 } 1494 tp->t_rtt = 0; 1495 tp->t_rxtshift = 0; 1496 1497 /* 1498 * the retransmit should happen at rtt + 4 * rttvar. 1499 * Because of the way we do the smoothing, srtt and rttvar 1500 * will each average +1/2 tick of bias. When we compute 1501 * the retransmit timer, we want 1/2 tick of rounding and 1502 * 1 extra tick because of +-1/2 tick uncertainty in the 1503 * firing of the timer. The bias will give us exactly the 1504 * 1.5 tick we need. But, because the bias is 1505 * statistical, we have to test that we don't drop below 1506 * the minimum feasible timer (which is 2 ticks). 1507 */ 1508 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 1509 rtt + 2, TCPTV_REXMTMAX); 1510 1511 /* 1512 * We received an ack for a packet that wasn't retransmitted; 1513 * it is probably safe to discard any error indications we've 1514 * received recently. This isn't quite right, but close enough 1515 * for now (a route might have failed after we sent a segment, 1516 * and the return path might not be symmetrical). 1517 */ 1518 tp->t_softerror = 0; 1519 } 1520 1521 /* 1522 * TCP compressed state engine. Currently used to hold compressed 1523 * state for SYN_RECEIVED. 1524 */ 1525 1526 u_long syn_cache_count; 1527 u_int32_t syn_hash1, syn_hash2; 1528 1529 #define SYN_HASH(sa, sp, dp) \ 1530 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ 1531 ((u_int32_t)(sp)))^syn_hash2)) \ 1532 & 0x7fffffff) 1533 1534 #define eptosp(ep, e, s) ((struct s *)((char *)(ep) - \ 1535 ((char *)(&((struct s *)0)->e) - (char *)0))) 1536 1537 #define SYN_CACHE_RM(sc, p, scp) { \ 1538 *(p) = (sc)->sc_next; \ 1539 if ((sc)->sc_next) \ 1540 (sc)->sc_next->sc_timer += (sc)->sc_timer; \ 1541 else { \ 1542 (scp)->sch_timer_sum -= (sc)->sc_timer; \ 1543 if ((scp)->sch_timer_sum <= 0) \ 1544 (scp)->sch_timer_sum = -1; \ 1545 /* If need be, fix up the last pointer */ \ 1546 if ((scp)->sch_first) \ 1547 (scp)->sch_last = eptosp(p, sc_next, syn_cache); \ 1548 } \ 1549 (scp)->sch_length--; \ 1550 syn_cache_count--; \ 1551 } 1552 1553 void 1554 syn_cache_insert(sc, prevp, headp) 1555 struct syn_cache *sc; 1556 struct syn_cache ***prevp; 1557 struct syn_cache_head **headp; 1558 { 1559 struct syn_cache_head *scp, *scp2, *sce; 1560 struct syn_cache *sc2; 1561 static u_int timeo_val; 1562 int s; 1563 1564 /* Initialize the hash secrets when adding the first entry */ 1565 if (syn_cache_count == 0) { 1566 struct timeval tv; 1567 microtime(&tv); 1568 syn_hash1 = random() ^ (u_long)≻ 1569 syn_hash2 = random() ^ tv.tv_usec; 1570 } 1571 1572 sc->sc_hash = SYN_HASH(&sc->sc_src, sc->sc_sport, sc->sc_dport); 1573 sc->sc_next = NULL; 1574 scp = &tcp_syn_cache[sc->sc_hash % tcp_syn_cache_size]; 1575 *headp = scp; 1576 1577 /* 1578 * Make sure that we don't overflow the per-bucket 1579 * limit or the total cache size limit. 1580 */ 1581 s = splsoftnet(); 1582 if (scp->sch_length >= tcp_syn_bucket_limit) { 1583 tcpstat.tcps_sc_bucketoverflow++; 1584 sc2 = scp->sch_first; 1585 scp->sch_first = sc2->sc_next; 1586 FREE(sc2, M_PCB); 1587 } else if (syn_cache_count >= tcp_syn_cache_limit) { 1588 tcpstat.tcps_sc_overflowed++; 1589 /* 1590 * The cache is full. Toss the first (i.e, oldest) 1591 * element in this bucket. 1592 */ 1593 scp2 = scp; 1594 if (scp2->sch_first == NULL) { 1595 sce = &tcp_syn_cache[tcp_syn_cache_size]; 1596 for (++scp2; scp2 != scp; scp2++) { 1597 if (scp2 >= sce) 1598 scp2 = &tcp_syn_cache[0]; 1599 if (scp2->sch_first) 1600 break; 1601 } 1602 } 1603 sc2 = scp2->sch_first; 1604 if (sc2 == NULL) { 1605 FREE(sc, M_PCB); 1606 return; 1607 } 1608 if ((scp2->sch_first = sc2->sc_next) == NULL) 1609 scp2->sch_last = NULL; 1610 else 1611 sc2->sc_next->sc_timer += sc2->sc_timer; 1612 FREE(sc2, M_PCB); 1613 } else { 1614 scp->sch_length++; 1615 syn_cache_count++; 1616 } 1617 tcpstat.tcps_sc_added++; 1618 1619 /* 1620 * Put it into the bucket. 1621 */ 1622 if (scp->sch_first == NULL) 1623 *prevp = &scp->sch_first; 1624 else { 1625 *prevp = &scp->sch_last->sc_next; 1626 tcpstat.tcps_sc_collisions++; 1627 } 1628 **prevp = sc; 1629 scp->sch_last = sc; 1630 1631 /* 1632 * If the timeout value has changed 1633 * 1) force it to fit in a u_char 1634 * 2) Run the timer routine to truncate all 1635 * existing entries to the new timeout value. 1636 */ 1637 if (timeo_val != tcp_syn_cache_timeo) { 1638 tcp_syn_cache_timeo = min(tcp_syn_cache_timeo, UCHAR_MAX); 1639 if (timeo_val > tcp_syn_cache_timeo) 1640 syn_cache_timer(timeo_val - tcp_syn_cache_timeo); 1641 timeo_val = tcp_syn_cache_timeo; 1642 } 1643 if (scp->sch_timer_sum > 0) 1644 sc->sc_timer = tcp_syn_cache_timeo - scp->sch_timer_sum; 1645 else if (scp->sch_timer_sum == 0) { 1646 /* When the bucket timer is 0, it is not in the cache queue. */ 1647 scp->sch_headq = tcp_syn_cache_first; 1648 tcp_syn_cache_first = scp; 1649 sc->sc_timer = tcp_syn_cache_timeo; 1650 } 1651 scp->sch_timer_sum = tcp_syn_cache_timeo; 1652 splx(s); 1653 } 1654 1655 /* 1656 * Walk down the cache list, decrementing the timer of 1657 * the first element on each entry. If the timer goes 1658 * to zero, remove it and all successive entries with 1659 * a zero timer. 1660 */ 1661 void 1662 syn_cache_timer(interval) 1663 int interval; 1664 { 1665 struct syn_cache_head *scp, **pscp; 1666 struct syn_cache *sc, *scn; 1667 int n, s; 1668 1669 pscp = &tcp_syn_cache_first; 1670 scp = tcp_syn_cache_first; 1671 s = splsoftnet(); 1672 while (scp) { 1673 /* 1674 * Remove any empty hash buckets 1675 * from the cache queue. 1676 */ 1677 if ((sc = scp->sch_first) == NULL) { 1678 *pscp = scp->sch_headq; 1679 scp->sch_headq = NULL; 1680 scp->sch_timer_sum = 0; 1681 scp->sch_first = scp->sch_last = NULL; 1682 scp->sch_length = 0; 1683 scp = *pscp; 1684 continue; 1685 } 1686 1687 scp->sch_timer_sum -= interval; 1688 if (scp->sch_timer_sum <= 0) 1689 scp->sch_timer_sum = -1; 1690 n = interval; 1691 while (sc->sc_timer <= n) { 1692 n -= sc->sc_timer; 1693 scn = sc->sc_next; 1694 tcpstat.tcps_sc_timed_out++; 1695 syn_cache_count--; 1696 FREE(sc, M_PCB); 1697 scp->sch_length--; 1698 if ((sc = scn) == NULL) 1699 break; 1700 } 1701 if ((scp->sch_first = sc) != NULL) { 1702 sc->sc_timer -= n; 1703 pscp = &scp->sch_headq; 1704 scp = scp->sch_headq; 1705 } 1706 } 1707 splx(s); 1708 } 1709 1710 /* 1711 * Find an entry in the syn cache. 1712 */ 1713 struct syn_cache * 1714 syn_cache_lookup(ti, prevp, headp) 1715 struct tcpiphdr *ti; 1716 struct syn_cache ***prevp; 1717 struct syn_cache_head **headp; 1718 { 1719 struct syn_cache *sc, **prev; 1720 struct syn_cache_head *head; 1721 u_int32_t hash; 1722 int s; 1723 1724 hash = SYN_HASH(&ti->ti_src, ti->ti_sport, ti->ti_dport); 1725 1726 head = &tcp_syn_cache[hash % tcp_syn_cache_size]; 1727 *headp = head; 1728 prev = &head->sch_first; 1729 s = splsoftnet(); 1730 for (sc = head->sch_first; sc; prev = &sc->sc_next, sc = sc->sc_next) { 1731 if (sc->sc_hash != hash) 1732 continue; 1733 if (sc->sc_src.s_addr == ti->ti_src.s_addr && 1734 sc->sc_sport == ti->ti_sport && 1735 sc->sc_dport == ti->ti_dport && 1736 sc->sc_dst.s_addr == ti->ti_dst.s_addr) { 1737 *prevp = prev; 1738 splx(s); 1739 return (sc); 1740 } 1741 } 1742 splx(s); 1743 return (NULL); 1744 } 1745 1746 /* 1747 * This function gets called when we receive an ACK for a 1748 * socket in the LISTEN state. We look up the connection 1749 * in the syn cache, and if its there, we pull it out of 1750 * the cache and turn it into a full-blown connection in 1751 * the SYN-RECEIVED state. 1752 * 1753 * The return values may not be immediately obvious, and their effects 1754 * can be subtle, so here they are: 1755 * 1756 * NULL SYN was not found in cache; caller should drop the 1757 * packet and send an RST. 1758 * 1759 * -1 We were unable to create the new connection, and are 1760 * aborting it. An ACK,RST is being sent to the peer 1761 * (unless we got screwey sequence numbners; see below), 1762 * because the 3-way handshake has been completed. Caller 1763 * should not free the mbuf, since we may be using it. If 1764 * we are not, we will free it. 1765 * 1766 * Otherwise, the return value is a pointer to the new socket 1767 * associated with the connection. 1768 */ 1769 struct socket * 1770 syn_cache_get(so, m) 1771 struct socket *so; 1772 struct mbuf *m; 1773 { 1774 struct syn_cache *sc, **sc_prev; 1775 struct syn_cache_head *head; 1776 register struct inpcb *inp; 1777 register struct tcpcb *tp = 0; 1778 register struct tcpiphdr *ti; 1779 struct sockaddr_in *sin; 1780 struct mbuf *am; 1781 long win; 1782 int s; 1783 1784 ti = mtod(m, struct tcpiphdr *); 1785 s = splsoftnet(); 1786 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) { 1787 splx(s); 1788 return (NULL); 1789 } 1790 1791 win = sbspace(&so->so_rcv); 1792 if (win > TCP_MAXWIN) 1793 win = TCP_MAXWIN; 1794 1795 /* 1796 * Verify the sequence and ack numbers. 1797 */ 1798 if ((ti->ti_ack != sc->sc_iss + 1) || 1799 SEQ_LEQ(ti->ti_seq, sc->sc_irs) || 1800 SEQ_GT(ti->ti_seq, sc->sc_irs + 1 + win)) { 1801 (void) syn_cache_respond(sc, m, ti, win, 0); 1802 splx(s); 1803 return ((struct socket *)(-1)); 1804 } 1805 1806 /* Remove this cache entry */ 1807 SYN_CACHE_RM(sc, sc_prev, head); 1808 splx(s); 1809 1810 /* 1811 * Ok, create the full blown connection, and set things up 1812 * as they would have been set up if we had created the 1813 * connection when the SYN arrived. If we can't create 1814 * the connection, abort it. 1815 */ 1816 so = sonewconn(so, SS_ISCONNECTED); 1817 if (so == NULL) 1818 goto resetandabort; 1819 1820 inp = sotoinpcb(so); 1821 inp->inp_laddr = sc->sc_dst; 1822 inp->inp_lport = sc->sc_dport; 1823 in_pcbstate(inp, INP_BOUND); 1824 #if BSD>=43 1825 inp->inp_options = ip_srcroute(); 1826 #endif 1827 1828 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 1829 if (am == NULL) { 1830 m_freem(m); 1831 goto resetandabort; 1832 } 1833 am->m_len = sizeof(struct sockaddr_in); 1834 sin = mtod(am, struct sockaddr_in *); 1835 sin->sin_family = AF_INET; 1836 sin->sin_len = sizeof(*sin); 1837 sin->sin_addr = sc->sc_src; 1838 sin->sin_port = sc->sc_sport; 1839 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 1840 if (in_pcbconnect(inp, am)) { 1841 (void) m_free(am); 1842 m_freem(m); 1843 goto resetandabort; 1844 } 1845 (void) m_free(am); 1846 1847 tp = intotcpcb(inp); 1848 if (sc->sc_request_r_scale != 15) { 1849 tp->requested_s_scale = sc->sc_requested_s_scale; 1850 tp->request_r_scale = sc->sc_request_r_scale; 1851 tp->snd_scale = sc->sc_requested_s_scale; 1852 tp->rcv_scale = sc->sc_request_r_scale; 1853 tp->t_flags |= TF_RCVD_SCALE; 1854 } 1855 if (sc->sc_tstmp) 1856 tp->t_flags |= TF_RCVD_TSTMP; 1857 1858 tp->t_template = tcp_template(tp); 1859 if (tp->t_template == 0) { 1860 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 1861 so = NULL; 1862 m_freem(m); 1863 goto abort; 1864 } 1865 1866 tp->iss = sc->sc_iss; 1867 tp->irs = sc->sc_irs; 1868 tcp_sendseqinit(tp); 1869 tcp_rcvseqinit(tp); 1870 tp->t_state = TCPS_SYN_RECEIVED; 1871 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 1872 tcpstat.tcps_accepts++; 1873 1874 /* Initialize tp->t_ourmss before we deal with the peer's! */ 1875 tp->t_ourmss = sc->sc_ourmaxseg; 1876 tcp_mss_from_peer(tp, sc->sc_peermaxseg); 1877 tcp_rmx_rtt(tp); 1878 tp->snd_wl1 = sc->sc_irs; 1879 tp->rcv_up = sc->sc_irs + 1; 1880 1881 /* 1882 * This is what whould have happened in tcp_ouput() when 1883 * the SYN,ACK was sent. 1884 */ 1885 tp->snd_up = tp->snd_una; 1886 tp->snd_max = tp->snd_nxt = tp->iss+1; 1887 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1888 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1889 tp->rcv_adv = tp->rcv_nxt + win; 1890 tp->last_ack_sent = tp->rcv_nxt; 1891 1892 tcpstat.tcps_sc_completed++; 1893 FREE(sc, M_PCB); 1894 return (so); 1895 1896 resetandabort: 1897 (void) tcp_respond(NULL, ti, m, ti->ti_seq+ti->ti_len, 1898 (tcp_seq)0, TH_RST|TH_ACK); 1899 abort: 1900 if (so != NULL) 1901 (void) soabort(so); 1902 FREE(sc, M_PCB); 1903 tcpstat.tcps_sc_aborted++; 1904 return ((struct socket *)(-1)); 1905 } 1906 1907 /* 1908 * This function is called when we get a RST for a 1909 * non-existant connection, so that we can see if the 1910 * connection is in the syn cache. If it is, zap it. 1911 */ 1912 1913 void 1914 syn_cache_reset(ti) 1915 register struct tcpiphdr *ti; 1916 { 1917 struct syn_cache *sc, **sc_prev; 1918 struct syn_cache_head *head; 1919 int s = splsoftnet(); 1920 1921 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) { 1922 splx(s); 1923 return; 1924 } 1925 if (SEQ_LT(ti->ti_seq,sc->sc_irs) || 1926 SEQ_GT(ti->ti_seq, sc->sc_irs+1)) { 1927 splx(s); 1928 return; 1929 } 1930 SYN_CACHE_RM(sc, sc_prev, head); 1931 splx(s); 1932 tcpstat.tcps_sc_reset++; 1933 FREE(sc, M_PCB); 1934 } 1935 1936 void 1937 syn_cache_unreach(ip, th) 1938 struct ip *ip; 1939 struct tcphdr *th; 1940 { 1941 struct syn_cache *sc, **sc_prev; 1942 struct syn_cache_head *head; 1943 struct tcpiphdr ti2; 1944 int s; 1945 1946 ti2.ti_src.s_addr = ip->ip_dst.s_addr; 1947 ti2.ti_dst.s_addr = ip->ip_src.s_addr; 1948 ti2.ti_sport = th->th_dport; 1949 ti2.ti_dport = th->th_sport; 1950 1951 s = splsoftnet(); 1952 if ((sc = syn_cache_lookup(&ti2, &sc_prev, &head)) == NULL) { 1953 splx(s); 1954 return; 1955 } 1956 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 1957 if (ntohl (th->th_seq) != sc->sc_iss) { 1958 splx(s); 1959 return; 1960 } 1961 SYN_CACHE_RM(sc, sc_prev, head); 1962 splx(s); 1963 tcpstat.tcps_sc_unreach++; 1964 FREE(sc, M_PCB); 1965 } 1966 1967 /* 1968 * Given a LISTEN socket and an inbound SYN request, add 1969 * this to the syn cache, and send back a segment: 1970 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 1971 * to the source. 1972 * 1973 * XXX We don't properly handle SYN-with-data! 1974 */ 1975 1976 int 1977 syn_cache_add(so, m, optp, optlen, oi) 1978 struct socket *so; 1979 struct mbuf *m; 1980 u_char *optp; 1981 int optlen; 1982 struct tcp_opt_info *oi; 1983 { 1984 register struct tcpiphdr *ti; 1985 struct tcpcb tb, *tp; 1986 long win; 1987 struct syn_cache *sc, **sc_prev; 1988 struct syn_cache_head *scp; 1989 extern int tcp_do_rfc1323; 1990 1991 tp = sototcpcb(so); 1992 ti = mtod(m, struct tcpiphdr *); 1993 1994 /* 1995 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1996 * in_broadcast() should never return true on a received 1997 * packet with M_BCAST not set. 1998 */ 1999 if (m->m_flags & (M_BCAST|M_MCAST) || 2000 IN_MULTICAST(ti->ti_src.s_addr) || 2001 IN_MULTICAST(ti->ti_dst.s_addr)) 2002 return (0); 2003 2004 /* 2005 * Initialize some local state. 2006 */ 2007 win = sbspace(&so->so_rcv); 2008 if (win > TCP_MAXWIN) 2009 win = TCP_MAXWIN; 2010 2011 if (optp) { 2012 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 2013 tcp_dooptions(&tb, optp, optlen, ti, oi); 2014 } else 2015 tb.t_flags = 0; 2016 2017 /* 2018 * See if we already have an entry for this connection. 2019 */ 2020 if ((sc = syn_cache_lookup(ti, &sc_prev, &scp)) != NULL) { 2021 tcpstat.tcps_sc_dupesyn++; 2022 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) { 2023 tcpstat.tcps_sndacks++; 2024 tcpstat.tcps_sndtotal++; 2025 } 2026 return (1); 2027 } 2028 2029 MALLOC(sc, struct syn_cache *, sizeof(*sc), M_PCB, M_NOWAIT); 2030 if (sc == NULL) 2031 return (0); 2032 /* 2033 * Fill in the cache, and put the necessary TCP 2034 * options into the reply. 2035 */ 2036 sc->sc_src.s_addr = ti->ti_src.s_addr; 2037 sc->sc_dst.s_addr = ti->ti_dst.s_addr; 2038 sc->sc_sport = ti->ti_sport; 2039 sc->sc_dport = ti->ti_dport; 2040 sc->sc_irs = ti->ti_seq; 2041 sc->sc_iss = tcp_iss; 2042 tcp_iss += TCP_ISSINCR/2; 2043 sc->sc_peermaxseg = oi->maxseg; 2044 sc->sc_ourmaxseg = tcp_mss_to_advertise(tp); 2045 sc->sc_tstmp = (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) ? 1 : 0; 2046 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2047 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2048 sc->sc_requested_s_scale = tb.requested_s_scale; 2049 sc->sc_request_r_scale = 0; 2050 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 2051 TCP_MAXWIN << sc->sc_request_r_scale < 2052 so->so_rcv.sb_hiwat) 2053 sc->sc_request_r_scale++; 2054 } else { 2055 sc->sc_requested_s_scale = 15; 2056 sc->sc_request_r_scale = 15; 2057 } 2058 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) { 2059 syn_cache_insert(sc, &sc_prev, &scp); 2060 tcpstat.tcps_sndacks++; 2061 tcpstat.tcps_sndtotal++; 2062 } else { 2063 FREE(sc, M_PCB); 2064 tcpstat.tcps_sc_dropped++; 2065 } 2066 return (1); 2067 } 2068 2069 int 2070 syn_cache_respond(sc, m, ti, win, ts) 2071 struct syn_cache *sc; 2072 struct mbuf *m; 2073 register struct tcpiphdr *ti; 2074 long win; 2075 u_long ts; 2076 { 2077 u_int8_t *optp; 2078 int optlen; 2079 2080 /* 2081 * Tack on the TCP options. If there isn't enough trailing 2082 * space for them, move up the fixed header to make space. 2083 */ 2084 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 2085 (sc->sc_tstmp ? TCPOLEN_TSTAMP_APPA : 0); 2086 if (optlen > M_TRAILINGSPACE(m)) { 2087 if (M_LEADINGSPACE(m) >= optlen) { 2088 m->m_data -= optlen; 2089 m->m_len += optlen; 2090 } else { 2091 struct mbuf *m0 = m; 2092 if ((m = m_gethdr(M_DONTWAIT, MT_HEADER)) == NULL) { 2093 m_freem(m0); 2094 return (ENOBUFS); 2095 } 2096 MH_ALIGN(m, sizeof(*ti) + optlen); 2097 m->m_next = m0; /* this gets freed below */ 2098 } 2099 ovbcopy((caddr_t)ti, mtod(m, caddr_t), sizeof(*ti)); 2100 ti = mtod(m, struct tcpiphdr *); 2101 } 2102 2103 optp = (u_int8_t *)(ti + 1); 2104 optp[0] = TCPOPT_MAXSEG; 2105 optp[1] = 4; 2106 optp[2] = (sc->sc_ourmaxseg >> 8) & 0xff; 2107 optp[3] = sc->sc_ourmaxseg & 0xff; 2108 optlen = 4; 2109 2110 if (sc->sc_request_r_scale != 15) { 2111 *((u_int32_t *)(optp + optlen)) = htonl(TCPOPT_NOP << 24 | 2112 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 2113 sc->sc_request_r_scale); 2114 optlen += 4; 2115 } 2116 2117 if (sc->sc_tstmp) { 2118 u_int32_t *lp = (u_int32_t *)(optp + optlen); 2119 /* Form timestamp option as shown in appendix A of RFC 1323. */ 2120 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 2121 *lp++ = htonl(tcp_now); 2122 *lp = htonl(ts); 2123 optlen += TCPOLEN_TSTAMP_APPA; 2124 } 2125 2126 /* 2127 * Toss any trailing mbufs. No need to worry about 2128 * m_len and m_pkthdr.len, since tcp_respond() will 2129 * unconditionally set them. 2130 */ 2131 if (m->m_next) { 2132 m_freem(m->m_next); 2133 m->m_next = NULL; 2134 } 2135 2136 /* 2137 * Fill in the fields that tcp_respond() will not touch, and 2138 * then send the response. 2139 */ 2140 ti->ti_off = (sizeof(struct tcphdr) + optlen) >> 2; 2141 ti->ti_win = htons(win); 2142 return (tcp_respond(NULL, ti, m, sc->sc_irs + 1, sc->sc_iss, 2143 TH_SYN|TH_ACK)); 2144 } 2145 #endif /* TUBA_INCLUDE */ 2146