1 /* $NetBSD: tcp_subr.c,v 1.160 2004/01/07 19:15:43 matt Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1997, 1998, 2000, 2001 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 38 * Facility, NASA Ames Research Center. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. All advertising materials mentioning features or use of this software 49 * must display the following acknowledgement: 50 * This product includes software developed by the NetBSD 51 * Foundation, Inc. and its contributors. 52 * 4. Neither the name of The NetBSD Foundation nor the names of its 53 * contributors may be used to endorse or promote products derived 54 * from this software without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 59 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 66 * POSSIBILITY OF SUCH DAMAGE. 67 */ 68 69 /* 70 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 71 * The Regents of the University of California. All rights reserved. 72 * 73 * Redistribution and use in source and binary forms, with or without 74 * modification, are permitted provided that the following conditions 75 * are met: 76 * 1. Redistributions of source code must retain the above copyright 77 * notice, this list of conditions and the following disclaimer. 78 * 2. Redistributions in binary form must reproduce the above copyright 79 * notice, this list of conditions and the following disclaimer in the 80 * documentation and/or other materials provided with the distribution. 81 * 3. Neither the name of the University nor the names of its contributors 82 * may be used to endorse or promote products derived from this software 83 * without specific prior written permission. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 * 97 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 98 */ 99 100 #include <sys/cdefs.h> 101 __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.160 2004/01/07 19:15:43 matt Exp $"); 102 103 #include "opt_inet.h" 104 #include "opt_ipsec.h" 105 #include "opt_tcp_compat_42.h" 106 #include "opt_inet_csum.h" 107 #include "opt_mbuftrace.h" 108 #include "rnd.h" 109 110 #include <sys/param.h> 111 #include <sys/proc.h> 112 #include <sys/systm.h> 113 #include <sys/malloc.h> 114 #include <sys/mbuf.h> 115 #include <sys/socket.h> 116 #include <sys/socketvar.h> 117 #include <sys/protosw.h> 118 #include <sys/errno.h> 119 #include <sys/kernel.h> 120 #include <sys/pool.h> 121 #if NRND > 0 122 #include <sys/md5.h> 123 #include <sys/rnd.h> 124 #endif 125 126 #include <net/route.h> 127 #include <net/if.h> 128 129 #include <netinet/in.h> 130 #include <netinet/in_systm.h> 131 #include <netinet/ip.h> 132 #include <netinet/in_pcb.h> 133 #include <netinet/ip_var.h> 134 #include <netinet/ip_icmp.h> 135 136 #ifdef INET6 137 #ifndef INET 138 #include <netinet/in.h> 139 #endif 140 #include <netinet/ip6.h> 141 #include <netinet6/in6_pcb.h> 142 #include <netinet6/ip6_var.h> 143 #include <netinet6/in6_var.h> 144 #include <netinet6/ip6protosw.h> 145 #include <netinet/icmp6.h> 146 #include <netinet6/nd6.h> 147 #endif 148 149 #include <netinet/tcp.h> 150 #include <netinet/tcp_fsm.h> 151 #include <netinet/tcp_seq.h> 152 #include <netinet/tcp_timer.h> 153 #include <netinet/tcp_var.h> 154 #include <netinet/tcpip.h> 155 156 #ifdef IPSEC 157 #include <netinet6/ipsec.h> 158 #endif /*IPSEC*/ 159 160 #ifdef FAST_IPSEC 161 #include <netipsec/ipsec.h> 162 #ifdef INET6 163 #include <netipsec/ipsec6.h> 164 #endif 165 #endif /* FAST_IPSEC*/ 166 167 168 struct inpcbtable tcbtable; /* head of queue of active tcpcb's */ 169 struct tcpstat tcpstat; /* tcp statistics */ 170 u_int32_t tcp_now; /* for RFC 1323 timestamps */ 171 172 /* patchable/settable parameters for tcp */ 173 int tcp_mssdflt = TCP_MSS; 174 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 175 int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */ 176 #if NRND > 0 177 int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */ 178 #endif 179 int tcp_do_sack = 1; /* selective acknowledgement */ 180 int tcp_do_win_scale = 1; /* RFC1323 window scaling */ 181 int tcp_do_timestamps = 1; /* RFC1323 timestamps */ 182 int tcp_do_newreno = 0; /* Use the New Reno algorithms */ 183 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ 184 #ifndef TCP_INIT_WIN 185 #define TCP_INIT_WIN 1 /* initial slow start window */ 186 #endif 187 #ifndef TCP_INIT_WIN_LOCAL 188 #define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */ 189 #endif 190 int tcp_init_win = TCP_INIT_WIN; 191 int tcp_init_win_local = TCP_INIT_WIN_LOCAL; 192 int tcp_mss_ifmtu = 0; 193 #ifdef TCP_COMPAT_42 194 int tcp_compat_42 = 1; 195 #else 196 int tcp_compat_42 = 0; 197 #endif 198 int tcp_rst_ppslim = 100; /* 100pps */ 199 200 /* tcb hash */ 201 #ifndef TCBHASHSIZE 202 #define TCBHASHSIZE 128 203 #endif 204 int tcbhashsize = TCBHASHSIZE; 205 206 /* syn hash parameters */ 207 #define TCP_SYN_HASH_SIZE 293 208 #define TCP_SYN_BUCKET_SIZE 35 209 int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 210 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 211 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 212 struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; 213 214 int tcp_freeq __P((struct tcpcb *)); 215 216 #ifdef INET 217 void tcp_mtudisc_callback __P((struct in_addr)); 218 #endif 219 #ifdef INET6 220 void tcp6_mtudisc_callback __P((struct in6_addr *)); 221 #endif 222 223 void tcp_mtudisc __P((struct inpcb *, int)); 224 #ifdef INET6 225 void tcp6_mtudisc __P((struct in6pcb *, int)); 226 #endif 227 228 struct pool tcpcb_pool; 229 230 #ifdef TCP_CSUM_COUNTERS 231 #include <sys/device.h> 232 233 struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 234 NULL, "tcp", "hwcsum bad"); 235 struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 236 NULL, "tcp", "hwcsum ok"); 237 struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 238 NULL, "tcp", "hwcsum data"); 239 struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 240 NULL, "tcp", "swcsum"); 241 #endif /* TCP_CSUM_COUNTERS */ 242 243 #ifdef TCP_OUTPUT_COUNTERS 244 #include <sys/device.h> 245 246 struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 247 NULL, "tcp", "output big header"); 248 struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 249 NULL, "tcp", "output predict hit"); 250 struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 251 NULL, "tcp", "output predict miss"); 252 struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 253 NULL, "tcp", "output copy small"); 254 struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 255 NULL, "tcp", "output copy big"); 256 struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 257 NULL, "tcp", "output reference big"); 258 #endif /* TCP_OUTPUT_COUNTERS */ 259 260 #ifdef TCP_REASS_COUNTERS 261 #include <sys/device.h> 262 263 struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 264 NULL, "tcp_reass", "calls"); 265 struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 266 &tcp_reass_, "tcp_reass", "insert into empty queue"); 267 struct evcnt tcp_reass_iteration[8] = { 268 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"), 269 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"), 270 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"), 271 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"), 272 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"), 273 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"), 274 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"), 275 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"), 276 }; 277 struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 278 &tcp_reass_, "tcp_reass", "prepend to first"); 279 struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 280 &tcp_reass_, "tcp_reass", "prepend"); 281 struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 282 &tcp_reass_, "tcp_reass", "insert"); 283 struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 284 &tcp_reass_, "tcp_reass", "insert at tail"); 285 struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 286 &tcp_reass_, "tcp_reass", "append"); 287 struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 288 &tcp_reass_, "tcp_reass", "append to tail fragment"); 289 struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 290 &tcp_reass_, "tcp_reass", "overlap at end"); 291 struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 292 &tcp_reass_, "tcp_reass", "overlap at start"); 293 struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 294 &tcp_reass_, "tcp_reass", "duplicate segment"); 295 struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 296 &tcp_reass_, "tcp_reass", "duplicate fragment"); 297 298 #endif /* TCP_REASS_COUNTERS */ 299 300 #ifdef MBUFTRACE 301 struct mowner tcp_mowner = { "tcp" }; 302 struct mowner tcp_rx_mowner = { "tcp", "rx" }; 303 struct mowner tcp_tx_mowner = { "tcp", "tx" }; 304 #endif 305 306 /* 307 * Tcp initialization 308 */ 309 void 310 tcp_init() 311 { 312 int hlen; 313 314 /* Initialize the TCPCB template. */ 315 tcp_tcpcb_template(); 316 317 pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl", 318 NULL); 319 in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize); 320 321 hlen = sizeof(struct ip) + sizeof(struct tcphdr); 322 #ifdef INET6 323 if (sizeof(struct ip) < sizeof(struct ip6_hdr)) 324 hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 325 #endif 326 if (max_protohdr < hlen) 327 max_protohdr = hlen; 328 if (max_linkhdr + hlen > MHLEN) 329 panic("tcp_init"); 330 331 #ifdef INET 332 icmp_mtudisc_callback_register(tcp_mtudisc_callback); 333 #endif 334 #ifdef INET6 335 icmp6_mtudisc_callback_register(tcp6_mtudisc_callback); 336 #endif 337 338 /* Initialize timer state. */ 339 tcp_timer_init(); 340 341 /* Initialize the compressed state engine. */ 342 syn_cache_init(); 343 344 #ifdef TCP_CSUM_COUNTERS 345 evcnt_attach_static(&tcp_hwcsum_bad); 346 evcnt_attach_static(&tcp_hwcsum_ok); 347 evcnt_attach_static(&tcp_hwcsum_data); 348 evcnt_attach_static(&tcp_swcsum); 349 #endif /* TCP_CSUM_COUNTERS */ 350 351 #ifdef TCP_OUTPUT_COUNTERS 352 evcnt_attach_static(&tcp_output_bigheader); 353 evcnt_attach_static(&tcp_output_predict_hit); 354 evcnt_attach_static(&tcp_output_predict_miss); 355 evcnt_attach_static(&tcp_output_copysmall); 356 evcnt_attach_static(&tcp_output_copybig); 357 evcnt_attach_static(&tcp_output_refbig); 358 #endif /* TCP_OUTPUT_COUNTERS */ 359 360 #ifdef TCP_REASS_COUNTERS 361 evcnt_attach_static(&tcp_reass_); 362 evcnt_attach_static(&tcp_reass_empty); 363 evcnt_attach_static(&tcp_reass_iteration[0]); 364 evcnt_attach_static(&tcp_reass_iteration[1]); 365 evcnt_attach_static(&tcp_reass_iteration[2]); 366 evcnt_attach_static(&tcp_reass_iteration[3]); 367 evcnt_attach_static(&tcp_reass_iteration[4]); 368 evcnt_attach_static(&tcp_reass_iteration[5]); 369 evcnt_attach_static(&tcp_reass_iteration[6]); 370 evcnt_attach_static(&tcp_reass_iteration[7]); 371 evcnt_attach_static(&tcp_reass_prependfirst); 372 evcnt_attach_static(&tcp_reass_prepend); 373 evcnt_attach_static(&tcp_reass_insert); 374 evcnt_attach_static(&tcp_reass_inserttail); 375 evcnt_attach_static(&tcp_reass_append); 376 evcnt_attach_static(&tcp_reass_appendtail); 377 evcnt_attach_static(&tcp_reass_overlaptail); 378 evcnt_attach_static(&tcp_reass_overlapfront); 379 evcnt_attach_static(&tcp_reass_segdup); 380 evcnt_attach_static(&tcp_reass_fragdup); 381 #endif /* TCP_REASS_COUNTERS */ 382 383 MOWNER_ATTACH(&tcp_tx_mowner); 384 MOWNER_ATTACH(&tcp_rx_mowner); 385 MOWNER_ATTACH(&tcp_mowner); 386 } 387 388 /* 389 * Create template to be used to send tcp packets on a connection. 390 * Call after host entry created, allocates an mbuf and fills 391 * in a skeletal tcp/ip header, minimizing the amount of work 392 * necessary when the connection is used. 393 */ 394 struct mbuf * 395 tcp_template(tp) 396 struct tcpcb *tp; 397 { 398 struct inpcb *inp = tp->t_inpcb; 399 #ifdef INET6 400 struct in6pcb *in6p = tp->t_in6pcb; 401 #endif 402 struct tcphdr *n; 403 struct mbuf *m; 404 int hlen; 405 406 switch (tp->t_family) { 407 case AF_INET: 408 hlen = sizeof(struct ip); 409 if (inp) 410 break; 411 #ifdef INET6 412 if (in6p) { 413 /* mapped addr case */ 414 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr) 415 && IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) 416 break; 417 } 418 #endif 419 return NULL; /*EINVAL*/ 420 #ifdef INET6 421 case AF_INET6: 422 hlen = sizeof(struct ip6_hdr); 423 if (in6p) { 424 /* more sainty check? */ 425 break; 426 } 427 return NULL; /*EINVAL*/ 428 #endif 429 default: 430 hlen = 0; /*pacify gcc*/ 431 return NULL; /*EAFNOSUPPORT*/ 432 } 433 #ifdef DIAGNOSTIC 434 if (hlen + sizeof(struct tcphdr) > MCLBYTES) 435 panic("mclbytes too small for t_template"); 436 #endif 437 m = tp->t_template; 438 if (m && m->m_len == hlen + sizeof(struct tcphdr)) 439 ; 440 else { 441 if (m) 442 m_freem(m); 443 m = tp->t_template = NULL; 444 MGETHDR(m, M_DONTWAIT, MT_HEADER); 445 if (m && hlen + sizeof(struct tcphdr) > MHLEN) { 446 MCLGET(m, M_DONTWAIT); 447 if ((m->m_flags & M_EXT) == 0) { 448 m_free(m); 449 m = NULL; 450 } 451 } 452 if (m == NULL) 453 return NULL; 454 MCLAIM(m, &tcp_mowner); 455 m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr); 456 } 457 458 bzero(mtod(m, caddr_t), m->m_len); 459 460 n = (struct tcphdr *)(mtod(m, caddr_t) + hlen); 461 462 switch (tp->t_family) { 463 case AF_INET: 464 { 465 struct ipovly *ipov; 466 mtod(m, struct ip *)->ip_v = 4; 467 mtod(m, struct ip *)->ip_hl = hlen >> 2; 468 ipov = mtod(m, struct ipovly *); 469 ipov->ih_pr = IPPROTO_TCP; 470 ipov->ih_len = htons(sizeof(struct tcphdr)); 471 if (inp) { 472 ipov->ih_src = inp->inp_laddr; 473 ipov->ih_dst = inp->inp_faddr; 474 } 475 #ifdef INET6 476 else if (in6p) { 477 /* mapped addr case */ 478 bcopy(&in6p->in6p_laddr.s6_addr32[3], &ipov->ih_src, 479 sizeof(ipov->ih_src)); 480 bcopy(&in6p->in6p_faddr.s6_addr32[3], &ipov->ih_dst, 481 sizeof(ipov->ih_dst)); 482 } 483 #endif 484 /* 485 * Compute the pseudo-header portion of the checksum 486 * now. We incrementally add in the TCP option and 487 * payload lengths later, and then compute the TCP 488 * checksum right before the packet is sent off onto 489 * the wire. 490 */ 491 n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr, 492 ipov->ih_dst.s_addr, 493 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 494 break; 495 } 496 #ifdef INET6 497 case AF_INET6: 498 { 499 struct ip6_hdr *ip6; 500 mtod(m, struct ip *)->ip_v = 6; 501 ip6 = mtod(m, struct ip6_hdr *); 502 ip6->ip6_nxt = IPPROTO_TCP; 503 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 504 ip6->ip6_src = in6p->in6p_laddr; 505 ip6->ip6_dst = in6p->in6p_faddr; 506 ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK; 507 if (ip6_auto_flowlabel) { 508 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; 509 ip6->ip6_flow |= 510 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); 511 } 512 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 513 ip6->ip6_vfc |= IPV6_VERSION; 514 515 /* 516 * Compute the pseudo-header portion of the checksum 517 * now. We incrementally add in the TCP option and 518 * payload lengths later, and then compute the TCP 519 * checksum right before the packet is sent off onto 520 * the wire. 521 */ 522 n->th_sum = in6_cksum_phdr(&in6p->in6p_laddr, 523 &in6p->in6p_faddr, htonl(sizeof(struct tcphdr)), 524 htonl(IPPROTO_TCP)); 525 break; 526 } 527 #endif 528 } 529 if (inp) { 530 n->th_sport = inp->inp_lport; 531 n->th_dport = inp->inp_fport; 532 } 533 #ifdef INET6 534 else if (in6p) { 535 n->th_sport = in6p->in6p_lport; 536 n->th_dport = in6p->in6p_fport; 537 } 538 #endif 539 n->th_seq = 0; 540 n->th_ack = 0; 541 n->th_x2 = 0; 542 n->th_off = 5; 543 n->th_flags = 0; 544 n->th_win = 0; 545 n->th_urp = 0; 546 return (m); 547 } 548 549 /* 550 * Send a single message to the TCP at address specified by 551 * the given TCP/IP header. If m == 0, then we make a copy 552 * of the tcpiphdr at ti and send directly to the addressed host. 553 * This is used to force keep alive messages out using the TCP 554 * template for a connection tp->t_template. If flags are given 555 * then we send a message back to the TCP which originated the 556 * segment ti, and discard the mbuf containing it and any other 557 * attached mbufs. 558 * 559 * In any case the ack and sequence number of the transmitted 560 * segment are as specified by the parameters. 561 */ 562 int 563 tcp_respond(tp, template, m, th0, ack, seq, flags) 564 struct tcpcb *tp; 565 struct mbuf *template; 566 struct mbuf *m; 567 struct tcphdr *th0; 568 tcp_seq ack, seq; 569 int flags; 570 { 571 struct route *ro; 572 int error, tlen, win = 0; 573 int hlen; 574 struct ip *ip; 575 #ifdef INET6 576 struct ip6_hdr *ip6; 577 #endif 578 int family; /* family on packet, not inpcb/in6pcb! */ 579 struct tcphdr *th; 580 struct socket *so; 581 582 if (tp != NULL && (flags & TH_RST) == 0) { 583 #ifdef DIAGNOSTIC 584 if (tp->t_inpcb && tp->t_in6pcb) 585 panic("tcp_respond: both t_inpcb and t_in6pcb are set"); 586 #endif 587 #ifdef INET 588 if (tp->t_inpcb) 589 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 590 #endif 591 #ifdef INET6 592 if (tp->t_in6pcb) 593 win = sbspace(&tp->t_in6pcb->in6p_socket->so_rcv); 594 #endif 595 } 596 597 th = NULL; /* Quell uninitialized warning */ 598 ip = NULL; 599 #ifdef INET6 600 ip6 = NULL; 601 #endif 602 if (m == 0) { 603 if (!template) 604 return EINVAL; 605 606 /* get family information from template */ 607 switch (mtod(template, struct ip *)->ip_v) { 608 case 4: 609 family = AF_INET; 610 hlen = sizeof(struct ip); 611 break; 612 #ifdef INET6 613 case 6: 614 family = AF_INET6; 615 hlen = sizeof(struct ip6_hdr); 616 break; 617 #endif 618 default: 619 return EAFNOSUPPORT; 620 } 621 622 MGETHDR(m, M_DONTWAIT, MT_HEADER); 623 if (m) { 624 MCLAIM(m, &tcp_tx_mowner); 625 MCLGET(m, M_DONTWAIT); 626 if ((m->m_flags & M_EXT) == 0) { 627 m_free(m); 628 m = NULL; 629 } 630 } 631 if (m == NULL) 632 return (ENOBUFS); 633 634 if (tcp_compat_42) 635 tlen = 1; 636 else 637 tlen = 0; 638 639 m->m_data += max_linkhdr; 640 bcopy(mtod(template, caddr_t), mtod(m, caddr_t), 641 template->m_len); 642 switch (family) { 643 case AF_INET: 644 ip = mtod(m, struct ip *); 645 th = (struct tcphdr *)(ip + 1); 646 break; 647 #ifdef INET6 648 case AF_INET6: 649 ip6 = mtod(m, struct ip6_hdr *); 650 th = (struct tcphdr *)(ip6 + 1); 651 break; 652 #endif 653 #if 0 654 default: 655 /* noone will visit here */ 656 m_freem(m); 657 return EAFNOSUPPORT; 658 #endif 659 } 660 flags = TH_ACK; 661 } else { 662 663 if ((m->m_flags & M_PKTHDR) == 0) { 664 #if 0 665 printf("non PKTHDR to tcp_respond\n"); 666 #endif 667 m_freem(m); 668 return EINVAL; 669 } 670 #ifdef DIAGNOSTIC 671 if (!th0) 672 panic("th0 == NULL in tcp_respond"); 673 #endif 674 675 /* get family information from m */ 676 switch (mtod(m, struct ip *)->ip_v) { 677 case 4: 678 family = AF_INET; 679 hlen = sizeof(struct ip); 680 ip = mtod(m, struct ip *); 681 break; 682 #ifdef INET6 683 case 6: 684 family = AF_INET6; 685 hlen = sizeof(struct ip6_hdr); 686 ip6 = mtod(m, struct ip6_hdr *); 687 break; 688 #endif 689 default: 690 m_freem(m); 691 return EAFNOSUPPORT; 692 } 693 if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2)) 694 tlen = sizeof(*th0); 695 else 696 tlen = th0->th_off << 2; 697 698 if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 && 699 mtod(m, caddr_t) + hlen == (caddr_t)th0) { 700 m->m_len = hlen + tlen; 701 m_freem(m->m_next); 702 m->m_next = NULL; 703 } else { 704 struct mbuf *n; 705 706 #ifdef DIAGNOSTIC 707 if (max_linkhdr + hlen + tlen > MCLBYTES) { 708 m_freem(m); 709 return EMSGSIZE; 710 } 711 #endif 712 MGETHDR(n, M_DONTWAIT, MT_HEADER); 713 if (n && max_linkhdr + hlen + tlen > MHLEN) { 714 MCLGET(n, M_DONTWAIT); 715 if ((n->m_flags & M_EXT) == 0) { 716 m_freem(n); 717 n = NULL; 718 } 719 } 720 if (!n) { 721 m_freem(m); 722 return ENOBUFS; 723 } 724 725 MCLAIM(n, &tcp_tx_mowner); 726 n->m_data += max_linkhdr; 727 n->m_len = hlen + tlen; 728 m_copyback(n, 0, hlen, mtod(m, caddr_t)); 729 m_copyback(n, hlen, tlen, (caddr_t)th0); 730 731 m_freem(m); 732 m = n; 733 n = NULL; 734 } 735 736 #define xchg(a,b,type) { type t; t=a; a=b; b=t; } 737 switch (family) { 738 case AF_INET: 739 ip = mtod(m, struct ip *); 740 th = (struct tcphdr *)(ip + 1); 741 ip->ip_p = IPPROTO_TCP; 742 xchg(ip->ip_dst, ip->ip_src, struct in_addr); 743 ip->ip_p = IPPROTO_TCP; 744 break; 745 #ifdef INET6 746 case AF_INET6: 747 ip6 = mtod(m, struct ip6_hdr *); 748 th = (struct tcphdr *)(ip6 + 1); 749 ip6->ip6_nxt = IPPROTO_TCP; 750 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 751 ip6->ip6_nxt = IPPROTO_TCP; 752 break; 753 #endif 754 #if 0 755 default: 756 /* noone will visit here */ 757 m_freem(m); 758 return EAFNOSUPPORT; 759 #endif 760 } 761 xchg(th->th_dport, th->th_sport, u_int16_t); 762 #undef xchg 763 tlen = 0; /*be friendly with the following code*/ 764 } 765 th->th_seq = htonl(seq); 766 th->th_ack = htonl(ack); 767 th->th_x2 = 0; 768 if ((flags & TH_SYN) == 0) { 769 if (tp) 770 win >>= tp->rcv_scale; 771 if (win > TCP_MAXWIN) 772 win = TCP_MAXWIN; 773 th->th_win = htons((u_int16_t)win); 774 th->th_off = sizeof (struct tcphdr) >> 2; 775 tlen += sizeof(*th); 776 } else 777 tlen += th->th_off << 2; 778 m->m_len = hlen + tlen; 779 m->m_pkthdr.len = hlen + tlen; 780 m->m_pkthdr.rcvif = (struct ifnet *) 0; 781 th->th_flags = flags; 782 th->th_urp = 0; 783 784 switch (family) { 785 #ifdef INET 786 case AF_INET: 787 { 788 struct ipovly *ipov = (struct ipovly *)ip; 789 bzero(ipov->ih_x1, sizeof ipov->ih_x1); 790 ipov->ih_len = htons((u_int16_t)tlen); 791 792 th->th_sum = 0; 793 th->th_sum = in_cksum(m, hlen + tlen); 794 ip->ip_len = htons(hlen + tlen); 795 ip->ip_ttl = ip_defttl; 796 break; 797 } 798 #endif 799 #ifdef INET6 800 case AF_INET6: 801 { 802 th->th_sum = 0; 803 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 804 tlen); 805 ip6->ip6_plen = ntohs(tlen); 806 if (tp && tp->t_in6pcb) { 807 struct ifnet *oifp; 808 ro = (struct route *)&tp->t_in6pcb->in6p_route; 809 oifp = ro->ro_rt ? ro->ro_rt->rt_ifp : NULL; 810 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, oifp); 811 } else 812 ip6->ip6_hlim = ip6_defhlim; 813 ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; 814 if (ip6_auto_flowlabel) { 815 ip6->ip6_flow |= 816 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); 817 } 818 break; 819 } 820 #endif 821 } 822 823 if (tp && tp->t_inpcb) 824 so = tp->t_inpcb->inp_socket; 825 #ifdef INET6 826 else if (tp && tp->t_in6pcb) 827 so = tp->t_in6pcb->in6p_socket; 828 #endif 829 else 830 so = NULL; 831 832 if (tp != NULL && tp->t_inpcb != NULL) { 833 ro = &tp->t_inpcb->inp_route; 834 #ifdef DIAGNOSTIC 835 if (family != AF_INET) 836 panic("tcp_respond: address family mismatch"); 837 if (!in_hosteq(ip->ip_dst, tp->t_inpcb->inp_faddr)) { 838 panic("tcp_respond: ip_dst %x != inp_faddr %x", 839 ntohl(ip->ip_dst.s_addr), 840 ntohl(tp->t_inpcb->inp_faddr.s_addr)); 841 } 842 #endif 843 } 844 #ifdef INET6 845 else if (tp != NULL && tp->t_in6pcb != NULL) { 846 ro = (struct route *)&tp->t_in6pcb->in6p_route; 847 #ifdef DIAGNOSTIC 848 if (family == AF_INET) { 849 if (!IN6_IS_ADDR_V4MAPPED(&tp->t_in6pcb->in6p_faddr)) 850 panic("tcp_respond: not mapped addr"); 851 if (bcmp(&ip->ip_dst, 852 &tp->t_in6pcb->in6p_faddr.s6_addr32[3], 853 sizeof(ip->ip_dst)) != 0) { 854 panic("tcp_respond: ip_dst != in6p_faddr"); 855 } 856 } else if (family == AF_INET6) { 857 if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, 858 &tp->t_in6pcb->in6p_faddr)) 859 panic("tcp_respond: ip6_dst != in6p_faddr"); 860 } else 861 panic("tcp_respond: address family mismatch"); 862 #endif 863 } 864 #endif 865 else 866 ro = NULL; 867 868 switch (family) { 869 #ifdef INET 870 case AF_INET: 871 error = ip_output(m, NULL, ro, 872 (tp && tp->t_mtudisc ? IP_MTUDISC : 0), 873 (struct ip_moptions *)0, so); 874 break; 875 #endif 876 #ifdef INET6 877 case AF_INET6: 878 error = ip6_output(m, NULL, (struct route_in6 *)ro, 0, 879 (struct ip6_moptions *)0, so, NULL); 880 break; 881 #endif 882 default: 883 error = EAFNOSUPPORT; 884 break; 885 } 886 887 return (error); 888 } 889 890 /* 891 * Template TCPCB. Rather than zeroing a new TCPCB and initializing 892 * a bunch of members individually, we maintain this template for the 893 * static and mostly-static components of the TCPCB, and copy it into 894 * the new TCPCB instead. 895 */ 896 static struct tcpcb tcpcb_template = { 897 /* 898 * If TCP_NTIMERS ever changes, we'll need to update this 899 * initializer. 900 */ 901 .t_timer = { 902 CALLOUT_INITIALIZER, 903 CALLOUT_INITIALIZER, 904 CALLOUT_INITIALIZER, 905 CALLOUT_INITIALIZER, 906 }, 907 .t_delack_ch = CALLOUT_INITIALIZER, 908 909 .t_srtt = TCPTV_SRTTBASE, 910 .t_rttmin = TCPTV_MIN, 911 912 .snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT, 913 .snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT, 914 }; 915 916 /* 917 * Updates the TCPCB template whenever a parameter that would affect 918 * the template is changed. 919 */ 920 void 921 tcp_tcpcb_template(void) 922 { 923 struct tcpcb *tp = &tcpcb_template; 924 int flags; 925 926 tp->t_peermss = tcp_mssdflt; 927 tp->t_ourmss = tcp_mssdflt; 928 tp->t_segsz = tcp_mssdflt; 929 930 flags = 0; 931 if (tcp_do_rfc1323 && tcp_do_win_scale) 932 flags |= TF_REQ_SCALE; 933 if (tcp_do_rfc1323 && tcp_do_timestamps) 934 flags |= TF_REQ_TSTMP; 935 if (tcp_do_sack == 2) 936 flags |= TF_WILL_SACK; 937 else if (tcp_do_sack == 1) 938 flags |= TF_WILL_SACK|TF_IGNR_RXSACK; 939 flags |= TF_CANT_TXSACK; 940 tp->t_flags = flags; 941 942 /* 943 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 944 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives 945 * reasonable initial retransmit time. 946 */ 947 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1); 948 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 949 TCPTV_MIN, TCPTV_REXMTMAX); 950 } 951 952 /* 953 * Create a new TCP control block, making an 954 * empty reassembly queue and hooking it to the argument 955 * protocol control block. 956 */ 957 struct tcpcb * 958 tcp_newtcpcb(family, aux) 959 int family; /* selects inpcb, or in6pcb */ 960 void *aux; 961 { 962 struct tcpcb *tp; 963 int i; 964 965 /* XXX Consider using a pool_cache for speed. */ 966 tp = pool_get(&tcpcb_pool, PR_NOWAIT); 967 if (tp == NULL) 968 return (NULL); 969 memcpy(tp, &tcpcb_template, sizeof(*tp)); 970 TAILQ_INIT(&tp->segq); 971 TAILQ_INIT(&tp->timeq); 972 tp->t_family = family; /* may be overridden later on */ 973 LIST_INIT(&tp->t_sc); /* XXX can template this */ 974 975 /* Don't sweat this loop; hopefully the compiler will unroll it. */ 976 for (i = 0; i < TCPT_NTIMERS; i++) 977 TCP_TIMER_INIT(tp, i); 978 979 switch (family) { 980 case AF_INET: 981 { 982 struct inpcb *inp = (struct inpcb *)aux; 983 984 inp->inp_ip.ip_ttl = ip_defttl; 985 inp->inp_ppcb = (caddr_t)tp; 986 987 tp->t_inpcb = inp; 988 tp->t_mtudisc = ip_mtudisc; 989 break; 990 } 991 #ifdef INET6 992 case AF_INET6: 993 { 994 struct in6pcb *in6p = (struct in6pcb *)aux; 995 996 in6p->in6p_ip6.ip6_hlim = in6_selecthlim(in6p, 997 in6p->in6p_route.ro_rt ? in6p->in6p_route.ro_rt->rt_ifp 998 : NULL); 999 in6p->in6p_ppcb = (caddr_t)tp; 1000 1001 tp->t_in6pcb = in6p; 1002 /* for IPv6, always try to run path MTU discovery */ 1003 tp->t_mtudisc = 1; 1004 break; 1005 } 1006 #endif /* INET6 */ 1007 default: 1008 pool_put(&tcpcb_pool, tp); 1009 return (NULL); 1010 } 1011 1012 /* 1013 * Initialize our timebase. When we send timestamps, we take 1014 * the delta from tcp_now -- this means each connection always 1015 * gets a timebase of 0, which makes it, among other things, 1016 * more difficult to determine how long a system has been up, 1017 * and thus how many TCP sequence increments have occurred. 1018 */ 1019 tp->ts_timebase = tcp_now; 1020 1021 return (tp); 1022 } 1023 1024 /* 1025 * Drop a TCP connection, reporting 1026 * the specified error. If connection is synchronized, 1027 * then send a RST to peer. 1028 */ 1029 struct tcpcb * 1030 tcp_drop(tp, errno) 1031 struct tcpcb *tp; 1032 int errno; 1033 { 1034 struct socket *so = NULL; 1035 1036 #ifdef DIAGNOSTIC 1037 if (tp->t_inpcb && tp->t_in6pcb) 1038 panic("tcp_drop: both t_inpcb and t_in6pcb are set"); 1039 #endif 1040 #ifdef INET 1041 if (tp->t_inpcb) 1042 so = tp->t_inpcb->inp_socket; 1043 #endif 1044 #ifdef INET6 1045 if (tp->t_in6pcb) 1046 so = tp->t_in6pcb->in6p_socket; 1047 #endif 1048 if (!so) 1049 return NULL; 1050 1051 if (TCPS_HAVERCVDSYN(tp->t_state)) { 1052 tp->t_state = TCPS_CLOSED; 1053 (void) tcp_output(tp); 1054 tcpstat.tcps_drops++; 1055 } else 1056 tcpstat.tcps_conndrops++; 1057 if (errno == ETIMEDOUT && tp->t_softerror) 1058 errno = tp->t_softerror; 1059 so->so_error = errno; 1060 return (tcp_close(tp)); 1061 } 1062 1063 /* 1064 * Return whether this tcpcb is marked as dead, indicating 1065 * to the calling timer function that no further action should 1066 * be taken, as we are about to release this tcpcb. The release 1067 * of the storage will be done if this is the last timer running. 1068 * 1069 * This is typically called from the callout handler function before 1070 * callout_ack() is done, therefore we need to test the number of 1071 * running timer functions against 1 below, not 0. 1072 */ 1073 int 1074 tcp_isdead(tp) 1075 struct tcpcb *tp; 1076 { 1077 int dead = (tp->t_flags & TF_DEAD); 1078 1079 if (__predict_false(dead)) { 1080 if (tcp_timers_invoking(tp) > 1) 1081 /* not quite there yet -- count separately? */ 1082 return dead; 1083 tcpstat.tcps_delayed_free++; 1084 pool_put(&tcpcb_pool, tp); 1085 } 1086 return dead; 1087 } 1088 1089 /* 1090 * Close a TCP control block: 1091 * discard all space held by the tcp 1092 * discard internet protocol block 1093 * wake up any sleepers 1094 */ 1095 struct tcpcb * 1096 tcp_close(tp) 1097 struct tcpcb *tp; 1098 { 1099 struct inpcb *inp; 1100 #ifdef INET6 1101 struct in6pcb *in6p; 1102 #endif 1103 struct socket *so; 1104 #ifdef RTV_RTT 1105 struct rtentry *rt; 1106 #endif 1107 struct route *ro; 1108 1109 inp = tp->t_inpcb; 1110 #ifdef INET6 1111 in6p = tp->t_in6pcb; 1112 #endif 1113 so = NULL; 1114 ro = NULL; 1115 if (inp) { 1116 so = inp->inp_socket; 1117 ro = &inp->inp_route; 1118 } 1119 #ifdef INET6 1120 else if (in6p) { 1121 so = in6p->in6p_socket; 1122 ro = (struct route *)&in6p->in6p_route; 1123 } 1124 #endif 1125 1126 #ifdef RTV_RTT 1127 /* 1128 * If we sent enough data to get some meaningful characteristics, 1129 * save them in the routing entry. 'Enough' is arbitrarily 1130 * defined as the sendpipesize (default 4K) * 16. This would 1131 * give us 16 rtt samples assuming we only get one sample per 1132 * window (the usual case on a long haul net). 16 samples is 1133 * enough for the srtt filter to converge to within 5% of the correct 1134 * value; fewer samples and we could save a very bogus rtt. 1135 * 1136 * Don't update the default route's characteristics and don't 1137 * update anything that the user "locked". 1138 */ 1139 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && 1140 ro && (rt = ro->ro_rt) && 1141 !in_nullhost(satosin(rt_key(rt))->sin_addr)) { 1142 u_long i = 0; 1143 1144 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 1145 i = tp->t_srtt * 1146 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2)); 1147 if (rt->rt_rmx.rmx_rtt && i) 1148 /* 1149 * filter this update to half the old & half 1150 * the new values, converting scale. 1151 * See route.h and tcp_var.h for a 1152 * description of the scaling constants. 1153 */ 1154 rt->rt_rmx.rmx_rtt = 1155 (rt->rt_rmx.rmx_rtt + i) / 2; 1156 else 1157 rt->rt_rmx.rmx_rtt = i; 1158 } 1159 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 1160 i = tp->t_rttvar * 1161 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2)); 1162 if (rt->rt_rmx.rmx_rttvar && i) 1163 rt->rt_rmx.rmx_rttvar = 1164 (rt->rt_rmx.rmx_rttvar + i) / 2; 1165 else 1166 rt->rt_rmx.rmx_rttvar = i; 1167 } 1168 /* 1169 * update the pipelimit (ssthresh) if it has been updated 1170 * already or if a pipesize was specified & the threshhold 1171 * got below half the pipesize. I.e., wait for bad news 1172 * before we start updating, then update on both good 1173 * and bad news. 1174 */ 1175 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 1176 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) || 1177 i < (rt->rt_rmx.rmx_sendpipe / 2)) { 1178 /* 1179 * convert the limit from user data bytes to 1180 * packets then to packet data bytes. 1181 */ 1182 i = (i + tp->t_segsz / 2) / tp->t_segsz; 1183 if (i < 2) 1184 i = 2; 1185 i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr)); 1186 if (rt->rt_rmx.rmx_ssthresh) 1187 rt->rt_rmx.rmx_ssthresh = 1188 (rt->rt_rmx.rmx_ssthresh + i) / 2; 1189 else 1190 rt->rt_rmx.rmx_ssthresh = i; 1191 } 1192 } 1193 #endif /* RTV_RTT */ 1194 /* free the reassembly queue, if any */ 1195 TCP_REASS_LOCK(tp); 1196 (void) tcp_freeq(tp); 1197 TCP_REASS_UNLOCK(tp); 1198 1199 tcp_canceltimers(tp); 1200 TCP_CLEAR_DELACK(tp); 1201 syn_cache_cleanup(tp); 1202 1203 if (tp->t_template) { 1204 m_free(tp->t_template); 1205 tp->t_template = NULL; 1206 } 1207 if (tcp_timers_invoking(tp)) 1208 tp->t_flags |= TF_DEAD; 1209 else 1210 pool_put(&tcpcb_pool, tp); 1211 1212 if (inp) { 1213 inp->inp_ppcb = 0; 1214 soisdisconnected(so); 1215 in_pcbdetach(inp); 1216 } 1217 #ifdef INET6 1218 else if (in6p) { 1219 in6p->in6p_ppcb = 0; 1220 soisdisconnected(so); 1221 in6_pcbdetach(in6p); 1222 } 1223 #endif 1224 tcpstat.tcps_closed++; 1225 return ((struct tcpcb *)0); 1226 } 1227 1228 int 1229 tcp_freeq(tp) 1230 struct tcpcb *tp; 1231 { 1232 struct ipqent *qe; 1233 int rv = 0; 1234 #ifdef TCPREASS_DEBUG 1235 int i = 0; 1236 #endif 1237 1238 TCP_REASS_LOCK_CHECK(tp); 1239 1240 while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) { 1241 #ifdef TCPREASS_DEBUG 1242 printf("tcp_freeq[%p,%d]: %u:%u(%u) 0x%02x\n", 1243 tp, i++, qe->ipqe_seq, qe->ipqe_seq + qe->ipqe_len, 1244 qe->ipqe_len, qe->ipqe_flags & (TH_SYN|TH_FIN|TH_RST)); 1245 #endif 1246 TAILQ_REMOVE(&tp->segq, qe, ipqe_q); 1247 TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq); 1248 m_freem(qe->ipqe_m); 1249 pool_put(&ipqent_pool, qe); 1250 rv = 1; 1251 } 1252 return (rv); 1253 } 1254 1255 /* 1256 * Protocol drain routine. Called when memory is in short supply. 1257 */ 1258 void 1259 tcp_drain() 1260 { 1261 struct inpcb_hdr *inph; 1262 struct tcpcb *tp; 1263 1264 /* 1265 * Free the sequence queue of all TCP connections. 1266 */ 1267 CIRCLEQ_FOREACH(inph, &tcbtable.inpt_queue, inph_queue) { 1268 switch (inph->inph_af) { 1269 case AF_INET: 1270 tp = intotcpcb((struct inpcb *)inph); 1271 break; 1272 #ifdef INET6 1273 case AF_INET6: 1274 tp = in6totcpcb((struct in6pcb *)inph); 1275 break; 1276 #endif 1277 default: 1278 tp = NULL; 1279 break; 1280 } 1281 if (tp != NULL) { 1282 /* 1283 * We may be called from a device's interrupt 1284 * context. If the tcpcb is already busy, 1285 * just bail out now. 1286 */ 1287 if (tcp_reass_lock_try(tp) == 0) 1288 continue; 1289 if (tcp_freeq(tp)) 1290 tcpstat.tcps_connsdrained++; 1291 TCP_REASS_UNLOCK(tp); 1292 } 1293 } 1294 } 1295 1296 /* 1297 * Notify a tcp user of an asynchronous error; 1298 * store error as soft error, but wake up user 1299 * (for now, won't do anything until can select for soft error). 1300 */ 1301 void 1302 tcp_notify(inp, error) 1303 struct inpcb *inp; 1304 int error; 1305 { 1306 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 1307 struct socket *so = inp->inp_socket; 1308 1309 /* 1310 * Ignore some errors if we are hooked up. 1311 * If connection hasn't completed, has retransmitted several times, 1312 * and receives a second error, give up now. This is better 1313 * than waiting a long time to establish a connection that 1314 * can never complete. 1315 */ 1316 if (tp->t_state == TCPS_ESTABLISHED && 1317 (error == EHOSTUNREACH || error == ENETUNREACH || 1318 error == EHOSTDOWN)) { 1319 return; 1320 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 && 1321 tp->t_rxtshift > 3 && tp->t_softerror) 1322 so->so_error = error; 1323 else 1324 tp->t_softerror = error; 1325 wakeup((caddr_t) &so->so_timeo); 1326 sorwakeup(so); 1327 sowwakeup(so); 1328 } 1329 1330 #ifdef INET6 1331 void 1332 tcp6_notify(in6p, error) 1333 struct in6pcb *in6p; 1334 int error; 1335 { 1336 struct tcpcb *tp = (struct tcpcb *)in6p->in6p_ppcb; 1337 struct socket *so = in6p->in6p_socket; 1338 1339 /* 1340 * Ignore some errors if we are hooked up. 1341 * If connection hasn't completed, has retransmitted several times, 1342 * and receives a second error, give up now. This is better 1343 * than waiting a long time to establish a connection that 1344 * can never complete. 1345 */ 1346 if (tp->t_state == TCPS_ESTABLISHED && 1347 (error == EHOSTUNREACH || error == ENETUNREACH || 1348 error == EHOSTDOWN)) { 1349 return; 1350 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 && 1351 tp->t_rxtshift > 3 && tp->t_softerror) 1352 so->so_error = error; 1353 else 1354 tp->t_softerror = error; 1355 wakeup((caddr_t) &so->so_timeo); 1356 sorwakeup(so); 1357 sowwakeup(so); 1358 } 1359 #endif 1360 1361 #ifdef INET6 1362 void 1363 tcp6_ctlinput(cmd, sa, d) 1364 int cmd; 1365 struct sockaddr *sa; 1366 void *d; 1367 { 1368 struct tcphdr th; 1369 void (*notify) __P((struct in6pcb *, int)) = tcp6_notify; 1370 int nmatch; 1371 struct ip6_hdr *ip6; 1372 const struct sockaddr_in6 *sa6_src = NULL; 1373 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; 1374 struct mbuf *m; 1375 int off; 1376 1377 if (sa->sa_family != AF_INET6 || 1378 sa->sa_len != sizeof(struct sockaddr_in6)) 1379 return; 1380 if ((unsigned)cmd >= PRC_NCMDS) 1381 return; 1382 else if (cmd == PRC_QUENCH) { 1383 /* XXX there's no PRC_QUENCH in IPv6 */ 1384 notify = tcp6_quench; 1385 } else if (PRC_IS_REDIRECT(cmd)) 1386 notify = in6_rtchange, d = NULL; 1387 else if (cmd == PRC_MSGSIZE) 1388 ; /* special code is present, see below */ 1389 else if (cmd == PRC_HOSTDEAD) 1390 d = NULL; 1391 else if (inet6ctlerrmap[cmd] == 0) 1392 return; 1393 1394 /* if the parameter is from icmp6, decode it. */ 1395 if (d != NULL) { 1396 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; 1397 m = ip6cp->ip6c_m; 1398 ip6 = ip6cp->ip6c_ip6; 1399 off = ip6cp->ip6c_off; 1400 sa6_src = ip6cp->ip6c_src; 1401 } else { 1402 m = NULL; 1403 ip6 = NULL; 1404 sa6_src = &sa6_any; 1405 off = 0; 1406 } 1407 1408 if (ip6) { 1409 /* 1410 * XXX: We assume that when ip6 is non NULL, 1411 * M and OFF are valid. 1412 */ 1413 1414 /* check if we can safely examine src and dst ports */ 1415 if (m->m_pkthdr.len < off + sizeof(th)) { 1416 if (cmd == PRC_MSGSIZE) 1417 icmp6_mtudisc_update((struct ip6ctlparam *)d, 0); 1418 return; 1419 } 1420 1421 bzero(&th, sizeof(th)); 1422 m_copydata(m, off, sizeof(th), (caddr_t)&th); 1423 1424 if (cmd == PRC_MSGSIZE) { 1425 int valid = 0; 1426 1427 /* 1428 * Check to see if we have a valid TCP connection 1429 * corresponding to the address in the ICMPv6 message 1430 * payload. 1431 */ 1432 if (in6_pcblookup_connect(&tcbtable, &sa6->sin6_addr, 1433 th.th_dport, (struct in6_addr *)&sa6_src->sin6_addr, 1434 th.th_sport, 0)) 1435 valid++; 1436 1437 /* 1438 * Depending on the value of "valid" and routing table 1439 * size (mtudisc_{hi,lo}wat), we will: 1440 * - recalcurate the new MTU and create the 1441 * corresponding routing entry, or 1442 * - ignore the MTU change notification. 1443 */ 1444 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); 1445 1446 /* 1447 * no need to call in6_pcbnotify, it should have been 1448 * called via callback if necessary 1449 */ 1450 return; 1451 } 1452 1453 nmatch = in6_pcbnotify(&tcbtable, sa, th.th_dport, 1454 (struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify); 1455 if (nmatch == 0 && syn_cache_count && 1456 (inet6ctlerrmap[cmd] == EHOSTUNREACH || 1457 inet6ctlerrmap[cmd] == ENETUNREACH || 1458 inet6ctlerrmap[cmd] == EHOSTDOWN)) 1459 syn_cache_unreach((struct sockaddr *)sa6_src, 1460 sa, &th); 1461 } else { 1462 (void) in6_pcbnotify(&tcbtable, sa, 0, 1463 (struct sockaddr *)sa6_src, 0, cmd, NULL, notify); 1464 } 1465 } 1466 #endif 1467 1468 #ifdef INET 1469 /* assumes that ip header and tcp header are contiguous on mbuf */ 1470 void * 1471 tcp_ctlinput(cmd, sa, v) 1472 int cmd; 1473 struct sockaddr *sa; 1474 void *v; 1475 { 1476 struct ip *ip = v; 1477 struct tcphdr *th; 1478 struct icmp *icp; 1479 extern const int inetctlerrmap[]; 1480 void (*notify) __P((struct inpcb *, int)) = tcp_notify; 1481 int errno; 1482 int nmatch; 1483 #ifdef INET6 1484 struct in6_addr src6, dst6; 1485 #endif 1486 1487 if (sa->sa_family != AF_INET || 1488 sa->sa_len != sizeof(struct sockaddr_in)) 1489 return NULL; 1490 if ((unsigned)cmd >= PRC_NCMDS) 1491 return NULL; 1492 errno = inetctlerrmap[cmd]; 1493 if (cmd == PRC_QUENCH) 1494 notify = tcp_quench; 1495 else if (PRC_IS_REDIRECT(cmd)) 1496 notify = in_rtchange, ip = 0; 1497 else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) { 1498 /* 1499 * Check to see if we have a valid TCP connection 1500 * corresponding to the address in the ICMP message 1501 * payload. 1502 * 1503 * Boundary check is made in icmp_input(), with ICMP_ADVLENMIN. 1504 */ 1505 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 1506 #ifdef INET6 1507 memset(&src6, 0, sizeof(src6)); 1508 memset(&dst6, 0, sizeof(dst6)); 1509 src6.s6_addr16[5] = dst6.s6_addr16[5] = 0xffff; 1510 memcpy(&src6.s6_addr32[3], &ip->ip_src, sizeof(struct in_addr)); 1511 memcpy(&dst6.s6_addr32[3], &ip->ip_dst, sizeof(struct in_addr)); 1512 #endif 1513 if (in_pcblookup_connect(&tcbtable, ip->ip_dst, th->th_dport, 1514 ip->ip_src, th->th_sport) != NULL) 1515 ; 1516 #ifdef INET6 1517 else if (in6_pcblookup_connect(&tcbtable, &dst6, 1518 th->th_dport, &src6, th->th_sport, 0) != NULL) 1519 ; 1520 #endif 1521 else 1522 return NULL; 1523 1524 /* 1525 * Now that we've validated that we are actually communicating 1526 * with the host indicated in the ICMP message, locate the 1527 * ICMP header, recalculate the new MTU, and create the 1528 * corresponding routing entry. 1529 */ 1530 icp = (struct icmp *)((caddr_t)ip - 1531 offsetof(struct icmp, icmp_ip)); 1532 icmp_mtudisc(icp, ip->ip_dst); 1533 1534 return NULL; 1535 } else if (cmd == PRC_HOSTDEAD) 1536 ip = 0; 1537 else if (errno == 0) 1538 return NULL; 1539 if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) { 1540 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 1541 nmatch = in_pcbnotify(&tcbtable, satosin(sa)->sin_addr, 1542 th->th_dport, ip->ip_src, th->th_sport, errno, notify); 1543 if (nmatch == 0 && syn_cache_count && 1544 (inetctlerrmap[cmd] == EHOSTUNREACH || 1545 inetctlerrmap[cmd] == ENETUNREACH || 1546 inetctlerrmap[cmd] == EHOSTDOWN)) { 1547 struct sockaddr_in sin; 1548 bzero(&sin, sizeof(sin)); 1549 sin.sin_len = sizeof(sin); 1550 sin.sin_family = AF_INET; 1551 sin.sin_port = th->th_sport; 1552 sin.sin_addr = ip->ip_src; 1553 syn_cache_unreach((struct sockaddr *)&sin, sa, th); 1554 } 1555 1556 /* XXX mapped address case */ 1557 } else 1558 in_pcbnotifyall(&tcbtable, satosin(sa)->sin_addr, errno, 1559 notify); 1560 return NULL; 1561 } 1562 1563 /* 1564 * When a source quence is received, we are being notifed of congestion. 1565 * Close the congestion window down to the Loss Window (one segment). 1566 * We will gradually open it again as we proceed. 1567 */ 1568 void 1569 tcp_quench(inp, errno) 1570 struct inpcb *inp; 1571 int errno; 1572 { 1573 struct tcpcb *tp = intotcpcb(inp); 1574 1575 if (tp) 1576 tp->snd_cwnd = tp->t_segsz; 1577 } 1578 #endif 1579 1580 #ifdef INET6 1581 void 1582 tcp6_quench(in6p, errno) 1583 struct in6pcb *in6p; 1584 int errno; 1585 { 1586 struct tcpcb *tp = in6totcpcb(in6p); 1587 1588 if (tp) 1589 tp->snd_cwnd = tp->t_segsz; 1590 } 1591 #endif 1592 1593 #ifdef INET 1594 /* 1595 * Path MTU Discovery handlers. 1596 */ 1597 void 1598 tcp_mtudisc_callback(faddr) 1599 struct in_addr faddr; 1600 { 1601 #ifdef INET6 1602 struct in6_addr in6; 1603 #endif 1604 1605 in_pcbnotifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc); 1606 #ifdef INET6 1607 memset(&in6, 0, sizeof(in6)); 1608 in6.s6_addr16[5] = 0xffff; 1609 memcpy(&in6.s6_addr32[3], &faddr, sizeof(struct in_addr)); 1610 tcp6_mtudisc_callback(&in6); 1611 #endif 1612 } 1613 1614 /* 1615 * On receipt of path MTU corrections, flush old route and replace it 1616 * with the new one. Retransmit all unacknowledged packets, to ensure 1617 * that all packets will be received. 1618 */ 1619 void 1620 tcp_mtudisc(inp, errno) 1621 struct inpcb *inp; 1622 int errno; 1623 { 1624 struct tcpcb *tp = intotcpcb(inp); 1625 struct rtentry *rt = in_pcbrtentry(inp); 1626 1627 if (tp != 0) { 1628 if (rt != 0) { 1629 /* 1630 * If this was not a host route, remove and realloc. 1631 */ 1632 if ((rt->rt_flags & RTF_HOST) == 0) { 1633 in_rtchange(inp, errno); 1634 if ((rt = in_pcbrtentry(inp)) == 0) 1635 return; 1636 } 1637 1638 /* 1639 * Slow start out of the error condition. We 1640 * use the MTU because we know it's smaller 1641 * than the previously transmitted segment. 1642 * 1643 * Note: This is more conservative than the 1644 * suggestion in draft-floyd-incr-init-win-03. 1645 */ 1646 if (rt->rt_rmx.rmx_mtu != 0) 1647 tp->snd_cwnd = 1648 TCP_INITIAL_WINDOW(tcp_init_win, 1649 rt->rt_rmx.rmx_mtu); 1650 } 1651 1652 /* 1653 * Resend unacknowledged packets. 1654 */ 1655 tp->snd_nxt = tp->snd_una; 1656 tcp_output(tp); 1657 } 1658 } 1659 #endif 1660 1661 #ifdef INET6 1662 /* 1663 * Path MTU Discovery handlers. 1664 */ 1665 void 1666 tcp6_mtudisc_callback(faddr) 1667 struct in6_addr *faddr; 1668 { 1669 struct sockaddr_in6 sin6; 1670 1671 bzero(&sin6, sizeof(sin6)); 1672 sin6.sin6_family = AF_INET6; 1673 sin6.sin6_len = sizeof(struct sockaddr_in6); 1674 sin6.sin6_addr = *faddr; 1675 (void) in6_pcbnotify(&tcbtable, (struct sockaddr *)&sin6, 0, 1676 (struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc); 1677 } 1678 1679 void 1680 tcp6_mtudisc(in6p, errno) 1681 struct in6pcb *in6p; 1682 int errno; 1683 { 1684 struct tcpcb *tp = in6totcpcb(in6p); 1685 struct rtentry *rt = in6_pcbrtentry(in6p); 1686 1687 if (tp != 0) { 1688 if (rt != 0) { 1689 /* 1690 * If this was not a host route, remove and realloc. 1691 */ 1692 if ((rt->rt_flags & RTF_HOST) == 0) { 1693 in6_rtchange(in6p, errno); 1694 if ((rt = in6_pcbrtentry(in6p)) == 0) 1695 return; 1696 } 1697 1698 /* 1699 * Slow start out of the error condition. We 1700 * use the MTU because we know it's smaller 1701 * than the previously transmitted segment. 1702 * 1703 * Note: This is more conservative than the 1704 * suggestion in draft-floyd-incr-init-win-03. 1705 */ 1706 if (rt->rt_rmx.rmx_mtu != 0) 1707 tp->snd_cwnd = 1708 TCP_INITIAL_WINDOW(tcp_init_win, 1709 rt->rt_rmx.rmx_mtu); 1710 } 1711 1712 /* 1713 * Resend unacknowledged packets. 1714 */ 1715 tp->snd_nxt = tp->snd_una; 1716 tcp_output(tp); 1717 } 1718 } 1719 #endif /* INET6 */ 1720 1721 /* 1722 * Compute the MSS to advertise to the peer. Called only during 1723 * the 3-way handshake. If we are the server (peer initiated 1724 * connection), we are called with a pointer to the interface 1725 * on which the SYN packet arrived. If we are the client (we 1726 * initiated connection), we are called with a pointer to the 1727 * interface out which this connection should go. 1728 * 1729 * NOTE: Do not subtract IP option/extension header size nor IPsec 1730 * header size from MSS advertisement. MSS option must hold the maximum 1731 * segment size we can accept, so it must always be: 1732 * max(if mtu) - ip header - tcp header 1733 */ 1734 u_long 1735 tcp_mss_to_advertise(ifp, af) 1736 const struct ifnet *ifp; 1737 int af; 1738 { 1739 extern u_long in_maxmtu; 1740 u_long mss = 0; 1741 u_long hdrsiz; 1742 1743 /* 1744 * In order to avoid defeating path MTU discovery on the peer, 1745 * we advertise the max MTU of all attached networks as our MSS, 1746 * per RFC 1191, section 3.1. 1747 * 1748 * We provide the option to advertise just the MTU of 1749 * the interface on which we hope this connection will 1750 * be receiving. If we are responding to a SYN, we 1751 * will have a pretty good idea about this, but when 1752 * initiating a connection there is a bit more doubt. 1753 * 1754 * We also need to ensure that loopback has a large enough 1755 * MSS, as the loopback MTU is never included in in_maxmtu. 1756 */ 1757 1758 if (ifp != NULL) 1759 switch (af) { 1760 case AF_INET: 1761 mss = ifp->if_mtu; 1762 break; 1763 #ifdef INET6 1764 case AF_INET6: 1765 mss = IN6_LINKMTU(ifp); 1766 break; 1767 #endif 1768 } 1769 1770 if (tcp_mss_ifmtu == 0) 1771 switch (af) { 1772 case AF_INET: 1773 mss = max(in_maxmtu, mss); 1774 break; 1775 #ifdef INET6 1776 case AF_INET6: 1777 mss = max(in6_maxmtu, mss); 1778 break; 1779 #endif 1780 } 1781 1782 switch (af) { 1783 case AF_INET: 1784 hdrsiz = sizeof(struct ip); 1785 break; 1786 #ifdef INET6 1787 case AF_INET6: 1788 hdrsiz = sizeof(struct ip6_hdr); 1789 break; 1790 #endif 1791 default: 1792 hdrsiz = 0; 1793 break; 1794 } 1795 hdrsiz += sizeof(struct tcphdr); 1796 if (mss > hdrsiz) 1797 mss -= hdrsiz; 1798 1799 mss = max(tcp_mssdflt, mss); 1800 return (mss); 1801 } 1802 1803 /* 1804 * Set connection variables based on the peer's advertised MSS. 1805 * We are passed the TCPCB for the actual connection. If we 1806 * are the server, we are called by the compressed state engine 1807 * when the 3-way handshake is complete. If we are the client, 1808 * we are called when we receive the SYN,ACK from the server. 1809 * 1810 * NOTE: Our advertised MSS value must be initialized in the TCPCB 1811 * before this routine is called! 1812 */ 1813 void 1814 tcp_mss_from_peer(tp, offer) 1815 struct tcpcb *tp; 1816 int offer; 1817 { 1818 struct socket *so; 1819 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) 1820 struct rtentry *rt; 1821 #endif 1822 u_long bufsize; 1823 int mss; 1824 1825 #ifdef DIAGNOSTIC 1826 if (tp->t_inpcb && tp->t_in6pcb) 1827 panic("tcp_mss_from_peer: both t_inpcb and t_in6pcb are set"); 1828 #endif 1829 so = NULL; 1830 rt = NULL; 1831 #ifdef INET 1832 if (tp->t_inpcb) { 1833 so = tp->t_inpcb->inp_socket; 1834 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) 1835 rt = in_pcbrtentry(tp->t_inpcb); 1836 #endif 1837 } 1838 #endif 1839 #ifdef INET6 1840 if (tp->t_in6pcb) { 1841 so = tp->t_in6pcb->in6p_socket; 1842 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH) 1843 rt = in6_pcbrtentry(tp->t_in6pcb); 1844 #endif 1845 } 1846 #endif 1847 1848 /* 1849 * As per RFC1122, use the default MSS value, unless they 1850 * sent us an offer. Do not accept offers less than 256 bytes. 1851 */ 1852 mss = tcp_mssdflt; 1853 if (offer) 1854 mss = offer; 1855 mss = max(mss, 256); /* sanity */ 1856 tp->t_peermss = mss; 1857 mss -= tcp_optlen(tp); 1858 #ifdef INET 1859 if (tp->t_inpcb) 1860 mss -= ip_optlen(tp->t_inpcb); 1861 #endif 1862 #ifdef INET6 1863 if (tp->t_in6pcb) 1864 mss -= ip6_optlen(tp->t_in6pcb); 1865 #endif 1866 1867 /* 1868 * If there's a pipesize, change the socket buffer to that size. 1869 * Make the socket buffer an integral number of MSS units. If 1870 * the MSS is larger than the socket buffer, artificially decrease 1871 * the MSS. 1872 */ 1873 #ifdef RTV_SPIPE 1874 if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0) 1875 bufsize = rt->rt_rmx.rmx_sendpipe; 1876 else 1877 #endif 1878 bufsize = so->so_snd.sb_hiwat; 1879 if (bufsize < mss) 1880 mss = bufsize; 1881 else { 1882 bufsize = roundup(bufsize, mss); 1883 if (bufsize > sb_max) 1884 bufsize = sb_max; 1885 (void) sbreserve(&so->so_snd, bufsize); 1886 } 1887 tp->t_segsz = mss; 1888 1889 #ifdef RTV_SSTHRESH 1890 if (rt != NULL && rt->rt_rmx.rmx_ssthresh) { 1891 /* 1892 * There's some sort of gateway or interface buffer 1893 * limit on the path. Use this to set the slow 1894 * start threshold, but set the threshold to no less 1895 * than 2 * MSS. 1896 */ 1897 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 1898 } 1899 #endif 1900 } 1901 1902 /* 1903 * Processing necessary when a TCP connection is established. 1904 */ 1905 void 1906 tcp_established(tp) 1907 struct tcpcb *tp; 1908 { 1909 struct socket *so; 1910 #ifdef RTV_RPIPE 1911 struct rtentry *rt; 1912 #endif 1913 u_long bufsize; 1914 1915 #ifdef DIAGNOSTIC 1916 if (tp->t_inpcb && tp->t_in6pcb) 1917 panic("tcp_established: both t_inpcb and t_in6pcb are set"); 1918 #endif 1919 so = NULL; 1920 rt = NULL; 1921 #ifdef INET 1922 if (tp->t_inpcb) { 1923 so = tp->t_inpcb->inp_socket; 1924 #if defined(RTV_RPIPE) 1925 rt = in_pcbrtentry(tp->t_inpcb); 1926 #endif 1927 } 1928 #endif 1929 #ifdef INET6 1930 if (tp->t_in6pcb) { 1931 so = tp->t_in6pcb->in6p_socket; 1932 #if defined(RTV_RPIPE) 1933 rt = in6_pcbrtentry(tp->t_in6pcb); 1934 #endif 1935 } 1936 #endif 1937 1938 tp->t_state = TCPS_ESTABLISHED; 1939 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1940 1941 #ifdef RTV_RPIPE 1942 if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0) 1943 bufsize = rt->rt_rmx.rmx_recvpipe; 1944 else 1945 #endif 1946 bufsize = so->so_rcv.sb_hiwat; 1947 if (bufsize > tp->t_ourmss) { 1948 bufsize = roundup(bufsize, tp->t_ourmss); 1949 if (bufsize > sb_max) 1950 bufsize = sb_max; 1951 (void) sbreserve(&so->so_rcv, bufsize); 1952 } 1953 } 1954 1955 /* 1956 * Check if there's an initial rtt or rttvar. Convert from the 1957 * route-table units to scaled multiples of the slow timeout timer. 1958 * Called only during the 3-way handshake. 1959 */ 1960 void 1961 tcp_rmx_rtt(tp) 1962 struct tcpcb *tp; 1963 { 1964 #ifdef RTV_RTT 1965 struct rtentry *rt = NULL; 1966 int rtt; 1967 1968 #ifdef DIAGNOSTIC 1969 if (tp->t_inpcb && tp->t_in6pcb) 1970 panic("tcp_rmx_rtt: both t_inpcb and t_in6pcb are set"); 1971 #endif 1972 #ifdef INET 1973 if (tp->t_inpcb) 1974 rt = in_pcbrtentry(tp->t_inpcb); 1975 #endif 1976 #ifdef INET6 1977 if (tp->t_in6pcb) 1978 rt = in6_pcbrtentry(tp->t_in6pcb); 1979 #endif 1980 if (rt == NULL) 1981 return; 1982 1983 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 1984 /* 1985 * XXX The lock bit for MTU indicates that the value 1986 * is also a minimum value; this is subject to time. 1987 */ 1988 if (rt->rt_rmx.rmx_locks & RTV_RTT) 1989 TCPT_RANGESET(tp->t_rttmin, 1990 rtt / (RTM_RTTUNIT / PR_SLOWHZ), 1991 TCPTV_MIN, TCPTV_REXMTMAX); 1992 tp->t_srtt = rtt / 1993 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2)); 1994 if (rt->rt_rmx.rmx_rttvar) { 1995 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 1996 ((RTM_RTTUNIT / PR_SLOWHZ) >> 1997 (TCP_RTTVAR_SHIFT + 2)); 1998 } else { 1999 /* Default variation is +- 1 rtt */ 2000 tp->t_rttvar = 2001 tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT); 2002 } 2003 TCPT_RANGESET(tp->t_rxtcur, 2004 ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2), 2005 tp->t_rttmin, TCPTV_REXMTMAX); 2006 } 2007 #endif 2008 } 2009 2010 tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */ 2011 #if NRND > 0 2012 u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */ 2013 #endif 2014 2015 /* 2016 * Get a new sequence value given a tcp control block 2017 */ 2018 tcp_seq 2019 tcp_new_iss(struct tcpcb *tp, tcp_seq addin) 2020 { 2021 2022 #ifdef INET 2023 if (tp->t_inpcb != NULL) { 2024 return (tcp_new_iss1(&tp->t_inpcb->inp_laddr, 2025 &tp->t_inpcb->inp_faddr, tp->t_inpcb->inp_lport, 2026 tp->t_inpcb->inp_fport, sizeof(tp->t_inpcb->inp_laddr), 2027 addin)); 2028 } 2029 #endif 2030 #ifdef INET6 2031 if (tp->t_in6pcb != NULL) { 2032 return (tcp_new_iss1(&tp->t_in6pcb->in6p_laddr, 2033 &tp->t_in6pcb->in6p_faddr, tp->t_in6pcb->in6p_lport, 2034 tp->t_in6pcb->in6p_fport, sizeof(tp->t_in6pcb->in6p_laddr), 2035 addin)); 2036 } 2037 #endif 2038 /* Not possible. */ 2039 panic("tcp_new_iss"); 2040 } 2041 2042 /* 2043 * This routine actually generates a new TCP initial sequence number. 2044 */ 2045 tcp_seq 2046 tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport, 2047 size_t addrsz, tcp_seq addin) 2048 { 2049 tcp_seq tcp_iss; 2050 2051 #if NRND > 0 2052 static int beenhere; 2053 2054 /* 2055 * If we haven't been here before, initialize our cryptographic 2056 * hash secret. 2057 */ 2058 if (beenhere == 0) { 2059 rnd_extract_data(tcp_iss_secret, sizeof(tcp_iss_secret), 2060 RND_EXTRACT_ANY); 2061 beenhere = 1; 2062 } 2063 2064 if (tcp_do_rfc1948) { 2065 MD5_CTX ctx; 2066 u_int8_t hash[16]; /* XXX MD5 knowledge */ 2067 2068 /* 2069 * Compute the base value of the ISS. It is a hash 2070 * of (saddr, sport, daddr, dport, secret). 2071 */ 2072 MD5Init(&ctx); 2073 2074 MD5Update(&ctx, (u_char *) laddr, addrsz); 2075 MD5Update(&ctx, (u_char *) &lport, sizeof(lport)); 2076 2077 MD5Update(&ctx, (u_char *) faddr, addrsz); 2078 MD5Update(&ctx, (u_char *) &fport, sizeof(fport)); 2079 2080 MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret)); 2081 2082 MD5Final(hash, &ctx); 2083 2084 memcpy(&tcp_iss, hash, sizeof(tcp_iss)); 2085 2086 /* 2087 * Now increment our "timer", and add it in to 2088 * the computed value. 2089 * 2090 * XXX Use `addin'? 2091 * XXX TCP_ISSINCR too large to use? 2092 */ 2093 tcp_iss_seq += TCP_ISSINCR; 2094 #ifdef TCPISS_DEBUG 2095 printf("ISS hash 0x%08x, ", tcp_iss); 2096 #endif 2097 tcp_iss += tcp_iss_seq + addin; 2098 #ifdef TCPISS_DEBUG 2099 printf("new ISS 0x%08x\n", tcp_iss); 2100 #endif 2101 } else 2102 #endif /* NRND > 0 */ 2103 { 2104 /* 2105 * Randomize. 2106 */ 2107 #if NRND > 0 2108 rnd_extract_data(&tcp_iss, sizeof(tcp_iss), RND_EXTRACT_ANY); 2109 #else 2110 tcp_iss = arc4random(); 2111 #endif 2112 2113 /* 2114 * If we were asked to add some amount to a known value, 2115 * we will take a random value obtained above, mask off 2116 * the upper bits, and add in the known value. We also 2117 * add in a constant to ensure that we are at least a 2118 * certain distance from the original value. 2119 * 2120 * This is used when an old connection is in timed wait 2121 * and we have a new one coming in, for instance. 2122 */ 2123 if (addin != 0) { 2124 #ifdef TCPISS_DEBUG 2125 printf("Random %08x, ", tcp_iss); 2126 #endif 2127 tcp_iss &= TCP_ISS_RANDOM_MASK; 2128 tcp_iss += addin + TCP_ISSINCR; 2129 #ifdef TCPISS_DEBUG 2130 printf("Old ISS %08x, ISS %08x\n", addin, tcp_iss); 2131 #endif 2132 } else { 2133 tcp_iss &= TCP_ISS_RANDOM_MASK; 2134 tcp_iss += tcp_iss_seq; 2135 tcp_iss_seq += TCP_ISSINCR; 2136 #ifdef TCPISS_DEBUG 2137 printf("ISS %08x\n", tcp_iss); 2138 #endif 2139 } 2140 } 2141 2142 if (tcp_compat_42) { 2143 /* 2144 * Limit it to the positive range for really old TCP 2145 * implementations. 2146 * Just AND off the top bit instead of checking if 2147 * is set first - saves a branch 50% of the time. 2148 */ 2149 tcp_iss &= 0x7fffffff; /* XXX */ 2150 } 2151 2152 return (tcp_iss); 2153 } 2154 2155 #if defined(IPSEC) || defined(FAST_IPSEC) 2156 /* compute ESP/AH header size for TCP, including outer IP header. */ 2157 size_t 2158 ipsec4_hdrsiz_tcp(tp) 2159 struct tcpcb *tp; 2160 { 2161 struct inpcb *inp; 2162 size_t hdrsiz; 2163 2164 /* XXX mapped addr case (tp->t_in6pcb) */ 2165 if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) 2166 return 0; 2167 switch (tp->t_family) { 2168 case AF_INET: 2169 /* XXX: should use currect direction. */ 2170 hdrsiz = ipsec4_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp); 2171 break; 2172 default: 2173 hdrsiz = 0; 2174 break; 2175 } 2176 2177 return hdrsiz; 2178 } 2179 2180 #ifdef INET6 2181 size_t 2182 ipsec6_hdrsiz_tcp(tp) 2183 struct tcpcb *tp; 2184 { 2185 struct in6pcb *in6p; 2186 size_t hdrsiz; 2187 2188 if (!tp || !tp->t_template || !(in6p = tp->t_in6pcb)) 2189 return 0; 2190 switch (tp->t_family) { 2191 case AF_INET6: 2192 /* XXX: should use currect direction. */ 2193 hdrsiz = ipsec6_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, in6p); 2194 break; 2195 case AF_INET: 2196 /* mapped address case - tricky */ 2197 default: 2198 hdrsiz = 0; 2199 break; 2200 } 2201 2202 return hdrsiz; 2203 } 2204 #endif 2205 #endif /*IPSEC*/ 2206 2207 /* 2208 * Determine the length of the TCP options for this connection. 2209 * 2210 * XXX: What do we do for SACK, when we add that? Just reserve 2211 * all of the space? Otherwise we can't exactly be incrementing 2212 * cwnd by an amount that varies depending on the amount we last 2213 * had to SACK! 2214 */ 2215 2216 u_int 2217 tcp_optlen(tp) 2218 struct tcpcb *tp; 2219 { 2220 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 2221 (TF_REQ_TSTMP | TF_RCVD_TSTMP)) 2222 return TCPOLEN_TSTAMP_APPA; 2223 else 2224 return 0; 2225 } 2226