1 /* $OpenBSD: tcp_subr.c,v 1.75 2004/01/31 19:40:10 markus Exp $ */ 2 /* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/proc.h> 74 #include <sys/mbuf.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/protosw.h> 78 #include <sys/kernel.h> 79 80 #include <net/route.h> 81 #include <net/if.h> 82 83 #include <netinet/in.h> 84 #include <netinet/in_systm.h> 85 #include <netinet/ip.h> 86 #include <netinet/in_pcb.h> 87 #include <netinet/ip_var.h> 88 #include <netinet/ip_icmp.h> 89 #include <netinet/tcp.h> 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_seq.h> 92 #include <netinet/tcp_timer.h> 93 #include <netinet/tcp_var.h> 94 #include <netinet/tcpip.h> 95 #include <dev/rndvar.h> 96 97 #ifdef INET6 98 #include <netinet6/in6_var.h> 99 #include <netinet6/ip6protosw.h> 100 #endif /* INET6 */ 101 102 #ifdef TCP_SIGNATURE 103 #include <sys/md5k.h> 104 #endif /* TCP_SIGNATURE */ 105 106 /* patchable/settable parameters for tcp */ 107 int tcp_mssdflt = TCP_MSS; 108 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 109 110 /* 111 * Configure kernel with options "TCP_DO_RFC1323=0" to disable RFC1323 stuff. 112 * This is a good idea over slow SLIP/PPP links, because the timestamp 113 * pretty well destroys the VJ compression (any packet with a timestamp 114 * different from the previous one can't be compressed), as well as adding 115 * more overhead. 116 * XXX And it should be a settable per route characteristic (with this just 117 * used as the default). 118 */ 119 #ifndef TCP_DO_RFC1323 120 #define TCP_DO_RFC1323 1 121 #endif 122 int tcp_do_rfc1323 = TCP_DO_RFC1323; 123 124 #ifndef TCP_DO_SACK 125 #ifdef TCP_SACK 126 #define TCP_DO_SACK 1 127 #else 128 #define TCP_DO_SACK 0 129 #endif 130 #endif 131 int tcp_do_sack = TCP_DO_SACK; /* RFC 2018 selective ACKs */ 132 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ 133 int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ 134 int tcp_do_rfc3390 = 0; /* RFC3390 Increasing TCP's Initial Window */ 135 136 u_int32_t tcp_now; 137 138 #ifndef TCBHASHSIZE 139 #define TCBHASHSIZE 128 140 #endif 141 int tcbhashsize = TCBHASHSIZE; 142 143 /* syn hash parameters */ 144 #define TCP_SYN_HASH_SIZE 293 145 #define TCP_SYN_BUCKET_SIZE 35 146 int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; 147 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; 148 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; 149 struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; 150 151 #ifdef INET6 152 extern int ip6_defhlim; 153 #endif /* INET6 */ 154 155 struct pool tcpcb_pool; 156 #ifdef TCP_SACK 157 struct pool sackhl_pool; 158 #endif 159 160 int tcp_freeq(struct tcpcb *); 161 162 struct tcpstat tcpstat; /* tcp statistics */ 163 tcp_seq tcp_iss; 164 165 /* 166 * Tcp initialization 167 */ 168 void 169 tcp_init() 170 { 171 #ifdef TCP_COMPAT_42 172 tcp_iss = 1; /* wrong */ 173 #endif /* TCP_COMPAT_42 */ 174 pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl", 175 NULL); 176 #ifdef TCP_SACK 177 pool_init(&sackhl_pool, sizeof(struct sackhole), 0, 0, 0, "sackhlpl", 178 NULL); 179 #endif /* TCP_SACK */ 180 in_pcbinit(&tcbtable, tcbhashsize); 181 tcp_now = arc4random() / 2; 182 183 #ifdef INET6 184 /* 185 * Since sizeof(struct ip6_hdr) > sizeof(struct ip), we 186 * do max length checks/computations only on the former. 187 */ 188 if (max_protohdr < (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) 189 max_protohdr = (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); 190 if ((max_linkhdr + sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) > 191 MHLEN) 192 panic("tcp_init"); 193 194 icmp6_mtudisc_callback_register(tcp6_mtudisc_callback); 195 #endif /* INET6 */ 196 197 /* Initialize the compressed state engine. */ 198 syn_cache_init(); 199 200 /* Initialize timer state. */ 201 tcp_timer_init(); 202 } 203 204 /* 205 * Create template to be used to send tcp packets on a connection. 206 * Call after host entry created, allocates an mbuf and fills 207 * in a skeletal tcp/ip header, minimizing the amount of work 208 * necessary when the connection is used. 209 * 210 * To support IPv6 in addition to IPv4 and considering that the sizes of 211 * the IPv4 and IPv6 headers are not the same, we now use a separate pointer 212 * for the TCP header. Also, we made the former tcpiphdr header pointer 213 * into just an IP overlay pointer, with casting as appropriate for v6. rja 214 */ 215 struct mbuf * 216 tcp_template(tp) 217 struct tcpcb *tp; 218 { 219 struct inpcb *inp = tp->t_inpcb; 220 struct mbuf *m; 221 struct tcphdr *th; 222 223 if ((m = tp->t_template) == 0) { 224 m = m_get(M_DONTWAIT, MT_HEADER); 225 if (m == NULL) 226 return (0); 227 228 switch (tp->pf) { 229 case 0: /*default to PF_INET*/ 230 #ifdef INET 231 case AF_INET: 232 m->m_len = sizeof(struct ip); 233 break; 234 #endif /* INET */ 235 #ifdef INET6 236 case AF_INET6: 237 m->m_len = sizeof(struct ip6_hdr); 238 break; 239 #endif /* INET6 */ 240 } 241 m->m_len += sizeof (struct tcphdr); 242 243 /* 244 * The link header, network header, TCP header, and TCP options 245 * all must fit in this mbuf. For now, assume the worst case of 246 * TCP options size. Eventually, compute this from tp flags. 247 */ 248 if (m->m_len + MAX_TCPOPTLEN + max_linkhdr >= MHLEN) { 249 MCLGET(m, M_DONTWAIT); 250 if ((m->m_flags & M_EXT) == 0) { 251 m_free(m); 252 return (0); 253 } 254 } 255 } 256 257 switch(tp->pf) { 258 #ifdef INET 259 case AF_INET: 260 { 261 struct ipovly *ipovly; 262 263 ipovly = mtod(m, struct ipovly *); 264 265 bzero(ipovly->ih_x1, sizeof ipovly->ih_x1); 266 ipovly->ih_pr = IPPROTO_TCP; 267 ipovly->ih_len = htons(sizeof (struct tcphdr)); 268 ipovly->ih_src = inp->inp_laddr; 269 ipovly->ih_dst = inp->inp_faddr; 270 271 th = (struct tcphdr *)(mtod(m, caddr_t) + 272 sizeof(struct ip)); 273 th->th_sum = in_cksum_phdr(ipovly->ih_src.s_addr, 274 ipovly->ih_dst.s_addr, 275 htons(sizeof (struct tcphdr) + IPPROTO_TCP)); 276 } 277 break; 278 #endif /* INET */ 279 #ifdef INET6 280 case AF_INET6: 281 { 282 struct ip6_hdr *ip6; 283 284 ip6 = mtod(m, struct ip6_hdr *); 285 286 ip6->ip6_src = inp->inp_laddr6; 287 ip6->ip6_dst = inp->inp_faddr6; 288 ip6->ip6_flow = htonl(0x60000000) | 289 (inp->inp_flowinfo & IPV6_FLOWLABEL_MASK); 290 291 ip6->ip6_nxt = IPPROTO_TCP; 292 ip6->ip6_plen = htons(sizeof(struct tcphdr)); /*XXX*/ 293 ip6->ip6_hlim = in6_selecthlim(inp, NULL); /*XXX*/ 294 295 th = (struct tcphdr *)(mtod(m, caddr_t) + 296 sizeof(struct ip6_hdr)); 297 th->th_sum = 0; 298 } 299 break; 300 #endif /* INET6 */ 301 } 302 303 th->th_sport = inp->inp_lport; 304 th->th_dport = inp->inp_fport; 305 th->th_seq = 0; 306 th->th_ack = 0; 307 th->th_x2 = 0; 308 th->th_off = 5; 309 th->th_flags = 0; 310 th->th_win = 0; 311 th->th_urp = 0; 312 return (m); 313 } 314 315 /* 316 * Send a single message to the TCP at address specified by 317 * the given TCP/IP header. If m == 0, then we make a copy 318 * of the tcpiphdr at ti and send directly to the addressed host. 319 * This is used to force keep alive messages out using the TCP 320 * template for a connection tp->t_template. If flags are given 321 * then we send a message back to the TCP which originated the 322 * segment ti, and discard the mbuf containing it and any other 323 * attached mbufs. 324 * 325 * In any case the ack and sequence number of the transmitted 326 * segment are as specified by the parameters. 327 */ 328 #ifdef INET6 329 /* This function looks hairy, because it was so IPv4-dependent. */ 330 #endif /* INET6 */ 331 void 332 tcp_respond(tp, template, m, ack, seq, flags) 333 struct tcpcb *tp; 334 caddr_t template; 335 struct mbuf *m; 336 tcp_seq ack, seq; 337 int flags; 338 { 339 int tlen; 340 int win = 0; 341 struct route *ro = 0; 342 struct tcphdr *th; 343 struct tcpiphdr *ti = (struct tcpiphdr *)template; 344 int af; /* af on wire */ 345 346 if (tp) { 347 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 348 /* 349 * If this is called with an unconnected 350 * socket/tp/pcb (tp->pf is 0), we lose. 351 */ 352 af = tp->pf; 353 354 /* 355 * The route/route6 distinction is meaningless 356 * unless you're allocating space or passing parameters. 357 */ 358 ro = &tp->t_inpcb->inp_route; 359 } else 360 af = (((struct ip *)ti)->ip_v == 6) ? AF_INET6 : AF_INET; 361 if (m == 0) { 362 m = m_gethdr(M_DONTWAIT, MT_HEADER); 363 if (m == NULL) 364 return; 365 #ifdef TCP_COMPAT_42 366 tlen = 1; 367 #else 368 tlen = 0; 369 #endif 370 m->m_data += max_linkhdr; 371 switch (af) { 372 #ifdef INET6 373 case AF_INET6: 374 bcopy(ti, mtod(m, caddr_t), sizeof(struct tcphdr) + 375 sizeof(struct ip6_hdr)); 376 break; 377 #endif /* INET6 */ 378 case AF_INET: 379 bcopy(ti, mtod(m, caddr_t), sizeof(struct tcphdr) + 380 sizeof(struct ip)); 381 break; 382 } 383 384 ti = mtod(m, struct tcpiphdr *); 385 flags = TH_ACK; 386 } else { 387 m_freem(m->m_next); 388 m->m_next = 0; 389 m->m_data = (caddr_t)ti; 390 tlen = 0; 391 #define xchg(a,b,type) do { type t; t=a; a=b; b=t; } while (0) 392 switch (af) { 393 #ifdef INET6 394 case AF_INET6: 395 m->m_len = sizeof(struct tcphdr) + sizeof(struct ip6_hdr); 396 xchg(((struct ip6_hdr *)ti)->ip6_dst, 397 ((struct ip6_hdr *)ti)->ip6_src, struct in6_addr); 398 th = (void *)((caddr_t)ti + sizeof(struct ip6_hdr)); 399 break; 400 #endif /* INET6 */ 401 case AF_INET: 402 m->m_len = sizeof (struct tcpiphdr); 403 xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_int32_t); 404 th = (void *)((caddr_t)ti + sizeof(struct ip)); 405 break; 406 } 407 xchg(th->th_dport, th->th_sport, u_int16_t); 408 #undef xchg 409 } 410 switch (af) { 411 #ifdef INET6 412 case AF_INET6: 413 tlen += sizeof(struct tcphdr) + sizeof(struct ip6_hdr); 414 th = (struct tcphdr *)((caddr_t)ti + sizeof(struct ip6_hdr)); 415 break; 416 #endif /* INET6 */ 417 case AF_INET: 418 ti->ti_len = htons((u_int16_t)(sizeof (struct tcphdr) + tlen)); 419 tlen += sizeof (struct tcpiphdr); 420 th = (struct tcphdr *)((caddr_t)ti + sizeof(struct ip)); 421 break; 422 } 423 424 m->m_len = tlen; 425 m->m_pkthdr.len = tlen; 426 m->m_pkthdr.rcvif = (struct ifnet *) 0; 427 th->th_seq = htonl(seq); 428 th->th_ack = htonl(ack); 429 th->th_x2 = 0; 430 th->th_off = sizeof (struct tcphdr) >> 2; 431 th->th_flags = flags; 432 if (tp) 433 win >>= tp->rcv_scale; 434 if (win > TCP_MAXWIN) 435 win = TCP_MAXWIN; 436 th->th_win = htons((u_int16_t)win); 437 th->th_urp = 0; 438 439 switch (af) { 440 #ifdef INET6 441 case AF_INET6: 442 ((struct ip6_hdr *)ti)->ip6_flow = htonl(0x60000000); 443 ((struct ip6_hdr *)ti)->ip6_nxt = IPPROTO_TCP; 444 ((struct ip6_hdr *)ti)->ip6_hlim = 445 in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); /*XXX*/ 446 ((struct ip6_hdr *)ti)->ip6_plen = tlen - sizeof(struct ip6_hdr); 447 th->th_sum = 0; 448 th->th_sum = in6_cksum(m, IPPROTO_TCP, 449 sizeof(struct ip6_hdr), ((struct ip6_hdr *)ti)->ip6_plen); 450 HTONS(((struct ip6_hdr *)ti)->ip6_plen); 451 ip6_output(m, tp ? tp->t_inpcb->inp_outputopts6 : NULL, 452 (struct route_in6 *)ro, 0, NULL, NULL); 453 break; 454 #endif /* INET6 */ 455 case AF_INET: 456 bzero(ti->ti_x1, sizeof ti->ti_x1); 457 ti->ti_len = htons((u_short)tlen - sizeof(struct ip)); 458 459 /* 460 * There's no point deferring to hardware checksum processing 461 * here, as we only send a minimal TCP packet whose checksum 462 * we need to compute in any case. 463 */ 464 th->th_sum = 0; 465 th->th_sum = in_cksum(m, tlen); 466 ((struct ip *)ti)->ip_len = htons(tlen); 467 ((struct ip *)ti)->ip_ttl = ip_defttl; 468 ip_output(m, (void *)NULL, ro, ip_mtudisc ? IP_MTUDISC : 0, 469 (void *)NULL, tp ? tp->t_inpcb : (void *)NULL); 470 } 471 } 472 473 /* 474 * Create a new TCP control block, making an 475 * empty reassembly queue and hooking it to the argument 476 * protocol control block. 477 */ 478 struct tcpcb * 479 tcp_newtcpcb(struct inpcb *inp) 480 { 481 struct tcpcb *tp; 482 int i; 483 484 tp = pool_get(&tcpcb_pool, PR_NOWAIT); 485 if (tp == NULL) 486 return ((struct tcpcb *)0); 487 bzero((char *) tp, sizeof(struct tcpcb)); 488 LIST_INIT(&tp->segq); 489 tp->t_maxseg = tcp_mssdflt; 490 tp->t_maxopd = 0; 491 492 TCP_INIT_DELACK(tp); 493 for (i = 0; i < TCPT_NTIMERS; i++) 494 TCP_TIMER_INIT(tp, i); 495 496 #ifdef TCP_SACK 497 tp->sack_enable = tcp_do_sack; 498 #endif 499 tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 500 tp->t_inpcb = inp; 501 /* 502 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 503 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives 504 * reasonable initial retransmit time. 505 */ 506 tp->t_srtt = TCPTV_SRTTBASE; 507 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1); 508 tp->t_rttmin = TCPTV_MIN; 509 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 510 TCPTV_MIN, TCPTV_REXMTMAX); 511 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 512 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 513 #ifdef INET6 514 /* we disallow IPv4 mapped address completely. */ 515 if ((inp->inp_flags & INP_IPV6) == 0) 516 tp->pf = PF_INET; 517 else 518 tp->pf = PF_INET6; 519 #else 520 tp->pf = PF_INET; 521 #endif 522 523 #ifdef INET6 524 if (inp->inp_flags & INP_IPV6) 525 inp->inp_ipv6.ip6_hlim = ip6_defhlim; 526 else 527 #endif /* INET6 */ 528 inp->inp_ip.ip_ttl = ip_defttl; 529 530 inp->inp_ppcb = (caddr_t)tp; 531 return (tp); 532 } 533 534 /* 535 * Drop a TCP connection, reporting 536 * the specified error. If connection is synchronized, 537 * then send a RST to peer. 538 */ 539 struct tcpcb * 540 tcp_drop(tp, errno) 541 struct tcpcb *tp; 542 int errno; 543 { 544 struct socket *so = tp->t_inpcb->inp_socket; 545 546 if (TCPS_HAVERCVDSYN(tp->t_state)) { 547 tp->t_state = TCPS_CLOSED; 548 (void) tcp_output(tp); 549 tcpstat.tcps_drops++; 550 } else 551 tcpstat.tcps_conndrops++; 552 if (errno == ETIMEDOUT && tp->t_softerror) 553 errno = tp->t_softerror; 554 so->so_error = errno; 555 return (tcp_close(tp)); 556 } 557 558 /* 559 * Close a TCP control block: 560 * discard all space held by the tcp 561 * discard internet protocol block 562 * wake up any sleepers 563 */ 564 struct tcpcb * 565 tcp_close(struct tcpcb *tp) 566 { 567 struct inpcb *inp = tp->t_inpcb; 568 struct socket *so = inp->inp_socket; 569 #ifdef TCP_SACK 570 struct sackhole *p, *q; 571 #endif 572 #ifdef RTV_RTT 573 struct rtentry *rt; 574 #ifdef INET6 575 int bound_to_specific = 0; /* I.e. non-default */ 576 577 /* 578 * This code checks the nature of the route for this connection. 579 * Normally this is done by two simple checks in the next 580 * INET/INET6 ifdef block, but because of two possible lower layers, 581 * that check is done here. 582 * 583 * Perhaps should be doing this only for a RTF_HOST route. 584 */ 585 rt = inp->inp_route.ro_rt; /* Same for route or route6. */ 586 if (tp->pf == PF_INET6) { 587 if (rt) 588 bound_to_specific = 589 !(IN6_IS_ADDR_UNSPECIFIED(& 590 ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)); 591 } else { 592 if (rt) 593 bound_to_specific = 594 (((struct sockaddr_in *)rt_key(rt))-> 595 sin_addr.s_addr != INADDR_ANY); 596 } 597 #endif /* INET6 */ 598 599 /* 600 * If we sent enough data to get some meaningful characteristics, 601 * save them in the routing entry. 'Enough' is arbitrarily 602 * defined as the sendpipesize (default 4K) * 16. This would 603 * give us 16 rtt samples assuming we only get one sample per 604 * window (the usual case on a long haul net). 16 samples is 605 * enough for the srtt filter to converge to within 5% of the correct 606 * value; fewer samples and we could save a very bogus rtt. 607 * 608 * Don't update the default route's characteristics and don't 609 * update anything that the user "locked". 610 */ 611 #ifdef INET6 612 /* 613 * Note that rt and bound_to_specific are set above. 614 */ 615 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && 616 rt && bound_to_specific) { 617 #else /* INET6 */ 618 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && 619 (rt = inp->inp_route.ro_rt) && 620 satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { 621 #endif /* INET6 */ 622 u_long i = 0; 623 624 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 625 i = tp->t_srtt * 626 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); 627 if (rt->rt_rmx.rmx_rtt && i) 628 /* 629 * filter this update to half the old & half 630 * the new values, converting scale. 631 * See route.h and tcp_var.h for a 632 * description of the scaling constants. 633 */ 634 rt->rt_rmx.rmx_rtt = 635 (rt->rt_rmx.rmx_rtt + i) / 2; 636 else 637 rt->rt_rmx.rmx_rtt = i; 638 } 639 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 640 i = tp->t_rttvar * 641 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); 642 if (rt->rt_rmx.rmx_rttvar && i) 643 rt->rt_rmx.rmx_rttvar = 644 (rt->rt_rmx.rmx_rttvar + i) / 2; 645 else 646 rt->rt_rmx.rmx_rttvar = i; 647 } 648 /* 649 * update the pipelimit (ssthresh) if it has been updated 650 * already or if a pipesize was specified & the threshhold 651 * got below half the pipesize. I.e., wait for bad news 652 * before we start updating, then update on both good 653 * and bad news. 654 */ 655 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 656 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) || 657 i < (rt->rt_rmx.rmx_sendpipe / 2)) { 658 /* 659 * convert the limit from user data bytes to 660 * packets then to packet data bytes. 661 */ 662 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 663 if (i < 2) 664 i = 2; 665 #ifdef INET6 666 if (tp->pf == PF_INET6) 667 i *= (u_long)(tp->t_maxseg + sizeof (struct tcphdr) 668 + sizeof(struct ip6_hdr)); 669 else 670 #endif /* INET6 */ 671 i *= (u_long)(tp->t_maxseg + 672 sizeof (struct tcpiphdr)); 673 674 if (rt->rt_rmx.rmx_ssthresh) 675 rt->rt_rmx.rmx_ssthresh = 676 (rt->rt_rmx.rmx_ssthresh + i) / 2; 677 else 678 rt->rt_rmx.rmx_ssthresh = i; 679 } 680 } 681 #endif /* RTV_RTT */ 682 683 /* free the reassembly queue, if any */ 684 tcp_freeq(tp); 685 686 tcp_canceltimers(tp); 687 TCP_CLEAR_DELACK(tp); 688 syn_cache_cleanup(tp); 689 690 #ifdef TCP_SACK 691 /* Free SACK holes. */ 692 q = p = tp->snd_holes; 693 while (p != 0) { 694 q = p->next; 695 pool_put(&sackhl_pool, p); 696 p = q; 697 } 698 #endif 699 if (tp->t_template) 700 (void) m_free(tp->t_template); 701 pool_put(&tcpcb_pool, tp); 702 inp->inp_ppcb = 0; 703 soisdisconnected(so); 704 in_pcbdetach(inp); 705 tcpstat.tcps_closed++; 706 return ((struct tcpcb *)0); 707 } 708 709 int 710 tcp_freeq(struct tcpcb *tp) 711 { 712 struct ipqent *qe; 713 int rv = 0; 714 715 while ((qe = LIST_FIRST(&tp->segq)) != NULL) { 716 LIST_REMOVE(qe, ipqe_q); 717 m_freem(qe->ipqe_m); 718 pool_put(&ipqent_pool, qe); 719 rv = 1; 720 } 721 return (rv); 722 } 723 724 void 725 tcp_drain() 726 { 727 728 } 729 730 /* 731 * Compute proper scaling value for receiver window from buffer space 732 */ 733 734 void 735 tcp_rscale(struct tcpcb *tp, u_long hiwat) 736 { 737 tp->request_r_scale = 0; 738 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 739 TCP_MAXWIN << tp->request_r_scale < hiwat) 740 tp->request_r_scale++; 741 } 742 743 /* 744 * Notify a tcp user of an asynchronous error; 745 * store error as soft error, but wake up user 746 * (for now, won't do anything until can select for soft error). 747 */ 748 void 749 tcp_notify(inp, error) 750 struct inpcb *inp; 751 int error; 752 { 753 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 754 struct socket *so = inp->inp_socket; 755 756 /* 757 * Ignore some errors if we are hooked up. 758 * If connection hasn't completed, has retransmitted several times, 759 * and receives a second error, give up now. This is better 760 * than waiting a long time to establish a connection that 761 * can never complete. 762 */ 763 if (tp->t_state == TCPS_ESTABLISHED && 764 (error == EHOSTUNREACH || error == ENETUNREACH || 765 error == EHOSTDOWN)) { 766 return; 767 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 && 768 tp->t_rxtshift > 3 && tp->t_softerror) 769 so->so_error = error; 770 else 771 tp->t_softerror = error; 772 wakeup((caddr_t) &so->so_timeo); 773 sorwakeup(so); 774 sowwakeup(so); 775 } 776 777 #ifdef INET6 778 void 779 tcp6_ctlinput(cmd, sa, d) 780 int cmd; 781 struct sockaddr *sa; 782 void *d; 783 { 784 struct tcphdr th; 785 void (*notify)(struct inpcb *, int) = tcp_notify; 786 struct ip6_hdr *ip6; 787 const struct sockaddr_in6 *sa6_src = NULL; 788 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; 789 struct mbuf *m; 790 int off; 791 struct { 792 u_int16_t th_sport; 793 u_int16_t th_dport; 794 } *thp; 795 796 if (sa->sa_family != AF_INET6 || 797 sa->sa_len != sizeof(struct sockaddr_in6)) 798 return; 799 if ((unsigned)cmd >= PRC_NCMDS) 800 return; 801 else if (cmd == PRC_QUENCH) { 802 /* XXX there's no PRC_QUENCH in IPv6 */ 803 notify = tcp_quench; 804 } else if (PRC_IS_REDIRECT(cmd)) 805 notify = in_rtchange, d = NULL; 806 else if (cmd == PRC_MSGSIZE) 807 ; /* special code is present, see below */ 808 else if (cmd == PRC_HOSTDEAD) 809 d = NULL; 810 else if (inet6ctlerrmap[cmd] == 0) 811 return; 812 813 /* if the parameter is from icmp6, decode it. */ 814 if (d != NULL) { 815 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; 816 m = ip6cp->ip6c_m; 817 ip6 = ip6cp->ip6c_ip6; 818 off = ip6cp->ip6c_off; 819 sa6_src = ip6cp->ip6c_src; 820 } else { 821 m = NULL; 822 ip6 = NULL; 823 sa6_src = &sa6_any; 824 } 825 826 if (ip6) { 827 /* 828 * XXX: We assume that when ip6 is non NULL, 829 * M and OFF are valid. 830 */ 831 832 /* check if we can safely examine src and dst ports */ 833 if (m->m_pkthdr.len < off + sizeof(*thp)) 834 return; 835 836 bzero(&th, sizeof(th)); 837 #ifdef DIAGNOSTIC 838 if (sizeof(*thp) > sizeof(th)) 839 panic("assumption failed in tcp6_ctlinput"); 840 #endif 841 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 842 843 if (cmd == PRC_MSGSIZE) { 844 int valid = 0; 845 846 /* 847 * Check to see if we have a valid TCP connection 848 * corresponding to the address in the ICMPv6 message 849 * payload. 850 */ 851 if (in6_pcbhashlookup(&tcbtable, &sa6->sin6_addr, 852 th.th_dport, (struct in6_addr *)&sa6_src->sin6_addr, 853 th.th_sport)) 854 valid++; 855 856 /* 857 * Depending on the value of "valid" and routing table 858 * size (mtudisc_{hi,lo}wat), we will: 859 * - recalcurate the new MTU and create the 860 * corresponding routing entry, or 861 * - ignore the MTU change notification. 862 */ 863 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); 864 865 return; 866 } 867 868 if (in6_pcbnotify(&tcbtable, sa, th.th_dport, 869 (struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify) == 0 && 870 syn_cache_count && 871 (inet6ctlerrmap[cmd] == EHOSTUNREACH || 872 inet6ctlerrmap[cmd] == ENETUNREACH || 873 inet6ctlerrmap[cmd] == EHOSTDOWN)) 874 syn_cache_unreach((struct sockaddr *)sa6_src, 875 sa, &th); 876 } else { 877 (void) in6_pcbnotify(&tcbtable, sa, 0, 878 (struct sockaddr *)sa6_src, 0, cmd, NULL, notify); 879 } 880 } 881 #endif 882 883 void * 884 tcp_ctlinput(cmd, sa, v) 885 int cmd; 886 struct sockaddr *sa; 887 void *v; 888 { 889 struct ip *ip = v; 890 struct tcphdr *th; 891 extern int inetctlerrmap[]; 892 void (*notify)(struct inpcb *, int) = tcp_notify; 893 int errno; 894 895 if (sa->sa_family != AF_INET) 896 return NULL; 897 898 if ((unsigned)cmd >= PRC_NCMDS) 899 return NULL; 900 errno = inetctlerrmap[cmd]; 901 if (cmd == PRC_QUENCH) 902 notify = tcp_quench; 903 else if (PRC_IS_REDIRECT(cmd)) 904 notify = in_rtchange, ip = 0; 905 else if (cmd == PRC_MSGSIZE && ip_mtudisc) { 906 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 907 /* 908 * Verify that the packet in the icmp payload refers 909 * to an existing TCP connection. 910 */ 911 /* 912 * XXX is it possible to get a valid PRC_MSGSIZE error for 913 * a non-established connection? 914 */ 915 if (in_pcbhashlookup(&tcbtable, 916 ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport)) { 917 struct icmp *icp; 918 icp = (struct icmp *)((caddr_t)ip - 919 offsetof(struct icmp, icmp_ip)); 920 921 /* Calculate new mtu and create corresponding route */ 922 icmp_mtudisc(icp); 923 } 924 notify = tcp_mtudisc, ip = 0; 925 } else if (cmd == PRC_MTUINC) 926 notify = tcp_mtudisc_increase, ip = 0; 927 else if (cmd == PRC_HOSTDEAD) 928 ip = 0; 929 else if (errno == 0) 930 return NULL; 931 932 if (ip) { 933 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 934 if (in_pcbnotify(&tcbtable, sa, th->th_dport, ip->ip_src, 935 th->th_sport, errno, notify) == 0 && 936 syn_cache_count && 937 (inetctlerrmap[cmd] == EHOSTUNREACH || 938 inetctlerrmap[cmd] == ENETUNREACH || 939 inetctlerrmap[cmd] == EHOSTDOWN)) { 940 struct sockaddr_in sin; 941 942 bzero(&sin, sizeof(sin)); 943 sin.sin_len = sizeof(sin); 944 sin.sin_family = AF_INET; 945 sin.sin_port = th->th_sport; 946 sin.sin_addr = ip->ip_src; 947 syn_cache_unreach((struct sockaddr *)&sin, 948 sa, th); 949 } 950 } else 951 in_pcbnotifyall(&tcbtable, sa, errno, notify); 952 953 return NULL; 954 } 955 956 /* 957 * When a source quench is received, close congestion window 958 * to one segment. We will gradually open it again as we proceed. 959 */ 960 void 961 tcp_quench(inp, errno) 962 struct inpcb *inp; 963 int errno; 964 { 965 struct tcpcb *tp = intotcpcb(inp); 966 967 if (tp) 968 tp->snd_cwnd = tp->t_maxseg; 969 } 970 971 #ifdef INET6 972 /* 973 * Path MTU Discovery handlers. 974 */ 975 void 976 tcp6_mtudisc_callback(faddr) 977 struct in6_addr *faddr; 978 { 979 struct sockaddr_in6 sin6; 980 981 bzero(&sin6, sizeof(sin6)); 982 sin6.sin6_family = AF_INET6; 983 sin6.sin6_len = sizeof(struct sockaddr_in6); 984 sin6.sin6_addr = *faddr; 985 (void) in6_pcbnotify(&tcbtable, (struct sockaddr *)&sin6, 0, 986 (struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp_mtudisc); 987 } 988 #endif /* INET6 */ 989 990 /* 991 * On receipt of path MTU corrections, flush old route and replace it 992 * with the new one. Retransmit all unacknowledged packets, to ensure 993 * that all packets will be received. 994 */ 995 void 996 tcp_mtudisc(inp, errno) 997 struct inpcb *inp; 998 int errno; 999 { 1000 struct tcpcb *tp = intotcpcb(inp); 1001 struct rtentry *rt = in_pcbrtentry(inp); 1002 1003 if (tp != 0) { 1004 if (rt != 0) { 1005 /* 1006 * If this was not a host route, remove and realloc. 1007 */ 1008 if ((rt->rt_flags & RTF_HOST) == 0) { 1009 in_rtchange(inp, errno); 1010 if ((rt = in_pcbrtentry(inp)) == 0) 1011 return; 1012 } 1013 1014 if (rt->rt_rmx.rmx_mtu != 0) { 1015 /* also takes care of congestion window */ 1016 tcp_mss(tp, -1); 1017 } 1018 } 1019 1020 /* 1021 * Resend unacknowledged packets. 1022 */ 1023 tp->snd_nxt = tp->snd_una; 1024 tcp_output(tp); 1025 } 1026 } 1027 1028 void 1029 tcp_mtudisc_increase(inp, errno) 1030 struct inpcb *inp; 1031 int errno; 1032 { 1033 struct tcpcb *tp = intotcpcb(inp); 1034 struct rtentry *rt = in_pcbrtentry(inp); 1035 1036 if (tp != 0 && rt != 0) { 1037 /* 1038 * If this was a host route, remove and realloc. 1039 */ 1040 if (rt->rt_flags & RTF_HOST) 1041 in_rtchange(inp, errno); 1042 1043 /* also takes care of congestion window */ 1044 tcp_mss(tp, -1); 1045 } 1046 } 1047 1048 #ifdef TCP_SIGNATURE 1049 int 1050 tcp_signature_tdb_attach() 1051 { 1052 return (0); 1053 } 1054 1055 int 1056 tcp_signature_tdb_init(tdbp, xsp, ii) 1057 struct tdb *tdbp; 1058 struct xformsw *xsp; 1059 struct ipsecinit *ii; 1060 { 1061 if ((ii->ii_authkeylen < 1) || (ii->ii_authkeylen > 80)) 1062 return (EINVAL); 1063 1064 tdbp->tdb_amxkey = malloc(ii->ii_authkeylen, M_XDATA, M_DONTWAIT); 1065 if (tdbp->tdb_amxkey == NULL) 1066 return (ENOMEM); 1067 bcopy(ii->ii_authkey, tdbp->tdb_amxkey, ii->ii_authkeylen); 1068 tdbp->tdb_amxkeylen = ii->ii_authkeylen; 1069 1070 return (0); 1071 } 1072 1073 int 1074 tcp_signature_tdb_zeroize(tdbp) 1075 struct tdb *tdbp; 1076 { 1077 if (tdbp->tdb_amxkey) { 1078 bzero(tdbp->tdb_amxkey, tdbp->tdb_amxkeylen); 1079 free(tdbp->tdb_amxkey, M_XDATA); 1080 tdbp->tdb_amxkey = NULL; 1081 } 1082 1083 return (0); 1084 } 1085 1086 int 1087 tcp_signature_tdb_input(m, tdbp, skip, protoff) 1088 struct mbuf *m; 1089 struct tdb *tdbp; 1090 int skip, protoff; 1091 { 1092 return (0); 1093 } 1094 1095 int 1096 tcp_signature_tdb_output(m, tdbp, mp, skip, protoff) 1097 struct mbuf *m; 1098 struct tdb *tdbp; 1099 struct mbuf **mp; 1100 int skip, protoff; 1101 { 1102 return (EINVAL); 1103 } 1104 1105 int 1106 tcp_signature_apply(fstate, data, len) 1107 caddr_t fstate; 1108 caddr_t data; 1109 unsigned int len; 1110 { 1111 MD5Update((MD5_CTX *)fstate, (char *)data, len); 1112 return 0; 1113 } 1114 #endif /* TCP_SIGNATURE */ 1115 1116 #define TCP_RNDISS_ROUNDS 16 1117 #define TCP_RNDISS_OUT 7200 1118 #define TCP_RNDISS_MAX 30000 1119 1120 u_int8_t tcp_rndiss_sbox[128]; 1121 u_int16_t tcp_rndiss_msb; 1122 u_int16_t tcp_rndiss_cnt; 1123 long tcp_rndiss_reseed; 1124 1125 u_int16_t 1126 tcp_rndiss_encrypt(val) 1127 u_int16_t val; 1128 { 1129 u_int16_t sum = 0, i; 1130 1131 for (i = 0; i < TCP_RNDISS_ROUNDS; i++) { 1132 sum += 0x79b9; 1133 val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7; 1134 val = ((val & 0xff) << 7) | (val >> 8); 1135 } 1136 1137 return val; 1138 } 1139 1140 void 1141 tcp_rndiss_init() 1142 { 1143 get_random_bytes(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox)); 1144 1145 tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT; 1146 tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000; 1147 tcp_rndiss_cnt = 0; 1148 } 1149 1150 tcp_seq 1151 tcp_rndiss_next() 1152 { 1153 if (tcp_rndiss_cnt >= TCP_RNDISS_MAX || 1154 time.tv_sec > tcp_rndiss_reseed) 1155 tcp_rndiss_init(); 1156 1157 /* (arc4random() & 0x7fff) ensures a 32768 byte gap between ISS */ 1158 return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) | 1159 (arc4random() & 0x7fff); 1160 } 1161 1162