1 /* $NetBSD: tcp_output.c,v 1.74 2001/09/10 22:14:27 thorpej Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 2001 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 115 */ 116 117 #include "opt_inet.h" 118 #include "opt_ipsec.h" 119 #include "opt_tcp_debug.h" 120 121 #include <sys/param.h> 122 #include <sys/systm.h> 123 #include <sys/malloc.h> 124 #include <sys/mbuf.h> 125 #include <sys/protosw.h> 126 #include <sys/socket.h> 127 #include <sys/socketvar.h> 128 #include <sys/errno.h> 129 #include <sys/domain.h> 130 #include <sys/kernel.h> 131 132 #include <net/if.h> 133 #include <net/route.h> 134 135 #include <netinet/in.h> 136 #include <netinet/in_systm.h> 137 #include <netinet/ip.h> 138 #include <netinet/in_pcb.h> 139 #include <netinet/ip_var.h> 140 141 #ifdef INET6 142 #ifndef INET 143 #include <netinet/in.h> 144 #endif 145 #include <netinet/ip6.h> 146 #include <netinet6/in6_pcb.h> 147 #include <netinet6/ip6_var.h> 148 #endif 149 150 #include <netinet/tcp.h> 151 #define TCPOUTFLAGS 152 #include <netinet/tcp_fsm.h> 153 #include <netinet/tcp_seq.h> 154 #include <netinet/tcp_timer.h> 155 #include <netinet/tcp_var.h> 156 #include <netinet/tcpip.h> 157 #include <netinet/tcp_debug.h> 158 159 #ifdef notyet 160 extern struct mbuf *m_copypack(); 161 #endif 162 163 #define MAX_TCPOPTLEN 32 /* max # bytes that go in options */ 164 165 /* 166 * Knob to enable Congestion Window Monitoring, and control the 167 * the burst size it allows. Default burst is 4 packets, per 168 * the Internet draft. 169 */ 170 int tcp_cwm = 1; 171 int tcp_cwm_burstsize = 4; 172 173 static 174 #ifndef GPROF 175 __inline 176 #endif 177 void 178 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep) 179 { 180 #ifdef INET 181 struct inpcb *inp = tp->t_inpcb; 182 #endif 183 #ifdef INET6 184 struct in6pcb *in6p = tp->t_in6pcb; 185 #endif 186 struct rtentry *rt; 187 struct ifnet *ifp; 188 int size; 189 int iphlen; 190 191 #ifdef DIAGNOSTIC 192 if (tp->t_inpcb && tp->t_in6pcb) 193 panic("tcp_segsize: both t_inpcb and t_in6pcb are set"); 194 #endif 195 switch (tp->t_family) { 196 #ifdef INET 197 case AF_INET: 198 iphlen = sizeof(struct ip); 199 break; 200 #endif 201 #ifdef INET6 202 case AF_INET6: 203 iphlen = sizeof(struct ip6_hdr); 204 break; 205 #endif 206 default: 207 size = tcp_mssdflt; 208 goto out; 209 } 210 211 rt = NULL; 212 #ifdef INET 213 if (inp) 214 rt = in_pcbrtentry(inp); 215 #endif 216 #ifdef INET6 217 if (in6p) 218 rt = in6_pcbrtentry(in6p); 219 #endif 220 if (rt == NULL) { 221 size = tcp_mssdflt; 222 goto out; 223 } 224 225 ifp = rt->rt_ifp; 226 227 size = tcp_mssdflt; 228 if (rt->rt_rmx.rmx_mtu != 0) 229 size = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 230 else if (ifp->if_flags & IFF_LOOPBACK) 231 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 232 #ifdef INET 233 else if (inp && ip_mtudisc) 234 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 235 else if (inp && in_localaddr(inp->inp_faddr)) 236 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 237 #endif 238 #ifdef INET6 239 else if (in6p) { 240 #ifdef INET 241 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { 242 /* mapped addr case */ 243 struct in_addr d; 244 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d)); 245 if (ip_mtudisc || in_localaddr(d)) 246 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 247 } else 248 #endif 249 { 250 /* 251 * for IPv6, path MTU discovery is always turned on, 252 * or the node must use packet size <= 1280. 253 */ 254 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 255 } 256 } 257 #endif 258 size -= tcp_optlen(tp); 259 /* 260 * XXX tp->t_ourmss should have the right size, but without this code 261 * fragmentation will occur... need more investigation 262 */ 263 #ifdef INET 264 if (inp) { 265 #ifdef IPSEC 266 size -= ipsec4_hdrsiz_tcp(tp); 267 #endif 268 size -= ip_optlen(inp); 269 } 270 #endif 271 #ifdef INET6 272 #ifdef INET 273 if (in6p && tp->t_family == AF_INET) { 274 #ifdef IPSEC 275 size -= ipsec4_hdrsiz_tcp(tp); 276 #endif 277 /* XXX size -= ip_optlen(in6p); */ 278 } else 279 #endif 280 if (in6p && tp->t_family == AF_INET6) { 281 #ifdef IPSEC 282 size -= ipsec6_hdrsiz_tcp(tp); 283 #endif 284 size -= ip6_optlen(in6p); 285 } 286 #endif 287 288 out: 289 /* 290 * *rxsegsizep holds *estimated* inbound segment size (estimation 291 * assumes that path MTU is the same for both ways). this is only 292 * for silly window avoidance, do not use the value for other purposes. 293 * 294 * ipseclen is subtracted from both sides, this may not be right. 295 * I'm not quite sure about this (could someone comment). 296 */ 297 *txsegsizep = min(tp->t_peermss, size); 298 *rxsegsizep = min(tp->t_ourmss, size); 299 300 if (*txsegsizep != tp->t_segsz) { 301 /* 302 * If the new segment size is larger, we don't want to 303 * mess up the congestion window, but if it is smaller 304 * we'll have to reduce the congestion window to ensure 305 * that we don't get into trouble with initial windows 306 * and the rest. In any case, if the segment size 307 * has changed, chances are the path has, too, and 308 * our congestion window will be different. 309 */ 310 if (*txsegsizep < tp->t_segsz) { 311 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz) 312 * *txsegsizep, *txsegsizep); 313 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz) 314 * *txsegsizep, *txsegsizep); 315 } 316 tp->t_segsz = *txsegsizep; 317 } 318 } 319 320 static 321 #ifndef GPROF 322 __inline 323 #endif 324 int 325 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, 326 long len, int hdrlen, struct mbuf **mp) 327 { 328 struct mbuf *m; 329 330 if (tp->t_force && len == 1) 331 tcpstat.tcps_sndprobe++; 332 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 333 tcpstat.tcps_sndrexmitpack++; 334 tcpstat.tcps_sndrexmitbyte += len; 335 } else { 336 tcpstat.tcps_sndpack++; 337 tcpstat.tcps_sndbyte += len; 338 } 339 #ifdef notyet 340 if ((m = m_copypack(so->so_snd.sb_mb, off, 341 (int)len, max_linkhdr + hdrlen)) == 0) 342 return (ENOBUFS); 343 /* 344 * m_copypack left space for our hdr; use it. 345 */ 346 m->m_len += hdrlen; 347 m->m_data -= hdrlen; 348 #else 349 MGETHDR(m, M_DONTWAIT, MT_HEADER); 350 if (m != NULL && 351 (max_linkhdr + hdrlen > MHLEN || 352 max_linkhdr + hdrlen + len <= MCLBYTES)) { 353 MCLGET(m, M_DONTWAIT); 354 if ((m->m_flags & M_EXT) == 0) { 355 m_freem(m); 356 m = NULL; 357 } 358 } 359 if (m == NULL) 360 return (ENOBUFS); 361 m->m_data += max_linkhdr; 362 m->m_len = hdrlen; 363 if (len <= M_TRAILINGSPACE(m)) { 364 m_copydata(so->so_snd.sb_mb, off, (int) len, 365 mtod(m, caddr_t) + hdrlen); 366 m->m_len += len; 367 } else { 368 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 369 if (m->m_next == NULL) { 370 m_freem(m); 371 return (ENOBUFS); 372 } 373 } 374 #endif 375 376 *mp = m; 377 return (0); 378 } 379 380 /* 381 * Tcp output routine: figure out what should be sent and send it. 382 */ 383 int 384 tcp_output(tp) 385 struct tcpcb *tp; 386 { 387 struct socket *so; 388 struct route *ro; 389 long len, win; 390 int off, flags, error; 391 struct mbuf *m; 392 struct ip *ip; 393 #ifdef INET6 394 struct ip6_hdr *ip6; 395 #endif 396 struct tcphdr *th; 397 u_char opt[MAX_TCPOPTLEN]; 398 unsigned optlen, hdrlen; 399 int idle, sendalot, txsegsize, rxsegsize; 400 int maxburst = TCP_MAXBURST; 401 int af; /* address family on the wire */ 402 int iphdrlen; 403 404 #ifdef DIAGNOSTIC 405 if (tp->t_inpcb && tp->t_in6pcb) 406 panic("tcp_output: both t_inpcb and t_in6pcb are set"); 407 #endif 408 so = NULL; 409 ro = NULL; 410 if (tp->t_inpcb) { 411 so = tp->t_inpcb->inp_socket; 412 ro = &tp->t_inpcb->inp_route; 413 } 414 #ifdef INET6 415 else if (tp->t_in6pcb) { 416 so = tp->t_in6pcb->in6p_socket; 417 ro = (struct route *)&tp->t_in6pcb->in6p_route; 418 } 419 #endif 420 421 switch (af = tp->t_family) { 422 #ifdef INET 423 case AF_INET: 424 if (tp->t_inpcb) 425 break; 426 #ifdef INET6 427 /* mapped addr case */ 428 if (tp->t_in6pcb) 429 break; 430 #endif 431 return EINVAL; 432 #endif 433 #ifdef INET6 434 case AF_INET6: 435 if (tp->t_in6pcb) 436 break; 437 return EINVAL; 438 #endif 439 default: 440 return EAFNOSUPPORT; 441 } 442 443 tcp_segsize(tp, &txsegsize, &rxsegsize); 444 445 idle = (tp->snd_max == tp->snd_una); 446 447 /* 448 * Restart Window computation. From draft-floyd-incr-init-win-03: 449 * 450 * Optionally, a TCP MAY set the restart window to the 451 * minimum of the value used for the initial window and 452 * the current value of cwnd (in other words, using a 453 * larger value for the restart window should never increase 454 * the size of cwnd). 455 */ 456 if (tcp_cwm) { 457 /* 458 * Hughes/Touch/Heidemann Congestion Window Monitoring. 459 * Count the number of packets currently pending 460 * acknowledgement, and limit our congestion window 461 * to a pre-determined allowed burst size plus that count. 462 * This prevents bursting once all pending packets have 463 * been acknowledged (i.e. transmission is idle). 464 * 465 * XXX Link this to Initial Window? 466 */ 467 tp->snd_cwnd = min(tp->snd_cwnd, 468 (tcp_cwm_burstsize * txsegsize) + 469 (tp->snd_nxt - tp->snd_una)); 470 } else { 471 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { 472 /* 473 * We have been idle for "a while" and no acks are 474 * expected to clock out any data we send -- 475 * slow start to get ack "clock" running again. 476 */ 477 tp->snd_cwnd = min(tp->snd_cwnd, 478 TCP_INITIAL_WINDOW(tcp_init_win, txsegsize)); 479 } 480 } 481 482 again: 483 /* 484 * Determine length of data that should be transmitted, and 485 * flags that should be used. If there is some data or critical 486 * controls (SYN, RST) to send, then transmit; otherwise, 487 * investigate further. 488 */ 489 sendalot = 0; 490 off = tp->snd_nxt - tp->snd_una; 491 win = min(tp->snd_wnd, tp->snd_cwnd); 492 493 flags = tcp_outflags[tp->t_state]; 494 /* 495 * If in persist timeout with window of 0, send 1 byte. 496 * Otherwise, if window is small but nonzero 497 * and timer expired, we will send what we can 498 * and go to transmit state. 499 */ 500 if (tp->t_force) { 501 if (win == 0) { 502 /* 503 * If we still have some data to send, then 504 * clear the FIN bit. Usually this would 505 * happen below when it realizes that we 506 * aren't sending all the data. However, 507 * if we have exactly 1 byte of unset data, 508 * then it won't clear the FIN bit below, 509 * and if we are in persist state, we wind 510 * up sending the packet without recording 511 * that we sent the FIN bit. 512 * 513 * We can't just blindly clear the FIN bit, 514 * because if we don't have any more data 515 * to send then the probe will be the FIN 516 * itself. 517 */ 518 if (off < so->so_snd.sb_cc) 519 flags &= ~TH_FIN; 520 win = 1; 521 } else { 522 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 523 tp->t_rxtshift = 0; 524 } 525 } 526 527 if (win < so->so_snd.sb_cc) { 528 len = win - off; 529 flags &= ~TH_FIN; 530 } else 531 len = so->so_snd.sb_cc - off; 532 533 if (len < 0) { 534 /* 535 * If FIN has been sent but not acked, 536 * but we haven't been called to retransmit, 537 * len will be -1. Otherwise, window shrank 538 * after we sent into it. If window shrank to 0, 539 * cancel pending retransmit, pull snd_nxt back 540 * to (closed) window, and set the persist timer 541 * if it isn't already going. If the window didn't 542 * close completely, just wait for an ACK. 543 * 544 * If we have a pending FIN, either it has already been 545 * transmitted or it is outside the window, so drop it. 546 * If the FIN has been transmitted, but this is not a 547 * retransmission, then len must be -1. Therefore we also 548 * prevent here the sending of `gratuitous FINs'. This 549 * eliminates the need to check for that case below (e.g. 550 * to back up snd_nxt before the FIN so that the sequence 551 * number is correct). 552 */ 553 len = 0; 554 flags &= ~TH_FIN; 555 if (win == 0) { 556 TCP_TIMER_DISARM(tp, TCPT_REXMT); 557 tp->t_rxtshift = 0; 558 tp->snd_nxt = tp->snd_una; 559 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 560 tcp_setpersist(tp); 561 } 562 } 563 if (len > txsegsize) { 564 len = txsegsize; 565 flags &= ~TH_FIN; 566 sendalot = 1; 567 } 568 569 win = sbspace(&so->so_rcv); 570 571 /* 572 * Sender silly window avoidance. If connection is idle 573 * and can send all data, a maximum segment, 574 * at least a maximum default-size segment do it, 575 * or are forced, do it; otherwise don't bother. 576 * If peer's buffer is tiny, then send 577 * when window is at least half open. 578 * If retransmitting (possibly after persist timer forced us 579 * to send into a small window), then must resend. 580 */ 581 if (len) { 582 if (len == txsegsize) 583 goto send; 584 if ((so->so_state & SS_MORETOCOME) == 0 && 585 ((idle || tp->t_flags & TF_NODELAY) && 586 len + off >= so->so_snd.sb_cc)) 587 goto send; 588 if (tp->t_force) 589 goto send; 590 if (len >= tp->max_sndwnd / 2) 591 goto send; 592 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 593 goto send; 594 } 595 596 /* 597 * Compare available window to amount of window known to peer 598 * (as advertised window less next expected input). If the 599 * difference is at least twice the size of the largest segment 600 * we expect to receive (i.e. two segments) or at least 50% of 601 * the maximum possible window, then want to send a window update 602 * to peer. 603 */ 604 if (win > 0) { 605 /* 606 * "adv" is the amount we can increase the window, 607 * taking into account that we are limited by 608 * TCP_MAXWIN << tp->rcv_scale. 609 */ 610 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 611 (tp->rcv_adv - tp->rcv_nxt); 612 613 if (adv >= (long) (2 * rxsegsize)) 614 goto send; 615 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 616 goto send; 617 } 618 619 /* 620 * Send if we owe peer an ACK. 621 */ 622 if (tp->t_flags & TF_ACKNOW) 623 goto send; 624 if (flags & (TH_SYN|TH_FIN|TH_RST)) 625 goto send; 626 if (SEQ_GT(tp->snd_up, tp->snd_una)) 627 goto send; 628 629 /* 630 * TCP window updates are not reliable, rather a polling protocol 631 * using ``persist'' packets is used to insure receipt of window 632 * updates. The three ``states'' for the output side are: 633 * idle not doing retransmits or persists 634 * persisting to move a small or zero window 635 * (re)transmitting and thereby not persisting 636 * 637 * tp->t_timer[TCPT_PERSIST] 638 * is set when we are in persist state. 639 * tp->t_force 640 * is set when we are called to send a persist packet. 641 * tp->t_timer[TCPT_REXMT] 642 * is set when we are retransmitting 643 * The output side is idle when both timers are zero. 644 * 645 * If send window is too small, there is data to transmit, and no 646 * retransmit or persist is pending, then go to persist state. 647 * If nothing happens soon, send when timer expires: 648 * if window is nonzero, transmit what we can, 649 * otherwise force out a byte. 650 */ 651 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 652 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 653 tp->t_rxtshift = 0; 654 tcp_setpersist(tp); 655 } 656 657 /* 658 * No reason to send a segment, just return. 659 */ 660 return (0); 661 662 send: 663 /* 664 * Before ESTABLISHED, force sending of initial options 665 * unless TCP set not to do any options. 666 * NOTE: we assume that the IP/TCP header plus TCP options 667 * always fit in a single mbuf, leaving room for a maximum 668 * link header, i.e. 669 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES 670 */ 671 optlen = 0; 672 switch (af) { 673 #ifdef INET 674 case AF_INET: 675 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 676 break; 677 #endif 678 #ifdef INET6 679 case AF_INET6: 680 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 681 break; 682 #endif 683 default: /*pacify gcc*/ 684 iphdrlen = 0; 685 break; 686 } 687 hdrlen = iphdrlen; 688 if (flags & TH_SYN) { 689 struct rtentry *rt; 690 691 rt = NULL; 692 #ifdef INET 693 if (tp->t_inpcb) 694 rt = in_pcbrtentry(tp->t_inpcb); 695 #endif 696 #ifdef INET6 697 if (tp->t_in6pcb) 698 rt = in6_pcbrtentry(tp->t_in6pcb); 699 #endif 700 701 tp->snd_nxt = tp->iss; 702 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ? 703 rt->rt_ifp : NULL, af); 704 if ((tp->t_flags & TF_NOOPT) == 0) { 705 opt[0] = TCPOPT_MAXSEG; 706 opt[1] = 4; 707 opt[2] = (tp->t_ourmss >> 8) & 0xff; 708 opt[3] = tp->t_ourmss & 0xff; 709 optlen = 4; 710 711 if ((tp->t_flags & TF_REQ_SCALE) && 712 ((flags & TH_ACK) == 0 || 713 (tp->t_flags & TF_RCVD_SCALE))) { 714 *((u_int32_t *) (opt + optlen)) = htonl( 715 TCPOPT_NOP << 24 | 716 TCPOPT_WINDOW << 16 | 717 TCPOLEN_WINDOW << 8 | 718 tp->request_r_scale); 719 optlen += 4; 720 } 721 } 722 } 723 724 /* 725 * Send a timestamp and echo-reply if this is a SYN and our side 726 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 727 * and our peer have sent timestamps in our SYN's. 728 */ 729 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 730 (flags & TH_RST) == 0 && 731 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 732 (tp->t_flags & TF_RCVD_TSTMP))) { 733 u_int32_t *lp = (u_int32_t *)(opt + optlen); 734 735 /* Form timestamp option as shown in appendix A of RFC 1323. */ 736 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 737 *lp++ = htonl(TCP_TIMESTAMP(tp)); 738 *lp = htonl(tp->ts_recent); 739 optlen += TCPOLEN_TSTAMP_APPA; 740 } 741 742 hdrlen += optlen; 743 744 #ifdef DIAGNOSTIC 745 if (len > txsegsize) 746 panic("tcp data to be sent is larger than segment"); 747 if (max_linkhdr + hdrlen > MCLBYTES) 748 panic("tcphdr too big"); 749 #endif 750 751 /* 752 * Grab a header mbuf, attaching a copy of data to 753 * be transmitted, and initialize the header from 754 * the template for sends on this connection. 755 */ 756 if (len) { 757 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); 758 if (error) 759 goto out; 760 /* 761 * If we're sending everything we've got, set PUSH. 762 * (This will keep happy those implementations which only 763 * give data to the user when a buffer fills or 764 * a PUSH comes in.) 765 */ 766 if (off + len == so->so_snd.sb_cc) 767 flags |= TH_PUSH; 768 } else { 769 if (tp->t_flags & TF_ACKNOW) 770 tcpstat.tcps_sndacks++; 771 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 772 tcpstat.tcps_sndctrl++; 773 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 774 tcpstat.tcps_sndurg++; 775 else 776 tcpstat.tcps_sndwinup++; 777 778 MGETHDR(m, M_DONTWAIT, MT_HEADER); 779 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 780 MCLGET(m, M_DONTWAIT); 781 if ((m->m_flags & M_EXT) == 0) { 782 m_freem(m); 783 m = NULL; 784 } 785 } 786 if (m == NULL) { 787 error = ENOBUFS; 788 goto out; 789 } 790 m->m_data += max_linkhdr; 791 m->m_len = hdrlen; 792 } 793 m->m_pkthdr.rcvif = (struct ifnet *)0; 794 switch (af) { 795 #ifdef INET 796 case AF_INET: 797 ip = mtod(m, struct ip *); 798 #ifdef INET6 799 ip6 = NULL; 800 #endif 801 th = (struct tcphdr *)(ip + 1); 802 break; 803 #endif 804 #ifdef INET6 805 case AF_INET6: 806 ip = NULL; 807 ip6 = mtod(m, struct ip6_hdr *); 808 th = (struct tcphdr *)(ip6 + 1); 809 break; 810 #endif 811 default: /*pacify gcc*/ 812 ip = NULL; 813 #ifdef INET6 814 ip6 = NULL; 815 #endif 816 th = NULL; 817 break; 818 } 819 if (tp->t_template == 0) 820 panic("tcp_output"); 821 if (tp->t_template->m_len < iphdrlen) 822 panic("tcp_output"); 823 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen); 824 825 /* 826 * If we are doing retransmissions, then snd_nxt will 827 * not reflect the first unsent octet. For ACK only 828 * packets, we do not want the sequence number of the 829 * retransmitted packet, we want the sequence number 830 * of the next unsent octet. So, if there is no data 831 * (and no SYN or FIN), use snd_max instead of snd_nxt 832 * when filling in ti_seq. But if we are in persist 833 * state, snd_max might reflect one byte beyond the 834 * right edge of the window, so use snd_nxt in that 835 * case, since we know we aren't doing a retransmission. 836 * (retransmit and persist are mutually exclusive...) 837 */ 838 if (len || (flags & (TH_SYN|TH_FIN)) || 839 TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) 840 th->th_seq = htonl(tp->snd_nxt); 841 else 842 th->th_seq = htonl(tp->snd_max); 843 th->th_ack = htonl(tp->rcv_nxt); 844 if (optlen) { 845 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 846 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 847 } 848 th->th_flags = flags; 849 /* 850 * Calculate receive window. Don't shrink window, 851 * but avoid silly window syndrome. 852 */ 853 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) 854 win = 0; 855 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 856 win = (long)TCP_MAXWIN << tp->rcv_scale; 857 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 858 win = (long)(tp->rcv_adv - tp->rcv_nxt); 859 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 860 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 861 u_int32_t urp = tp->snd_up - tp->snd_nxt; 862 if (urp > IP_MAXPACKET) 863 urp = IP_MAXPACKET; 864 th->th_urp = htons((u_int16_t)urp); 865 th->th_flags |= TH_URG; 866 } else 867 /* 868 * If no urgent pointer to send, then we pull 869 * the urgent pointer to the left edge of the send window 870 * so that it doesn't drift into the send window on sequence 871 * number wraparound. 872 */ 873 tp->snd_up = tp->snd_una; /* drag it along */ 874 875 /* 876 * Set ourselves up to be checksummed just before the packet 877 * hits the wire. 878 */ 879 switch (af) { 880 #ifdef INET 881 case AF_INET: 882 m->m_pkthdr.csum_flags = M_CSUM_TCPv4; 883 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 884 if (len + optlen) { 885 /* Fixup the pseudo-header checksum. */ 886 /* XXXJRT Not IP Jumbogram safe. */ 887 th->th_sum = in_cksum_addword(th->th_sum, 888 htons((u_int16_t) (len + optlen))); 889 } 890 break; 891 #endif 892 #ifdef INET6 893 case AF_INET6: 894 /* 895 * XXX Actually delaying the checksum is Hard 896 * XXX (well, maybe not for Itojun, but it is 897 * XXX for me), but we can still take advantage 898 * XXX of the cached pseudo-header checksum. 899 */ 900 /* equals to hdrlen + len */ 901 m->m_pkthdr.len = sizeof(struct ip6_hdr) 902 + sizeof(struct tcphdr) + optlen + len; 903 #ifdef notyet 904 m->m_pkthdr.csum_flags = M_CSUM_TCPv6; 905 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 906 #endif 907 if (len + optlen) { 908 /* Fixup the pseudo-header checksum. */ 909 /* XXXJRT: Not IPv6 Jumbogram safe. */ 910 th->th_sum = in_cksum_addword(th->th_sum, 911 htons((u_int16_t) (len + optlen))); 912 } 913 #ifndef notyet 914 th->th_sum = in6_cksum(m, 0, sizeof(struct ip6_hdr), 915 sizeof(struct tcphdr) + optlen + len); 916 #endif 917 break; 918 #endif 919 } 920 921 /* 922 * In transmit state, time the transmission and arrange for 923 * the retransmit. In persist state, just set snd_max. 924 */ 925 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 926 tcp_seq startseq = tp->snd_nxt; 927 928 /* 929 * Advance snd_nxt over sequence space of this segment. 930 * There are no states in which we send both a SYN and a FIN, 931 * so we collapse the tests for these flags. 932 */ 933 if (flags & (TH_SYN|TH_FIN)) 934 tp->snd_nxt++; 935 tp->snd_nxt += len; 936 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 937 tp->snd_max = tp->snd_nxt; 938 /* 939 * Time this transmission if not a retransmission and 940 * not currently timing anything. 941 */ 942 if (tp->t_rtttime == 0) { 943 tp->t_rtttime = tcp_now; 944 tp->t_rtseq = startseq; 945 tcpstat.tcps_segstimed++; 946 } 947 } 948 949 /* 950 * Set retransmit timer if not currently set, 951 * and not doing an ack or a keep-alive probe. 952 * Initial value for retransmit timer is smoothed 953 * round-trip time + 2 * round-trip time variance. 954 * Initialize shift counter which is used for backoff 955 * of retransmit time. 956 */ 957 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 958 tp->snd_nxt != tp->snd_una) { 959 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 960 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 961 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 962 tp->t_rxtshift = 0; 963 } 964 } 965 } else 966 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 967 tp->snd_max = tp->snd_nxt + len; 968 969 #ifdef TCP_DEBUG 970 /* 971 * Trace. 972 */ 973 if (so->so_options & SO_DEBUG) { 974 /* 975 * need to recover version # field, which was overwritten 976 * on ip_cksum computation. 977 */ 978 struct ip *sip; 979 sip = mtod(m, struct ip *); 980 switch (af) { 981 #ifdef INET 982 case AF_INET: 983 sip->ip_v = 4; 984 break; 985 #endif 986 #ifdef INET6 987 case AF_INET6: 988 sip->ip_v = 6; 989 break; 990 #endif 991 } 992 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); 993 } 994 #endif 995 996 /* 997 * Fill in IP length and desired time to live and 998 * send to IP level. There should be a better way 999 * to handle ttl and tos; we could keep them in 1000 * the template, but need a way to checksum without them. 1001 */ 1002 m->m_pkthdr.len = hdrlen + len; 1003 1004 switch (af) { 1005 #ifdef INET 1006 case AF_INET: 1007 ip->ip_len = m->m_pkthdr.len; 1008 if (tp->t_inpcb) { 1009 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1010 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 1011 } 1012 #ifdef INET6 1013 else if (tp->t_in6pcb) { 1014 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/ 1015 ip->ip_tos = 0; /*XXX*/ 1016 } 1017 #endif 1018 break; 1019 #endif 1020 #ifdef INET6 1021 case AF_INET6: 1022 ip6->ip6_nxt = IPPROTO_TCP; 1023 if (tp->t_in6pcb) { 1024 /* 1025 * we separately set hoplimit for every segment, since 1026 * the user might want to change the value via 1027 * setsockopt. Also, desired default hop limit might 1028 * be changed via Neighbor Discovery. 1029 */ 1030 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, 1031 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 1032 } 1033 /* ip6->ip6_flow = ??? */ 1034 /* ip6_plen will be filled in ip6_output(). */ 1035 break; 1036 #endif 1037 } 1038 1039 #ifdef IPSEC 1040 if (ipsec_setsocket(m, so) != 0) { 1041 m_freem(m); 1042 error = ENOBUFS; 1043 goto out; 1044 } 1045 #endif /*IPSEC*/ 1046 1047 switch (af) { 1048 #ifdef INET 1049 case AF_INET: 1050 { 1051 struct mbuf *opts; 1052 1053 if (tp->t_inpcb) 1054 opts = tp->t_inpcb->inp_options; 1055 else 1056 opts = NULL; 1057 error = ip_output(m, opts, ro, 1058 (ip_mtudisc ? IP_MTUDISC : 0) | 1059 (so->so_options & SO_DONTROUTE), 1060 0); 1061 break; 1062 } 1063 #endif 1064 #ifdef INET6 1065 case AF_INET6: 1066 { 1067 struct ip6_pktopts *opts; 1068 1069 if (tp->t_in6pcb) 1070 opts = tp->t_in6pcb->in6p_outputopts; 1071 else 1072 opts = NULL; 1073 error = ip6_output(m, opts, (struct route_in6 *)ro, 1074 so->so_options & SO_DONTROUTE, 0, NULL); 1075 break; 1076 } 1077 #endif 1078 default: 1079 error = EAFNOSUPPORT; 1080 break; 1081 } 1082 if (error) { 1083 out: 1084 if (error == ENOBUFS) { 1085 tcpstat.tcps_selfquench++; 1086 #ifdef INET 1087 if (tp->t_inpcb) 1088 tcp_quench(tp->t_inpcb, 0); 1089 #endif 1090 #ifdef INET6 1091 if (tp->t_in6pcb) 1092 tcp6_quench(tp->t_in6pcb, 0); 1093 #endif 1094 error = 0; 1095 } else if ((error == EHOSTUNREACH || error == ENETDOWN) && 1096 TCPS_HAVERCVDSYN(tp->t_state)) { 1097 tp->t_softerror = error; 1098 error = 0; 1099 } 1100 1101 /* Restart the delayed ACK timer, if necessary. */ 1102 if (tp->t_flags & TF_DELACK) 1103 TCP_RESTART_DELACK(tp); 1104 1105 return (error); 1106 } 1107 tcpstat.tcps_sndtotal++; 1108 if (tp->t_flags & TF_DELACK) 1109 tcpstat.tcps_delack++; 1110 1111 /* 1112 * Data sent (as far as we can tell). 1113 * If this advertises a larger window than any other segment, 1114 * then remember the size of the advertised window. 1115 * Any pending ACK has now been sent. 1116 */ 1117 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1118 tp->rcv_adv = tp->rcv_nxt + win; 1119 tp->last_ack_sent = tp->rcv_nxt; 1120 tp->t_flags &= ~TF_ACKNOW; 1121 TCP_CLEAR_DELACK(tp); 1122 #ifdef DIAGNOSTIC 1123 if (maxburst < 0) 1124 printf("tcp_output: maxburst exceeded by %d\n", -maxburst); 1125 #endif 1126 if (sendalot && (!tcp_do_newreno || --maxburst)) 1127 goto again; 1128 return (0); 1129 } 1130 1131 void 1132 tcp_setpersist(tp) 1133 struct tcpcb *tp; 1134 { 1135 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); 1136 int nticks; 1137 1138 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 1139 panic("tcp_output REXMT"); 1140 /* 1141 * Start/restart persistance timer. 1142 */ 1143 if (t < tp->t_rttmin) 1144 t = tp->t_rttmin; 1145 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], 1146 TCPTV_PERSMIN, TCPTV_PERSMAX); 1147 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); 1148 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1149 tp->t_rxtshift++; 1150 } 1151