1 /* $NetBSD: tcp_output.c,v 1.76 2001/12/03 01:45:43 jmcneill Exp $ */ 2 3 /* 4 %%% portions-copyright-nrl-95 5 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 7 Reserved. All rights under this copyright have been assigned to the US 8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 10 software. 11 You should have received a copy of the license with this software. If you 12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 13 14 */ 15 16 /* 17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 18 * All rights reserved. 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 1. Redistributions of source code must retain the above copyright 24 * notice, this list of conditions and the following disclaimer. 25 * 2. Redistributions in binary form must reproduce the above copyright 26 * notice, this list of conditions and the following disclaimer in the 27 * documentation and/or other materials provided with the distribution. 28 * 3. Neither the name of the project nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 45 /*- 46 * Copyright (c) 1997, 1998, 2001 The NetBSD Foundation, Inc. 47 * All rights reserved. 48 * 49 * This code is derived from software contributed to The NetBSD Foundation 50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 51 * Facility, NASA Ames Research Center. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 3. All advertising materials mentioning features or use of this software 62 * must display the following acknowledgement: 63 * This product includes software developed by the NetBSD 64 * Foundation, Inc. and its contributors. 65 * 4. Neither the name of The NetBSD Foundation nor the names of its 66 * contributors may be used to endorse or promote products derived 67 * from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 * POSSIBILITY OF SUCH DAMAGE. 80 */ 81 82 /* 83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 84 * The Regents of the University of California. All rights reserved. 85 * 86 * Redistribution and use in source and binary forms, with or without 87 * modification, are permitted provided that the following conditions 88 * are met: 89 * 1. Redistributions of source code must retain the above copyright 90 * notice, this list of conditions and the following disclaimer. 91 * 2. Redistributions in binary form must reproduce the above copyright 92 * notice, this list of conditions and the following disclaimer in the 93 * documentation and/or other materials provided with the distribution. 94 * 3. All advertising materials mentioning features or use of this software 95 * must display the following acknowledgement: 96 * This product includes software developed by the University of 97 * California, Berkeley and its contributors. 98 * 4. Neither the name of the University nor the names of its contributors 99 * may be used to endorse or promote products derived from this software 100 * without specific prior written permission. 101 * 102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 112 * SUCH DAMAGE. 113 * 114 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 115 */ 116 117 #include <sys/cdefs.h> 118 __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.76 2001/12/03 01:45:43 jmcneill Exp $"); 119 120 #include "opt_inet.h" 121 #include "opt_ipsec.h" 122 #include "opt_tcp_debug.h" 123 124 #include <sys/param.h> 125 #include <sys/systm.h> 126 #include <sys/malloc.h> 127 #include <sys/mbuf.h> 128 #include <sys/protosw.h> 129 #include <sys/socket.h> 130 #include <sys/socketvar.h> 131 #include <sys/errno.h> 132 #include <sys/domain.h> 133 #include <sys/kernel.h> 134 135 #include <net/if.h> 136 #include <net/route.h> 137 138 #include <netinet/in.h> 139 #include <netinet/in_systm.h> 140 #include <netinet/ip.h> 141 #include <netinet/in_pcb.h> 142 #include <netinet/ip_var.h> 143 144 #ifdef INET6 145 #ifndef INET 146 #include <netinet/in.h> 147 #endif 148 #include <netinet/ip6.h> 149 #include <netinet6/in6_pcb.h> 150 #include <netinet6/ip6_var.h> 151 #endif 152 153 #include <netinet/tcp.h> 154 #define TCPOUTFLAGS 155 #include <netinet/tcp_fsm.h> 156 #include <netinet/tcp_seq.h> 157 #include <netinet/tcp_timer.h> 158 #include <netinet/tcp_var.h> 159 #include <netinet/tcpip.h> 160 #include <netinet/tcp_debug.h> 161 162 #ifdef notyet 163 extern struct mbuf *m_copypack(); 164 #endif 165 166 #define MAX_TCPOPTLEN 32 /* max # bytes that go in options */ 167 168 /* 169 * Knob to enable Congestion Window Monitoring, and control the 170 * the burst size it allows. Default burst is 4 packets, per 171 * the Internet draft. 172 */ 173 int tcp_cwm = 1; 174 int tcp_cwm_burstsize = 4; 175 176 static 177 #ifndef GPROF 178 __inline 179 #endif 180 void 181 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep) 182 { 183 #ifdef INET 184 struct inpcb *inp = tp->t_inpcb; 185 #endif 186 #ifdef INET6 187 struct in6pcb *in6p = tp->t_in6pcb; 188 #endif 189 struct rtentry *rt; 190 struct ifnet *ifp; 191 int size; 192 int iphlen; 193 int optlen; 194 195 #ifdef DIAGNOSTIC 196 if (tp->t_inpcb && tp->t_in6pcb) 197 panic("tcp_segsize: both t_inpcb and t_in6pcb are set"); 198 #endif 199 switch (tp->t_family) { 200 #ifdef INET 201 case AF_INET: 202 iphlen = sizeof(struct ip); 203 break; 204 #endif 205 #ifdef INET6 206 case AF_INET6: 207 iphlen = sizeof(struct ip6_hdr); 208 break; 209 #endif 210 default: 211 size = tcp_mssdflt; 212 goto out; 213 } 214 215 rt = NULL; 216 #ifdef INET 217 if (inp) 218 rt = in_pcbrtentry(inp); 219 #endif 220 #ifdef INET6 221 if (in6p) 222 rt = in6_pcbrtentry(in6p); 223 #endif 224 if (rt == NULL) { 225 size = tcp_mssdflt; 226 goto out; 227 } 228 229 ifp = rt->rt_ifp; 230 231 size = tcp_mssdflt; 232 if (rt->rt_rmx.rmx_mtu != 0) 233 size = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 234 else if (ifp->if_flags & IFF_LOOPBACK) 235 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 236 #ifdef INET 237 else if (inp && ip_mtudisc) 238 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 239 else if (inp && in_localaddr(inp->inp_faddr)) 240 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 241 #endif 242 #ifdef INET6 243 else if (in6p) { 244 #ifdef INET 245 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { 246 /* mapped addr case */ 247 struct in_addr d; 248 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d)); 249 if (ip_mtudisc || in_localaddr(d)) 250 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 251 } else 252 #endif 253 { 254 /* 255 * for IPv6, path MTU discovery is always turned on, 256 * or the node must use packet size <= 1280. 257 */ 258 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 259 } 260 } 261 #endif 262 /* 263 * Now we must make room for whatever extra TCP/IP options are in 264 * the packet. 265 */ 266 optlen = tcp_optlen(tp); 267 268 /* 269 * XXX tp->t_ourmss should have the right size, but without this code 270 * fragmentation will occur... need more investigation 271 */ 272 #ifdef INET 273 if (inp) { 274 #ifdef IPSEC 275 optlen += ipsec4_hdrsiz_tcp(tp); 276 #endif 277 optlen += ip_optlen(inp); 278 } 279 #endif 280 #ifdef INET6 281 #ifdef INET 282 if (in6p && tp->t_family == AF_INET) { 283 #ifdef IPSEC 284 optlen += ipsec4_hdrsiz_tcp(tp); 285 #endif 286 /* XXX size -= ip_optlen(in6p); */ 287 } else 288 #endif 289 if (in6p && tp->t_family == AF_INET6) { 290 #ifdef IPSEC 291 optlen += ipsec6_hdrsiz_tcp(tp); 292 #endif 293 optlen += ip6_optlen(in6p); 294 } 295 #endif 296 size -= optlen; 297 298 out: 299 /* 300 * *rxsegsizep holds *estimated* inbound segment size (estimation 301 * assumes that path MTU is the same for both ways). this is only 302 * for silly window avoidance, do not use the value for other purposes. 303 * 304 * ipseclen is subtracted from both sides, this may not be right. 305 * I'm not quite sure about this (could someone comment). 306 */ 307 *txsegsizep = min(tp->t_peermss - optlen, size); 308 *rxsegsizep = min(tp->t_ourmss - optlen, size); 309 310 if (*txsegsizep != tp->t_segsz) { 311 /* 312 * If the new segment size is larger, we don't want to 313 * mess up the congestion window, but if it is smaller 314 * we'll have to reduce the congestion window to ensure 315 * that we don't get into trouble with initial windows 316 * and the rest. In any case, if the segment size 317 * has changed, chances are the path has, too, and 318 * our congestion window will be different. 319 */ 320 if (*txsegsizep < tp->t_segsz) { 321 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz) 322 * *txsegsizep, *txsegsizep); 323 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz) 324 * *txsegsizep, *txsegsizep); 325 } 326 tp->t_segsz = *txsegsizep; 327 } 328 } 329 330 static 331 #ifndef GPROF 332 __inline 333 #endif 334 int 335 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, 336 long len, int hdrlen, struct mbuf **mp) 337 { 338 struct mbuf *m; 339 340 if (tp->t_force && len == 1) 341 tcpstat.tcps_sndprobe++; 342 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 343 tcpstat.tcps_sndrexmitpack++; 344 tcpstat.tcps_sndrexmitbyte += len; 345 } else { 346 tcpstat.tcps_sndpack++; 347 tcpstat.tcps_sndbyte += len; 348 } 349 #ifdef notyet 350 if ((m = m_copypack(so->so_snd.sb_mb, off, 351 (int)len, max_linkhdr + hdrlen)) == 0) 352 return (ENOBUFS); 353 /* 354 * m_copypack left space for our hdr; use it. 355 */ 356 m->m_len += hdrlen; 357 m->m_data -= hdrlen; 358 #else 359 MGETHDR(m, M_DONTWAIT, MT_HEADER); 360 if (m != NULL && 361 (max_linkhdr + hdrlen > MHLEN || 362 max_linkhdr + hdrlen + len <= MCLBYTES)) { 363 MCLGET(m, M_DONTWAIT); 364 if ((m->m_flags & M_EXT) == 0) { 365 m_freem(m); 366 m = NULL; 367 } 368 } 369 if (m == NULL) 370 return (ENOBUFS); 371 m->m_data += max_linkhdr; 372 m->m_len = hdrlen; 373 if (len <= M_TRAILINGSPACE(m)) { 374 m_copydata(so->so_snd.sb_mb, off, (int) len, 375 mtod(m, caddr_t) + hdrlen); 376 m->m_len += len; 377 } else { 378 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 379 if (m->m_next == NULL) { 380 m_freem(m); 381 return (ENOBUFS); 382 } 383 } 384 #endif 385 386 *mp = m; 387 return (0); 388 } 389 390 /* 391 * Tcp output routine: figure out what should be sent and send it. 392 */ 393 int 394 tcp_output(tp) 395 struct tcpcb *tp; 396 { 397 struct socket *so; 398 struct route *ro; 399 long len, win; 400 int off, flags, error; 401 struct mbuf *m; 402 struct ip *ip; 403 #ifdef INET6 404 struct ip6_hdr *ip6; 405 #endif 406 struct tcphdr *th; 407 u_char opt[MAX_TCPOPTLEN]; 408 unsigned optlen, hdrlen; 409 int idle, sendalot, txsegsize, rxsegsize; 410 int maxburst = TCP_MAXBURST; 411 int af; /* address family on the wire */ 412 int iphdrlen; 413 414 #ifdef DIAGNOSTIC 415 if (tp->t_inpcb && tp->t_in6pcb) 416 panic("tcp_output: both t_inpcb and t_in6pcb are set"); 417 #endif 418 so = NULL; 419 ro = NULL; 420 if (tp->t_inpcb) { 421 so = tp->t_inpcb->inp_socket; 422 ro = &tp->t_inpcb->inp_route; 423 } 424 #ifdef INET6 425 else if (tp->t_in6pcb) { 426 so = tp->t_in6pcb->in6p_socket; 427 ro = (struct route *)&tp->t_in6pcb->in6p_route; 428 } 429 #endif 430 431 switch (af = tp->t_family) { 432 #ifdef INET 433 case AF_INET: 434 if (tp->t_inpcb) 435 break; 436 #ifdef INET6 437 /* mapped addr case */ 438 if (tp->t_in6pcb) 439 break; 440 #endif 441 return EINVAL; 442 #endif 443 #ifdef INET6 444 case AF_INET6: 445 if (tp->t_in6pcb) 446 break; 447 return EINVAL; 448 #endif 449 default: 450 return EAFNOSUPPORT; 451 } 452 453 tcp_segsize(tp, &txsegsize, &rxsegsize); 454 455 idle = (tp->snd_max == tp->snd_una); 456 457 /* 458 * Restart Window computation. From draft-floyd-incr-init-win-03: 459 * 460 * Optionally, a TCP MAY set the restart window to the 461 * minimum of the value used for the initial window and 462 * the current value of cwnd (in other words, using a 463 * larger value for the restart window should never increase 464 * the size of cwnd). 465 */ 466 if (tcp_cwm) { 467 /* 468 * Hughes/Touch/Heidemann Congestion Window Monitoring. 469 * Count the number of packets currently pending 470 * acknowledgement, and limit our congestion window 471 * to a pre-determined allowed burst size plus that count. 472 * This prevents bursting once all pending packets have 473 * been acknowledged (i.e. transmission is idle). 474 * 475 * XXX Link this to Initial Window? 476 */ 477 tp->snd_cwnd = min(tp->snd_cwnd, 478 (tcp_cwm_burstsize * txsegsize) + 479 (tp->snd_nxt - tp->snd_una)); 480 } else { 481 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { 482 /* 483 * We have been idle for "a while" and no acks are 484 * expected to clock out any data we send -- 485 * slow start to get ack "clock" running again. 486 */ 487 tp->snd_cwnd = min(tp->snd_cwnd, 488 TCP_INITIAL_WINDOW(tcp_init_win, txsegsize)); 489 } 490 } 491 492 again: 493 /* 494 * Determine length of data that should be transmitted, and 495 * flags that should be used. If there is some data or critical 496 * controls (SYN, RST) to send, then transmit; otherwise, 497 * investigate further. 498 */ 499 sendalot = 0; 500 off = tp->snd_nxt - tp->snd_una; 501 win = min(tp->snd_wnd, tp->snd_cwnd); 502 503 flags = tcp_outflags[tp->t_state]; 504 /* 505 * If in persist timeout with window of 0, send 1 byte. 506 * Otherwise, if window is small but nonzero 507 * and timer expired, we will send what we can 508 * and go to transmit state. 509 */ 510 if (tp->t_force) { 511 if (win == 0) { 512 /* 513 * If we still have some data to send, then 514 * clear the FIN bit. Usually this would 515 * happen below when it realizes that we 516 * aren't sending all the data. However, 517 * if we have exactly 1 byte of unset data, 518 * then it won't clear the FIN bit below, 519 * and if we are in persist state, we wind 520 * up sending the packet without recording 521 * that we sent the FIN bit. 522 * 523 * We can't just blindly clear the FIN bit, 524 * because if we don't have any more data 525 * to send then the probe will be the FIN 526 * itself. 527 */ 528 if (off < so->so_snd.sb_cc) 529 flags &= ~TH_FIN; 530 win = 1; 531 } else { 532 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 533 tp->t_rxtshift = 0; 534 } 535 } 536 537 if (win < so->so_snd.sb_cc) { 538 len = win - off; 539 flags &= ~TH_FIN; 540 } else 541 len = so->so_snd.sb_cc - off; 542 543 if (len < 0) { 544 /* 545 * If FIN has been sent but not acked, 546 * but we haven't been called to retransmit, 547 * len will be -1. Otherwise, window shrank 548 * after we sent into it. If window shrank to 0, 549 * cancel pending retransmit, pull snd_nxt back 550 * to (closed) window, and set the persist timer 551 * if it isn't already going. If the window didn't 552 * close completely, just wait for an ACK. 553 * 554 * If we have a pending FIN, either it has already been 555 * transmitted or it is outside the window, so drop it. 556 * If the FIN has been transmitted, but this is not a 557 * retransmission, then len must be -1. Therefore we also 558 * prevent here the sending of `gratuitous FINs'. This 559 * eliminates the need to check for that case below (e.g. 560 * to back up snd_nxt before the FIN so that the sequence 561 * number is correct). 562 */ 563 len = 0; 564 flags &= ~TH_FIN; 565 if (win == 0) { 566 TCP_TIMER_DISARM(tp, TCPT_REXMT); 567 tp->t_rxtshift = 0; 568 tp->snd_nxt = tp->snd_una; 569 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 570 tcp_setpersist(tp); 571 } 572 } 573 if (len > txsegsize) { 574 len = txsegsize; 575 flags &= ~TH_FIN; 576 sendalot = 1; 577 } 578 579 win = sbspace(&so->so_rcv); 580 581 /* 582 * Sender silly window avoidance. If connection is idle 583 * and can send all data, a maximum segment, 584 * at least a maximum default-size segment do it, 585 * or are forced, do it; otherwise don't bother. 586 * If peer's buffer is tiny, then send 587 * when window is at least half open. 588 * If retransmitting (possibly after persist timer forced us 589 * to send into a small window), then must resend. 590 */ 591 if (len) { 592 if (len == txsegsize) 593 goto send; 594 if ((so->so_state & SS_MORETOCOME) == 0 && 595 ((idle || tp->t_flags & TF_NODELAY) && 596 len + off >= so->so_snd.sb_cc)) 597 goto send; 598 if (tp->t_force) 599 goto send; 600 if (len >= tp->max_sndwnd / 2) 601 goto send; 602 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 603 goto send; 604 } 605 606 /* 607 * Compare available window to amount of window known to peer 608 * (as advertised window less next expected input). If the 609 * difference is at least twice the size of the largest segment 610 * we expect to receive (i.e. two segments) or at least 50% of 611 * the maximum possible window, then want to send a window update 612 * to peer. 613 */ 614 if (win > 0) { 615 /* 616 * "adv" is the amount we can increase the window, 617 * taking into account that we are limited by 618 * TCP_MAXWIN << tp->rcv_scale. 619 */ 620 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 621 (tp->rcv_adv - tp->rcv_nxt); 622 623 if (adv >= (long) (2 * rxsegsize)) 624 goto send; 625 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 626 goto send; 627 } 628 629 /* 630 * Send if we owe peer an ACK. 631 */ 632 if (tp->t_flags & TF_ACKNOW) 633 goto send; 634 if (flags & (TH_SYN|TH_FIN|TH_RST)) 635 goto send; 636 if (SEQ_GT(tp->snd_up, tp->snd_una)) 637 goto send; 638 639 /* 640 * TCP window updates are not reliable, rather a polling protocol 641 * using ``persist'' packets is used to insure receipt of window 642 * updates. The three ``states'' for the output side are: 643 * idle not doing retransmits or persists 644 * persisting to move a small or zero window 645 * (re)transmitting and thereby not persisting 646 * 647 * tp->t_timer[TCPT_PERSIST] 648 * is set when we are in persist state. 649 * tp->t_force 650 * is set when we are called to send a persist packet. 651 * tp->t_timer[TCPT_REXMT] 652 * is set when we are retransmitting 653 * The output side is idle when both timers are zero. 654 * 655 * If send window is too small, there is data to transmit, and no 656 * retransmit or persist is pending, then go to persist state. 657 * If nothing happens soon, send when timer expires: 658 * if window is nonzero, transmit what we can, 659 * otherwise force out a byte. 660 */ 661 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 662 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 663 tp->t_rxtshift = 0; 664 tcp_setpersist(tp); 665 } 666 667 /* 668 * No reason to send a segment, just return. 669 */ 670 return (0); 671 672 send: 673 /* 674 * Before ESTABLISHED, force sending of initial options 675 * unless TCP set not to do any options. 676 * NOTE: we assume that the IP/TCP header plus TCP options 677 * always fit in a single mbuf, leaving room for a maximum 678 * link header, i.e. 679 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES 680 */ 681 optlen = 0; 682 switch (af) { 683 #ifdef INET 684 case AF_INET: 685 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 686 break; 687 #endif 688 #ifdef INET6 689 case AF_INET6: 690 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 691 break; 692 #endif 693 default: /*pacify gcc*/ 694 iphdrlen = 0; 695 break; 696 } 697 hdrlen = iphdrlen; 698 if (flags & TH_SYN) { 699 struct rtentry *rt; 700 701 rt = NULL; 702 #ifdef INET 703 if (tp->t_inpcb) 704 rt = in_pcbrtentry(tp->t_inpcb); 705 #endif 706 #ifdef INET6 707 if (tp->t_in6pcb) 708 rt = in6_pcbrtentry(tp->t_in6pcb); 709 #endif 710 711 tp->snd_nxt = tp->iss; 712 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ? 713 rt->rt_ifp : NULL, af); 714 if ((tp->t_flags & TF_NOOPT) == 0) { 715 opt[0] = TCPOPT_MAXSEG; 716 opt[1] = 4; 717 opt[2] = (tp->t_ourmss >> 8) & 0xff; 718 opt[3] = tp->t_ourmss & 0xff; 719 optlen = 4; 720 721 if ((tp->t_flags & TF_REQ_SCALE) && 722 ((flags & TH_ACK) == 0 || 723 (tp->t_flags & TF_RCVD_SCALE))) { 724 *((u_int32_t *) (opt + optlen)) = htonl( 725 TCPOPT_NOP << 24 | 726 TCPOPT_WINDOW << 16 | 727 TCPOLEN_WINDOW << 8 | 728 tp->request_r_scale); 729 optlen += 4; 730 } 731 } 732 } 733 734 /* 735 * Send a timestamp and echo-reply if this is a SYN and our side 736 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 737 * and our peer have sent timestamps in our SYN's. 738 */ 739 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 740 (flags & TH_RST) == 0 && 741 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 742 (tp->t_flags & TF_RCVD_TSTMP))) { 743 u_int32_t *lp = (u_int32_t *)(opt + optlen); 744 745 /* Form timestamp option as shown in appendix A of RFC 1323. */ 746 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 747 *lp++ = htonl(TCP_TIMESTAMP(tp)); 748 *lp = htonl(tp->ts_recent); 749 optlen += TCPOLEN_TSTAMP_APPA; 750 } 751 752 hdrlen += optlen; 753 754 #ifdef DIAGNOSTIC 755 if (len > txsegsize) 756 panic("tcp data to be sent is larger than segment"); 757 if (max_linkhdr + hdrlen > MCLBYTES) 758 panic("tcphdr too big"); 759 #endif 760 761 /* 762 * Grab a header mbuf, attaching a copy of data to 763 * be transmitted, and initialize the header from 764 * the template for sends on this connection. 765 */ 766 if (len) { 767 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); 768 if (error) 769 goto out; 770 /* 771 * If we're sending everything we've got, set PUSH. 772 * (This will keep happy those implementations which only 773 * give data to the user when a buffer fills or 774 * a PUSH comes in.) 775 */ 776 if (off + len == so->so_snd.sb_cc) 777 flags |= TH_PUSH; 778 } else { 779 if (tp->t_flags & TF_ACKNOW) 780 tcpstat.tcps_sndacks++; 781 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 782 tcpstat.tcps_sndctrl++; 783 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 784 tcpstat.tcps_sndurg++; 785 else 786 tcpstat.tcps_sndwinup++; 787 788 MGETHDR(m, M_DONTWAIT, MT_HEADER); 789 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 790 MCLGET(m, M_DONTWAIT); 791 if ((m->m_flags & M_EXT) == 0) { 792 m_freem(m); 793 m = NULL; 794 } 795 } 796 if (m == NULL) { 797 error = ENOBUFS; 798 goto out; 799 } 800 m->m_data += max_linkhdr; 801 m->m_len = hdrlen; 802 } 803 m->m_pkthdr.rcvif = (struct ifnet *)0; 804 switch (af) { 805 #ifdef INET 806 case AF_INET: 807 ip = mtod(m, struct ip *); 808 #ifdef INET6 809 ip6 = NULL; 810 #endif 811 th = (struct tcphdr *)(ip + 1); 812 break; 813 #endif 814 #ifdef INET6 815 case AF_INET6: 816 ip = NULL; 817 ip6 = mtod(m, struct ip6_hdr *); 818 th = (struct tcphdr *)(ip6 + 1); 819 break; 820 #endif 821 default: /*pacify gcc*/ 822 ip = NULL; 823 #ifdef INET6 824 ip6 = NULL; 825 #endif 826 th = NULL; 827 break; 828 } 829 if (tp->t_template == 0) 830 panic("tcp_output"); 831 if (tp->t_template->m_len < iphdrlen) 832 panic("tcp_output"); 833 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen); 834 835 /* 836 * If we are doing retransmissions, then snd_nxt will 837 * not reflect the first unsent octet. For ACK only 838 * packets, we do not want the sequence number of the 839 * retransmitted packet, we want the sequence number 840 * of the next unsent octet. So, if there is no data 841 * (and no SYN or FIN), use snd_max instead of snd_nxt 842 * when filling in ti_seq. But if we are in persist 843 * state, snd_max might reflect one byte beyond the 844 * right edge of the window, so use snd_nxt in that 845 * case, since we know we aren't doing a retransmission. 846 * (retransmit and persist are mutually exclusive...) 847 */ 848 if (len || (flags & (TH_SYN|TH_FIN)) || 849 TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) 850 th->th_seq = htonl(tp->snd_nxt); 851 else 852 th->th_seq = htonl(tp->snd_max); 853 th->th_ack = htonl(tp->rcv_nxt); 854 if (optlen) { 855 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 856 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 857 } 858 th->th_flags = flags; 859 /* 860 * Calculate receive window. Don't shrink window, 861 * but avoid silly window syndrome. 862 */ 863 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) 864 win = 0; 865 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 866 win = (long)TCP_MAXWIN << tp->rcv_scale; 867 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 868 win = (long)(tp->rcv_adv - tp->rcv_nxt); 869 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 870 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 871 u_int32_t urp = tp->snd_up - tp->snd_nxt; 872 if (urp > IP_MAXPACKET) 873 urp = IP_MAXPACKET; 874 th->th_urp = htons((u_int16_t)urp); 875 th->th_flags |= TH_URG; 876 } else 877 /* 878 * If no urgent pointer to send, then we pull 879 * the urgent pointer to the left edge of the send window 880 * so that it doesn't drift into the send window on sequence 881 * number wraparound. 882 */ 883 tp->snd_up = tp->snd_una; /* drag it along */ 884 885 /* 886 * Set ourselves up to be checksummed just before the packet 887 * hits the wire. 888 */ 889 switch (af) { 890 #ifdef INET 891 case AF_INET: 892 m->m_pkthdr.csum_flags = M_CSUM_TCPv4; 893 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 894 if (len + optlen) { 895 /* Fixup the pseudo-header checksum. */ 896 /* XXXJRT Not IP Jumbogram safe. */ 897 th->th_sum = in_cksum_addword(th->th_sum, 898 htons((u_int16_t) (len + optlen))); 899 } 900 break; 901 #endif 902 #ifdef INET6 903 case AF_INET6: 904 /* 905 * XXX Actually delaying the checksum is Hard 906 * XXX (well, maybe not for Itojun, but it is 907 * XXX for me), but we can still take advantage 908 * XXX of the cached pseudo-header checksum. 909 */ 910 /* equals to hdrlen + len */ 911 m->m_pkthdr.len = sizeof(struct ip6_hdr) 912 + sizeof(struct tcphdr) + optlen + len; 913 #ifdef notyet 914 m->m_pkthdr.csum_flags = M_CSUM_TCPv6; 915 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 916 #endif 917 if (len + optlen) { 918 /* Fixup the pseudo-header checksum. */ 919 /* XXXJRT: Not IPv6 Jumbogram safe. */ 920 th->th_sum = in_cksum_addword(th->th_sum, 921 htons((u_int16_t) (len + optlen))); 922 } 923 #ifndef notyet 924 th->th_sum = in6_cksum(m, 0, sizeof(struct ip6_hdr), 925 sizeof(struct tcphdr) + optlen + len); 926 #endif 927 break; 928 #endif 929 } 930 931 /* 932 * In transmit state, time the transmission and arrange for 933 * the retransmit. In persist state, just set snd_max. 934 */ 935 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 936 tcp_seq startseq = tp->snd_nxt; 937 938 /* 939 * Advance snd_nxt over sequence space of this segment. 940 * There are no states in which we send both a SYN and a FIN, 941 * so we collapse the tests for these flags. 942 */ 943 if (flags & (TH_SYN|TH_FIN)) 944 tp->snd_nxt++; 945 tp->snd_nxt += len; 946 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 947 tp->snd_max = tp->snd_nxt; 948 /* 949 * Time this transmission if not a retransmission and 950 * not currently timing anything. 951 */ 952 if (tp->t_rtttime == 0) { 953 tp->t_rtttime = tcp_now; 954 tp->t_rtseq = startseq; 955 tcpstat.tcps_segstimed++; 956 } 957 } 958 959 /* 960 * Set retransmit timer if not currently set, 961 * and not doing an ack or a keep-alive probe. 962 * Initial value for retransmit timer is smoothed 963 * round-trip time + 2 * round-trip time variance. 964 * Initialize shift counter which is used for backoff 965 * of retransmit time. 966 */ 967 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 968 tp->snd_nxt != tp->snd_una) { 969 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 970 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 971 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 972 tp->t_rxtshift = 0; 973 } 974 } 975 } else 976 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 977 tp->snd_max = tp->snd_nxt + len; 978 979 #ifdef TCP_DEBUG 980 /* 981 * Trace. 982 */ 983 if (so->so_options & SO_DEBUG) { 984 /* 985 * need to recover version # field, which was overwritten 986 * on ip_cksum computation. 987 */ 988 struct ip *sip; 989 sip = mtod(m, struct ip *); 990 switch (af) { 991 #ifdef INET 992 case AF_INET: 993 sip->ip_v = 4; 994 break; 995 #endif 996 #ifdef INET6 997 case AF_INET6: 998 sip->ip_v = 6; 999 break; 1000 #endif 1001 } 1002 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); 1003 } 1004 #endif 1005 1006 /* 1007 * Fill in IP length and desired time to live and 1008 * send to IP level. There should be a better way 1009 * to handle ttl and tos; we could keep them in 1010 * the template, but need a way to checksum without them. 1011 */ 1012 m->m_pkthdr.len = hdrlen + len; 1013 1014 switch (af) { 1015 #ifdef INET 1016 case AF_INET: 1017 ip->ip_len = m->m_pkthdr.len; 1018 if (tp->t_inpcb) { 1019 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1020 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 1021 } 1022 #ifdef INET6 1023 else if (tp->t_in6pcb) { 1024 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/ 1025 ip->ip_tos = 0; /*XXX*/ 1026 } 1027 #endif 1028 break; 1029 #endif 1030 #ifdef INET6 1031 case AF_INET6: 1032 ip6->ip6_nxt = IPPROTO_TCP; 1033 if (tp->t_in6pcb) { 1034 /* 1035 * we separately set hoplimit for every segment, since 1036 * the user might want to change the value via 1037 * setsockopt. Also, desired default hop limit might 1038 * be changed via Neighbor Discovery. 1039 */ 1040 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, 1041 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 1042 } 1043 /* ip6->ip6_flow = ??? */ 1044 /* ip6_plen will be filled in ip6_output(). */ 1045 break; 1046 #endif 1047 } 1048 1049 #ifdef IPSEC 1050 if (ipsec_setsocket(m, so) != 0) { 1051 m_freem(m); 1052 error = ENOBUFS; 1053 goto out; 1054 } 1055 #endif /*IPSEC*/ 1056 1057 switch (af) { 1058 #ifdef INET 1059 case AF_INET: 1060 { 1061 struct mbuf *opts; 1062 1063 if (tp->t_inpcb) 1064 opts = tp->t_inpcb->inp_options; 1065 else 1066 opts = NULL; 1067 error = ip_output(m, opts, ro, 1068 (ip_mtudisc ? IP_MTUDISC : 0) | 1069 (so->so_options & SO_DONTROUTE), 1070 0); 1071 break; 1072 } 1073 #endif 1074 #ifdef INET6 1075 case AF_INET6: 1076 { 1077 struct ip6_pktopts *opts; 1078 1079 if (tp->t_in6pcb) 1080 opts = tp->t_in6pcb->in6p_outputopts; 1081 else 1082 opts = NULL; 1083 error = ip6_output(m, opts, (struct route_in6 *)ro, 1084 so->so_options & SO_DONTROUTE, 0, NULL); 1085 break; 1086 } 1087 #endif 1088 default: 1089 error = EAFNOSUPPORT; 1090 break; 1091 } 1092 if (error) { 1093 out: 1094 if (error == ENOBUFS) { 1095 tcpstat.tcps_selfquench++; 1096 #ifdef INET 1097 if (tp->t_inpcb) 1098 tcp_quench(tp->t_inpcb, 0); 1099 #endif 1100 #ifdef INET6 1101 if (tp->t_in6pcb) 1102 tcp6_quench(tp->t_in6pcb, 0); 1103 #endif 1104 error = 0; 1105 } else if ((error == EHOSTUNREACH || error == ENETDOWN) && 1106 TCPS_HAVERCVDSYN(tp->t_state)) { 1107 tp->t_softerror = error; 1108 error = 0; 1109 } 1110 1111 /* Restart the delayed ACK timer, if necessary. */ 1112 if (tp->t_flags & TF_DELACK) 1113 TCP_RESTART_DELACK(tp); 1114 1115 return (error); 1116 } 1117 tcpstat.tcps_sndtotal++; 1118 if (tp->t_flags & TF_DELACK) 1119 tcpstat.tcps_delack++; 1120 1121 /* 1122 * Data sent (as far as we can tell). 1123 * If this advertises a larger window than any other segment, 1124 * then remember the size of the advertised window. 1125 * Any pending ACK has now been sent. 1126 */ 1127 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1128 tp->rcv_adv = tp->rcv_nxt + win; 1129 tp->last_ack_sent = tp->rcv_nxt; 1130 tp->t_flags &= ~TF_ACKNOW; 1131 TCP_CLEAR_DELACK(tp); 1132 #ifdef DIAGNOSTIC 1133 if (maxburst < 0) 1134 printf("tcp_output: maxburst exceeded by %d\n", -maxburst); 1135 #endif 1136 if (sendalot && (!tcp_do_newreno || --maxburst)) 1137 goto again; 1138 return (0); 1139 } 1140 1141 void 1142 tcp_setpersist(tp) 1143 struct tcpcb *tp; 1144 { 1145 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); 1146 int nticks; 1147 1148 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 1149 panic("tcp_output REXMT"); 1150 /* 1151 * Start/restart persistance timer. 1152 */ 1153 if (t < tp->t_rttmin) 1154 t = tp->t_rttmin; 1155 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], 1156 TCPTV_PERSMIN, TCPTV_PERSMAX); 1157 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); 1158 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1159 tp->t_rxtshift++; 1160 } 1161