1 /* $OpenBSD: tcp_output.c,v 1.25 1999/12/08 06:50:20 itojun Exp $ */ 2 /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93 37 */ 38 39 /* 40 %%% portions-copyright-nrl-95 41 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 43 Reserved. All rights under this copyright have been assigned to the US 44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 46 software. 47 You should have received a copy of the license with this software. If you 48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 49 */ 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/protosw.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/errno.h> 59 #include <sys/domain.h> 60 61 #include <net/route.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/ip.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/ip_var.h> 68 #include <netinet/tcp.h> 69 #define TCPOUTFLAGS 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcpip.h> 75 #include <netinet/tcp_debug.h> 76 77 #ifdef TUBA 78 #include <netiso/iso.h> 79 #include <netiso/tuba_table.h> 80 #endif 81 82 #ifdef INET6 83 #include <netinet6/tcpipv6.h> 84 #endif /* INET6 */ 85 86 #ifdef TCP_SIGNATURE 87 #include <sys/md5k.h> 88 #endif /* TCP_SIGNATURE */ 89 90 #ifdef notyet 91 extern struct mbuf *m_copypack(); 92 #endif 93 94 #ifdef TCP_SACK 95 extern int tcprexmtthresh; 96 #endif 97 98 #ifdef TCP_SACK 99 #ifdef TCP_SACK_DEBUG 100 void 101 tcp_print_holes(tp) 102 struct tcpcb *tp; 103 { 104 struct sackhole *p = tp->snd_holes; 105 if (p == 0) 106 return; 107 printf("Hole report: start--end dups rxmit\n"); 108 while (p) { 109 printf("%x--%x d %d r %x\n", p->start, p->end, p->dups, 110 p->rxmit); 111 p = p->next; 112 } 113 printf("\n"); 114 } 115 #endif /* TCP_SACK_DEBUG */ 116 117 /* 118 * Returns pointer to a sackhole if there are any pending retransmissions; 119 * NULL otherwise. 120 */ 121 struct sackhole * 122 tcp_sack_output(tp) 123 register struct tcpcb *tp; 124 { 125 struct sackhole *p; 126 if (tp->sack_disable) 127 return 0; 128 p = tp->snd_holes; 129 while (p) { 130 if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) { 131 if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 132 p = p->next; 133 continue; 134 } 135 #ifdef TCP_SACK_DEBUG 136 if (p) 137 tcp_print_holes(tp); 138 #endif 139 return p; 140 } 141 p = p->next; 142 } 143 return 0; 144 } 145 146 /* 147 * After a timeout, the SACK list may be rebuilt. This SACK information 148 * should be used to avoid retransmitting SACKed data. This function 149 * traverses the SACK list to see if snd_nxt should be moved forward. 150 */ 151 void 152 tcp_sack_adjust(tp) 153 struct tcpcb *tp; 154 { 155 int i; 156 157 for (i = 0; i < tp->rcv_numsacks; i++) { 158 if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start)) 159 break; 160 if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt)) 161 continue; 162 if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0) 163 continue; 164 /* snd_nxt must be in middle of block of SACKed data */ 165 tp->snd_nxt = tp->sackblks[i].end; 166 break; 167 } 168 } 169 #endif /* TCP_SACK */ 170 171 /* 172 * Tcp output routine: figure out what should be sent and send it. 173 */ 174 int 175 tcp_output(tp) 176 register struct tcpcb *tp; 177 { 178 register struct socket *so = tp->t_inpcb->inp_socket; 179 register long len, win; 180 int off, flags, error; 181 register struct mbuf *m; 182 register struct tcphdr *th; 183 u_char opt[MAX_TCPOPTLEN]; 184 unsigned int optlen, hdrlen; 185 int idle, sendalot; 186 #ifdef TCP_SACK 187 int i, sack_rxmit = 0; 188 struct sackhole *p; 189 #endif 190 #if defined(TCP_SACK) || defined(TCP_NEWRENO) 191 int maxburst = TCP_MAXBURST; 192 #endif 193 #ifdef TCP_SIGNATURE 194 unsigned int sigoff; 195 #endif /* TCP_SIGNATURE */ 196 197 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) 198 if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE)) 199 return (EINVAL); 200 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ 201 202 /* 203 * Determine length of data that should be transmitted, 204 * and flags that will be used. 205 * If there is some data or critical controls (SYN, RST) 206 * to send, then transmit; otherwise, investigate further. 207 */ 208 idle = (tp->snd_max == tp->snd_una); 209 if (idle && tp->t_idle >= tp->t_rxtcur) 210 /* 211 * We have been idle for "a while" and no acks are 212 * expected to clock out any data we send -- 213 * slow start to get ack "clock" running again. 214 */ 215 tp->snd_cwnd = tp->t_maxseg; 216 again: 217 sendalot = 0; 218 #ifdef TCP_SACK 219 /* 220 * If we've recently taken a timeout, snd_max will be greater than 221 * snd_nxt. There may be SACK information that allows us to avoid 222 * resending already delivered data. Adjust snd_nxt accordingly. 223 */ 224 if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) 225 tcp_sack_adjust(tp); 226 #endif 227 off = tp->snd_nxt - tp->snd_una; 228 win = ulmin(tp->snd_wnd, tp->snd_cwnd); 229 230 flags = tcp_outflags[tp->t_state]; 231 /* 232 * If in persist timeout with window of 0, send 1 byte. 233 * Otherwise, if window is small but nonzero 234 * and timer expired, we will send what we can 235 * and go to transmit state. 236 */ 237 238 #ifdef TCP_SACK 239 /* 240 * Send any SACK-generated retransmissions. If we're explicitly trying 241 * to send out new data (when sendalot is 1), bypass this function. 242 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 243 * we're replacing a (future) new transmission with a retransmission 244 * now, and we previously incremented snd_cwnd in tcp_input(). 245 */ 246 if (!tp->sack_disable && !sendalot) { 247 if ((p = tcp_sack_output(tp))) { 248 off = p->rxmit - tp->snd_una; 249 sack_rxmit = 1; 250 #if 0 251 /* Coalesce holes into a single retransmission */ 252 #endif 253 len = min(tp->t_maxseg, p->end - p->rxmit); 254 #ifndef TCP_FACK 255 /* in FACK, hold snd_cwnd constant during recovery */ 256 if (SEQ_LT(tp->snd_una, tp->snd_last)) 257 tp->snd_cwnd -= tp->t_maxseg; 258 #endif 259 } 260 } 261 #endif /* TCP_SACK */ 262 263 if (tp->t_force) { 264 if (win == 0) { 265 /* 266 * If we still have some data to send, then 267 * clear the FIN bit. Usually this would 268 * happen below when it realizes that we 269 * aren't sending all the data. However, 270 * if we have exactly 1 byte of unset data, 271 * then it won't clear the FIN bit below, 272 * and if we are in persist state, we wind 273 * up sending the packet without recording 274 * that we sent the FIN bit. 275 * 276 * We can't just blindly clear the FIN bit, 277 * because if we don't have any more data 278 * to send then the probe will be the FIN 279 * itself. 280 */ 281 if (off < so->so_snd.sb_cc) 282 flags &= ~TH_FIN; 283 win = 1; 284 } else { 285 tp->t_timer[TCPT_PERSIST] = 0; 286 tp->t_rxtshift = 0; 287 } 288 } 289 290 #ifdef TCP_SACK 291 if (!sack_rxmit) { 292 #endif 293 len = ulmin(so->so_snd.sb_cc, win) - off; 294 295 #if defined(TCP_SACK) && defined(TCP_FACK) 296 /* 297 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and 298 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then 299 * do not send data (like zero window conditions) 300 */ 301 if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && 302 (tp->snd_awnd >= tp->snd_cwnd)) 303 len = 0; 304 #endif /* TCP_FACK */ 305 #ifdef TCP_SACK 306 } 307 #endif 308 309 if (len < 0) { 310 /* 311 * If FIN has been sent but not acked, 312 * but we haven't been called to retransmit, 313 * len will be -1. Otherwise, window shrank 314 * after we sent into it. If window shrank to 0, 315 * cancel pending retransmit and pull snd_nxt 316 * back to (closed) window. We will enter persist 317 * state below. If the window didn't close completely, 318 * just wait for an ACK. 319 */ 320 len = 0; 321 if (win == 0) { 322 tp->t_timer[TCPT_REXMT] = 0; 323 tp->snd_nxt = tp->snd_una; 324 } 325 } 326 if (len > tp->t_maxseg) { 327 len = tp->t_maxseg; 328 sendalot = 1; 329 } 330 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 331 flags &= ~TH_FIN; 332 333 win = sbspace(&so->so_rcv); 334 335 /* 336 * Sender silly window avoidance. If connection is idle 337 * and can send all data, a maximum segment, 338 * at least a maximum default-size segment do it, 339 * or are forced, do it; otherwise don't bother. 340 * If peer's buffer is tiny, then send 341 * when window is at least half open. 342 * If retransmitting (possibly after persist timer forced us 343 * to send into a small window), then must resend. 344 */ 345 if (len) { 346 if (len == tp->t_maxseg) 347 goto send; 348 if ((idle || tp->t_flags & TF_NODELAY) && 349 len + off >= so->so_snd.sb_cc) 350 goto send; 351 if (tp->t_force) 352 goto send; 353 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 354 goto send; 355 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 356 goto send; 357 #ifdef TCP_SACK 358 if (sack_rxmit) 359 goto send; 360 #endif 361 } 362 363 /* 364 * Compare available window to amount of window 365 * known to peer (as advertised window less 366 * next expected input). If the difference is at least two 367 * max size segments, or at least 50% of the maximum possible 368 * window, then want to send a window update to peer. 369 */ 370 if (win > 0) { 371 /* 372 * "adv" is the amount we can increase the window, 373 * taking into account that we are limited by 374 * TCP_MAXWIN << tp->rcv_scale. 375 */ 376 long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) - 377 (tp->rcv_adv - tp->rcv_nxt); 378 379 if (adv >= (long) (2 * tp->t_maxseg)) 380 goto send; 381 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 382 goto send; 383 } 384 385 /* 386 * Send if we owe peer an ACK. 387 */ 388 if (tp->t_flags & TF_ACKNOW) 389 goto send; 390 if (flags & (TH_SYN|TH_RST)) 391 goto send; 392 if (SEQ_GT(tp->snd_up, tp->snd_una)) 393 goto send; 394 /* 395 * If our state indicates that FIN should be sent 396 * and we have not yet done so, or we're retransmitting the FIN, 397 * then we need to send. 398 */ 399 if (flags & TH_FIN && 400 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 401 goto send; 402 403 /* 404 * TCP window updates are not reliable, rather a polling protocol 405 * using ``persist'' packets is used to insure receipt of window 406 * updates. The three ``states'' for the output side are: 407 * idle not doing retransmits or persists 408 * persisting to move a small or zero window 409 * (re)transmitting and thereby not persisting 410 * 411 * tp->t_timer[TCPT_PERSIST] 412 * is set when we are in persist state. 413 * tp->t_force 414 * is set when we are called to send a persist packet. 415 * tp->t_timer[TCPT_REXMT] 416 * is set when we are retransmitting 417 * The output side is idle when both timers are zero. 418 * 419 * If send window is too small, there is data to transmit, and no 420 * retransmit or persist is pending, then go to persist state. 421 * If nothing happens soon, send when timer expires: 422 * if window is nonzero, transmit what we can, 423 * otherwise force out a byte. 424 */ 425 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 426 tp->t_timer[TCPT_PERSIST] == 0) { 427 tp->t_rxtshift = 0; 428 tcp_setpersist(tp); 429 } 430 431 /* 432 * No reason to send a segment, just return. 433 */ 434 return (0); 435 436 send: 437 /* 438 * Before ESTABLISHED, force sending of initial options 439 * unless TCP set not to do any options. 440 * NOTE: we assume that the IP/TCP header plus TCP options 441 * always fit in a single mbuf, leaving room for a maximum 442 * link header, i.e. 443 * max_linkhdr + sizeof(network header) + sizeof(struct tcphdr + 444 * optlen <= MHLEN 445 */ 446 optlen = 0; 447 448 switch (tp->pf) { 449 case 0: /*default to PF_INET*/ 450 #ifdef INET 451 case PF_INET: 452 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 453 break; 454 #endif /* INET */ 455 #ifdef INET6 456 case PF_INET6: 457 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 458 break; 459 #endif /* INET6 */ 460 default: 461 return (EPFNOSUPPORT); 462 } 463 464 if (flags & TH_SYN) { 465 tp->snd_nxt = tp->iss; 466 if ((tp->t_flags & TF_NOOPT) == 0) { 467 u_int16_t mss; 468 469 opt[0] = TCPOPT_MAXSEG; 470 opt[1] = 4; 471 mss = htons((u_int16_t) tcp_mss(tp, 0)); 472 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); 473 optlen = 4; 474 #ifdef TCP_SACK 475 /* 476 * If this is the first SYN of connection (not a SYN 477 * ACK), include SACK_PERMIT_HDR option. If this is a 478 * SYN ACK, include SACK_PERMIT_HDR option if peer has 479 * already done so. 480 */ 481 if (!tp->sack_disable && ((flags & TH_ACK) == 0 || 482 (tp->t_flags & TF_SACK_PERMIT))) { 483 *((u_int32_t *) (opt + optlen)) = 484 htonl(TCPOPT_SACK_PERMIT_HDR); 485 optlen += 4; 486 } 487 #endif 488 489 if ((tp->t_flags & TF_REQ_SCALE) && 490 ((flags & TH_ACK) == 0 || 491 (tp->t_flags & TF_RCVD_SCALE))) { 492 *((u_int32_t *) (opt + optlen)) = htonl( 493 TCPOPT_NOP << 24 | 494 TCPOPT_WINDOW << 16 | 495 TCPOLEN_WINDOW << 8 | 496 tp->request_r_scale); 497 optlen += 4; 498 } 499 } 500 } 501 502 /* 503 * Send a timestamp and echo-reply if this is a SYN and our side 504 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 505 * and our peer have sent timestamps in our SYN's. 506 */ 507 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 508 (flags & TH_RST) == 0 && 509 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 510 (tp->t_flags & TF_RCVD_TSTMP))) { 511 u_int32_t *lp = (u_int32_t *)(opt + optlen); 512 513 /* Form timestamp option as shown in appendix A of RFC 1323. */ 514 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 515 *lp++ = htonl(tcp_now); 516 *lp = htonl(tp->ts_recent); 517 optlen += TCPOLEN_TSTAMP_APPA; 518 } 519 520 #ifdef TCP_SIGNATURE 521 if (tp->t_flags & TF_SIGNATURE) { 522 u_int8_t *bp = (u_int8_t *)(opt + optlen); 523 524 /* Send signature option */ 525 *(bp++) = TCPOPT_SIGNATURE; 526 *(bp++) = TCPOLEN_SIGNATURE; 527 sigoff = optlen + 2; 528 529 { 530 unsigned int i; 531 532 for (i = 0; i < 16; i++) 533 *(bp++) = 0; 534 } 535 536 optlen += TCPOLEN_SIGNATURE; 537 538 /* Pad options list to the next 32 bit boundary and 539 * terminate it. 540 */ 541 *bp++ = TCPOPT_NOP; 542 *bp++ = TCPOPT_EOL; 543 optlen += 2; 544 } 545 #endif /* TCP_SIGNATURE */ 546 547 #ifdef TCP_SACK 548 /* 549 * Send SACKs if necessary. This should be the last option processed. 550 * Only as many SACKs are sent as are permitted by the maximum options 551 * size. No more than three SACKs are sent. 552 */ 553 if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED && 554 (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && 555 tp->rcv_numsacks) { 556 u_int32_t *lp = (u_int32_t *)(opt + optlen); 557 u_int32_t *olp = lp++; 558 int count = 0; /* actual number of SACKs inserted */ 559 int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; 560 561 maxsack = min(maxsack, TCP_MAX_SACK); 562 for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { 563 struct sackblk sack = tp->sackblks[i]; 564 if (sack.start == 0 && sack.end == 0) 565 continue; 566 *lp++ = htonl(sack.start); 567 *lp++ = htonl(sack.end); 568 count++; 569 } 570 *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); 571 optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ 572 } 573 #endif /* TCP_SACK */ 574 575 #ifdef DIAGNOSTIC 576 if (optlen > MAX_TCPOPTLEN) 577 panic("tcp_output: options too long"); 578 #endif /* DIAGNOSTIC */ 579 580 hdrlen += optlen; 581 582 /* 583 * Adjust data length if insertion of options will 584 * bump the packet length beyond the t_maxopd length. 585 */ 586 if (len > tp->t_maxopd - optlen) { 587 len = tp->t_maxopd - optlen; 588 sendalot = 1; 589 flags &= ~TH_FIN; 590 } 591 592 #ifdef DIAGNOSTIC 593 if (max_linkhdr + hdrlen > MHLEN) 594 panic("tcphdr too big"); 595 #endif 596 597 /* 598 * Grab a header mbuf, attaching a copy of data to 599 * be transmitted, and initialize the header from 600 * the template for sends on this connection. 601 */ 602 if (len) { 603 if (tp->t_force && len == 1) 604 tcpstat.tcps_sndprobe++; 605 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 606 tcpstat.tcps_sndrexmitpack++; 607 tcpstat.tcps_sndrexmitbyte += len; 608 } else { 609 tcpstat.tcps_sndpack++; 610 tcpstat.tcps_sndbyte += len; 611 } 612 #ifdef notyet 613 if ((m = m_copypack(so->so_snd.sb_mb, off, 614 (int)len, max_linkhdr + hdrlen)) == 0) { 615 error = ENOBUFS; 616 goto out; 617 } 618 /* 619 * m_copypack left space for our hdr; use it. 620 */ 621 m->m_len += hdrlen; 622 m->m_data -= hdrlen; 623 #else 624 MGETHDR(m, M_DONTWAIT, MT_HEADER); 625 if (m != NULL) { 626 MCLGET(m, M_DONTWAIT); 627 if ((m->m_flags & M_EXT) == 0) { 628 m_freem(m); 629 m = NULL; 630 } 631 } 632 if (m == NULL) { 633 error = ENOBUFS; 634 goto out; 635 } 636 m->m_data += max_linkhdr; 637 m->m_len = hdrlen; 638 if (len <= MCLBYTES - hdrlen - max_linkhdr) { 639 m_copydata(so->so_snd.sb_mb, off, (int) len, 640 mtod(m, caddr_t) + hdrlen); 641 m->m_len += len; 642 } else { 643 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 644 if (m->m_next == 0) { 645 (void) m_free(m); 646 error = ENOBUFS; 647 goto out; 648 } 649 } 650 #endif 651 /* 652 * If we're sending everything we've got, set PUSH. 653 * (This will keep happy those implementations which only 654 * give data to the user when a buffer fills or 655 * a PUSH comes in.) 656 */ 657 if (off + len == so->so_snd.sb_cc) 658 flags |= TH_PUSH; 659 } else { 660 if (tp->t_flags & TF_ACKNOW) 661 tcpstat.tcps_sndacks++; 662 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 663 tcpstat.tcps_sndctrl++; 664 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 665 tcpstat.tcps_sndurg++; 666 else 667 tcpstat.tcps_sndwinup++; 668 669 MGETHDR(m, M_DONTWAIT, MT_HEADER); 670 if (m != NULL) { 671 MCLGET(m, M_DONTWAIT); 672 if ((m->m_flags & M_EXT) == 0) { 673 m_freem(m); 674 m = NULL; 675 } 676 } 677 if (m == NULL) { 678 error = ENOBUFS; 679 goto out; 680 } 681 m->m_data += max_linkhdr; 682 m->m_len = hdrlen; 683 } 684 m->m_pkthdr.rcvif = (struct ifnet *)0; 685 686 if (!tp->t_template) 687 panic("tcp_output"); 688 #ifdef DIAGNOSTIC 689 if (tp->t_template->m_len != hdrlen - optlen) 690 panic("tcp_output: template len != hdrlen - optlen"); 691 #endif /* DIAGNOSTIC */ 692 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), 693 tp->t_template->m_len); 694 th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len - 695 sizeof(struct tcphdr)); 696 697 /* 698 * Fill in fields, remembering maximum advertised 699 * window for use in delaying messages about window sizes. 700 * If resending a FIN, be sure not to use a new sequence number. 701 */ 702 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && 703 (tp->snd_nxt == tp->snd_max)) 704 tp->snd_nxt--; 705 /* 706 * If we are doing retransmissions, then snd_nxt will 707 * not reflect the first unsent octet. For ACK only 708 * packets, we do not want the sequence number of the 709 * retransmitted packet, we want the sequence number 710 * of the next unsent octet. So, if there is no data 711 * (and no SYN or FIN), use snd_max instead of snd_nxt 712 * when filling in ti_seq. But if we are in persist 713 * state, snd_max might reflect one byte beyond the 714 * right edge of the window, so use snd_nxt in that 715 * case, since we know we aren't doing a retransmission. 716 * (retransmit and persist are mutually exclusive...) 717 */ 718 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) 719 th->th_seq = htonl(tp->snd_nxt); 720 else 721 th->th_seq = htonl(tp->snd_max); 722 723 #ifdef TCP_SACK 724 if (sack_rxmit) { 725 /* 726 * If sendalot was turned on (due to option stuffing), turn it 727 * off. Properly set th_seq field. Advance the ret'x pointer 728 * by len. 729 */ 730 if (sendalot) 731 sendalot = 0; 732 th->th_seq = htonl(p->rxmit); 733 p->rxmit += len; 734 #if defined(TCP_SACK) && defined(TCP_FACK) 735 tp->retran_data += len; 736 #endif /* TCP_FACK */ 737 } 738 #endif /* TCP_SACK */ 739 740 th->th_ack = htonl(tp->rcv_nxt); 741 if (optlen) { 742 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 743 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 744 } 745 th->th_flags = flags; 746 747 /* 748 * Calculate receive window. Don't shrink window, 749 * but avoid silly window syndrome. 750 */ 751 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 752 win = 0; 753 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 754 win = (long)TCP_MAXWIN << tp->rcv_scale; 755 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 756 win = (long)(tp->rcv_adv - tp->rcv_nxt); 757 if (flags & TH_RST) 758 win = 0; 759 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 760 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 761 u_int32_t urp = tp->snd_up - tp->snd_nxt; 762 if (urp > IP_MAXPACKET) 763 urp = IP_MAXPACKET; 764 th->th_urp = htons((u_int16_t)urp); 765 th->th_flags |= TH_URG; 766 } else 767 /* 768 * If no urgent pointer to send, then we pull 769 * the urgent pointer to the left edge of the send window 770 * so that it doesn't drift into the send window on sequence 771 * number wraparound. 772 */ 773 tp->snd_up = tp->snd_una; /* drag it along */ 774 775 /* Put TCP length in pseudo-header */ 776 switch (tp->pf) { 777 case 0: /*default to PF_INET*/ 778 #ifdef INET 779 case AF_INET: 780 if (len + optlen) 781 mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)( 782 sizeof (struct tcphdr) + optlen + len)); 783 break; 784 #endif /* INET */ 785 #ifdef INET6 786 case AF_INET6: 787 break; 788 #endif /* INET6 */ 789 } 790 791 #ifdef TCP_SIGNATURE 792 if (tp->t_flags & TF_SIGNATURE) { 793 MD5_CTX ctx; 794 union sockaddr_union sa; 795 struct tdb *tdb; 796 797 memset(&sa, 0, sizeof(union sockaddr_union)); 798 799 switch (tp->pf) { 800 case 0: /*default to PF_INET*/ 801 #ifdef INET 802 case AF_INET: 803 sa.sa.sa_len = sizeof(struct sockaddr_in); 804 sa.sa.sa_family = AF_INET; 805 sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 806 break; 807 #endif /* INET */ 808 #ifdef INET6 809 case AF_INET6: 810 sa.sa.sa_len = sizeof(struct sockaddr_in6); 811 sa.sa.sa_family = AF_INET6; 812 sa.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 813 break; 814 #endif /* INET6 */ 815 } 816 817 /* XXX gettdb() should really be called at spltdb(). */ 818 /* XXX this is splsoftnet(), currently they are the same. */ 819 tdb = gettdb(0, &sa, IPPROTO_TCP); 820 if (tdb == NULL) 821 return (EPERM); 822 823 MD5Init(&ctx); 824 825 switch (tp->pf) { 826 case 0: /*default to PF_INET*/ 827 #ifdef INET 828 case AF_INET: 829 { 830 struct ippseudo ippseudo; 831 struct ipovly *ipovly; 832 833 ipovly = mtod(m, struct ipovly *); 834 835 ippseudo.ippseudo_src = ipovly->ih_src; 836 ippseudo.ippseudo_dst = ipovly->ih_dst; 837 ippseudo.ippseudo_pad = 0; 838 ippseudo.ippseudo_p = IPPROTO_TCP; 839 ippseudo.ippseudo_len = ipovly->ih_len; 840 MD5Update(&ctx, (char *)&ippseudo, 841 sizeof(struct ippseudo)); 842 MD5Update(&ctx, mtod(m, caddr_t) + 843 sizeof(struct ip), 844 sizeof(struct tcphdr)); 845 } 846 break; 847 #endif /* INET */ 848 #ifdef INET6 849 case AF_INET6: 850 { 851 static int printed = 0; 852 853 if (!printed) { 854 printf("error: TCP MD5 support for " 855 "IPv6 not yet implemented.\n"); 856 printed = 1; 857 } 858 } 859 break; 860 #endif /* INET6 */ 861 } 862 863 if (len && m_apply(m, hdrlen, len, tcp_signature_apply, 864 (caddr_t)&ctx)) 865 return (EINVAL); 866 867 MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen); 868 MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx); 869 } 870 #endif /* TCP_SIGNATURE */ 871 872 /* 873 * Put TCP length in extended header, and then 874 * checksum extended header and data. 875 */ 876 switch (tp->pf) { 877 case 0: /*default to PF_INET*/ 878 #ifdef INET 879 case AF_INET: 880 th->th_sum = in_cksum(m, (int)(hdrlen + len)); 881 break; 882 #endif /* INET */ 883 #ifdef INET6 884 case AF_INET6: 885 m->m_pkthdr.len = hdrlen + len; 886 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 887 hdrlen - sizeof(struct ip6_hdr) + len); 888 break; 889 #endif /* INET6 */ 890 } 891 892 /* 893 * In transmit state, time the transmission and arrange for 894 * the retransmit. In persist state, just set snd_max. 895 */ 896 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 897 tcp_seq startseq = tp->snd_nxt; 898 899 /* 900 * Advance snd_nxt over sequence space of this segment. 901 */ 902 if (flags & (TH_SYN|TH_FIN)) { 903 if (flags & TH_SYN) 904 tp->snd_nxt++; 905 if (flags & TH_FIN) { 906 tp->snd_nxt++; 907 tp->t_flags |= TF_SENTFIN; 908 } 909 } 910 #ifdef TCP_SACK 911 if (!tp->sack_disable) { 912 if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { 913 goto timer; 914 } 915 } 916 #endif 917 tp->snd_nxt += len; 918 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 919 tp->snd_max = tp->snd_nxt; 920 /* 921 * Time this transmission if not a retransmission and 922 * not currently timing anything. 923 */ 924 if (tp->t_rtt == 0) { 925 tp->t_rtt = 1; 926 tp->t_rtseq = startseq; 927 tcpstat.tcps_segstimed++; 928 } 929 } 930 931 /* 932 * Set retransmit timer if not currently set, 933 * and not doing an ack or a keep-alive probe. 934 * Initial value for retransmit timer is smoothed 935 * round-trip time + 2 * round-trip time variance. 936 * Initialize shift counter which is used for backoff 937 * of retransmit time. 938 */ 939 #ifdef TCP_SACK 940 timer: 941 if (!tp->sack_disable && sack_rxmit && 942 tp->t_timer[TCPT_REXMT] == 0 && 943 tp->snd_nxt != tp->snd_max) { 944 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 945 if (tp->t_timer[TCPT_PERSIST]) { 946 tp->t_timer[TCPT_PERSIST] = 0; 947 tp->t_rxtshift = 0; 948 } 949 } 950 #endif 951 952 if (tp->t_timer[TCPT_REXMT] == 0 && 953 tp->snd_nxt != tp->snd_una) { 954 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 955 if (tp->t_timer[TCPT_PERSIST]) { 956 tp->t_timer[TCPT_PERSIST] = 0; 957 tp->t_rxtshift = 0; 958 } 959 } 960 } else 961 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 962 tp->snd_max = tp->snd_nxt + len; 963 964 /* 965 * Trace. 966 */ 967 if (so->so_options & SO_DEBUG) 968 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0, 969 len); 970 971 /* 972 * Fill in IP length and desired time to live and 973 * send to IP level. There should be a better way 974 * to handle ttl and tos; we could keep them in 975 * the template, but need a way to checksum without them. 976 */ 977 m->m_pkthdr.len = hdrlen + len; 978 979 switch (tp->pf) { 980 case 0: /*default to PF_INET*/ 981 #ifdef INET 982 case AF_INET: 983 { 984 struct ip *ip; 985 986 ip = mtod(m, struct ip *); 987 ip->ip_len = m->m_pkthdr.len; 988 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 989 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 990 } 991 error = ip_output(m, tp->t_inpcb->inp_options, 992 &tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE, 993 0, tp->t_inpcb); 994 break; 995 #endif /* INET */ 996 #ifdef INET6 997 case AF_INET6: 998 { 999 struct ip6_hdr *ipv6; 1000 1001 ipv6 = mtod(m, struct ip6_hdr *); 1002 ipv6->ip6_plen = m->m_pkthdr.len - 1003 sizeof(struct ip6_hdr); 1004 ipv6->ip6_nxt = IPPROTO_TCP; 1005 ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); 1006 } 1007 error = ip6_output(m, tp->t_inpcb->inp_outputopts6, 1008 &tp->t_inpcb->inp_route6, 1009 (so->so_options & SO_DONTROUTE), NULL, NULL); 1010 break; 1011 #endif /* INET6 */ 1012 #ifdef TUBA 1013 case AF_ISO: 1014 if (tp->t_tuba_pcb) 1015 error = tuba_output(m, tp); 1016 break; 1017 #endif /* TUBA */ 1018 } 1019 1020 #if defined(TCP_SACK) && defined(TCP_FACK) 1021 /* Update snd_awnd to reflect the new data that was sent. */ 1022 tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + 1023 tp->retran_data; 1024 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */ 1025 1026 if (error) { 1027 out: 1028 if (error == ENOBUFS) { 1029 tcp_quench(tp->t_inpcb, 0); 1030 return (0); 1031 } 1032 if ((error == EHOSTUNREACH || error == ENETDOWN) 1033 && TCPS_HAVERCVDSYN(tp->t_state)) { 1034 tp->t_softerror = error; 1035 return (0); 1036 } 1037 return (error); 1038 } 1039 tcpstat.tcps_sndtotal++; 1040 1041 /* 1042 * Data sent (as far as we can tell). 1043 * If this advertises a larger window than any other segment, 1044 * then remember the size of the advertised window. 1045 * Any pending ACK has now been sent. 1046 */ 1047 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1048 tp->rcv_adv = tp->rcv_nxt + win; 1049 tp->last_ack_sent = tp->rcv_nxt; 1050 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 1051 #if defined(TCP_SACK) || defined(TCP_NEWRENO) 1052 if (sendalot && --maxburst) 1053 #else 1054 if (sendalot) 1055 #endif 1056 goto again; 1057 return (0); 1058 } 1059 1060 void 1061 tcp_setpersist(tp) 1062 register struct tcpcb *tp; 1063 { 1064 register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 1065 1066 if (tp->t_timer[TCPT_REXMT]) 1067 panic("tcp_output REXMT"); 1068 /* 1069 * Start/restart persistance timer. 1070 */ 1071 if (t < tp->t_rttmin) 1072 t = tp->t_rttmin; 1073 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 1074 t * tcp_backoff[tp->t_rxtshift], 1075 TCPTV_PERSMIN, TCPTV_PERSMAX); 1076 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1077 tp->t_rxtshift++; 1078 } 1079