1 /* $OpenBSD: tcp_output.c,v 1.21 1999/07/06 20:17:53 cmetz Exp $ */ 2 /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93 37 */ 38 39 /* 40 %%% portions-copyright-nrl-95 41 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 43 Reserved. All rights under this copyright have been assigned to the US 44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 46 software. 47 You should have received a copy of the license with this software. If you 48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 49 */ 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/protosw.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/errno.h> 59 #include <sys/domain.h> 60 61 #include <net/route.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/ip.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/ip_var.h> 68 #include <netinet/tcp.h> 69 #define TCPOUTFLAGS 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcpip.h> 75 #include <netinet/tcp_debug.h> 76 77 #ifdef TUBA 78 #include <netiso/iso.h> 79 #include <netiso/tuba_table.h> 80 #endif 81 82 #ifdef INET6 83 #include <netinet6/tcpipv6.h> 84 #endif /* INET6 */ 85 86 #ifdef TCP_SIGNATURE 87 #include <sys/md5k.h> 88 #endif /* TCP_SIGNATURE */ 89 90 #ifdef notyet 91 extern struct mbuf *m_copypack(); 92 #endif 93 94 #ifdef TCP_SACK 95 extern int tcprexmtthresh; 96 #endif 97 98 #ifdef TCP_SACK 99 #ifdef TCP_SACK_DEBUG 100 void 101 tcp_print_holes(tp) 102 struct tcpcb *tp; 103 { 104 struct sackhole *p = tp->snd_holes; 105 if (p == 0) 106 return; 107 printf("Hole report: start--end dups rxmit\n"); 108 while (p) { 109 printf("%x--%x d %d r %x\n", p->start, p->end, p->dups, 110 p->rxmit); 111 p = p->next; 112 } 113 printf("\n"); 114 } 115 #endif /* TCP_SACK_DEBUG */ 116 117 /* 118 * Returns pointer to a sackhole if there are any pending retransmissions; 119 * NULL otherwise. 120 */ 121 struct sackhole * 122 tcp_sack_output(tp) 123 register struct tcpcb *tp; 124 { 125 struct sackhole *p; 126 if (tp->sack_disable) 127 return 0; 128 p = tp->snd_holes; 129 while (p) { 130 if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) { 131 if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 132 p = p->next; 133 continue; 134 } 135 #ifdef TCP_SACK_DEBUG 136 if (p) 137 tcp_print_holes(tp); 138 #endif 139 return p; 140 } 141 p = p->next; 142 } 143 return 0; 144 } 145 146 /* 147 * After a timeout, the SACK list may be rebuilt. This SACK information 148 * should be used to avoid retransmitting SACKed data. This function 149 * traverses the SACK list to see if snd_nxt should be moved forward. 150 */ 151 void 152 tcp_sack_adjust(tp) 153 struct tcpcb *tp; 154 { 155 int i; 156 157 for (i = 0; i < tp->rcv_numsacks; i++) { 158 if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start)) 159 break; 160 if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt)) 161 continue; 162 if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0) 163 continue; 164 /* snd_nxt must be in middle of block of SACKed data */ 165 tp->snd_nxt = tp->sackblks[i].end; 166 break; 167 } 168 } 169 #endif /* TCP_SACK */ 170 171 /* 172 * Tcp output routine: figure out what should be sent and send it. 173 */ 174 int 175 tcp_output(tp) 176 register struct tcpcb *tp; 177 { 178 register struct socket *so = tp->t_inpcb->inp_socket; 179 register long len, win; 180 int off, flags, error; 181 register struct mbuf *m; 182 register struct tcphdr *th; 183 u_char opt[MAX_TCPOPTLEN]; 184 unsigned int optlen, hdrlen; 185 int idle, sendalot; 186 #ifdef TCP_SACK 187 int i, sack_rxmit = 0; 188 struct sackhole *p; 189 #endif 190 #if defined(TCP_SACK) || defined(TCP_NEWRENO) 191 int maxburst = TCP_MAXBURST; 192 #endif 193 #ifdef TCP_SIGNATURE 194 unsigned int sigoff; 195 #endif /* TCP_SIGNATURE */ 196 197 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) 198 if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE)) 199 return (EINVAL); 200 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ 201 202 /* 203 * Determine length of data that should be transmitted, 204 * and flags that will be used. 205 * If there is some data or critical controls (SYN, RST) 206 * to send, then transmit; otherwise, investigate further. 207 */ 208 idle = (tp->snd_max == tp->snd_una); 209 if (idle && tp->t_idle >= tp->t_rxtcur) 210 /* 211 * We have been idle for "a while" and no acks are 212 * expected to clock out any data we send -- 213 * slow start to get ack "clock" running again. 214 */ 215 tp->snd_cwnd = tp->t_maxseg; 216 again: 217 sendalot = 0; 218 #ifdef TCP_SACK 219 /* 220 * If we've recently taken a timeout, snd_max will be greater than 221 * snd_nxt. There may be SACK information that allows us to avoid 222 * resending already delivered data. Adjust snd_nxt accordingly. 223 */ 224 if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) 225 tcp_sack_adjust(tp); 226 #endif 227 off = tp->snd_nxt - tp->snd_una; 228 win = ulmin(tp->snd_wnd, tp->snd_cwnd); 229 230 flags = tcp_outflags[tp->t_state]; 231 /* 232 * If in persist timeout with window of 0, send 1 byte. 233 * Otherwise, if window is small but nonzero 234 * and timer expired, we will send what we can 235 * and go to transmit state. 236 */ 237 238 #ifdef TCP_SACK 239 /* 240 * Send any SACK-generated retransmissions. If we're explicitly trying 241 * to send out new data (when sendalot is 1), bypass this function. 242 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 243 * we're replacing a (future) new transmission with a retransmission 244 * now, and we previously incremented snd_cwnd in tcp_input(). 245 */ 246 if (!tp->sack_disable && !sendalot) { 247 if ((p = tcp_sack_output(tp))) { 248 off = p->rxmit - tp->snd_una; 249 sack_rxmit = 1; 250 #if 0 251 /* Coalesce holes into a single retransmission */ 252 #endif 253 len = min(tp->t_maxseg, p->end - p->rxmit); 254 #ifndef TCP_FACK 255 /* in FACK, hold snd_cwnd constant during recovery */ 256 if (SEQ_LT(tp->snd_una, tp->snd_last)) 257 tp->snd_cwnd -= tp->t_maxseg; 258 #endif 259 } 260 } 261 #endif /* TCP_SACK */ 262 263 if (tp->t_force) { 264 if (win == 0) { 265 /* 266 * If we still have some data to send, then 267 * clear the FIN bit. Usually this would 268 * happen below when it realizes that we 269 * aren't sending all the data. However, 270 * if we have exactly 1 byte of unset data, 271 * then it won't clear the FIN bit below, 272 * and if we are in persist state, we wind 273 * up sending the packet without recording 274 * that we sent the FIN bit. 275 * 276 * We can't just blindly clear the FIN bit, 277 * because if we don't have any more data 278 * to send then the probe will be the FIN 279 * itself. 280 */ 281 if (off < so->so_snd.sb_cc) 282 flags &= ~TH_FIN; 283 win = 1; 284 } else { 285 tp->t_timer[TCPT_PERSIST] = 0; 286 tp->t_rxtshift = 0; 287 } 288 } 289 290 #ifdef TCP_SACK 291 if (!sack_rxmit) { 292 #endif 293 len = ulmin(so->so_snd.sb_cc, win) - off; 294 295 #if defined(TCP_SACK) && defined(TCP_FACK) 296 /* 297 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and 298 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then 299 * do not send data (like zero window conditions) 300 */ 301 if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && 302 (tp->snd_awnd >= tp->snd_cwnd)) 303 len = 0; 304 #endif /* TCP_FACK */ 305 #ifdef TCP_SACK 306 } 307 #endif 308 309 if (len < 0) { 310 /* 311 * If FIN has been sent but not acked, 312 * but we haven't been called to retransmit, 313 * len will be -1. Otherwise, window shrank 314 * after we sent into it. If window shrank to 0, 315 * cancel pending retransmit and pull snd_nxt 316 * back to (closed) window. We will enter persist 317 * state below. If the window didn't close completely, 318 * just wait for an ACK. 319 */ 320 len = 0; 321 if (win == 0) { 322 tp->t_timer[TCPT_REXMT] = 0; 323 tp->snd_nxt = tp->snd_una; 324 } 325 } 326 if (len > tp->t_maxseg) { 327 len = tp->t_maxseg; 328 sendalot = 1; 329 } 330 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 331 flags &= ~TH_FIN; 332 333 win = sbspace(&so->so_rcv); 334 335 /* 336 * Sender silly window avoidance. If connection is idle 337 * and can send all data, a maximum segment, 338 * at least a maximum default-size segment do it, 339 * or are forced, do it; otherwise don't bother. 340 * If peer's buffer is tiny, then send 341 * when window is at least half open. 342 * If retransmitting (possibly after persist timer forced us 343 * to send into a small window), then must resend. 344 */ 345 if (len) { 346 if (len == tp->t_maxseg) 347 goto send; 348 if ((idle || tp->t_flags & TF_NODELAY) && 349 len + off >= so->so_snd.sb_cc) 350 goto send; 351 if (tp->t_force) 352 goto send; 353 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 354 goto send; 355 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 356 goto send; 357 #ifdef TCP_SACK 358 if (sack_rxmit) 359 goto send; 360 #endif 361 } 362 363 /* 364 * Compare available window to amount of window 365 * known to peer (as advertised window less 366 * next expected input). If the difference is at least two 367 * max size segments, or at least 50% of the maximum possible 368 * window, then want to send a window update to peer. 369 */ 370 if (win > 0) { 371 /* 372 * "adv" is the amount we can increase the window, 373 * taking into account that we are limited by 374 * TCP_MAXWIN << tp->rcv_scale. 375 */ 376 long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) - 377 (tp->rcv_adv - tp->rcv_nxt); 378 379 if (adv >= (long) (2 * tp->t_maxseg)) 380 goto send; 381 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 382 goto send; 383 } 384 385 /* 386 * Send if we owe peer an ACK. 387 */ 388 if (tp->t_flags & TF_ACKNOW) 389 goto send; 390 if (flags & (TH_SYN|TH_RST)) 391 goto send; 392 if (SEQ_GT(tp->snd_up, tp->snd_una)) 393 goto send; 394 /* 395 * If our state indicates that FIN should be sent 396 * and we have not yet done so, or we're retransmitting the FIN, 397 * then we need to send. 398 */ 399 if (flags & TH_FIN && 400 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 401 goto send; 402 403 /* 404 * TCP window updates are not reliable, rather a polling protocol 405 * using ``persist'' packets is used to insure receipt of window 406 * updates. The three ``states'' for the output side are: 407 * idle not doing retransmits or persists 408 * persisting to move a small or zero window 409 * (re)transmitting and thereby not persisting 410 * 411 * tp->t_timer[TCPT_PERSIST] 412 * is set when we are in persist state. 413 * tp->t_force 414 * is set when we are called to send a persist packet. 415 * tp->t_timer[TCPT_REXMT] 416 * is set when we are retransmitting 417 * The output side is idle when both timers are zero. 418 * 419 * If send window is too small, there is data to transmit, and no 420 * retransmit or persist is pending, then go to persist state. 421 * If nothing happens soon, send when timer expires: 422 * if window is nonzero, transmit what we can, 423 * otherwise force out a byte. 424 */ 425 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 426 tp->t_timer[TCPT_PERSIST] == 0) { 427 tp->t_rxtshift = 0; 428 tcp_setpersist(tp); 429 } 430 431 /* 432 * No reason to send a segment, just return. 433 */ 434 return (0); 435 436 send: 437 /* 438 * Before ESTABLISHED, force sending of initial options 439 * unless TCP set not to do any options. 440 * NOTE: we assume that the IP/TCP header plus TCP options 441 * always fit in a single mbuf, leaving room for a maximum 442 * link header, i.e. 443 * max_linkhdr + sizeof(network header) + sizeof(struct tcphdr) + 444 * optlen <= MHLEN 445 */ 446 optlen = 0; 447 448 #if defined(INET) && defined(INET6) 449 switch (tp->pf) { 450 #else /* defined(INET) && defined(INET6) */ 451 switch (0) { 452 #endif /* defined(INET) && defined(INET6) */ 453 case 0: /* If tp->pf is 0, then assume IPv4 unless not avail */ 454 #ifdef INET 455 case PF_INET: 456 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 457 break; 458 #endif /* INET */ 459 #ifdef INET6 460 case PF_INET6: 461 hdrlen = sizeof(struct ipv6) + sizeof(struct tcphdr); 462 break; 463 #endif /* INET6 */ 464 default: 465 return (EPFNOSUPPORT); 466 } 467 468 if (flags & TH_SYN) { 469 tp->snd_nxt = tp->iss; 470 if ((tp->t_flags & TF_NOOPT) == 0) { 471 u_int16_t mss; 472 473 opt[0] = TCPOPT_MAXSEG; 474 opt[1] = 4; 475 mss = htons((u_int16_t) tcp_mss(tp, 0)); 476 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); 477 optlen = 4; 478 #ifdef TCP_SACK 479 /* 480 * If this is the first SYN of connection (not a SYN 481 * ACK), include SACK_PERMIT_HDR option. If this is a 482 * SYN ACK, include SACK_PERMIT_HDR option if peer has 483 * already done so. 484 */ 485 if (!tp->sack_disable && ((flags & TH_ACK) == 0 || 486 (tp->t_flags & TF_SACK_PERMIT))) { 487 *((u_int32_t *) (opt + optlen)) = 488 htonl(TCPOPT_SACK_PERMIT_HDR); 489 optlen += 4; 490 } 491 #endif 492 493 if ((tp->t_flags & TF_REQ_SCALE) && 494 ((flags & TH_ACK) == 0 || 495 (tp->t_flags & TF_RCVD_SCALE))) { 496 *((u_int32_t *) (opt + optlen)) = htonl( 497 TCPOPT_NOP << 24 | 498 TCPOPT_WINDOW << 16 | 499 TCPOLEN_WINDOW << 8 | 500 tp->request_r_scale); 501 optlen += 4; 502 } 503 } 504 } 505 506 /* 507 * Send a timestamp and echo-reply if this is a SYN and our side 508 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 509 * and our peer have sent timestamps in our SYN's. 510 */ 511 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 512 (flags & TH_RST) == 0 && 513 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 514 (tp->t_flags & TF_RCVD_TSTMP))) { 515 u_int32_t *lp = (u_int32_t *)(opt + optlen); 516 517 /* Form timestamp option as shown in appendix A of RFC 1323. */ 518 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 519 *lp++ = htonl(tcp_now); 520 *lp = htonl(tp->ts_recent); 521 optlen += TCPOLEN_TSTAMP_APPA; 522 } 523 524 #ifdef TCP_SIGNATURE 525 if (tp->t_flags & TF_SIGNATURE) { 526 u_int8_t *bp = (u_int8_t *)(opt + optlen); 527 528 /* Send signature option */ 529 *(bp++) = TCPOPT_SIGNATURE; 530 *(bp++) = TCPOLEN_SIGNATURE; 531 sigoff = optlen + 2; 532 533 { 534 unsigned int i; 535 536 for (i = 0; i < 16; i++) 537 *(bp++) = 0; 538 } 539 540 optlen += TCPOLEN_SIGNATURE; 541 542 /* Pad options list to the next 32 bit boundary and 543 * terminate it. 544 */ 545 *bp++ = TCPOPT_NOP; 546 *bp++ = TCPOPT_EOL; 547 optlen += 2; 548 } 549 #endif /* TCP_SIGNATURE */ 550 551 #ifdef TCP_SACK 552 /* 553 * Send SACKs if necessary. This should be the last option processed. 554 * Only as many SACKs are sent as are permitted by the maximum options 555 * size. No more than three SACKs are sent. 556 */ 557 if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED && 558 (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && 559 tp->rcv_numsacks) { 560 u_int32_t *lp = (u_int32_t *)(opt + optlen); 561 u_int32_t *olp = lp++; 562 int count = 0; /* actual number of SACKs inserted */ 563 int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; 564 565 maxsack = min(maxsack, TCP_MAX_SACK); 566 for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { 567 struct sackblk sack = tp->sackblks[i]; 568 if (sack.start == 0 && sack.end == 0) 569 continue; 570 *lp++ = htonl(sack.start); 571 *lp++ = htonl(sack.end); 572 count++; 573 } 574 *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); 575 optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ 576 } 577 #endif /* TCP_SACK */ 578 579 #ifdef DIAGNOSTIC 580 if (optlen > MAX_TCPOPTLEN) 581 panic("tcp_output: options too long"); 582 #endif /* DIAGNOSTIC */ 583 584 hdrlen += optlen; 585 586 /* 587 * Adjust data length if insertion of options will 588 * bump the packet length beyond the t_maxopd length. 589 */ 590 if (len > tp->t_maxopd - optlen) { 591 len = tp->t_maxopd - optlen; 592 sendalot = 1; 593 flags &= ~TH_FIN; 594 } 595 596 #ifdef DIAGNOSTIC 597 if (max_linkhdr + hdrlen > MHLEN) 598 panic("tcphdr too big"); 599 #endif 600 601 /* 602 * Grab a header mbuf, attaching a copy of data to 603 * be transmitted, and initialize the header from 604 * the template for sends on this connection. 605 */ 606 if (len) { 607 if (tp->t_force && len == 1) 608 tcpstat.tcps_sndprobe++; 609 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 610 tcpstat.tcps_sndrexmitpack++; 611 tcpstat.tcps_sndrexmitbyte += len; 612 } else { 613 tcpstat.tcps_sndpack++; 614 tcpstat.tcps_sndbyte += len; 615 } 616 #ifdef notyet 617 if ((m = m_copypack(so->so_snd.sb_mb, off, 618 (int)len, max_linkhdr + hdrlen)) == 0) { 619 error = ENOBUFS; 620 goto out; 621 } 622 /* 623 * m_copypack left space for our hdr; use it. 624 */ 625 m->m_len += hdrlen; 626 m->m_data -= hdrlen; 627 #else 628 MGETHDR(m, M_DONTWAIT, MT_HEADER); 629 if (m == NULL) { 630 error = ENOBUFS; 631 goto out; 632 } 633 m->m_data += max_linkhdr; 634 m->m_len = hdrlen; 635 if (len <= MHLEN - hdrlen - max_linkhdr) { 636 m_copydata(so->so_snd.sb_mb, off, (int) len, 637 mtod(m, caddr_t) + hdrlen); 638 m->m_len += len; 639 } else { 640 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 641 if (m->m_next == 0) { 642 (void) m_free(m); 643 error = ENOBUFS; 644 goto out; 645 } 646 } 647 #endif 648 /* 649 * If we're sending everything we've got, set PUSH. 650 * (This will keep happy those implementations which only 651 * give data to the user when a buffer fills or 652 * a PUSH comes in.) 653 */ 654 if (off + len == so->so_snd.sb_cc) 655 flags |= TH_PUSH; 656 } else { 657 if (tp->t_flags & TF_ACKNOW) 658 tcpstat.tcps_sndacks++; 659 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 660 tcpstat.tcps_sndctrl++; 661 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 662 tcpstat.tcps_sndurg++; 663 else 664 tcpstat.tcps_sndwinup++; 665 666 MGETHDR(m, M_DONTWAIT, MT_HEADER); 667 if (m == NULL) { 668 error = ENOBUFS; 669 goto out; 670 } 671 m->m_data += max_linkhdr; 672 m->m_len = hdrlen; 673 } 674 m->m_pkthdr.rcvif = (struct ifnet *)0; 675 676 if (!tp->t_template) 677 panic("tcp_output"); 678 #ifdef DIAGNOSTIC 679 if (tp->t_template->m_len != hdrlen - optlen) 680 panic("tcp_output: template len != hdrlen - optlen"); 681 #endif /* DIAGNOSTIC */ 682 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), 683 tp->t_template->m_len); 684 th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len - 685 sizeof(struct tcphdr)); 686 687 /* 688 * Fill in fields, remembering maximum advertised 689 * window for use in delaying messages about window sizes. 690 * If resending a FIN, be sure not to use a new sequence number. 691 */ 692 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && 693 (tp->snd_nxt == tp->snd_max)) 694 tp->snd_nxt--; 695 /* 696 * If we are doing retransmissions, then snd_nxt will 697 * not reflect the first unsent octet. For ACK only 698 * packets, we do not want the sequence number of the 699 * retransmitted packet, we want the sequence number 700 * of the next unsent octet. So, if there is no data 701 * (and no SYN or FIN), use snd_max instead of snd_nxt 702 * when filling in ti_seq. But if we are in persist 703 * state, snd_max might reflect one byte beyond the 704 * right edge of the window, so use snd_nxt in that 705 * case, since we know we aren't doing a retransmission. 706 * (retransmit and persist are mutually exclusive...) 707 */ 708 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) 709 th->th_seq = htonl(tp->snd_nxt); 710 else 711 th->th_seq = htonl(tp->snd_max); 712 713 #ifdef TCP_SACK 714 if (sack_rxmit) { 715 /* 716 * If sendalot was turned on (due to option stuffing), turn it 717 * off. Properly set th_seq field. Advance the ret'x pointer 718 * by len. 719 */ 720 if (sendalot) 721 sendalot = 0; 722 th->th_seq = htonl(p->rxmit); 723 p->rxmit += len; 724 #if defined(TCP_SACK) && defined(TCP_FACK) 725 tp->retran_data += len; 726 #endif /* TCP_FACK */ 727 } 728 #endif /* TCP_SACK */ 729 730 th->th_ack = htonl(tp->rcv_nxt); 731 if (optlen) { 732 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 733 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 734 } 735 th->th_flags = flags; 736 737 /* 738 * Calculate receive window. Don't shrink window, 739 * but avoid silly window syndrome. 740 */ 741 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 742 win = 0; 743 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 744 win = (long)TCP_MAXWIN << tp->rcv_scale; 745 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 746 win = (long)(tp->rcv_adv - tp->rcv_nxt); 747 if (flags & TH_RST) 748 win = 0; 749 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 750 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 751 u_int32_t urp = tp->snd_up - tp->snd_nxt; 752 if (urp > IP_MAXPACKET) 753 urp = IP_MAXPACKET; 754 th->th_urp = htons((u_int16_t)urp); 755 th->th_flags |= TH_URG; 756 } else 757 /* 758 * If no urgent pointer to send, then we pull 759 * the urgent pointer to the left edge of the send window 760 * so that it doesn't drift into the send window on sequence 761 * number wraparound. 762 */ 763 tp->snd_up = tp->snd_una; /* drag it along */ 764 765 /* Put TCP length in pseudo-header */ 766 #if defined(INET) && defined(INET6) 767 switch (tp->pf) { 768 #else /* defined(INET) && defined(INET6) */ 769 switch (0) { 770 #endif /* defined(INET) && defined(INET6) */ 771 case 0: 772 #ifdef INET 773 case AF_INET: 774 if (len + optlen) 775 mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)( 776 sizeof (struct tcphdr) + optlen + len)); 777 break; 778 #endif /* INET */ 779 #ifdef INET6 780 case AF_INET6: 781 break; 782 #endif /* INET6 */ 783 } 784 785 #ifdef TCP_SIGNATURE 786 if (tp->t_flags & TF_SIGNATURE) { 787 MD5_CTX ctx; 788 union sockaddr_union sa; 789 struct tdb *tdb; 790 791 memset(&sa, 0, sizeof(union sockaddr_union)); 792 793 #if defined(INET) && defined(INET6) 794 switch(tp->pf) { 795 #else /* defined(INET) && defined(INET6) */ 796 switch (0) { 797 #endif /* defined(INET) && defined(INET6) */ 798 case 0: 799 #ifdef INET 800 case AF_INET: 801 sa.sa.sa_len = sizeof(struct sockaddr_in); 802 sa.sa.sa_family = AF_INET; 803 sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 804 break; 805 #endif /* INET */ 806 #ifdef INET6 807 case AF_INET6: 808 sa.sa.sa_len = sizeof(struct sockaddr_in6); 809 sa.sa.sa_family = AF_INET6; 810 sa.sin6.sin6_addr = mtod(m, struct ipv6 *)->ipv6_dst; 811 break; 812 #endif /* INET6 */ 813 } 814 815 tdb = gettdb(0, &sa, IPPROTO_TCP); 816 if (tdb == NULL) 817 return (EPERM); 818 819 MD5Init(&ctx); 820 821 #if defined(INET) && defined(INET6) 822 switch(tp->pf) { 823 #else /* defined(INET) && defined(INET6) */ 824 switch (0) { 825 #endif /* defined(INET) && defined(INET6) */ 826 case 0: 827 #ifdef INET 828 case AF_INET: 829 { 830 struct ippseudo ippseudo; 831 struct ipovly *ipovly; 832 833 ipovly = mtod(m, struct ipovly *); 834 835 ippseudo.ippseudo_src = ipovly->ih_src; 836 ippseudo.ippseudo_dst = ipovly->ih_dst; 837 ippseudo.ippseudo_pad = 0; 838 ippseudo.ippseudo_p = IPPROTO_TCP; 839 ippseudo.ippseudo_len = ipovly->ih_len; 840 MD5Update(&ctx, (char *)&ippseudo, 841 sizeof(struct ippseudo)); 842 MD5Update(&ctx, mtod(m, caddr_t) + 843 sizeof(struct ip), 844 sizeof(struct tcphdr)); 845 } 846 break; 847 #endif /* INET */ 848 #ifdef INET6 849 case AF_INET6: 850 { 851 static int printed = 0; 852 853 if (!printed) { 854 printf("error: TCP MD5 support for " 855 "IPv6 not yet implemented.\n"); 856 printed = 1; 857 } 858 } 859 break; 860 #endif /* INET6 */ 861 } 862 863 if (len && m_apply(m, hdrlen, len, tcp_signature_apply, 864 (caddr_t)&ctx)) 865 return (EINVAL); 866 867 MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen); 868 MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx); 869 } 870 #endif /* TCP_SIGNATURE */ 871 872 /* 873 * Put TCP length in extended header, and then 874 * checksum extended header and data. 875 */ 876 #if defined(INET) && defined(INET6) 877 switch (tp->pf) { 878 #else /* defined(INET) && defined(INET6) */ 879 switch (0) { 880 #endif /* defined(INET) && defined(INET6) */ 881 case 0: 882 #ifdef INET 883 case AF_INET: 884 th->th_sum = in_cksum(m, (int)(hdrlen + len)); 885 break; 886 #endif /* INET */ 887 #ifdef INET6 888 case AF_INET6: 889 th->th_sum = in6_cksum(m, IPPROTO_TCP, hdrlen + len, 890 sizeof(struct ipv6)); 891 break; 892 #endif /* INET6 */ 893 } 894 895 /* 896 * In transmit state, time the transmission and arrange for 897 * the retransmit. In persist state, just set snd_max. 898 */ 899 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 900 tcp_seq startseq = tp->snd_nxt; 901 902 /* 903 * Advance snd_nxt over sequence space of this segment. 904 */ 905 if (flags & (TH_SYN|TH_FIN)) { 906 if (flags & TH_SYN) 907 tp->snd_nxt++; 908 if (flags & TH_FIN) { 909 tp->snd_nxt++; 910 tp->t_flags |= TF_SENTFIN; 911 } 912 } 913 #ifdef TCP_SACK 914 if (!tp->sack_disable) { 915 if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { 916 goto timer; 917 } 918 } 919 #endif 920 tp->snd_nxt += len; 921 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 922 tp->snd_max = tp->snd_nxt; 923 /* 924 * Time this transmission if not a retransmission and 925 * not currently timing anything. 926 */ 927 if (tp->t_rtt == 0) { 928 tp->t_rtt = 1; 929 tp->t_rtseq = startseq; 930 tcpstat.tcps_segstimed++; 931 } 932 } 933 934 /* 935 * Set retransmit timer if not currently set, 936 * and not doing an ack or a keep-alive probe. 937 * Initial value for retransmit timer is smoothed 938 * round-trip time + 2 * round-trip time variance. 939 * Initialize shift counter which is used for backoff 940 * of retransmit time. 941 */ 942 #ifdef TCP_SACK 943 timer: 944 if (!tp->sack_disable && sack_rxmit && 945 tp->t_timer[TCPT_REXMT] == 0 && 946 tp->snd_nxt != tp->snd_max) { 947 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 948 if (tp->t_timer[TCPT_PERSIST]) { 949 tp->t_timer[TCPT_PERSIST] = 0; 950 tp->t_rxtshift = 0; 951 } 952 } 953 #endif 954 955 if (tp->t_timer[TCPT_REXMT] == 0 && 956 tp->snd_nxt != tp->snd_una) { 957 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 958 if (tp->t_timer[TCPT_PERSIST]) { 959 tp->t_timer[TCPT_PERSIST] = 0; 960 tp->t_rxtshift = 0; 961 } 962 } 963 } else 964 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 965 tp->snd_max = tp->snd_nxt + len; 966 967 /* 968 * Trace. 969 */ 970 if (so->so_options & SO_DEBUG) 971 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0, 972 len); 973 974 /* 975 * Fill in IP length and desired time to live and 976 * send to IP level. There should be a better way 977 * to handle ttl and tos; we could keep them in 978 * the template, but need a way to checksum without them. 979 */ 980 m->m_pkthdr.len = hdrlen + len; 981 982 #if defined(INET) && defined(INET6) 983 switch (tp->pf) { 984 #else /* defined(INET) && defined(INET6) */ 985 switch (0) { 986 #endif /* defined(INET) && defined(INET6) */ 987 case 0: 988 #ifdef INET 989 case AF_INET: 990 { 991 struct ip *ip; 992 993 ip = mtod(m, struct ip *); 994 ip->ip_len = m->m_pkthdr.len; 995 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 996 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 997 } 998 999 error = ip_output(m, tp->t_inpcb->inp_options, 1000 &tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE, 1001 0, tp->t_inpcb); 1002 break; 1003 #endif /* INET */ 1004 #ifdef INET6 1005 case AF_INET6: 1006 { 1007 struct ipv6 *ipv6; 1008 1009 ipv6->ipv6_length = m->m_pkthdr.len - 1010 sizeof(struct ipv6); 1011 ipv6->ipv6_nexthdr = IPPROTO_TCP; 1012 } 1013 1014 error = ipv6_output(m, &tp->t_inpcb->inp_route6, 1015 (so->so_options & SO_DONTROUTE), NULL, NULL, 1016 tp->t_inpcb->inp_socket); 1017 break; 1018 #endif /* INET6 */ 1019 #ifdef TUBA 1020 case AF_ISO: 1021 if (tp->t_tuba_pcb) 1022 error = tuba_output(m, tp); 1023 break; 1024 #endif /* TUBA */ 1025 } 1026 1027 #if defined(TCP_SACK) && defined(TCP_FACK) 1028 /* Update snd_awnd to reflect the new data that was sent. */ 1029 tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + 1030 tp->retran_data; 1031 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */ 1032 1033 if (error) { 1034 out: 1035 if (error == ENOBUFS) { 1036 tcp_quench(tp->t_inpcb, 0); 1037 return (0); 1038 } 1039 if ((error == EHOSTUNREACH || error == ENETDOWN) 1040 && TCPS_HAVERCVDSYN(tp->t_state)) { 1041 tp->t_softerror = error; 1042 return (0); 1043 } 1044 return (error); 1045 } 1046 tcpstat.tcps_sndtotal++; 1047 1048 /* 1049 * Data sent (as far as we can tell). 1050 * If this advertises a larger window than any other segment, 1051 * then remember the size of the advertised window. 1052 * Any pending ACK has now been sent. 1053 */ 1054 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1055 tp->rcv_adv = tp->rcv_nxt + win; 1056 tp->last_ack_sent = tp->rcv_nxt; 1057 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 1058 #if defined(TCP_SACK) || defined(TCP_NEWRENO) 1059 if (sendalot && --maxburst) 1060 #else 1061 if (sendalot) 1062 #endif 1063 goto again; 1064 return (0); 1065 } 1066 1067 void 1068 tcp_setpersist(tp) 1069 register struct tcpcb *tp; 1070 { 1071 register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 1072 1073 if (tp->t_timer[TCPT_REXMT]) 1074 panic("tcp_output REXMT"); 1075 /* 1076 * Start/restart persistance timer. 1077 */ 1078 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 1079 t * tcp_backoff[tp->t_rxtshift], 1080 TCPTV_PERSMIN, TCPTV_PERSMAX); 1081 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1082 tp->t_rxtshift++; 1083 } 1084