1 /* $OpenBSD: tcp_output.c,v 1.33 2000/09/20 17:00:22 provos Exp $ */ 2 /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93 37 */ 38 39 /* 40 %%% portions-copyright-nrl-95 41 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 43 Reserved. All rights under this copyright have been assigned to the US 44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 46 software. 47 You should have received a copy of the license with this software. If you 48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 49 */ 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/protosw.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/errno.h> 59 #include <sys/domain.h> 60 61 #include <net/route.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/ip.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/ip_var.h> 68 #include <netinet/tcp.h> 69 #define TCPOUTFLAGS 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcpip.h> 75 #include <netinet/tcp_debug.h> 76 77 #ifdef TUBA 78 #include <netiso/iso.h> 79 #include <netiso/tuba_table.h> 80 #endif 81 82 #ifdef INET6 83 #include <netinet6/tcpipv6.h> 84 #endif /* INET6 */ 85 86 #ifdef TCP_SIGNATURE 87 #include <sys/md5k.h> 88 #endif /* TCP_SIGNATURE */ 89 90 #ifdef notyet 91 extern struct mbuf *m_copypack(); 92 #endif 93 94 #ifdef TCP_SACK 95 extern int tcprexmtthresh; 96 #endif 97 98 #ifdef TCP_SACK 99 #ifdef TCP_SACK_DEBUG 100 void 101 tcp_print_holes(tp) 102 struct tcpcb *tp; 103 { 104 struct sackhole *p = tp->snd_holes; 105 if (p == 0) 106 return; 107 printf("Hole report: start--end dups rxmit\n"); 108 while (p) { 109 printf("%x--%x d %d r %x\n", p->start, p->end, p->dups, 110 p->rxmit); 111 p = p->next; 112 } 113 printf("\n"); 114 } 115 #endif /* TCP_SACK_DEBUG */ 116 117 /* 118 * Returns pointer to a sackhole if there are any pending retransmissions; 119 * NULL otherwise. 120 */ 121 struct sackhole * 122 tcp_sack_output(tp) 123 register struct tcpcb *tp; 124 { 125 struct sackhole *p; 126 if (tp->sack_disable) 127 return 0; 128 p = tp->snd_holes; 129 while (p) { 130 #ifndef TCP_FACK 131 if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) { 132 #else 133 /* In FACK, if p->dups is less than tcprexmtthresh, but 134 * snd_fack advances more than tcprextmtthresh * tp->t_maxseg, 135 * tcp_input() will try fast retransmit. This forces output. 136 */ 137 if ((p->dups >= tcprexmtthresh || 138 tp->t_dupacks == tcprexmtthresh) && 139 SEQ_LT(p->rxmit, p->end)) { 140 #endif /* TCP_FACK */ 141 if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 142 p = p->next; 143 continue; 144 } 145 #ifdef TCP_SACK_DEBUG 146 if (p) 147 tcp_print_holes(tp); 148 #endif 149 return p; 150 } 151 p = p->next; 152 } 153 return 0; 154 } 155 156 /* 157 * After a timeout, the SACK list may be rebuilt. This SACK information 158 * should be used to avoid retransmitting SACKed data. This function 159 * traverses the SACK list to see if snd_nxt should be moved forward. 160 */ 161 void 162 tcp_sack_adjust(tp) 163 struct tcpcb *tp; 164 { 165 struct sackhole *cur = tp->snd_holes; 166 if (cur == 0) 167 return; /* No holes */ 168 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 169 return; /* We're already beyond any SACKed blocks */ 170 /* 171 * Two cases for which we want to advance snd_nxt: 172 * i) snd_nxt lies between end of one hole and beginning of another 173 * ii) snd_nxt lies between end of last hole and rcv_lastsack 174 */ 175 while (cur->next) { 176 if (SEQ_LT(tp->snd_nxt, cur->end)) 177 return; 178 if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) 179 cur = cur->next; 180 else { 181 tp->snd_nxt = cur->next->start; 182 return; 183 } 184 } 185 if (SEQ_LT(tp->snd_nxt, cur->end)) 186 return; 187 tp->snd_nxt = tp->rcv_lastsack; 188 return; 189 } 190 #endif /* TCP_SACK */ 191 192 /* 193 * Tcp output routine: figure out what should be sent and send it. 194 */ 195 int 196 tcp_output(tp) 197 register struct tcpcb *tp; 198 { 199 register struct socket *so = tp->t_inpcb->inp_socket; 200 register long len, win; 201 int off, flags, error; 202 register struct mbuf *m; 203 register struct tcphdr *th; 204 u_char opt[MAX_TCPOPTLEN]; 205 unsigned int optlen, hdrlen; 206 int idle, sendalot = 0; 207 #ifdef TCP_SACK 208 int i, sack_rxmit = 0; 209 struct sackhole *p; 210 #endif 211 #if defined(TCP_SACK) 212 int maxburst = TCP_MAXBURST; 213 #endif 214 #ifdef TCP_SIGNATURE 215 unsigned int sigoff; 216 #endif /* TCP_SIGNATURE */ 217 218 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) 219 if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE)) 220 return (EINVAL); 221 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ 222 223 /* 224 * Determine length of data that should be transmitted, 225 * and flags that will be used. 226 * If there is some data or critical controls (SYN, RST) 227 * to send, then transmit; otherwise, investigate further. 228 */ 229 idle = (tp->snd_max == tp->snd_una); 230 if (idle && tp->t_idle >= tp->t_rxtcur) 231 /* 232 * We have been idle for "a while" and no acks are 233 * expected to clock out any data we send -- 234 * slow start to get ack "clock" running again. 235 */ 236 tp->snd_cwnd = tp->t_maxseg; 237 again: 238 #ifdef TCP_SACK 239 /* 240 * If we've recently taken a timeout, snd_max will be greater than 241 * snd_nxt. There may be SACK information that allows us to avoid 242 * resending already delivered data. Adjust snd_nxt accordingly. 243 */ 244 if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) 245 tcp_sack_adjust(tp); 246 #endif 247 off = tp->snd_nxt - tp->snd_una; 248 #if defined(TCP_SACK) && defined(TCP_FACK) 249 /* Normally, sendable data is limited by off < tp->snd_cwnd. 250 * But in FACK, sendable data is limited by snd_awnd < snd_cwnd, 251 * regardless of offset. 252 */ 253 if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh)) 254 win = tp->snd_wnd; 255 else 256 #endif 257 win = ulmin(tp->snd_wnd, tp->snd_cwnd); 258 259 flags = tcp_outflags[tp->t_state]; 260 261 #ifdef TCP_SACK 262 /* 263 * Send any SACK-generated retransmissions. If we're explicitly trying 264 * to send out new data (when sendalot is 1), bypass this function. 265 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 266 * we're replacing a (future) new transmission with a retransmission 267 * now, and we previously incremented snd_cwnd in tcp_input(). 268 */ 269 if (!tp->sack_disable && !sendalot) { 270 if (tp->t_dupacks >= tcprexmtthresh && 271 (p = tcp_sack_output(tp))) { 272 off = p->rxmit - tp->snd_una; 273 sack_rxmit = 1; 274 #if 0 275 /* Coalesce holes into a single retransmission */ 276 #endif 277 len = min(tp->t_maxseg, p->end - p->rxmit); 278 #ifndef TCP_FACK 279 /* in FACK, hold snd_cwnd constant during recovery */ 280 if (SEQ_LT(tp->snd_una, tp->snd_last)) 281 tp->snd_cwnd -= tp->t_maxseg; 282 #endif 283 } 284 } 285 #endif /* TCP_SACK */ 286 287 sendalot = 0; 288 /* 289 * If in persist timeout with window of 0, send 1 byte. 290 * Otherwise, if window is small but nonzero 291 * and timer expired, we will send what we can 292 * and go to transmit state. 293 */ 294 if (tp->t_force) { 295 if (win == 0) { 296 /* 297 * If we still have some data to send, then 298 * clear the FIN bit. Usually this would 299 * happen below when it realizes that we 300 * aren't sending all the data. However, 301 * if we have exactly 1 byte of unset data, 302 * then it won't clear the FIN bit below, 303 * and if we are in persist state, we wind 304 * up sending the packet without recording 305 * that we sent the FIN bit. 306 * 307 * We can't just blindly clear the FIN bit, 308 * because if we don't have any more data 309 * to send then the probe will be the FIN 310 * itself. 311 */ 312 if (off < so->so_snd.sb_cc) 313 flags &= ~TH_FIN; 314 win = 1; 315 } else { 316 tp->t_timer[TCPT_PERSIST] = 0; 317 tp->t_rxtshift = 0; 318 } 319 } 320 321 #ifdef TCP_SACK 322 if (!sack_rxmit) { 323 #endif 324 len = ulmin(so->so_snd.sb_cc, win) - off; 325 326 #if defined(TCP_SACK) && defined(TCP_FACK) 327 /* 328 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and 329 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then 330 * do not send data (like zero window conditions) 331 */ 332 if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && 333 (tp->snd_awnd >= tp->snd_cwnd)) 334 len = 0; 335 #endif /* TCP_FACK */ 336 #ifdef TCP_SACK 337 } 338 #endif 339 340 if (len < 0) { 341 /* 342 * If FIN has been sent but not acked, 343 * but we haven't been called to retransmit, 344 * len will be -1. Otherwise, window shrank 345 * after we sent into it. If window shrank to 0, 346 * cancel pending retransmit and pull snd_nxt 347 * back to (closed) window. We will enter persist 348 * state below. If the window didn't close completely, 349 * just wait for an ACK. 350 */ 351 len = 0; 352 if (win == 0) { 353 tp->t_timer[TCPT_REXMT] = 0; 354 tp->snd_nxt = tp->snd_una; 355 } 356 } 357 if (len > tp->t_maxseg) { 358 len = tp->t_maxseg; 359 sendalot = 1; 360 } 361 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 362 flags &= ~TH_FIN; 363 364 win = sbspace(&so->so_rcv); 365 366 /* 367 * Sender silly window avoidance. If connection is idle 368 * and can send all data, a maximum segment, 369 * at least a maximum default-size segment do it, 370 * or are forced, do it; otherwise don't bother. 371 * If peer's buffer is tiny, then send 372 * when window is at least half open. 373 * If retransmitting (possibly after persist timer forced us 374 * to send into a small window), then must resend. 375 */ 376 if (len) { 377 if (len == tp->t_maxseg) 378 goto send; 379 if ((idle || tp->t_flags & TF_NODELAY) && 380 len + off >= so->so_snd.sb_cc) 381 goto send; 382 if (tp->t_force) 383 goto send; 384 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 385 goto send; 386 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 387 goto send; 388 #ifdef TCP_SACK 389 if (sack_rxmit) 390 goto send; 391 #endif 392 } 393 394 /* 395 * Compare available window to amount of window 396 * known to peer (as advertised window less 397 * next expected input). If the difference is at least two 398 * max size segments, or at least 50% of the maximum possible 399 * window, then want to send a window update to peer. 400 */ 401 if (win > 0) { 402 /* 403 * "adv" is the amount we can increase the window, 404 * taking into account that we are limited by 405 * TCP_MAXWIN << tp->rcv_scale. 406 */ 407 long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) - 408 (tp->rcv_adv - tp->rcv_nxt); 409 410 if (adv >= (long) (2 * tp->t_maxseg)) 411 goto send; 412 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 413 goto send; 414 } 415 416 /* 417 * Send if we owe peer an ACK. 418 */ 419 if (tp->t_flags & TF_ACKNOW) 420 goto send; 421 if (flags & (TH_SYN|TH_RST)) 422 goto send; 423 if (SEQ_GT(tp->snd_up, tp->snd_una)) 424 goto send; 425 /* 426 * If our state indicates that FIN should be sent 427 * and we have not yet done so, or we're retransmitting the FIN, 428 * then we need to send. 429 */ 430 if (flags & TH_FIN && 431 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 432 goto send; 433 #ifdef TCP_SACK 434 /* 435 * In SACK, it is possible for tcp_output to fail to send a segment 436 * after the retransmission timer has been turned off. Make sure 437 * that the retransmission timer is set. 438 */ 439 if (SEQ_GT(tp->snd_max, tp->snd_una) && 440 tp->t_timer[TCPT_REXMT] == 0 && 441 tp->t_timer[TCPT_PERSIST] == 0) { 442 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 443 return (0); 444 } 445 #endif /* TCP_SACK */ 446 447 /* 448 * TCP window updates are not reliable, rather a polling protocol 449 * using ``persist'' packets is used to insure receipt of window 450 * updates. The three ``states'' for the output side are: 451 * idle not doing retransmits or persists 452 * persisting to move a small or zero window 453 * (re)transmitting and thereby not persisting 454 * 455 * tp->t_timer[TCPT_PERSIST] 456 * is set when we are in persist state. 457 * tp->t_force 458 * is set when we are called to send a persist packet. 459 * tp->t_timer[TCPT_REXMT] 460 * is set when we are retransmitting 461 * The output side is idle when both timers are zero. 462 * 463 * If send window is too small, there is data to transmit, and no 464 * retransmit or persist is pending, then go to persist state. 465 * If nothing happens soon, send when timer expires: 466 * if window is nonzero, transmit what we can, 467 * otherwise force out a byte. 468 */ 469 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 470 tp->t_timer[TCPT_PERSIST] == 0) { 471 tp->t_rxtshift = 0; 472 tcp_setpersist(tp); 473 } 474 475 /* 476 * No reason to send a segment, just return. 477 */ 478 return (0); 479 480 send: 481 /* 482 * Before ESTABLISHED, force sending of initial options 483 * unless TCP set not to do any options. 484 * NOTE: we assume that the IP/TCP header plus TCP options 485 * always fit in a single mbuf, leaving room for a maximum 486 * link header, i.e. 487 * max_linkhdr + sizeof(network header) + sizeof(struct tcphdr + 488 * optlen <= MHLEN 489 */ 490 optlen = 0; 491 492 switch (tp->pf) { 493 case 0: /*default to PF_INET*/ 494 #ifdef INET 495 case PF_INET: 496 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 497 break; 498 #endif /* INET */ 499 #ifdef INET6 500 case PF_INET6: 501 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 502 break; 503 #endif /* INET6 */ 504 default: 505 return (EPFNOSUPPORT); 506 } 507 508 if (flags & TH_SYN) { 509 tp->snd_nxt = tp->iss; 510 if ((tp->t_flags & TF_NOOPT) == 0) { 511 u_int16_t mss; 512 513 opt[0] = TCPOPT_MAXSEG; 514 opt[1] = 4; 515 mss = htons((u_int16_t) tcp_mss(tp, flags & TH_ACK ? 516 tp->t_maxopd : 0)); 517 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); 518 optlen = 4; 519 520 if (flags & TH_ACK) 521 tcp_mss_update(tp); 522 #ifdef TCP_SACK 523 /* 524 * If this is the first SYN of connection (not a SYN 525 * ACK), include SACK_PERMIT_HDR option. If this is a 526 * SYN ACK, include SACK_PERMIT_HDR option if peer has 527 * already done so. 528 */ 529 if (!tp->sack_disable && ((flags & TH_ACK) == 0 || 530 (tp->t_flags & TF_SACK_PERMIT))) { 531 *((u_int32_t *) (opt + optlen)) = 532 htonl(TCPOPT_SACK_PERMIT_HDR); 533 optlen += 4; 534 } 535 #endif 536 537 if ((tp->t_flags & TF_REQ_SCALE) && 538 ((flags & TH_ACK) == 0 || 539 (tp->t_flags & TF_RCVD_SCALE))) { 540 *((u_int32_t *) (opt + optlen)) = htonl( 541 TCPOPT_NOP << 24 | 542 TCPOPT_WINDOW << 16 | 543 TCPOLEN_WINDOW << 8 | 544 tp->request_r_scale); 545 optlen += 4; 546 } 547 } 548 } 549 550 /* 551 * Send a timestamp and echo-reply if this is a SYN and our side 552 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 553 * and our peer have sent timestamps in our SYN's. 554 */ 555 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 556 (flags & TH_RST) == 0 && 557 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 558 (tp->t_flags & TF_RCVD_TSTMP))) { 559 u_int32_t *lp = (u_int32_t *)(opt + optlen); 560 561 /* Form timestamp option as shown in appendix A of RFC 1323. */ 562 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 563 *lp++ = htonl(tcp_now); 564 *lp = htonl(tp->ts_recent); 565 optlen += TCPOLEN_TSTAMP_APPA; 566 } 567 568 #ifdef TCP_SIGNATURE 569 if (tp->t_flags & TF_SIGNATURE) { 570 u_int8_t *bp = (u_int8_t *)(opt + optlen); 571 572 /* Send signature option */ 573 *(bp++) = TCPOPT_SIGNATURE; 574 *(bp++) = TCPOLEN_SIGNATURE; 575 sigoff = optlen + 2; 576 577 { 578 unsigned int i; 579 580 for (i = 0; i < 16; i++) 581 *(bp++) = 0; 582 } 583 584 optlen += TCPOLEN_SIGNATURE; 585 586 /* Pad options list to the next 32 bit boundary and 587 * terminate it. 588 */ 589 *bp++ = TCPOPT_NOP; 590 *bp++ = TCPOPT_EOL; 591 optlen += 2; 592 } 593 #endif /* TCP_SIGNATURE */ 594 595 #ifdef TCP_SACK 596 /* 597 * Send SACKs if necessary. This should be the last option processed. 598 * Only as many SACKs are sent as are permitted by the maximum options 599 * size. No more than three SACKs are sent. 600 */ 601 if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED && 602 (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && 603 tp->rcv_numsacks) { 604 u_int32_t *lp = (u_int32_t *)(opt + optlen); 605 u_int32_t *olp = lp++; 606 int count = 0; /* actual number of SACKs inserted */ 607 int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; 608 609 maxsack = min(maxsack, TCP_MAX_SACK); 610 for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { 611 struct sackblk sack = tp->sackblks[i]; 612 if (sack.start == 0 && sack.end == 0) 613 continue; 614 *lp++ = htonl(sack.start); 615 *lp++ = htonl(sack.end); 616 count++; 617 } 618 *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); 619 optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ 620 } 621 #endif /* TCP_SACK */ 622 623 #ifdef DIAGNOSTIC 624 if (optlen > MAX_TCPOPTLEN) 625 panic("tcp_output: options too long"); 626 #endif /* DIAGNOSTIC */ 627 628 hdrlen += optlen; 629 630 /* 631 * Adjust data length if insertion of options will 632 * bump the packet length beyond the t_maxopd length. 633 */ 634 if (len > tp->t_maxopd - optlen) { 635 len = tp->t_maxopd - optlen; 636 sendalot = 1; 637 flags &= ~TH_FIN; 638 } 639 640 #ifdef DIAGNOSTIC 641 if (max_linkhdr + hdrlen > MCLBYTES) 642 panic("tcphdr too big"); 643 #endif 644 645 /* 646 * Grab a header mbuf, attaching a copy of data to 647 * be transmitted, and initialize the header from 648 * the template for sends on this connection. 649 */ 650 if (len) { 651 if (tp->t_force && len == 1) 652 tcpstat.tcps_sndprobe++; 653 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 654 tcpstat.tcps_sndrexmitpack++; 655 tcpstat.tcps_sndrexmitbyte += len; 656 } else { 657 tcpstat.tcps_sndpack++; 658 tcpstat.tcps_sndbyte += len; 659 } 660 #ifdef notyet 661 if ((m = m_copypack(so->so_snd.sb_mb, off, 662 (int)len, max_linkhdr + hdrlen)) == 0) { 663 error = ENOBUFS; 664 goto out; 665 } 666 /* 667 * m_copypack left space for our hdr; use it. 668 */ 669 m->m_len += hdrlen; 670 m->m_data -= hdrlen; 671 #else 672 MGETHDR(m, M_DONTWAIT, MT_HEADER); 673 if (m != NULL) { 674 MCLGET(m, M_DONTWAIT); 675 if ((m->m_flags & M_EXT) == 0) { 676 m_freem(m); 677 m = NULL; 678 } 679 } 680 if (m == NULL) { 681 error = ENOBUFS; 682 goto out; 683 } 684 m->m_data += max_linkhdr; 685 m->m_len = hdrlen; 686 if (len <= MCLBYTES - hdrlen - max_linkhdr) { 687 m_copydata(so->so_snd.sb_mb, off, (int) len, 688 mtod(m, caddr_t) + hdrlen); 689 m->m_len += len; 690 } else { 691 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 692 if (m->m_next == 0) { 693 (void) m_free(m); 694 error = ENOBUFS; 695 goto out; 696 } 697 } 698 #endif 699 /* 700 * If we're sending everything we've got, set PUSH. 701 * (This will keep happy those implementations which only 702 * give data to the user when a buffer fills or 703 * a PUSH comes in.) 704 */ 705 if (off + len == so->so_snd.sb_cc) 706 flags |= TH_PUSH; 707 } else { 708 if (tp->t_flags & TF_ACKNOW) 709 tcpstat.tcps_sndacks++; 710 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 711 tcpstat.tcps_sndctrl++; 712 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 713 tcpstat.tcps_sndurg++; 714 else 715 tcpstat.tcps_sndwinup++; 716 717 MGETHDR(m, M_DONTWAIT, MT_HEADER); 718 if (m != NULL) { 719 MCLGET(m, M_DONTWAIT); 720 if ((m->m_flags & M_EXT) == 0) { 721 m_freem(m); 722 m = NULL; 723 } 724 } 725 if (m == NULL) { 726 error = ENOBUFS; 727 goto out; 728 } 729 m->m_data += max_linkhdr; 730 m->m_len = hdrlen; 731 } 732 m->m_pkthdr.rcvif = (struct ifnet *)0; 733 734 if (!tp->t_template) 735 panic("tcp_output"); 736 #ifdef DIAGNOSTIC 737 if (tp->t_template->m_len != hdrlen - optlen) 738 panic("tcp_output: template len != hdrlen - optlen"); 739 #endif /* DIAGNOSTIC */ 740 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), 741 tp->t_template->m_len); 742 th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len - 743 sizeof(struct tcphdr)); 744 745 /* 746 * Fill in fields, remembering maximum advertised 747 * window for use in delaying messages about window sizes. 748 * If resending a FIN, be sure not to use a new sequence number. 749 */ 750 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && 751 (tp->snd_nxt == tp->snd_max)) 752 tp->snd_nxt--; 753 /* 754 * If we are doing retransmissions, then snd_nxt will 755 * not reflect the first unsent octet. For ACK only 756 * packets, we do not want the sequence number of the 757 * retransmitted packet, we want the sequence number 758 * of the next unsent octet. So, if there is no data 759 * (and no SYN or FIN), use snd_max instead of snd_nxt 760 * when filling in ti_seq. But if we are in persist 761 * state, snd_max might reflect one byte beyond the 762 * right edge of the window, so use snd_nxt in that 763 * case, since we know we aren't doing a retransmission. 764 * (retransmit and persist are mutually exclusive...) 765 */ 766 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) 767 th->th_seq = htonl(tp->snd_nxt); 768 else 769 th->th_seq = htonl(tp->snd_max); 770 771 #ifdef TCP_SACK 772 if (sack_rxmit) { 773 /* 774 * If sendalot was turned on (due to option stuffing), turn it 775 * off. Properly set th_seq field. Advance the ret'x pointer 776 * by len. 777 */ 778 if (sendalot) 779 sendalot = 0; 780 th->th_seq = htonl(p->rxmit); 781 p->rxmit += len; 782 #if defined(TCP_SACK) && defined(TCP_FACK) 783 tp->retran_data += len; 784 #endif /* TCP_FACK */ 785 } 786 #endif /* TCP_SACK */ 787 788 th->th_ack = htonl(tp->rcv_nxt); 789 if (optlen) { 790 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 791 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 792 } 793 th->th_flags = flags; 794 795 /* 796 * Calculate receive window. Don't shrink window, 797 * but avoid silly window syndrome. 798 */ 799 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 800 win = 0; 801 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 802 win = (long)TCP_MAXWIN << tp->rcv_scale; 803 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 804 win = (long)(tp->rcv_adv - tp->rcv_nxt); 805 if (flags & TH_RST) 806 win = 0; 807 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 808 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 809 u_int32_t urp = tp->snd_up - tp->snd_nxt; 810 if (urp > IP_MAXPACKET) 811 urp = IP_MAXPACKET; 812 th->th_urp = htons((u_int16_t)urp); 813 th->th_flags |= TH_URG; 814 } else 815 /* 816 * If no urgent pointer to send, then we pull 817 * the urgent pointer to the left edge of the send window 818 * so that it doesn't drift into the send window on sequence 819 * number wraparound. 820 */ 821 tp->snd_up = tp->snd_una; /* drag it along */ 822 823 /* Put TCP length in pseudo-header */ 824 switch (tp->pf) { 825 case 0: /*default to PF_INET*/ 826 #ifdef INET 827 case AF_INET: 828 if (len + optlen) 829 mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)( 830 sizeof (struct tcphdr) + optlen + len)); 831 break; 832 #endif /* INET */ 833 #ifdef INET6 834 case AF_INET6: 835 break; 836 #endif /* INET6 */ 837 } 838 839 #ifdef TCP_SIGNATURE 840 if (tp->t_flags & TF_SIGNATURE) { 841 MD5_CTX ctx; 842 union sockaddr_union sa; 843 struct tdb *tdb; 844 845 bzero(&sa, sizeof(union sockaddr_union)); 846 847 switch (tp->pf) { 848 case 0: /*default to PF_INET*/ 849 #ifdef INET 850 case AF_INET: 851 sa.sa.sa_len = sizeof(struct sockaddr_in); 852 sa.sa.sa_family = AF_INET; 853 sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 854 break; 855 #endif /* INET */ 856 #ifdef INET6 857 case AF_INET6: 858 sa.sa.sa_len = sizeof(struct sockaddr_in6); 859 sa.sa.sa_family = AF_INET6; 860 sa.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 861 break; 862 #endif /* INET6 */ 863 } 864 865 /* XXX gettdb() should really be called at spltdb(). */ 866 /* XXX this is splsoftnet(), currently they are the same. */ 867 tdb = gettdb(0, &sa, IPPROTO_TCP); 868 if (tdb == NULL) 869 return (EPERM); 870 871 MD5Init(&ctx); 872 873 switch (tp->pf) { 874 case 0: /*default to PF_INET*/ 875 #ifdef INET 876 case AF_INET: 877 { 878 struct ippseudo ippseudo; 879 struct ipovly *ipovly; 880 881 ipovly = mtod(m, struct ipovly *); 882 883 ippseudo.ippseudo_src = ipovly->ih_src; 884 ippseudo.ippseudo_dst = ipovly->ih_dst; 885 ippseudo.ippseudo_pad = 0; 886 ippseudo.ippseudo_p = IPPROTO_TCP; 887 ippseudo.ippseudo_len = ipovly->ih_len; 888 MD5Update(&ctx, (char *)&ippseudo, 889 sizeof(struct ippseudo)); 890 MD5Update(&ctx, mtod(m, caddr_t) + 891 sizeof(struct ip), 892 sizeof(struct tcphdr)); 893 } 894 break; 895 #endif /* INET */ 896 #ifdef INET6 897 case AF_INET6: 898 { 899 static int printed = 0; 900 901 if (!printed) { 902 printf("error: TCP MD5 support for " 903 "IPv6 not yet implemented.\n"); 904 printed = 1; 905 } 906 } 907 break; 908 #endif /* INET6 */ 909 } 910 911 if (len && m_apply(m, hdrlen, len, tcp_signature_apply, 912 (caddr_t)&ctx)) 913 return (EINVAL); 914 915 MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen); 916 MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx); 917 } 918 #endif /* TCP_SIGNATURE */ 919 920 /* 921 * Put TCP length in extended header, and then 922 * checksum extended header and data. 923 */ 924 switch (tp->pf) { 925 case 0: /*default to PF_INET*/ 926 #ifdef INET 927 case AF_INET: 928 th->th_sum = in_cksum(m, (int)(hdrlen + len)); 929 break; 930 #endif /* INET */ 931 #ifdef INET6 932 case AF_INET6: 933 m->m_pkthdr.len = hdrlen + len; 934 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 935 hdrlen - sizeof(struct ip6_hdr) + len); 936 break; 937 #endif /* INET6 */ 938 } 939 940 /* 941 * In transmit state, time the transmission and arrange for 942 * the retransmit. In persist state, just set snd_max. 943 */ 944 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 945 tcp_seq startseq = tp->snd_nxt; 946 947 /* 948 * Advance snd_nxt over sequence space of this segment. 949 */ 950 if (flags & (TH_SYN|TH_FIN)) { 951 if (flags & TH_SYN) 952 tp->snd_nxt++; 953 if (flags & TH_FIN) { 954 tp->snd_nxt++; 955 tp->t_flags |= TF_SENTFIN; 956 } 957 } 958 #ifdef TCP_SACK 959 if (!tp->sack_disable) { 960 if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { 961 goto timer; 962 } 963 } 964 #endif 965 tp->snd_nxt += len; 966 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 967 tp->snd_max = tp->snd_nxt; 968 /* 969 * Time this transmission if not a retransmission and 970 * not currently timing anything. 971 */ 972 if (tp->t_rtt == 0) { 973 tp->t_rtt = 1; 974 tp->t_rtseq = startseq; 975 tcpstat.tcps_segstimed++; 976 } 977 } 978 979 /* 980 * Set retransmit timer if not currently set, 981 * and not doing an ack or a keep-alive probe. 982 * Initial value for retransmit timer is smoothed 983 * round-trip time + 2 * round-trip time variance. 984 * Initialize shift counter which is used for backoff 985 * of retransmit time. 986 */ 987 #ifdef TCP_SACK 988 timer: 989 if (!tp->sack_disable && sack_rxmit && 990 tp->t_timer[TCPT_REXMT] == 0 && 991 tp->snd_nxt != tp->snd_max) { 992 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 993 if (tp->t_timer[TCPT_PERSIST]) { 994 tp->t_timer[TCPT_PERSIST] = 0; 995 tp->t_rxtshift = 0; 996 } 997 } 998 #endif 999 1000 if (tp->t_timer[TCPT_REXMT] == 0 && 1001 tp->snd_nxt != tp->snd_una) { 1002 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1003 if (tp->t_timer[TCPT_PERSIST]) { 1004 tp->t_timer[TCPT_PERSIST] = 0; 1005 tp->t_rxtshift = 0; 1006 } 1007 } 1008 } else 1009 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 1010 tp->snd_max = tp->snd_nxt + len; 1011 1012 /* 1013 * Trace. 1014 */ 1015 if (so->so_options & SO_DEBUG) 1016 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0, 1017 len); 1018 1019 /* 1020 * Fill in IP length and desired time to live and 1021 * send to IP level. There should be a better way 1022 * to handle ttl and tos; we could keep them in 1023 * the template, but need a way to checksum without them. 1024 */ 1025 m->m_pkthdr.len = hdrlen + len; 1026 1027 switch (tp->pf) { 1028 case 0: /*default to PF_INET*/ 1029 #ifdef INET 1030 case AF_INET: 1031 { 1032 struct ip *ip; 1033 1034 ip = mtod(m, struct ip *); 1035 ip->ip_len = m->m_pkthdr.len; 1036 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1037 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 1038 } 1039 error = ip_output(m, tp->t_inpcb->inp_options, 1040 &tp->t_inpcb->inp_route, 1041 (ip_mtudisc ? IP_MTUDISC : 0) | 1042 (so->so_options & SO_DONTROUTE), 1043 0, tp->t_inpcb); 1044 break; 1045 #endif /* INET */ 1046 #ifdef INET6 1047 case AF_INET6: 1048 { 1049 struct ip6_hdr *ipv6; 1050 1051 ipv6 = mtod(m, struct ip6_hdr *); 1052 ipv6->ip6_plen = m->m_pkthdr.len - 1053 sizeof(struct ip6_hdr); 1054 ipv6->ip6_nxt = IPPROTO_TCP; 1055 ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); 1056 } 1057 error = ip6_output(m, tp->t_inpcb->inp_outputopts6, 1058 &tp->t_inpcb->inp_route6, 1059 (so->so_options & SO_DONTROUTE), NULL, NULL); 1060 break; 1061 #endif /* INET6 */ 1062 #ifdef TUBA 1063 case AF_ISO: 1064 if (tp->t_tuba_pcb) 1065 error = tuba_output(m, tp); 1066 break; 1067 #endif /* TUBA */ 1068 } 1069 1070 #if defined(TCP_SACK) && defined(TCP_FACK) 1071 /* Update snd_awnd to reflect the new data that was sent. */ 1072 tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + 1073 tp->retran_data; 1074 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */ 1075 1076 if (error) { 1077 out: 1078 if (error == ENOBUFS) { 1079 tcp_quench(tp->t_inpcb, 0); 1080 return (0); 1081 } 1082 if ((error == EHOSTUNREACH || error == ENETDOWN) 1083 && TCPS_HAVERCVDSYN(tp->t_state)) { 1084 tp->t_softerror = error; 1085 return (0); 1086 } 1087 return (error); 1088 } 1089 tcpstat.tcps_sndtotal++; 1090 1091 /* 1092 * Data sent (as far as we can tell). 1093 * If this advertises a larger window than any other segment, 1094 * then remember the size of the advertised window. 1095 * Any pending ACK has now been sent. 1096 */ 1097 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1098 tp->rcv_adv = tp->rcv_nxt + win; 1099 tp->last_ack_sent = tp->rcv_nxt; 1100 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 1101 #if defined(TCP_SACK) 1102 if (sendalot && --maxburst) 1103 #else 1104 if (sendalot) 1105 #endif 1106 goto again; 1107 return (0); 1108 } 1109 1110 void 1111 tcp_setpersist(tp) 1112 register struct tcpcb *tp; 1113 { 1114 register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 1115 1116 if (tp->t_timer[TCPT_REXMT]) 1117 panic("tcp_output REXMT"); 1118 /* 1119 * Start/restart persistance timer. 1120 */ 1121 if (t < tp->t_rttmin) 1122 t = tp->t_rttmin; 1123 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 1124 t * tcp_backoff[tp->t_rxtshift], 1125 TCPTV_PERSMIN, TCPTV_PERSMAX); 1126 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1127 tp->t_rxtshift++; 1128 } 1129