1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * from: @(#)tcp_output.c 7.22 (Berkeley) 8/31/90 34 * $Id: tcp_output.c,v 1.3 1993/05/22 11:42:42 cgd Exp $ 35 */ 36 37 #include "param.h" 38 #include "systm.h" 39 #include "malloc.h" 40 #include "mbuf.h" 41 #include "protosw.h" 42 #include "socket.h" 43 #include "socketvar.h" 44 #include "errno.h" 45 46 #include "../net/route.h" 47 48 #include "in.h" 49 #include "in_systm.h" 50 #include "ip.h" 51 #include "in_pcb.h" 52 #include "ip_var.h" 53 #include "tcp.h" 54 #define TCPOUTFLAGS 55 #include "tcp_fsm.h" 56 #include "tcp_seq.h" 57 #include "tcp_timer.h" 58 #include "tcp_var.h" 59 #include "tcpip.h" 60 #include "tcp_debug.h" 61 62 #ifdef notyet 63 extern struct mbuf *m_copypack(); 64 #endif 65 66 /* 67 * Initial options. 68 */ 69 u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 70 71 /* 72 * Tcp output routine: figure out what should be sent and send it. 73 */ 74 tcp_output(tp) 75 register struct tcpcb *tp; 76 { 77 register struct socket *so = tp->t_inpcb->inp_socket; 78 register long len, win; 79 int off, flags, error; 80 register struct mbuf *m; 81 register struct tcpiphdr *ti; 82 u_char *opt; 83 unsigned optlen, hdrlen; 84 int idle, sendalot; 85 86 /* 87 * Determine length of data that should be transmitted, 88 * and flags that will be used. 89 * If there is some data or critical controls (SYN, RST) 90 * to send, then transmit; otherwise, investigate further. 91 */ 92 idle = (tp->snd_max == tp->snd_una); 93 if (idle && tp->t_idle >= tp->t_rxtcur) 94 /* 95 * We have been idle for "a while" and no acks are 96 * expected to clock out any data we send -- 97 * slow start to get ack "clock" running again. 98 */ 99 tp->snd_cwnd = tp->t_maxseg; 100 again: 101 sendalot = 0; 102 off = tp->snd_nxt - tp->snd_una; 103 win = min(tp->snd_wnd, tp->snd_cwnd); 104 105 /* 106 * If in persist timeout with window of 0, send 1 byte. 107 * Otherwise, if window is small but nonzero 108 * and timer expired, we will send what we can 109 * and go to transmit state. 110 */ 111 if (tp->t_force) { 112 if (win == 0) 113 win = 1; 114 else { 115 tp->t_timer[TCPT_PERSIST] = 0; 116 tp->t_rxtshift = 0; 117 } 118 } 119 120 flags = tcp_outflags[tp->t_state]; 121 len = min(so->so_snd.sb_cc, win) - off; 122 123 if (len < 0) { 124 /* 125 * If FIN has been sent but not acked, 126 * but we haven't been called to retransmit, 127 * len will be -1. Otherwise, window shrank 128 * after we sent into it. If window shrank to 0, 129 * cancel pending retransmit and pull snd_nxt 130 * back to (closed) window. We will enter persist 131 * state below. If the window didn't close completely, 132 * just wait for an ACK. 133 */ 134 len = 0; 135 if (win == 0) { 136 tp->t_timer[TCPT_REXMT] = 0; 137 tp->snd_nxt = tp->snd_una; 138 } 139 } 140 if (len > tp->t_maxseg) { 141 len = tp->t_maxseg; 142 sendalot = 1; 143 } 144 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 145 flags &= ~TH_FIN; 146 147 win = sbspace(&so->so_rcv); 148 149 /* 150 * Sender silly window avoidance. If connection is idle 151 * and can send all data, a maximum segment, 152 * at least a maximum default-size segment do it, 153 * or are forced, do it; otherwise don't bother. 154 * If peer's buffer is tiny, then send 155 * when window is at least half open. 156 * If retransmitting (possibly after persist timer forced us 157 * to send into a small window), then must resend. 158 */ 159 if (len) { 160 if (len == tp->t_maxseg) 161 goto send; 162 if ((idle || tp->t_flags & TF_NODELAY) && 163 len + off >= so->so_snd.sb_cc) 164 goto send; 165 if (tp->t_force) 166 goto send; 167 if (len >= tp->max_sndwnd / 2) 168 goto send; 169 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 170 goto send; 171 } 172 173 /* 174 * Compare available window to amount of window 175 * known to peer (as advertised window less 176 * next expected input). If the difference is at least two 177 * max size segments, or at least 50% of the maximum possible 178 * window, then want to send a window update to peer. 179 */ 180 if (win > 0) { 181 long adv = win - (tp->rcv_adv - tp->rcv_nxt); 182 183 if (adv >= (long) (2 * tp->t_maxseg)) 184 goto send; 185 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 186 goto send; 187 } 188 189 /* 190 * Send if we owe peer an ACK. 191 */ 192 if (tp->t_flags & TF_ACKNOW) 193 goto send; 194 if (flags & (TH_SYN|TH_RST)) 195 goto send; 196 if (SEQ_GT(tp->snd_up, tp->snd_una)) 197 goto send; 198 /* 199 * If our state indicates that FIN should be sent 200 * and we have not yet done so, or we're retransmitting the FIN, 201 * then we need to send. 202 */ 203 if (flags & TH_FIN && 204 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 205 goto send; 206 207 /* 208 * TCP window updates are not reliable, rather a polling protocol 209 * using ``persist'' packets is used to insure receipt of window 210 * updates. The three ``states'' for the output side are: 211 * idle not doing retransmits or persists 212 * persisting to move a small or zero window 213 * (re)transmitting and thereby not persisting 214 * 215 * tp->t_timer[TCPT_PERSIST] 216 * is set when we are in persist state. 217 * tp->t_force 218 * is set when we are called to send a persist packet. 219 * tp->t_timer[TCPT_REXMT] 220 * is set when we are retransmitting 221 * The output side is idle when both timers are zero. 222 * 223 * If send window is too small, there is data to transmit, and no 224 * retransmit or persist is pending, then go to persist state. 225 * If nothing happens soon, send when timer expires: 226 * if window is nonzero, transmit what we can, 227 * otherwise force out a byte. 228 */ 229 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 230 tp->t_timer[TCPT_PERSIST] == 0) { 231 tp->t_rxtshift = 0; 232 tcp_setpersist(tp); 233 } 234 235 /* 236 * No reason to send a segment, just return. 237 */ 238 return (0); 239 240 send: 241 /* 242 * Before ESTABLISHED, force sending of initial options 243 * unless TCP set not to do any options. 244 * NOTE: we assume that the IP/TCP header plus TCP options 245 * always fit in a single mbuf, leaving room for a maximum 246 * link header, i.e. 247 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 248 */ 249 optlen = 0; 250 hdrlen = sizeof (struct tcpiphdr); 251 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 252 opt = tcp_initopt; 253 optlen = sizeof (tcp_initopt); 254 hdrlen += sizeof (tcp_initopt); 255 *(u_short *)(opt + 2) = htons((u_short) tcp_mss(tp, 0)); 256 #ifdef DIAGNOSTIC 257 if (max_linkhdr + hdrlen > MHLEN) 258 panic("tcphdr too big"); 259 #endif 260 } 261 262 /* 263 * Grab a header mbuf, attaching a copy of data to 264 * be transmitted, and initialize the header from 265 * the template for sends on this connection. 266 */ 267 if (len) { 268 if (tp->t_force && len == 1) 269 tcpstat.tcps_sndprobe++; 270 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 271 tcpstat.tcps_sndrexmitpack++; 272 tcpstat.tcps_sndrexmitbyte += len; 273 } else { 274 tcpstat.tcps_sndpack++; 275 tcpstat.tcps_sndbyte += len; 276 } 277 #ifdef notyet 278 if ((m = m_copypack(so->so_snd.sb_mb, off, 279 (int)len, max_linkhdr + hdrlen)) == 0) { 280 error = ENOBUFS; 281 goto out; 282 } 283 /* 284 * m_copypack left space for our hdr; use it. 285 */ 286 m->m_len += hdrlen; 287 m->m_data -= hdrlen; 288 #else 289 MGETHDR(m, M_DONTWAIT, MT_HEADER); 290 if (m == NULL) { 291 error = ENOBUFS; 292 goto out; 293 } 294 m->m_data += max_linkhdr; 295 m->m_len = hdrlen; 296 if (len <= MHLEN - hdrlen - max_linkhdr) { 297 m_copydata(so->so_snd.sb_mb, off, (int) len, 298 mtod(m, caddr_t) + hdrlen); 299 m->m_len += len; 300 } else { 301 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 302 if (m->m_next == 0) 303 len = 0; 304 } 305 #endif 306 /* 307 * If we're sending everything we've got, set PUSH. 308 * (This will keep happy those implementations which only 309 * give data to the user when a buffer fills or 310 * a PUSH comes in.) 311 */ 312 if (off + len == so->so_snd.sb_cc) 313 flags |= TH_PUSH; 314 } else { 315 if (tp->t_flags & TF_ACKNOW) 316 tcpstat.tcps_sndacks++; 317 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 318 tcpstat.tcps_sndctrl++; 319 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 320 tcpstat.tcps_sndurg++; 321 else 322 tcpstat.tcps_sndwinup++; 323 324 MGETHDR(m, M_DONTWAIT, MT_HEADER); 325 if (m == NULL) { 326 error = ENOBUFS; 327 goto out; 328 } 329 m->m_data += max_linkhdr; 330 m->m_len = hdrlen; 331 } 332 m->m_pkthdr.rcvif = (struct ifnet *)0; 333 ti = mtod(m, struct tcpiphdr *); 334 if (tp->t_template == 0) 335 panic("tcp_output"); 336 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 337 338 /* 339 * Fill in fields, remembering maximum advertised 340 * window for use in delaying messages about window sizes. 341 * If resending a FIN, be sure not to use a new sequence number. 342 */ 343 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 344 tp->snd_nxt == tp->snd_max) 345 tp->snd_nxt--; 346 ti->ti_seq = htonl(tp->snd_nxt); 347 ti->ti_ack = htonl(tp->rcv_nxt); 348 if (optlen) { 349 bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 350 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 351 } 352 ti->ti_flags = flags; 353 /* 354 * Calculate receive window. Don't shrink window, 355 * but avoid silly window syndrome. 356 */ 357 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 358 win = 0; 359 if (win > TCP_MAXWIN) 360 win = TCP_MAXWIN; 361 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 362 win = (long)(tp->rcv_adv - tp->rcv_nxt); 363 ti->ti_win = htons((u_short)win); 364 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 365 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 366 ti->ti_flags |= TH_URG; 367 } else 368 /* 369 * If no urgent pointer to send, then we pull 370 * the urgent pointer to the left edge of the send window 371 * so that it doesn't drift into the send window on sequence 372 * number wraparound. 373 */ 374 tp->snd_up = tp->snd_una; /* drag it along */ 375 376 /* 377 * Put TCP length in extended header, and then 378 * checksum extended header and data. 379 */ 380 if (len + optlen) 381 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 382 optlen + len)); 383 ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 384 385 /* 386 * In transmit state, time the transmission and arrange for 387 * the retransmit. In persist state, just set snd_max. 388 */ 389 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 390 tcp_seq startseq = tp->snd_nxt; 391 392 /* 393 * Advance snd_nxt over sequence space of this segment. 394 */ 395 if (flags & (TH_SYN|TH_FIN)) { 396 if (flags & TH_SYN) 397 tp->snd_nxt++; 398 if (flags & TH_FIN) { 399 tp->snd_nxt++; 400 tp->t_flags |= TF_SENTFIN; 401 } 402 } 403 tp->snd_nxt += len; 404 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 405 tp->snd_max = tp->snd_nxt; 406 /* 407 * Time this transmission if not a retransmission and 408 * not currently timing anything. 409 */ 410 if (tp->t_rtt == 0) { 411 tp->t_rtt = 1; 412 tp->t_rtseq = startseq; 413 tcpstat.tcps_segstimed++; 414 } 415 } 416 417 /* 418 * Set retransmit timer if not currently set, 419 * and not doing an ack or a keep-alive probe. 420 * Initial value for retransmit timer is smoothed 421 * round-trip time + 2 * round-trip time variance. 422 * Initialize shift counter which is used for backoff 423 * of retransmit time. 424 */ 425 if (tp->t_timer[TCPT_REXMT] == 0 && 426 tp->snd_nxt != tp->snd_una) { 427 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 428 if (tp->t_timer[TCPT_PERSIST]) { 429 tp->t_timer[TCPT_PERSIST] = 0; 430 tp->t_rxtshift = 0; 431 } 432 } 433 } else 434 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 435 tp->snd_max = tp->snd_nxt + len; 436 437 /* 438 * Trace. 439 */ 440 if (so->so_options & SO_DEBUG) 441 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 442 443 /* 444 * Fill in IP length and desired time to live and 445 * send to IP level. There should be a better way 446 * to handle ttl and tos; we could keep them in 447 * the template, but need a way to checksum without them. 448 */ 449 m->m_pkthdr.len = hdrlen + len; 450 ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 451 ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 452 ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 453 #if BSD >= 43 454 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 455 so->so_options & SO_DONTROUTE); 456 #else 457 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 458 so->so_options & SO_DONTROUTE); 459 #endif 460 if (error) { 461 out: 462 if (error == ENOBUFS) { 463 tcp_quench(tp->t_inpcb); 464 return (0); 465 } 466 if ((error == EHOSTUNREACH || error == ENETDOWN) 467 && TCPS_HAVERCVDSYN(tp->t_state)) { 468 tp->t_softerror = error; 469 return (0); 470 } 471 return (error); 472 } 473 tcpstat.tcps_sndtotal++; 474 475 /* 476 * Data sent (as far as we can tell). 477 * If this advertises a larger window than any other segment, 478 * then remember the size of the advertised window. 479 * Any pending ACK has now been sent. 480 */ 481 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 482 tp->rcv_adv = tp->rcv_nxt + win; 483 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 484 if (sendalot) 485 goto again; 486 return (0); 487 } 488 489 tcp_setpersist(tp) 490 register struct tcpcb *tp; 491 { 492 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 493 494 if (tp->t_timer[TCPT_REXMT]) 495 panic("tcp_output REXMT"); 496 /* 497 * Start/restart persistance timer. 498 */ 499 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 500 t * tcp_backoff[tp->t_rxtshift], 501 TCPTV_PERSMIN, TCPTV_PERSMAX); 502 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 503 tp->t_rxtshift++; 504 } 505