1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)tcp_output.c 7.20 (Berkeley) 06/28/90 18 */ 19 20 #include "param.h" 21 #include "systm.h" 22 #include "malloc.h" 23 #include "mbuf.h" 24 #include "protosw.h" 25 #include "socket.h" 26 #include "socketvar.h" 27 #include "errno.h" 28 29 #include "../net/route.h" 30 31 #include "in.h" 32 #include "in_systm.h" 33 #include "ip.h" 34 #include "in_pcb.h" 35 #include "ip_var.h" 36 #include "tcp.h" 37 #define TCPOUTFLAGS 38 #include "tcp_fsm.h" 39 #include "tcp_seq.h" 40 #include "tcp_timer.h" 41 #include "tcp_var.h" 42 #include "tcpip.h" 43 #include "tcp_debug.h" 44 45 #ifdef notyet 46 extern struct mbuf *m_copypack(); 47 #endif 48 49 /* 50 * Initial options. 51 */ 52 u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 53 54 /* 55 * Tcp output routine: figure out what should be sent and send it. 56 */ 57 tcp_output(tp) 58 register struct tcpcb *tp; 59 { 60 register struct socket *so = tp->t_inpcb->inp_socket; 61 register long len, win; 62 int off, flags, error; 63 register struct mbuf *m; 64 register struct tcpiphdr *ti; 65 u_char *opt; 66 unsigned optlen, hdrlen; 67 int idle, sendalot; 68 69 /* 70 * Determine length of data that should be transmitted, 71 * and flags that will be used. 72 * If there is some data or critical controls (SYN, RST) 73 * to send, then transmit; otherwise, investigate further. 74 */ 75 idle = (tp->snd_max == tp->snd_una); 76 if (idle && tp->t_idle >= tp->t_rxtcur) 77 /* 78 * We have been idle for "a while" and no acks are 79 * expected to clock out any data we send -- 80 * slow start to get ack "clock" running again. 81 */ 82 tp->snd_cwnd = tp->t_maxseg; 83 again: 84 sendalot = 0; 85 off = tp->snd_nxt - tp->snd_una; 86 win = min(tp->snd_wnd, tp->snd_cwnd); 87 88 /* 89 * If in persist timeout with window of 0, send 1 byte. 90 * Otherwise, if window is small but nonzero 91 * and timer expired, we will send what we can 92 * and go to transmit state. 93 */ 94 if (tp->t_force) { 95 if (win == 0) 96 win = 1; 97 else { 98 tp->t_timer[TCPT_PERSIST] = 0; 99 tp->t_rxtshift = 0; 100 } 101 } 102 103 flags = tcp_outflags[tp->t_state]; 104 len = min(so->so_snd.sb_cc, win) - off; 105 106 if (len < 0) { 107 /* 108 * If FIN has been sent but not acked, 109 * but we haven't been called to retransmit, 110 * len will be -1. Otherwise, window shrank 111 * after we sent into it. If window shrank to 0, 112 * cancel pending retransmit and pull snd_nxt 113 * back to (closed) window. We will enter persist 114 * state below. If the window didn't close completely, 115 * just wait for an ACK. 116 */ 117 len = 0; 118 if (win == 0) { 119 tp->t_timer[TCPT_REXMT] = 0; 120 tp->snd_nxt = tp->snd_una; 121 } 122 } 123 if (len > tp->t_maxseg) { 124 len = tp->t_maxseg; 125 sendalot = 1; 126 } 127 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 128 flags &= ~TH_FIN; 129 130 win = sbspace(&so->so_rcv); 131 132 /* 133 * Sender silly window avoidance. If connection is idle 134 * and can send all data, a maximum segment, 135 * at least a maximum default-size segment do it, 136 * or are forced, do it; otherwise don't bother. 137 * If peer's buffer is tiny, then send 138 * when window is at least half open. 139 * If retransmitting (possibly after persist timer forced us 140 * to send into a small window), then must resend. 141 */ 142 if (len) { 143 if (len == tp->t_maxseg) 144 goto send; 145 if ((idle || tp->t_flags & TF_NODELAY) && 146 len + off >= so->so_snd.sb_cc) 147 goto send; 148 if (tp->t_force) 149 goto send; 150 if (len >= tp->max_sndwnd / 2) 151 goto send; 152 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 153 goto send; 154 } 155 156 /* 157 * Compare available window to amount of window 158 * known to peer (as advertised window less 159 * next expected input). If the difference is at least two 160 * max size segments, or at least 50% of the maximum possible 161 * window, then want to send a window update to peer. 162 */ 163 if (win > 0) { 164 int adv = win - (tp->rcv_adv - tp->rcv_nxt); 165 166 /* this was: XXX 167 * if (so->so_rcv.sb_cc == 0 && adv >= 2 * tp->t_maxseg) 168 */ 169 if (adv >= 2 * tp->t_maxseg) 170 goto send; 171 if (2 * adv >= so->so_rcv.sb_hiwat) 172 goto send; 173 } 174 175 /* 176 * Send if we owe peer an ACK. 177 */ 178 if (tp->t_flags & TF_ACKNOW) 179 goto send; 180 if (flags & (TH_SYN|TH_RST)) 181 goto send; 182 if (SEQ_GT(tp->snd_up, tp->snd_una)) 183 goto send; 184 /* 185 * If our state indicates that FIN should be sent 186 * and we have not yet done so, or we're retransmitting the FIN, 187 * then we need to send. 188 */ 189 if (flags & TH_FIN && 190 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 191 goto send; 192 193 /* 194 * TCP window updates are not reliable, rather a polling protocol 195 * using ``persist'' packets is used to insure receipt of window 196 * updates. The three ``states'' for the output side are: 197 * idle not doing retransmits or persists 198 * persisting to move a small or zero window 199 * (re)transmitting and thereby not persisting 200 * 201 * tp->t_timer[TCPT_PERSIST] 202 * is set when we are in persist state. 203 * tp->t_force 204 * is set when we are called to send a persist packet. 205 * tp->t_timer[TCPT_REXMT] 206 * is set when we are retransmitting 207 * The output side is idle when both timers are zero. 208 * 209 * If send window is too small, there is data to transmit, and no 210 * retransmit or persist is pending, then go to persist state. 211 * If nothing happens soon, send when timer expires: 212 * if window is nonzero, transmit what we can, 213 * otherwise force out a byte. 214 */ 215 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 216 tp->t_timer[TCPT_PERSIST] == 0) { 217 tp->t_rxtshift = 0; 218 tcp_setpersist(tp); 219 } 220 221 /* 222 * No reason to send a segment, just return. 223 */ 224 return (0); 225 226 send: 227 /* 228 * Before ESTABLISHED, force sending of initial options 229 * unless TCP set not to do any options. 230 * NOTE: we assume that the IP/TCP header plus TCP options 231 * always fit in a single mbuf, leaving room for a maximum 232 * link header, i.e. 233 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 234 */ 235 optlen = 0; 236 hdrlen = sizeof (struct tcpiphdr); 237 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 238 opt = tcp_initopt; 239 optlen = sizeof (tcp_initopt); 240 hdrlen += sizeof (tcp_initopt); 241 *(u_short *)(opt + 2) = htons((u_short) tcp_mss(tp, 0)); 242 #ifdef DIAGNOSTIC 243 if (max_linkhdr + hdrlen > MHLEN) 244 panic("tcphdr too big"); 245 #endif 246 } 247 248 /* 249 * Grab a header mbuf, attaching a copy of data to 250 * be transmitted, and initialize the header from 251 * the template for sends on this connection. 252 */ 253 if (len) { 254 if (tp->t_force && len == 1) 255 tcpstat.tcps_sndprobe++; 256 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 257 tcpstat.tcps_sndrexmitpack++; 258 tcpstat.tcps_sndrexmitbyte += len; 259 } else { 260 tcpstat.tcps_sndpack++; 261 tcpstat.tcps_sndbyte += len; 262 } 263 #ifdef notyet 264 if ((m = m_copypack(so->so_snd.sb_mb, off, 265 (int)len, max_linkhdr + hdrlen)) == 0) { 266 error = ENOBUFS; 267 goto out; 268 } 269 /* 270 * m_copypack left space for our hdr; use it. 271 */ 272 m->m_len += hdrlen; 273 m->m_data -= hdrlen; 274 #else 275 MGETHDR(m, M_DONTWAIT, MT_HEADER); 276 if (m == NULL) { 277 error = ENOBUFS; 278 goto out; 279 } 280 m->m_data += max_linkhdr; 281 m->m_len = hdrlen; 282 if (len <= MHLEN - hdrlen - max_linkhdr) { 283 m_copydata(so->so_snd.sb_mb, off, (int) len, 284 mtod(m, caddr_t) + hdrlen); 285 m->m_len += len; 286 } else { 287 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 288 if (m->m_next == 0) 289 len = 0; 290 } 291 #endif 292 /* 293 * If we're sending everything we've got, set PUSH. 294 * (This will keep happy those implementations which only 295 * give data to the user when a buffer fills or 296 * a PUSH comes in.) 297 */ 298 if (off + len == so->so_snd.sb_cc) 299 flags |= TH_PUSH; 300 } else { 301 if (tp->t_flags & TF_ACKNOW) 302 tcpstat.tcps_sndacks++; 303 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 304 tcpstat.tcps_sndctrl++; 305 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 306 tcpstat.tcps_sndurg++; 307 else 308 tcpstat.tcps_sndwinup++; 309 310 MGETHDR(m, M_DONTWAIT, MT_HEADER); 311 if (m == NULL) { 312 error = ENOBUFS; 313 goto out; 314 } 315 m->m_data += max_linkhdr; 316 m->m_len = hdrlen; 317 } 318 m->m_pkthdr.rcvif = (struct ifnet *)0; 319 ti = mtod(m, struct tcpiphdr *); 320 if (tp->t_template == 0) 321 panic("tcp_output"); 322 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 323 324 /* 325 * Fill in fields, remembering maximum advertised 326 * window for use in delaying messages about window sizes. 327 * If resending a FIN, be sure not to use a new sequence number. 328 */ 329 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 330 tp->snd_nxt == tp->snd_max) 331 tp->snd_nxt--; 332 ti->ti_seq = htonl(tp->snd_nxt); 333 ti->ti_ack = htonl(tp->rcv_nxt); 334 if (optlen) { 335 bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 336 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 337 } 338 ti->ti_flags = flags; 339 /* 340 * Calculate receive window. Don't shrink window, 341 * but avoid silly window syndrome. 342 */ 343 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 344 win = 0; 345 if (win > TCP_MAXWIN) 346 win = TCP_MAXWIN; 347 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 348 win = (long)(tp->rcv_adv - tp->rcv_nxt); 349 ti->ti_win = htons((u_short)win); 350 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 351 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 352 ti->ti_flags |= TH_URG; 353 } else 354 /* 355 * If no urgent pointer to send, then we pull 356 * the urgent pointer to the left edge of the send window 357 * so that it doesn't drift into the send window on sequence 358 * number wraparound. 359 */ 360 tp->snd_up = tp->snd_una; /* drag it along */ 361 362 /* 363 * Put TCP length in extended header, and then 364 * checksum extended header and data. 365 */ 366 if (len + optlen) 367 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 368 optlen + len)); 369 ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 370 371 /* 372 * In transmit state, time the transmission and arrange for 373 * the retransmit. In persist state, just set snd_max. 374 */ 375 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 376 tcp_seq startseq = tp->snd_nxt; 377 378 /* 379 * Advance snd_nxt over sequence space of this segment. 380 */ 381 if (flags & (TH_SYN|TH_FIN)) { 382 if (flags & TH_SYN) 383 tp->snd_nxt++; 384 if (flags & TH_FIN) { 385 tp->snd_nxt++; 386 tp->t_flags |= TF_SENTFIN; 387 } 388 } 389 tp->snd_nxt += len; 390 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 391 tp->snd_max = tp->snd_nxt; 392 /* 393 * Time this transmission if not a retransmission and 394 * not currently timing anything. 395 */ 396 if (tp->t_rtt == 0) { 397 tp->t_rtt = 1; 398 tp->t_rtseq = startseq; 399 tcpstat.tcps_segstimed++; 400 } 401 } 402 403 /* 404 * Set retransmit timer if not currently set, 405 * and not doing an ack or a keep-alive probe. 406 * Initial value for retransmit timer is smoothed 407 * round-trip time + 2 * round-trip time variance. 408 * Initialize shift counter which is used for backoff 409 * of retransmit time. 410 */ 411 if (tp->t_timer[TCPT_REXMT] == 0 && 412 tp->snd_nxt != tp->snd_una) { 413 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 414 if (tp->t_timer[TCPT_PERSIST]) { 415 tp->t_timer[TCPT_PERSIST] = 0; 416 tp->t_rxtshift = 0; 417 } 418 } 419 } else 420 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 421 tp->snd_max = tp->snd_nxt + len; 422 423 /* 424 * Trace. 425 */ 426 if (so->so_options & SO_DEBUG) 427 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 428 429 /* 430 * Fill in IP length and desired time to live and 431 * send to IP level. There should be a better way 432 * to handle ttl and tos; we could keep them in 433 * the template, but need a way to checksum without them. 434 */ 435 m->m_pkthdr.len = hdrlen + len; 436 ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 437 ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 438 ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 439 #if BSD >= 43 440 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 441 so->so_options & SO_DONTROUTE); 442 #else 443 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 444 so->so_options & SO_DONTROUTE); 445 #endif 446 if (error) { 447 out: 448 if (error == ENOBUFS) { 449 tcp_quench(tp->t_inpcb); 450 return (0); 451 } 452 if ((error == EHOSTUNREACH || error == ENETDOWN) 453 && TCPS_HAVERCVDSYN(tp->t_state)) { 454 tp->t_softerror = error; 455 return (0); 456 } 457 return (error); 458 } 459 tcpstat.tcps_sndtotal++; 460 461 /* 462 * Data sent (as far as we can tell). 463 * If this advertises a larger window than any other segment, 464 * then remember the size of the advertised window. 465 * Any pending ACK has now been sent. 466 */ 467 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 468 tp->rcv_adv = tp->rcv_nxt + win; 469 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 470 if (sendalot) 471 goto again; 472 return (0); 473 } 474 475 tcp_setpersist(tp) 476 register struct tcpcb *tp; 477 { 478 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 479 480 if (tp->t_timer[TCPT_REXMT]) 481 panic("tcp_output REXMT"); 482 /* 483 * Start/restart persistance timer. 484 */ 485 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 486 t * tcp_backoff[tp->t_rxtshift], 487 TCPTV_PERSMIN, TCPTV_PERSMAX); 488 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 489 tp->t_rxtshift++; 490 } 491