123191Smckusick /* 2*44380Skarels * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 332788Sbostic * All rights reserved. 423191Smckusick * 532788Sbostic * Redistribution and use in source and binary forms are permitted 634855Sbostic * provided that the above copyright notice and this paragraph are 734855Sbostic * duplicated in all such forms and that any documentation, 834855Sbostic * advertising materials, and other materials related to such 934855Sbostic * distribution and use acknowledge that the software was developed 1034855Sbostic * by the University of California, Berkeley. The name of the 1134855Sbostic * University may not be used to endorse or promote products derived 1234855Sbostic * from this software without specific prior written permission. 1334855Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1434855Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1534855Sbostic * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1632788Sbostic * 17*44380Skarels * @(#)tcp_output.c 7.20 (Berkeley) 06/28/90 1823191Smckusick */ 194677Swnj 2017063Sbloom #include "param.h" 2117063Sbloom #include "systm.h" 2237317Skarels #include "malloc.h" 2317063Sbloom #include "mbuf.h" 2417063Sbloom #include "protosw.h" 2517063Sbloom #include "socket.h" 2617063Sbloom #include "socketvar.h" 2717063Sbloom #include "errno.h" 2810895Ssam 2910895Ssam #include "../net/route.h" 3010895Ssam 3117063Sbloom #include "in.h" 3217063Sbloom #include "in_systm.h" 3317063Sbloom #include "ip.h" 3440685Skarels #include "in_pcb.h" 3517063Sbloom #include "ip_var.h" 3617063Sbloom #include "tcp.h" 375088Swnj #define TCPOUTFLAGS 3817063Sbloom #include "tcp_fsm.h" 3917063Sbloom #include "tcp_seq.h" 4017063Sbloom #include "tcp_timer.h" 4117063Sbloom #include "tcp_var.h" 4217063Sbloom #include "tcpip.h" 4317063Sbloom #include "tcp_debug.h" 444677Swnj 45*44380Skarels #ifdef notyet 46*44380Skarels extern struct mbuf *m_copypack(); 47*44380Skarels #endif 48*44380Skarels 494678Swnj /* 508314Sroot * Initial options. 515441Swnj */ 525441Swnj u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 535441Swnj 545441Swnj /* 555245Sroot * Tcp output routine: figure out what should be sent and send it. 564678Swnj */ 575075Swnj tcp_output(tp) 585075Swnj register struct tcpcb *tp; 594678Swnj { 605075Swnj register struct socket *so = tp->t_inpcb->inp_socket; 6137317Skarels register long len, win; 6225940Skarels int off, flags, error; 635075Swnj register struct mbuf *m; 645075Swnj register struct tcpiphdr *ti; 655441Swnj u_char *opt; 66*44380Skarels unsigned optlen, hdrlen; 6725940Skarels int idle, sendalot; 684678Swnj 695075Swnj /* 706279Swnj * Determine length of data that should be transmitted, 715088Swnj * and flags that will be used. 725088Swnj * If there is some data or critical controls (SYN, RST) 735088Swnj * to send, then transmit; otherwise, investigate further. 745075Swnj */ 7525940Skarels idle = (tp->snd_max == tp->snd_una); 76*44380Skarels if (idle && tp->t_idle >= tp->t_rxtcur) 77*44380Skarels /* 78*44380Skarels * We have been idle for "a while" and no acks are 79*44380Skarels * expected to clock out any data we send -- 80*44380Skarels * slow start to get ack "clock" running again. 81*44380Skarels */ 82*44380Skarels tp->snd_cwnd = tp->t_maxseg; 837125Swnj again: 847125Swnj sendalot = 0; 855075Swnj off = tp->snd_nxt - tp->snd_una; 8637317Skarels win = min(tp->snd_wnd, tp->snd_cwnd); 8726834Skarels 8821116Skarels /* 8921116Skarels * If in persist timeout with window of 0, send 1 byte. 9025940Skarels * Otherwise, if window is small but nonzero 9125940Skarels * and timer expired, we will send what we can 9225940Skarels * and go to transmit state. 9321116Skarels */ 9421116Skarels if (tp->t_force) { 9526834Skarels if (win == 0) 9621116Skarels win = 1; 9721116Skarels else { 9821116Skarels tp->t_timer[TCPT_PERSIST] = 0; 9921116Skarels tp->t_rxtshift = 0; 10021116Skarels } 10121116Skarels } 10225940Skarels 103*44380Skarels flags = tcp_outflags[tp->t_state]; 10437317Skarels len = min(so->so_snd.sb_cc, win) - off; 10525940Skarels 10626834Skarels if (len < 0) { 10726834Skarels /* 10827048Skarels * If FIN has been sent but not acked, 10927048Skarels * but we haven't been called to retransmit, 11032785Skarels * len will be -1. Otherwise, window shrank 11132785Skarels * after we sent into it. If window shrank to 0, 11232785Skarels * cancel pending retransmit and pull snd_nxt 11332785Skarels * back to (closed) window. We will enter persist 11432785Skarels * state below. If the window didn't close completely, 11527048Skarels * just wait for an ACK. 11626834Skarels */ 11732785Skarels len = 0; 11832785Skarels if (win == 0) { 11927067Skarels tp->t_timer[TCPT_REXMT] = 0; 12027067Skarels tp->snd_nxt = tp->snd_una; 12132785Skarels } 12226834Skarels } 12327067Skarels if (len > tp->t_maxseg) { 12427067Skarels len = tp->t_maxseg; 12531442Skarels sendalot = 1; 12627067Skarels } 12729795Skarels if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 12829795Skarels flags &= ~TH_FIN; 129*44380Skarels 13026834Skarels win = sbspace(&so->so_rcv); 13126834Skarels 13225940Skarels /* 13317318Skarels * Sender silly window avoidance. If connection is idle 13417318Skarels * and can send all data, a maximum segment, 13517318Skarels * at least a maximum default-size segment do it, 1366279Swnj * or are forced, do it; otherwise don't bother. 13725261Skarels * If peer's buffer is tiny, then send 13825261Skarels * when window is at least half open. 13921116Skarels * If retransmitting (possibly after persist timer forced us 14021116Skarels * to send into a small window), then must resend. 1416279Swnj */ 1426279Swnj if (len) { 14331725Skarels if (len == tp->t_maxseg) 1446279Swnj goto send; 14525940Skarels if ((idle || tp->t_flags & TF_NODELAY) && 14625940Skarels len + off >= so->so_snd.sb_cc) 1476279Swnj goto send; 1486279Swnj if (tp->t_force) 1496279Swnj goto send; 15025261Skarels if (len >= tp->max_sndwnd / 2) 15125261Skarels goto send; 15221116Skarels if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 15321116Skarels goto send; 15426834Skarels } 1556279Swnj 1565441Swnj /* 15725940Skarels * Compare available window to amount of window 15825940Skarels * known to peer (as advertised window less 15932785Skarels * next expected input). If the difference is at least two 16040685Skarels * max size segments, or at least 50% of the maximum possible 16132785Skarels * window, then want to send a window update to peer. 1625075Swnj */ 16332034Skarels if (win > 0) { 16432034Skarels int adv = win - (tp->rcv_adv - tp->rcv_nxt); 1654678Swnj 166*44380Skarels /* this was: XXX 167*44380Skarels * if (so->so_rcv.sb_cc == 0 && adv >= 2 * tp->t_maxseg) 168*44380Skarels */ 16940685Skarels if (adv >= 2 * tp->t_maxseg) 17032785Skarels goto send; 17140685Skarels if (2 * adv >= so->so_rcv.sb_hiwat) 17232034Skarels goto send; 17332034Skarels } 17432034Skarels 1755075Swnj /* 176*44380Skarels * Send if we owe peer an ACK. 177*44380Skarels */ 178*44380Skarels if (tp->t_flags & TF_ACKNOW) 179*44380Skarels goto send; 180*44380Skarels if (flags & (TH_SYN|TH_RST)) 181*44380Skarels goto send; 182*44380Skarels if (SEQ_GT(tp->snd_up, tp->snd_una)) 183*44380Skarels goto send; 184*44380Skarels /* 185*44380Skarels * If our state indicates that FIN should be sent 186*44380Skarels * and we have not yet done so, or we're retransmitting the FIN, 187*44380Skarels * then we need to send. 188*44380Skarels */ 189*44380Skarels if (flags & TH_FIN && 190*44380Skarels ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 191*44380Skarels goto send; 192*44380Skarels 193*44380Skarels /* 1947125Swnj * TCP window updates are not reliable, rather a polling protocol 1957125Swnj * using ``persist'' packets is used to insure receipt of window 1967125Swnj * updates. The three ``states'' for the output side are: 1977125Swnj * idle not doing retransmits or persists 19825940Skarels * persisting to move a small or zero window 1997125Swnj * (re)transmitting and thereby not persisting 2007125Swnj * 2017125Swnj * tp->t_timer[TCPT_PERSIST] 2027125Swnj * is set when we are in persist state. 2037125Swnj * tp->t_force 2047125Swnj * is set when we are called to send a persist packet. 2057125Swnj * tp->t_timer[TCPT_REXMT] 2067125Swnj * is set when we are retransmitting 2077125Swnj * The output side is idle when both timers are zero. 2087125Swnj * 20921116Skarels * If send window is too small, there is data to transmit, and no 21021116Skarels * retransmit or persist is pending, then go to persist state. 21121116Skarels * If nothing happens soon, send when timer expires: 21221116Skarels * if window is nonzero, transmit what we can, 21321116Skarels * otherwise force out a byte. 2147125Swnj */ 21521116Skarels if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 21621116Skarels tp->t_timer[TCPT_PERSIST] == 0) { 2177125Swnj tp->t_rxtshift = 0; 2187125Swnj tcp_setpersist(tp); 2197125Swnj } 2207125Swnj 2217125Swnj /* 2225075Swnj * No reason to send a segment, just return. 2235075Swnj */ 2245110Swnj return (0); 2254678Swnj 2265075Swnj send: 2275075Swnj /* 228*44380Skarels * Before ESTABLISHED, force sending of initial options 229*44380Skarels * unless TCP set not to do any options. 230*44380Skarels * NOTE: we assume that the IP/TCP header plus TCP options 231*44380Skarels * always fit in a single mbuf, leaving room for a maximum 232*44380Skarels * link header, i.e. 233*44380Skarels * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 234*44380Skarels */ 235*44380Skarels optlen = 0; 236*44380Skarels hdrlen = sizeof (struct tcpiphdr); 237*44380Skarels if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 238*44380Skarels opt = tcp_initopt; 239*44380Skarels optlen = sizeof (tcp_initopt); 240*44380Skarels hdrlen += sizeof (tcp_initopt); 241*44380Skarels *(u_short *)(opt + 2) = htons((u_short) tcp_mss(tp, 0)); 242*44380Skarels #ifdef DIAGNOSTIC 243*44380Skarels if (max_linkhdr + hdrlen > MHLEN) 244*44380Skarels panic("tcphdr too big"); 245*44380Skarels #endif 246*44380Skarels } 247*44380Skarels 248*44380Skarels /* 2495075Swnj * Grab a header mbuf, attaching a copy of data to 2505075Swnj * be transmitted, and initialize the header from 2515075Swnj * the template for sends on this connection. 2525075Swnj */ 2535075Swnj if (len) { 25431442Skarels if (tp->t_force && len == 1) 25531442Skarels tcpstat.tcps_sndprobe++; 25631442Skarels else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 25731442Skarels tcpstat.tcps_sndrexmitpack++; 25831442Skarels tcpstat.tcps_sndrexmitbyte += len; 25931442Skarels } else { 26031442Skarels tcpstat.tcps_sndpack++; 26131442Skarels tcpstat.tcps_sndbyte += len; 26231442Skarels } 263*44380Skarels #ifdef notyet 264*44380Skarels if ((m = m_copypack(so->so_snd.sb_mb, off, 265*44380Skarels (int)len, max_linkhdr + hdrlen)) == 0) { 266*44380Skarels error = ENOBUFS; 267*44380Skarels goto out; 268*44380Skarels } 269*44380Skarels /* 270*44380Skarels * m_copypack left space for our hdr; use it. 271*44380Skarels */ 272*44380Skarels m->m_len += hdrlen; 273*44380Skarels m->m_data -= hdrlen; 274*44380Skarels #else 275*44380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 276*44380Skarels if (m == NULL) { 277*44380Skarels error = ENOBUFS; 278*44380Skarels goto out; 279*44380Skarels } 280*44380Skarels m->m_data += max_linkhdr; 281*44380Skarels m->m_len = hdrlen; 282*44380Skarels if (len <= MHLEN - hdrlen - max_linkhdr) { 28337317Skarels m_copydata(so->so_snd.sb_mb, off, (int) len, 284*44380Skarels mtod(m, caddr_t) + hdrlen); 28537317Skarels m->m_len += len; 28637317Skarels } else { 28737317Skarels m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 28837317Skarels if (m->m_next == 0) 28937317Skarels len = 0; 29037317Skarels } 291*44380Skarels #endif 292*44380Skarels /* 293*44380Skarels * If we're sending everything we've got, set PUSH. 294*44380Skarels * (This will keep happy those implementations which only 295*44380Skarels * give data to the user when a buffer fills or 296*44380Skarels * a PUSH comes in.) 297*44380Skarels */ 298*44380Skarels if (off + len == so->so_snd.sb_cc) 299*44380Skarels flags |= TH_PUSH; 300*44380Skarels } else { 301*44380Skarels if (tp->t_flags & TF_ACKNOW) 302*44380Skarels tcpstat.tcps_sndacks++; 303*44380Skarels else if (flags & (TH_SYN|TH_FIN|TH_RST)) 304*44380Skarels tcpstat.tcps_sndctrl++; 305*44380Skarels else if (SEQ_GT(tp->snd_up, tp->snd_una)) 306*44380Skarels tcpstat.tcps_sndurg++; 307*44380Skarels else 308*44380Skarels tcpstat.tcps_sndwinup++; 30931442Skarels 310*44380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 311*44380Skarels if (m == NULL) { 312*44380Skarels error = ENOBUFS; 313*44380Skarels goto out; 314*44380Skarels } 315*44380Skarels m->m_data += max_linkhdr; 316*44380Skarels m->m_len = hdrlen; 317*44380Skarels } 318*44380Skarels m->m_pkthdr.rcvif = (struct ifnet *)0; 319*44380Skarels ti = mtod(m, struct tcpiphdr *); 3205075Swnj if (tp->t_template == 0) 3215075Swnj panic("tcp_output"); 3225110Swnj bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 3235075Swnj 3245075Swnj /* 3255075Swnj * Fill in fields, remembering maximum advertised 3265075Swnj * window for use in delaying messages about window sizes. 32727067Skarels * If resending a FIN, be sure not to use a new sequence number. 3285075Swnj */ 32931725Skarels if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 33031725Skarels tp->snd_nxt == tp->snd_max) 33127067Skarels tp->snd_nxt--; 33225940Skarels ti->ti_seq = htonl(tp->snd_nxt); 33325940Skarels ti->ti_ack = htonl(tp->rcv_nxt); 334*44380Skarels if (optlen) { 335*44380Skarels bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 3365441Swnj ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 3375088Swnj } 3385088Swnj ti->ti_flags = flags; 33925940Skarels /* 34025940Skarels * Calculate receive window. Don't shrink window, 34125940Skarels * but avoid silly window syndrome. 34225940Skarels */ 34333783Skarels if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 34425940Skarels win = 0; 345*44380Skarels if (win > TCP_MAXWIN) 346*44380Skarels win = TCP_MAXWIN; 34736777Skarels if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 34836777Skarels win = (long)(tp->rcv_adv - tp->rcv_nxt); 34925940Skarels ti->ti_win = htons((u_short)win); 3505088Swnj if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 35126387Skarels ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 3525075Swnj ti->ti_flags |= TH_URG; 3535075Swnj } else 3545075Swnj /* 3555075Swnj * If no urgent pointer to send, then we pull 3565075Swnj * the urgent pointer to the left edge of the send window 3575075Swnj * so that it doesn't drift into the send window on sequence 3585075Swnj * number wraparound. 3595075Swnj */ 3605088Swnj tp->snd_up = tp->snd_una; /* drag it along */ 3615075Swnj 3625075Swnj /* 3635075Swnj * Put TCP length in extended header, and then 3645075Swnj * checksum extended header and data. 3655075Swnj */ 36625940Skarels if (len + optlen) 367*44380Skarels ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 36825940Skarels optlen + len)); 369*44380Skarels ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 3705075Swnj 3715075Swnj /* 3727125Swnj * In transmit state, time the transmission and arrange for 37321116Skarels * the retransmit. In persist state, just set snd_max. 3745088Swnj */ 37521116Skarels if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 37631758Skarels tcp_seq startseq = tp->snd_nxt; 37731758Skarels 3787125Swnj /* 3797146Swnj * Advance snd_nxt over sequence space of this segment. 3807125Swnj */ 381*44380Skarels if (flags & (TH_SYN|TH_FIN)) { 382*44380Skarels if (flags & TH_SYN) 383*44380Skarels tp->snd_nxt++; 384*44380Skarels if (flags & TH_FIN) { 385*44380Skarels tp->snd_nxt++; 386*44380Skarels tp->t_flags |= TF_SENTFIN; 387*44380Skarels } 38827067Skarels } 3897125Swnj tp->snd_nxt += len; 39031758Skarels if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 3917149Swnj tp->snd_max = tp->snd_nxt; 39231758Skarels /* 39331758Skarels * Time this transmission if not a retransmission and 39431758Skarels * not currently timing anything. 39531758Skarels */ 39631758Skarels if (tp->t_rtt == 0) { 39731758Skarels tp->t_rtt = 1; 39831758Skarels tp->t_rtseq = startseq; 39931758Skarels tcpstat.tcps_segstimed++; 40031758Skarels } 40131758Skarels } 4025088Swnj 4037125Swnj /* 40421116Skarels * Set retransmit timer if not currently set, 40526443Skarels * and not doing an ack or a keep-alive probe. 40631726Skarels * Initial value for retransmit timer is smoothed 40731726Skarels * round-trip time + 2 * round-trip time variance. 40826834Skarels * Initialize shift counter which is used for backoff 40926834Skarels * of retransmit time. 4107125Swnj */ 4117125Swnj if (tp->t_timer[TCPT_REXMT] == 0 && 4127125Swnj tp->snd_nxt != tp->snd_una) { 41332034Skarels tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 41432034Skarels if (tp->t_timer[TCPT_PERSIST]) { 41532034Skarels tp->t_timer[TCPT_PERSIST] = 0; 41632034Skarels tp->t_rxtshift = 0; 41732034Skarels } 4187125Swnj } 41926443Skarels } else 42025940Skarels if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 42125940Skarels tp->snd_max = tp->snd_nxt + len; 4225163Swnj 4235163Swnj /* 4245268Sroot * Trace. 4255268Sroot */ 4267146Swnj if (so->so_options & SO_DEBUG) 4275268Sroot tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 4285268Sroot 4295268Sroot /* 4305075Swnj * Fill in IP length and desired time to live and 431*44380Skarels * send to IP level. There should be a better way 432*44380Skarels * to handle ttl and tos; we could keep them in 433*44380Skarels * the template, but need a way to checksum without them. 4345075Swnj */ 435*44380Skarels m->m_pkthdr.len = hdrlen + len; 436*44380Skarels ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 437*44380Skarels ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 438*44380Skarels ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 439*44380Skarels #if BSD >= 43 44026059Skarels error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 44126059Skarels so->so_options & SO_DONTROUTE); 442*44380Skarels #else 443*44380Skarels error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 444*44380Skarels so->so_options & SO_DONTROUTE); 445*44380Skarels #endif 44633446Skarels if (error) { 447*44380Skarels out: 44833446Skarels if (error == ENOBUFS) { 44933446Skarels tcp_quench(tp->t_inpcb); 45033446Skarels return (0); 45133446Skarels } 452*44380Skarels if ((error == EHOSTUNREACH || error == ENETDOWN) 453*44380Skarels && TCPS_HAVERCVDSYN(tp->t_state)) { 454*44380Skarels tp->t_softerror = error; 455*44380Skarels return (0); 456*44380Skarels } 4576505Ssam return (error); 45833446Skarels } 45931442Skarels tcpstat.tcps_sndtotal++; 4605075Swnj 4615075Swnj /* 4625075Swnj * Data sent (as far as we can tell). 4635075Swnj * If this advertises a larger window than any other segment, 4645245Sroot * then remember the size of the advertised window. 46525940Skarels * Any pending ACK has now been sent. 4665075Swnj */ 4675252Sroot if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 4685075Swnj tp->rcv_adv = tp->rcv_nxt + win; 4695088Swnj tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 47025940Skarels if (sendalot) 4717125Swnj goto again; 4726505Ssam return (0); 4734677Swnj } 4747125Swnj 4757125Swnj tcp_setpersist(tp) 4767125Swnj register struct tcpcb *tp; 4777125Swnj { 47831726Skarels register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 4797125Swnj 4807125Swnj if (tp->t_timer[TCPT_REXMT]) 4817125Swnj panic("tcp_output REXMT"); 4827125Swnj /* 4837125Swnj * Start/restart persistance timer. 4847125Swnj */ 4857125Swnj TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 48631726Skarels t * tcp_backoff[tp->t_rxtshift], 48731725Skarels TCPTV_PERSMIN, TCPTV_PERSMAX); 48831725Skarels if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 48931725Skarels tp->t_rxtshift++; 4907125Swnj } 491