123191Smckusick /* 244380Skarels * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 332788Sbostic * All rights reserved. 423191Smckusick * 544486Sbostic * %sccs.include.redist.c% 632788Sbostic * 7*45169Skarels * @(#)tcp_output.c 7.22 (Berkeley) 08/31/90 823191Smckusick */ 94677Swnj 1017063Sbloom #include "param.h" 1117063Sbloom #include "systm.h" 1237317Skarels #include "malloc.h" 1317063Sbloom #include "mbuf.h" 1417063Sbloom #include "protosw.h" 1517063Sbloom #include "socket.h" 1617063Sbloom #include "socketvar.h" 1717063Sbloom #include "errno.h" 1810895Ssam 1910895Ssam #include "../net/route.h" 2010895Ssam 2117063Sbloom #include "in.h" 2217063Sbloom #include "in_systm.h" 2317063Sbloom #include "ip.h" 2440685Skarels #include "in_pcb.h" 2517063Sbloom #include "ip_var.h" 2617063Sbloom #include "tcp.h" 275088Swnj #define TCPOUTFLAGS 2817063Sbloom #include "tcp_fsm.h" 2917063Sbloom #include "tcp_seq.h" 3017063Sbloom #include "tcp_timer.h" 3117063Sbloom #include "tcp_var.h" 3217063Sbloom #include "tcpip.h" 3317063Sbloom #include "tcp_debug.h" 344677Swnj 3544380Skarels #ifdef notyet 3644380Skarels extern struct mbuf *m_copypack(); 3744380Skarels #endif 3844380Skarels 394678Swnj /* 408314Sroot * Initial options. 415441Swnj */ 425441Swnj u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 435441Swnj 445441Swnj /* 455245Sroot * Tcp output routine: figure out what should be sent and send it. 464678Swnj */ 475075Swnj tcp_output(tp) 485075Swnj register struct tcpcb *tp; 494678Swnj { 505075Swnj register struct socket *so = tp->t_inpcb->inp_socket; 5137317Skarels register long len, win; 5225940Skarels int off, flags, error; 535075Swnj register struct mbuf *m; 545075Swnj register struct tcpiphdr *ti; 555441Swnj u_char *opt; 5644380Skarels unsigned optlen, hdrlen; 5725940Skarels int idle, sendalot; 584678Swnj 595075Swnj /* 606279Swnj * Determine length of data that should be transmitted, 615088Swnj * and flags that will be used. 625088Swnj * If there is some data or critical controls (SYN, RST) 635088Swnj * to send, then transmit; otherwise, investigate further. 645075Swnj */ 6525940Skarels idle = (tp->snd_max == tp->snd_una); 6644380Skarels if (idle && tp->t_idle >= tp->t_rxtcur) 6744380Skarels /* 6844380Skarels * We have been idle for "a while" and no acks are 6944380Skarels * expected to clock out any data we send -- 7044380Skarels * slow start to get ack "clock" running again. 7144380Skarels */ 7244380Skarels tp->snd_cwnd = tp->t_maxseg; 737125Swnj again: 747125Swnj sendalot = 0; 755075Swnj off = tp->snd_nxt - tp->snd_una; 7637317Skarels win = min(tp->snd_wnd, tp->snd_cwnd); 7726834Skarels 7821116Skarels /* 7921116Skarels * If in persist timeout with window of 0, send 1 byte. 8025940Skarels * Otherwise, if window is small but nonzero 8125940Skarels * and timer expired, we will send what we can 8225940Skarels * and go to transmit state. 8321116Skarels */ 8421116Skarels if (tp->t_force) { 8526834Skarels if (win == 0) 8621116Skarels win = 1; 8721116Skarels else { 8821116Skarels tp->t_timer[TCPT_PERSIST] = 0; 8921116Skarels tp->t_rxtshift = 0; 9021116Skarels } 9121116Skarels } 9225940Skarels 9344380Skarels flags = tcp_outflags[tp->t_state]; 9437317Skarels len = min(so->so_snd.sb_cc, win) - off; 9525940Skarels 9626834Skarels if (len < 0) { 9726834Skarels /* 9827048Skarels * If FIN has been sent but not acked, 9927048Skarels * but we haven't been called to retransmit, 10032785Skarels * len will be -1. Otherwise, window shrank 10132785Skarels * after we sent into it. If window shrank to 0, 10232785Skarels * cancel pending retransmit and pull snd_nxt 10332785Skarels * back to (closed) window. We will enter persist 10432785Skarels * state below. If the window didn't close completely, 10527048Skarels * just wait for an ACK. 10626834Skarels */ 10732785Skarels len = 0; 10832785Skarels if (win == 0) { 10927067Skarels tp->t_timer[TCPT_REXMT] = 0; 11027067Skarels tp->snd_nxt = tp->snd_una; 11132785Skarels } 11226834Skarels } 11327067Skarels if (len > tp->t_maxseg) { 11427067Skarels len = tp->t_maxseg; 11531442Skarels sendalot = 1; 11627067Skarels } 11729795Skarels if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 11829795Skarels flags &= ~TH_FIN; 11944380Skarels 12026834Skarels win = sbspace(&so->so_rcv); 12126834Skarels 12225940Skarels /* 12317318Skarels * Sender silly window avoidance. If connection is idle 12417318Skarels * and can send all data, a maximum segment, 12517318Skarels * at least a maximum default-size segment do it, 1266279Swnj * or are forced, do it; otherwise don't bother. 12725261Skarels * If peer's buffer is tiny, then send 12825261Skarels * when window is at least half open. 12921116Skarels * If retransmitting (possibly after persist timer forced us 13021116Skarels * to send into a small window), then must resend. 1316279Swnj */ 1326279Swnj if (len) { 13331725Skarels if (len == tp->t_maxseg) 1346279Swnj goto send; 13525940Skarels if ((idle || tp->t_flags & TF_NODELAY) && 13625940Skarels len + off >= so->so_snd.sb_cc) 1376279Swnj goto send; 1386279Swnj if (tp->t_force) 1396279Swnj goto send; 14025261Skarels if (len >= tp->max_sndwnd / 2) 14125261Skarels goto send; 14221116Skarels if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 14321116Skarels goto send; 14426834Skarels } 1456279Swnj 1465441Swnj /* 14725940Skarels * Compare available window to amount of window 14825940Skarels * known to peer (as advertised window less 14932785Skarels * next expected input). If the difference is at least two 15040685Skarels * max size segments, or at least 50% of the maximum possible 15132785Skarels * window, then want to send a window update to peer. 1525075Swnj */ 15332034Skarels if (win > 0) { 154*45169Skarels long adv = win - (tp->rcv_adv - tp->rcv_nxt); 1554678Swnj 156*45169Skarels if (adv >= (long) (2 * tp->t_maxseg)) 15732785Skarels goto send; 158*45169Skarels if (2 * adv >= (long) so->so_rcv.sb_hiwat) 15932034Skarels goto send; 16032034Skarels } 16132034Skarels 1625075Swnj /* 16344380Skarels * Send if we owe peer an ACK. 16444380Skarels */ 16544380Skarels if (tp->t_flags & TF_ACKNOW) 16644380Skarels goto send; 16744380Skarels if (flags & (TH_SYN|TH_RST)) 16844380Skarels goto send; 16944380Skarels if (SEQ_GT(tp->snd_up, tp->snd_una)) 17044380Skarels goto send; 17144380Skarels /* 17244380Skarels * If our state indicates that FIN should be sent 17344380Skarels * and we have not yet done so, or we're retransmitting the FIN, 17444380Skarels * then we need to send. 17544380Skarels */ 17644380Skarels if (flags & TH_FIN && 17744380Skarels ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 17844380Skarels goto send; 17944380Skarels 18044380Skarels /* 1817125Swnj * TCP window updates are not reliable, rather a polling protocol 1827125Swnj * using ``persist'' packets is used to insure receipt of window 1837125Swnj * updates. The three ``states'' for the output side are: 1847125Swnj * idle not doing retransmits or persists 18525940Skarels * persisting to move a small or zero window 1867125Swnj * (re)transmitting and thereby not persisting 1877125Swnj * 1887125Swnj * tp->t_timer[TCPT_PERSIST] 1897125Swnj * is set when we are in persist state. 1907125Swnj * tp->t_force 1917125Swnj * is set when we are called to send a persist packet. 1927125Swnj * tp->t_timer[TCPT_REXMT] 1937125Swnj * is set when we are retransmitting 1947125Swnj * The output side is idle when both timers are zero. 1957125Swnj * 19621116Skarels * If send window is too small, there is data to transmit, and no 19721116Skarels * retransmit or persist is pending, then go to persist state. 19821116Skarels * If nothing happens soon, send when timer expires: 19921116Skarels * if window is nonzero, transmit what we can, 20021116Skarels * otherwise force out a byte. 2017125Swnj */ 20221116Skarels if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 20321116Skarels tp->t_timer[TCPT_PERSIST] == 0) { 2047125Swnj tp->t_rxtshift = 0; 2057125Swnj tcp_setpersist(tp); 2067125Swnj } 2077125Swnj 2087125Swnj /* 2095075Swnj * No reason to send a segment, just return. 2105075Swnj */ 2115110Swnj return (0); 2124678Swnj 2135075Swnj send: 2145075Swnj /* 21544380Skarels * Before ESTABLISHED, force sending of initial options 21644380Skarels * unless TCP set not to do any options. 21744380Skarels * NOTE: we assume that the IP/TCP header plus TCP options 21844380Skarels * always fit in a single mbuf, leaving room for a maximum 21944380Skarels * link header, i.e. 22044380Skarels * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 22144380Skarels */ 22244380Skarels optlen = 0; 22344380Skarels hdrlen = sizeof (struct tcpiphdr); 22444380Skarels if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 22544380Skarels opt = tcp_initopt; 22644380Skarels optlen = sizeof (tcp_initopt); 22744380Skarels hdrlen += sizeof (tcp_initopt); 22844380Skarels *(u_short *)(opt + 2) = htons((u_short) tcp_mss(tp, 0)); 22944380Skarels #ifdef DIAGNOSTIC 23044380Skarels if (max_linkhdr + hdrlen > MHLEN) 23144380Skarels panic("tcphdr too big"); 23244380Skarels #endif 23344380Skarels } 23444380Skarels 23544380Skarels /* 2365075Swnj * Grab a header mbuf, attaching a copy of data to 2375075Swnj * be transmitted, and initialize the header from 2385075Swnj * the template for sends on this connection. 2395075Swnj */ 2405075Swnj if (len) { 24131442Skarels if (tp->t_force && len == 1) 24231442Skarels tcpstat.tcps_sndprobe++; 24331442Skarels else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 24431442Skarels tcpstat.tcps_sndrexmitpack++; 24531442Skarels tcpstat.tcps_sndrexmitbyte += len; 24631442Skarels } else { 24731442Skarels tcpstat.tcps_sndpack++; 24831442Skarels tcpstat.tcps_sndbyte += len; 24931442Skarels } 25044380Skarels #ifdef notyet 25144380Skarels if ((m = m_copypack(so->so_snd.sb_mb, off, 25244380Skarels (int)len, max_linkhdr + hdrlen)) == 0) { 25344380Skarels error = ENOBUFS; 25444380Skarels goto out; 25544380Skarels } 25644380Skarels /* 25744380Skarels * m_copypack left space for our hdr; use it. 25844380Skarels */ 25944380Skarels m->m_len += hdrlen; 26044380Skarels m->m_data -= hdrlen; 26144380Skarels #else 26244380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 26344380Skarels if (m == NULL) { 26444380Skarels error = ENOBUFS; 26544380Skarels goto out; 26644380Skarels } 26744380Skarels m->m_data += max_linkhdr; 26844380Skarels m->m_len = hdrlen; 26944380Skarels if (len <= MHLEN - hdrlen - max_linkhdr) { 27037317Skarels m_copydata(so->so_snd.sb_mb, off, (int) len, 27144380Skarels mtod(m, caddr_t) + hdrlen); 27237317Skarels m->m_len += len; 27337317Skarels } else { 27437317Skarels m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 27537317Skarels if (m->m_next == 0) 27637317Skarels len = 0; 27737317Skarels } 27844380Skarels #endif 27944380Skarels /* 28044380Skarels * If we're sending everything we've got, set PUSH. 28144380Skarels * (This will keep happy those implementations which only 28244380Skarels * give data to the user when a buffer fills or 28344380Skarels * a PUSH comes in.) 28444380Skarels */ 28544380Skarels if (off + len == so->so_snd.sb_cc) 28644380Skarels flags |= TH_PUSH; 28744380Skarels } else { 28844380Skarels if (tp->t_flags & TF_ACKNOW) 28944380Skarels tcpstat.tcps_sndacks++; 29044380Skarels else if (flags & (TH_SYN|TH_FIN|TH_RST)) 29144380Skarels tcpstat.tcps_sndctrl++; 29244380Skarels else if (SEQ_GT(tp->snd_up, tp->snd_una)) 29344380Skarels tcpstat.tcps_sndurg++; 29444380Skarels else 29544380Skarels tcpstat.tcps_sndwinup++; 29631442Skarels 29744380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 29844380Skarels if (m == NULL) { 29944380Skarels error = ENOBUFS; 30044380Skarels goto out; 30144380Skarels } 30244380Skarels m->m_data += max_linkhdr; 30344380Skarels m->m_len = hdrlen; 30444380Skarels } 30544380Skarels m->m_pkthdr.rcvif = (struct ifnet *)0; 30644380Skarels ti = mtod(m, struct tcpiphdr *); 3075075Swnj if (tp->t_template == 0) 3085075Swnj panic("tcp_output"); 3095110Swnj bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 3105075Swnj 3115075Swnj /* 3125075Swnj * Fill in fields, remembering maximum advertised 3135075Swnj * window for use in delaying messages about window sizes. 31427067Skarels * If resending a FIN, be sure not to use a new sequence number. 3155075Swnj */ 31631725Skarels if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 31731725Skarels tp->snd_nxt == tp->snd_max) 31827067Skarels tp->snd_nxt--; 31925940Skarels ti->ti_seq = htonl(tp->snd_nxt); 32025940Skarels ti->ti_ack = htonl(tp->rcv_nxt); 32144380Skarels if (optlen) { 32244380Skarels bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 3235441Swnj ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 3245088Swnj } 3255088Swnj ti->ti_flags = flags; 32625940Skarels /* 32725940Skarels * Calculate receive window. Don't shrink window, 32825940Skarels * but avoid silly window syndrome. 32925940Skarels */ 33033783Skarels if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 33125940Skarels win = 0; 33244380Skarels if (win > TCP_MAXWIN) 33344380Skarels win = TCP_MAXWIN; 33436777Skarels if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 33536777Skarels win = (long)(tp->rcv_adv - tp->rcv_nxt); 33625940Skarels ti->ti_win = htons((u_short)win); 3375088Swnj if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 33826387Skarels ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 3395075Swnj ti->ti_flags |= TH_URG; 3405075Swnj } else 3415075Swnj /* 3425075Swnj * If no urgent pointer to send, then we pull 3435075Swnj * the urgent pointer to the left edge of the send window 3445075Swnj * so that it doesn't drift into the send window on sequence 3455075Swnj * number wraparound. 3465075Swnj */ 3475088Swnj tp->snd_up = tp->snd_una; /* drag it along */ 3485075Swnj 3495075Swnj /* 3505075Swnj * Put TCP length in extended header, and then 3515075Swnj * checksum extended header and data. 3525075Swnj */ 35325940Skarels if (len + optlen) 35444380Skarels ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 35525940Skarels optlen + len)); 35644380Skarels ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 3575075Swnj 3585075Swnj /* 3597125Swnj * In transmit state, time the transmission and arrange for 36021116Skarels * the retransmit. In persist state, just set snd_max. 3615088Swnj */ 36221116Skarels if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 36331758Skarels tcp_seq startseq = tp->snd_nxt; 36431758Skarels 3657125Swnj /* 3667146Swnj * Advance snd_nxt over sequence space of this segment. 3677125Swnj */ 36844380Skarels if (flags & (TH_SYN|TH_FIN)) { 36944380Skarels if (flags & TH_SYN) 37044380Skarels tp->snd_nxt++; 37144380Skarels if (flags & TH_FIN) { 37244380Skarels tp->snd_nxt++; 37344380Skarels tp->t_flags |= TF_SENTFIN; 37444380Skarels } 37527067Skarels } 3767125Swnj tp->snd_nxt += len; 37731758Skarels if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 3787149Swnj tp->snd_max = tp->snd_nxt; 37931758Skarels /* 38031758Skarels * Time this transmission if not a retransmission and 38131758Skarels * not currently timing anything. 38231758Skarels */ 38331758Skarels if (tp->t_rtt == 0) { 38431758Skarels tp->t_rtt = 1; 38531758Skarels tp->t_rtseq = startseq; 38631758Skarels tcpstat.tcps_segstimed++; 38731758Skarels } 38831758Skarels } 3895088Swnj 3907125Swnj /* 39121116Skarels * Set retransmit timer if not currently set, 39226443Skarels * and not doing an ack or a keep-alive probe. 39331726Skarels * Initial value for retransmit timer is smoothed 39431726Skarels * round-trip time + 2 * round-trip time variance. 39526834Skarels * Initialize shift counter which is used for backoff 39626834Skarels * of retransmit time. 3977125Swnj */ 3987125Swnj if (tp->t_timer[TCPT_REXMT] == 0 && 3997125Swnj tp->snd_nxt != tp->snd_una) { 40032034Skarels tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 40132034Skarels if (tp->t_timer[TCPT_PERSIST]) { 40232034Skarels tp->t_timer[TCPT_PERSIST] = 0; 40332034Skarels tp->t_rxtshift = 0; 40432034Skarels } 4057125Swnj } 40626443Skarels } else 40725940Skarels if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 40825940Skarels tp->snd_max = tp->snd_nxt + len; 4095163Swnj 4105163Swnj /* 4115268Sroot * Trace. 4125268Sroot */ 4137146Swnj if (so->so_options & SO_DEBUG) 4145268Sroot tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 4155268Sroot 4165268Sroot /* 4175075Swnj * Fill in IP length and desired time to live and 41844380Skarels * send to IP level. There should be a better way 41944380Skarels * to handle ttl and tos; we could keep them in 42044380Skarels * the template, but need a way to checksum without them. 4215075Swnj */ 42244380Skarels m->m_pkthdr.len = hdrlen + len; 42344380Skarels ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 42444380Skarels ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 42544380Skarels ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 42644380Skarels #if BSD >= 43 42726059Skarels error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 42826059Skarels so->so_options & SO_DONTROUTE); 42944380Skarels #else 43044380Skarels error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 43144380Skarels so->so_options & SO_DONTROUTE); 43244380Skarels #endif 43333446Skarels if (error) { 43444380Skarels out: 43533446Skarels if (error == ENOBUFS) { 43633446Skarels tcp_quench(tp->t_inpcb); 43733446Skarels return (0); 43833446Skarels } 43944380Skarels if ((error == EHOSTUNREACH || error == ENETDOWN) 44044380Skarels && TCPS_HAVERCVDSYN(tp->t_state)) { 44144380Skarels tp->t_softerror = error; 44244380Skarels return (0); 44344380Skarels } 4446505Ssam return (error); 44533446Skarels } 44631442Skarels tcpstat.tcps_sndtotal++; 4475075Swnj 4485075Swnj /* 4495075Swnj * Data sent (as far as we can tell). 4505075Swnj * If this advertises a larger window than any other segment, 4515245Sroot * then remember the size of the advertised window. 45225940Skarels * Any pending ACK has now been sent. 4535075Swnj */ 4545252Sroot if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 4555075Swnj tp->rcv_adv = tp->rcv_nxt + win; 4565088Swnj tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 45725940Skarels if (sendalot) 4587125Swnj goto again; 4596505Ssam return (0); 4604677Swnj } 4617125Swnj 4627125Swnj tcp_setpersist(tp) 4637125Swnj register struct tcpcb *tp; 4647125Swnj { 46531726Skarels register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 4667125Swnj 4677125Swnj if (tp->t_timer[TCPT_REXMT]) 4687125Swnj panic("tcp_output REXMT"); 4697125Swnj /* 4707125Swnj * Start/restart persistance timer. 4717125Swnj */ 4727125Swnj TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 47331726Skarels t * tcp_backoff[tp->t_rxtshift], 47431725Skarels TCPTV_PERSMIN, TCPTV_PERSMAX); 47531725Skarels if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 47631725Skarels tp->t_rxtshift++; 4777125Swnj } 478