123191Smckusick /* 244380Skarels * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 332788Sbostic * All rights reserved. 423191Smckusick * 544486Sbostic * %sccs.include.redist.c% 632788Sbostic * 7*56531Sbostic * @(#)tcp_output.c 7.24 (Berkeley) 10/11/92 823191Smckusick */ 94677Swnj 10*56531Sbostic #include <sys/param.h> 11*56531Sbostic #include <sys/systm.h> 12*56531Sbostic #include <sys/malloc.h> 13*56531Sbostic #include <sys/mbuf.h> 14*56531Sbostic #include <sys/protosw.h> 15*56531Sbostic #include <sys/socket.h> 16*56531Sbostic #include <sys/socketvar.h> 17*56531Sbostic #include <sys/errno.h> 1810895Ssam 19*56531Sbostic #include <net/route.h> 2010895Ssam 21*56531Sbostic #include <netinet/in.h> 22*56531Sbostic #include <netinet/in_systm.h> 23*56531Sbostic #include <netinet/ip.h> 24*56531Sbostic #include <netinet/in_pcb.h> 25*56531Sbostic #include <netinet/ip_var.h> 26*56531Sbostic #include <netinet/tcp.h> 275088Swnj #define TCPOUTFLAGS 28*56531Sbostic #include <netinet/tcp_fsm.h> 29*56531Sbostic #include <netinet/tcp_seq.h> 30*56531Sbostic #include <netinet/tcp_timer.h> 31*56531Sbostic #include <netinet/tcp_var.h> 32*56531Sbostic #include <netinet/tcpip.h> 33*56531Sbostic #include <netinet/tcp_debug.h> 344677Swnj 3544380Skarels #ifdef notyet 3644380Skarels extern struct mbuf *m_copypack(); 3744380Skarels #endif 3844380Skarels 394678Swnj /* 408314Sroot * Initial options. 415441Swnj */ 425441Swnj u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 435441Swnj 445441Swnj /* 455245Sroot * Tcp output routine: figure out what should be sent and send it. 464678Swnj */ 475075Swnj tcp_output(tp) 485075Swnj register struct tcpcb *tp; 494678Swnj { 505075Swnj register struct socket *so = tp->t_inpcb->inp_socket; 5137317Skarels register long len, win; 5225940Skarels int off, flags, error; 535075Swnj register struct mbuf *m; 545075Swnj register struct tcpiphdr *ti; 555441Swnj u_char *opt; 5644380Skarels unsigned optlen, hdrlen; 5725940Skarels int idle, sendalot; 584678Swnj 595075Swnj /* 606279Swnj * Determine length of data that should be transmitted, 615088Swnj * and flags that will be used. 625088Swnj * If there is some data or critical controls (SYN, RST) 635088Swnj * to send, then transmit; otherwise, investigate further. 645075Swnj */ 6525940Skarels idle = (tp->snd_max == tp->snd_una); 6644380Skarels if (idle && tp->t_idle >= tp->t_rxtcur) 6744380Skarels /* 6844380Skarels * We have been idle for "a while" and no acks are 6944380Skarels * expected to clock out any data we send -- 7044380Skarels * slow start to get ack "clock" running again. 7144380Skarels */ 7244380Skarels tp->snd_cwnd = tp->t_maxseg; 737125Swnj again: 747125Swnj sendalot = 0; 755075Swnj off = tp->snd_nxt - tp->snd_una; 7637317Skarels win = min(tp->snd_wnd, tp->snd_cwnd); 7726834Skarels 7821116Skarels /* 7921116Skarels * If in persist timeout with window of 0, send 1 byte. 8025940Skarels * Otherwise, if window is small but nonzero 8125940Skarels * and timer expired, we will send what we can 8225940Skarels * and go to transmit state. 8321116Skarels */ 8421116Skarels if (tp->t_force) { 8526834Skarels if (win == 0) 8621116Skarels win = 1; 8721116Skarels else { 8821116Skarels tp->t_timer[TCPT_PERSIST] = 0; 8921116Skarels tp->t_rxtshift = 0; 9021116Skarels } 9121116Skarels } 9225940Skarels 9344380Skarels flags = tcp_outflags[tp->t_state]; 9437317Skarels len = min(so->so_snd.sb_cc, win) - off; 9525940Skarels 9626834Skarels if (len < 0) { 9726834Skarels /* 9827048Skarels * If FIN has been sent but not acked, 9927048Skarels * but we haven't been called to retransmit, 10032785Skarels * len will be -1. Otherwise, window shrank 10132785Skarels * after we sent into it. If window shrank to 0, 10232785Skarels * cancel pending retransmit and pull snd_nxt 10332785Skarels * back to (closed) window. We will enter persist 10432785Skarels * state below. If the window didn't close completely, 10527048Skarels * just wait for an ACK. 10626834Skarels */ 10732785Skarels len = 0; 10832785Skarels if (win == 0) { 10927067Skarels tp->t_timer[TCPT_REXMT] = 0; 11027067Skarels tp->snd_nxt = tp->snd_una; 11132785Skarels } 11226834Skarels } 11327067Skarels if (len > tp->t_maxseg) { 11427067Skarels len = tp->t_maxseg; 11531442Skarels sendalot = 1; 11627067Skarels } 11729795Skarels if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 11829795Skarels flags &= ~TH_FIN; 11944380Skarels 12026834Skarels win = sbspace(&so->so_rcv); 12126834Skarels 12225940Skarels /* 12317318Skarels * Sender silly window avoidance. If connection is idle 12417318Skarels * and can send all data, a maximum segment, 12517318Skarels * at least a maximum default-size segment do it, 1266279Swnj * or are forced, do it; otherwise don't bother. 12725261Skarels * If peer's buffer is tiny, then send 12825261Skarels * when window is at least half open. 12921116Skarels * If retransmitting (possibly after persist timer forced us 13021116Skarels * to send into a small window), then must resend. 1316279Swnj */ 1326279Swnj if (len) { 13331725Skarels if (len == tp->t_maxseg) 1346279Swnj goto send; 13525940Skarels if ((idle || tp->t_flags & TF_NODELAY) && 13625940Skarels len + off >= so->so_snd.sb_cc) 1376279Swnj goto send; 1386279Swnj if (tp->t_force) 1396279Swnj goto send; 14025261Skarels if (len >= tp->max_sndwnd / 2) 14125261Skarels goto send; 14221116Skarels if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 14321116Skarels goto send; 14426834Skarels } 1456279Swnj 1465441Swnj /* 14725940Skarels * Compare available window to amount of window 14825940Skarels * known to peer (as advertised window less 14932785Skarels * next expected input). If the difference is at least two 15040685Skarels * max size segments, or at least 50% of the maximum possible 15132785Skarels * window, then want to send a window update to peer. 1525075Swnj */ 15332034Skarels if (win > 0) { 15445169Skarels long adv = win - (tp->rcv_adv - tp->rcv_nxt); 1554678Swnj 15645169Skarels if (adv >= (long) (2 * tp->t_maxseg)) 15732785Skarels goto send; 15845169Skarels if (2 * adv >= (long) so->so_rcv.sb_hiwat) 15932034Skarels goto send; 16032034Skarels } 16132034Skarels 1625075Swnj /* 16344380Skarels * Send if we owe peer an ACK. 16444380Skarels */ 16544380Skarels if (tp->t_flags & TF_ACKNOW) 16644380Skarels goto send; 16744380Skarels if (flags & (TH_SYN|TH_RST)) 16844380Skarels goto send; 16944380Skarels if (SEQ_GT(tp->snd_up, tp->snd_una)) 17044380Skarels goto send; 17144380Skarels /* 17244380Skarels * If our state indicates that FIN should be sent 17344380Skarels * and we have not yet done so, or we're retransmitting the FIN, 17444380Skarels * then we need to send. 17544380Skarels */ 17644380Skarels if (flags & TH_FIN && 17744380Skarels ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 17844380Skarels goto send; 17944380Skarels 18044380Skarels /* 1817125Swnj * TCP window updates are not reliable, rather a polling protocol 1827125Swnj * using ``persist'' packets is used to insure receipt of window 1837125Swnj * updates. The three ``states'' for the output side are: 1847125Swnj * idle not doing retransmits or persists 18525940Skarels * persisting to move a small or zero window 1867125Swnj * (re)transmitting and thereby not persisting 1877125Swnj * 1887125Swnj * tp->t_timer[TCPT_PERSIST] 1897125Swnj * is set when we are in persist state. 1907125Swnj * tp->t_force 1917125Swnj * is set when we are called to send a persist packet. 1927125Swnj * tp->t_timer[TCPT_REXMT] 1937125Swnj * is set when we are retransmitting 1947125Swnj * The output side is idle when both timers are zero. 1957125Swnj * 19621116Skarels * If send window is too small, there is data to transmit, and no 19721116Skarels * retransmit or persist is pending, then go to persist state. 19821116Skarels * If nothing happens soon, send when timer expires: 19921116Skarels * if window is nonzero, transmit what we can, 20021116Skarels * otherwise force out a byte. 2017125Swnj */ 20221116Skarels if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 20321116Skarels tp->t_timer[TCPT_PERSIST] == 0) { 2047125Swnj tp->t_rxtshift = 0; 2057125Swnj tcp_setpersist(tp); 2067125Swnj } 2077125Swnj 2087125Swnj /* 2095075Swnj * No reason to send a segment, just return. 2105075Swnj */ 2115110Swnj return (0); 2124678Swnj 2135075Swnj send: 2145075Swnj /* 21544380Skarels * Before ESTABLISHED, force sending of initial options 21644380Skarels * unless TCP set not to do any options. 21744380Skarels * NOTE: we assume that the IP/TCP header plus TCP options 21844380Skarels * always fit in a single mbuf, leaving room for a maximum 21944380Skarels * link header, i.e. 22044380Skarels * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 22144380Skarels */ 22244380Skarels optlen = 0; 22344380Skarels hdrlen = sizeof (struct tcpiphdr); 22444380Skarels if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 22552515Storek u_short mss; 22652515Storek 22744380Skarels opt = tcp_initopt; 22844380Skarels optlen = sizeof (tcp_initopt); 22944380Skarels hdrlen += sizeof (tcp_initopt); 23052515Storek mss = htons((u_short) tcp_mss(tp, 0)); 23152515Storek bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); 23244380Skarels #ifdef DIAGNOSTIC 23344380Skarels if (max_linkhdr + hdrlen > MHLEN) 23444380Skarels panic("tcphdr too big"); 23544380Skarels #endif 23644380Skarels } 23744380Skarels 23844380Skarels /* 2395075Swnj * Grab a header mbuf, attaching a copy of data to 2405075Swnj * be transmitted, and initialize the header from 2415075Swnj * the template for sends on this connection. 2425075Swnj */ 2435075Swnj if (len) { 24431442Skarels if (tp->t_force && len == 1) 24531442Skarels tcpstat.tcps_sndprobe++; 24631442Skarels else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 24731442Skarels tcpstat.tcps_sndrexmitpack++; 24831442Skarels tcpstat.tcps_sndrexmitbyte += len; 24931442Skarels } else { 25031442Skarels tcpstat.tcps_sndpack++; 25131442Skarels tcpstat.tcps_sndbyte += len; 25231442Skarels } 25344380Skarels #ifdef notyet 25444380Skarels if ((m = m_copypack(so->so_snd.sb_mb, off, 25544380Skarels (int)len, max_linkhdr + hdrlen)) == 0) { 25644380Skarels error = ENOBUFS; 25744380Skarels goto out; 25844380Skarels } 25944380Skarels /* 26044380Skarels * m_copypack left space for our hdr; use it. 26144380Skarels */ 26244380Skarels m->m_len += hdrlen; 26344380Skarels m->m_data -= hdrlen; 26444380Skarels #else 26544380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 26644380Skarels if (m == NULL) { 26744380Skarels error = ENOBUFS; 26844380Skarels goto out; 26944380Skarels } 27044380Skarels m->m_data += max_linkhdr; 27144380Skarels m->m_len = hdrlen; 27244380Skarels if (len <= MHLEN - hdrlen - max_linkhdr) { 27337317Skarels m_copydata(so->so_snd.sb_mb, off, (int) len, 27444380Skarels mtod(m, caddr_t) + hdrlen); 27537317Skarels m->m_len += len; 27637317Skarels } else { 27737317Skarels m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 27837317Skarels if (m->m_next == 0) 27937317Skarels len = 0; 28037317Skarels } 28144380Skarels #endif 28244380Skarels /* 28344380Skarels * If we're sending everything we've got, set PUSH. 28444380Skarels * (This will keep happy those implementations which only 28544380Skarels * give data to the user when a buffer fills or 28644380Skarels * a PUSH comes in.) 28744380Skarels */ 28844380Skarels if (off + len == so->so_snd.sb_cc) 28944380Skarels flags |= TH_PUSH; 29044380Skarels } else { 29144380Skarels if (tp->t_flags & TF_ACKNOW) 29244380Skarels tcpstat.tcps_sndacks++; 29344380Skarels else if (flags & (TH_SYN|TH_FIN|TH_RST)) 29444380Skarels tcpstat.tcps_sndctrl++; 29544380Skarels else if (SEQ_GT(tp->snd_up, tp->snd_una)) 29644380Skarels tcpstat.tcps_sndurg++; 29744380Skarels else 29844380Skarels tcpstat.tcps_sndwinup++; 29931442Skarels 30044380Skarels MGETHDR(m, M_DONTWAIT, MT_HEADER); 30144380Skarels if (m == NULL) { 30244380Skarels error = ENOBUFS; 30344380Skarels goto out; 30444380Skarels } 30544380Skarels m->m_data += max_linkhdr; 30644380Skarels m->m_len = hdrlen; 30744380Skarels } 30844380Skarels m->m_pkthdr.rcvif = (struct ifnet *)0; 30944380Skarels ti = mtod(m, struct tcpiphdr *); 3105075Swnj if (tp->t_template == 0) 3115075Swnj panic("tcp_output"); 3125110Swnj bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 3135075Swnj 3145075Swnj /* 3155075Swnj * Fill in fields, remembering maximum advertised 3165075Swnj * window for use in delaying messages about window sizes. 31727067Skarels * If resending a FIN, be sure not to use a new sequence number. 3185075Swnj */ 31931725Skarels if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 32031725Skarels tp->snd_nxt == tp->snd_max) 32127067Skarels tp->snd_nxt--; 32225940Skarels ti->ti_seq = htonl(tp->snd_nxt); 32325940Skarels ti->ti_ack = htonl(tp->rcv_nxt); 32444380Skarels if (optlen) { 32544380Skarels bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 3265441Swnj ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 3275088Swnj } 3285088Swnj ti->ti_flags = flags; 32925940Skarels /* 33025940Skarels * Calculate receive window. Don't shrink window, 33125940Skarels * but avoid silly window syndrome. 33225940Skarels */ 33333783Skarels if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 33425940Skarels win = 0; 33544380Skarels if (win > TCP_MAXWIN) 33644380Skarels win = TCP_MAXWIN; 33736777Skarels if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 33836777Skarels win = (long)(tp->rcv_adv - tp->rcv_nxt); 33925940Skarels ti->ti_win = htons((u_short)win); 3405088Swnj if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 34126387Skarels ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 3425075Swnj ti->ti_flags |= TH_URG; 3435075Swnj } else 3445075Swnj /* 3455075Swnj * If no urgent pointer to send, then we pull 3465075Swnj * the urgent pointer to the left edge of the send window 3475075Swnj * so that it doesn't drift into the send window on sequence 3485075Swnj * number wraparound. 3495075Swnj */ 3505088Swnj tp->snd_up = tp->snd_una; /* drag it along */ 3515075Swnj 3525075Swnj /* 3535075Swnj * Put TCP length in extended header, and then 3545075Swnj * checksum extended header and data. 3555075Swnj */ 35625940Skarels if (len + optlen) 35744380Skarels ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 35825940Skarels optlen + len)); 35944380Skarels ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 3605075Swnj 3615075Swnj /* 3627125Swnj * In transmit state, time the transmission and arrange for 36321116Skarels * the retransmit. In persist state, just set snd_max. 3645088Swnj */ 36521116Skarels if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 36631758Skarels tcp_seq startseq = tp->snd_nxt; 36731758Skarels 3687125Swnj /* 3697146Swnj * Advance snd_nxt over sequence space of this segment. 3707125Swnj */ 37144380Skarels if (flags & (TH_SYN|TH_FIN)) { 37244380Skarels if (flags & TH_SYN) 37344380Skarels tp->snd_nxt++; 37444380Skarels if (flags & TH_FIN) { 37544380Skarels tp->snd_nxt++; 37644380Skarels tp->t_flags |= TF_SENTFIN; 37744380Skarels } 37827067Skarels } 3797125Swnj tp->snd_nxt += len; 38031758Skarels if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 3817149Swnj tp->snd_max = tp->snd_nxt; 38231758Skarels /* 38331758Skarels * Time this transmission if not a retransmission and 38431758Skarels * not currently timing anything. 38531758Skarels */ 38631758Skarels if (tp->t_rtt == 0) { 38731758Skarels tp->t_rtt = 1; 38831758Skarels tp->t_rtseq = startseq; 38931758Skarels tcpstat.tcps_segstimed++; 39031758Skarels } 39131758Skarels } 3925088Swnj 3937125Swnj /* 39421116Skarels * Set retransmit timer if not currently set, 39526443Skarels * and not doing an ack or a keep-alive probe. 39631726Skarels * Initial value for retransmit timer is smoothed 39731726Skarels * round-trip time + 2 * round-trip time variance. 39826834Skarels * Initialize shift counter which is used for backoff 39926834Skarels * of retransmit time. 4007125Swnj */ 4017125Swnj if (tp->t_timer[TCPT_REXMT] == 0 && 4027125Swnj tp->snd_nxt != tp->snd_una) { 40332034Skarels tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 40432034Skarels if (tp->t_timer[TCPT_PERSIST]) { 40532034Skarels tp->t_timer[TCPT_PERSIST] = 0; 40632034Skarels tp->t_rxtshift = 0; 40732034Skarels } 4087125Swnj } 40926443Skarels } else 41025940Skarels if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 41125940Skarels tp->snd_max = tp->snd_nxt + len; 4125163Swnj 4135163Swnj /* 4145268Sroot * Trace. 4155268Sroot */ 4167146Swnj if (so->so_options & SO_DEBUG) 4175268Sroot tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 4185268Sroot 4195268Sroot /* 4205075Swnj * Fill in IP length and desired time to live and 42144380Skarels * send to IP level. There should be a better way 42244380Skarels * to handle ttl and tos; we could keep them in 42344380Skarels * the template, but need a way to checksum without them. 4245075Swnj */ 42544380Skarels m->m_pkthdr.len = hdrlen + len; 42644380Skarels ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 42744380Skarels ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 42844380Skarels ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 42944380Skarels #if BSD >= 43 43026059Skarels error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 43126059Skarels so->so_options & SO_DONTROUTE); 43244380Skarels #else 43344380Skarels error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 43444380Skarels so->so_options & SO_DONTROUTE); 43544380Skarels #endif 43633446Skarels if (error) { 43744380Skarels out: 43833446Skarels if (error == ENOBUFS) { 43933446Skarels tcp_quench(tp->t_inpcb); 44033446Skarels return (0); 44133446Skarels } 44244380Skarels if ((error == EHOSTUNREACH || error == ENETDOWN) 44344380Skarels && TCPS_HAVERCVDSYN(tp->t_state)) { 44444380Skarels tp->t_softerror = error; 44544380Skarels return (0); 44644380Skarels } 4476505Ssam return (error); 44833446Skarels } 44931442Skarels tcpstat.tcps_sndtotal++; 4505075Swnj 4515075Swnj /* 4525075Swnj * Data sent (as far as we can tell). 4535075Swnj * If this advertises a larger window than any other segment, 4545245Sroot * then remember the size of the advertised window. 45525940Skarels * Any pending ACK has now been sent. 4565075Swnj */ 4575252Sroot if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 4585075Swnj tp->rcv_adv = tp->rcv_nxt + win; 4595088Swnj tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 46025940Skarels if (sendalot) 4617125Swnj goto again; 4626505Ssam return (0); 4634677Swnj } 4647125Swnj 4657125Swnj tcp_setpersist(tp) 4667125Swnj register struct tcpcb *tp; 4677125Swnj { 46831726Skarels register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 4697125Swnj 4707125Swnj if (tp->t_timer[TCPT_REXMT]) 4717125Swnj panic("tcp_output REXMT"); 4727125Swnj /* 4737125Swnj * Start/restart persistance timer. 4747125Swnj */ 4757125Swnj TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 47631726Skarels t * tcp_backoff[tp->t_rxtshift], 47731725Skarels TCPTV_PERSMIN, TCPTV_PERSMAX); 47831725Skarels if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 47931725Skarels tp->t_rxtshift++; 4807125Swnj } 481