xref: /netbsd-src/sys/netinet/tcp_output.c (revision 3b01aba77a7a698587faaae455bbfe740923c1f5)
1 /*	$NetBSD: tcp_output.c,v 1.70 2001/07/31 02:25:22 thorpej Exp $	*/
2 
3 /*
4 %%% portions-copyright-nrl-95
5 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
7 Reserved. All rights under this copyright have been assigned to the US
8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
10 software.
11 You should have received a copy of the license with this software. If you
12 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
13 
14 */
15 
16 /*
17  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
18  * All rights reserved.
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  * 1. Redistributions of source code must retain the above copyright
24  *    notice, this list of conditions and the following disclaimer.
25  * 2. Redistributions in binary form must reproduce the above copyright
26  *    notice, this list of conditions and the following disclaimer in the
27  *    documentation and/or other materials provided with the distribution.
28  * 3. Neither the name of the project nor the names of its contributors
29  *    may be used to endorse or promote products derived from this software
30  *    without specific prior written permission.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42  * SUCH DAMAGE.
43  */
44 
45 /*-
46  * Copyright (c) 1997, 1998, 2001 The NetBSD Foundation, Inc.
47  * All rights reserved.
48  *
49  * This code is derived from software contributed to The NetBSD Foundation
50  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
51  * Facility, NASA Ames Research Center.
52  *
53  * Redistribution and use in source and binary forms, with or without
54  * modification, are permitted provided that the following conditions
55  * are met:
56  * 1. Redistributions of source code must retain the above copyright
57  *    notice, this list of conditions and the following disclaimer.
58  * 2. Redistributions in binary form must reproduce the above copyright
59  *    notice, this list of conditions and the following disclaimer in the
60  *    documentation and/or other materials provided with the distribution.
61  * 3. All advertising materials mentioning features or use of this software
62  *    must display the following acknowledgement:
63  *	This product includes software developed by the NetBSD
64  *	Foundation, Inc. and its contributors.
65  * 4. Neither the name of The NetBSD Foundation nor the names of its
66  *    contributors may be used to endorse or promote products derived
67  *    from this software without specific prior written permission.
68  *
69  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
70  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
71  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
72  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
73  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
74  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
75  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
76  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
77  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
78  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
79  * POSSIBILITY OF SUCH DAMAGE.
80  */
81 
82 /*
83  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
84  *	The Regents of the University of California.  All rights reserved.
85  *
86  * Redistribution and use in source and binary forms, with or without
87  * modification, are permitted provided that the following conditions
88  * are met:
89  * 1. Redistributions of source code must retain the above copyright
90  *    notice, this list of conditions and the following disclaimer.
91  * 2. Redistributions in binary form must reproduce the above copyright
92  *    notice, this list of conditions and the following disclaimer in the
93  *    documentation and/or other materials provided with the distribution.
94  * 3. All advertising materials mentioning features or use of this software
95  *    must display the following acknowledgement:
96  *	This product includes software developed by the University of
97  *	California, Berkeley and its contributors.
98  * 4. Neither the name of the University nor the names of its contributors
99  *    may be used to endorse or promote products derived from this software
100  *    without specific prior written permission.
101  *
102  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
103  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
104  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
105  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
106  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
107  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
108  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
109  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
110  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
111  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
112  * SUCH DAMAGE.
113  *
114  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
115  */
116 
117 #include "opt_inet.h"
118 #include "opt_ipsec.h"
119 #include "opt_tcp_debug.h"
120 
121 #include <sys/param.h>
122 #include <sys/systm.h>
123 #include <sys/malloc.h>
124 #include <sys/mbuf.h>
125 #include <sys/protosw.h>
126 #include <sys/socket.h>
127 #include <sys/socketvar.h>
128 #include <sys/errno.h>
129 #include <sys/domain.h>
130 
131 #include <net/if.h>
132 #include <net/route.h>
133 
134 #include <netinet/in.h>
135 #include <netinet/in_systm.h>
136 #include <netinet/ip.h>
137 #include <netinet/in_pcb.h>
138 #include <netinet/ip_var.h>
139 
140 #ifdef INET6
141 #ifndef INET
142 #include <netinet/in.h>
143 #endif
144 #include <netinet/ip6.h>
145 #include <netinet6/in6_pcb.h>
146 #include <netinet6/ip6_var.h>
147 #endif
148 
149 #include <netinet/tcp.h>
150 #define	TCPOUTFLAGS
151 #include <netinet/tcp_fsm.h>
152 #include <netinet/tcp_seq.h>
153 #include <netinet/tcp_timer.h>
154 #include <netinet/tcp_var.h>
155 #include <netinet/tcpip.h>
156 #include <netinet/tcp_debug.h>
157 
158 #ifdef notyet
159 extern struct mbuf *m_copypack();
160 #endif
161 
162 #define MAX_TCPOPTLEN	32	/* max # bytes that go in options */
163 
164 /*
165  * Knob to enable Congestion Window Monitoring, and control the
166  * the burst size it allows.  Default burst is 4 packets, per
167  * the Internet draft.
168  */
169 int	tcp_cwm = 0;
170 int	tcp_cwm_burstsize = 4;
171 
172 static
173 #ifndef GPROF
174 __inline
175 #endif
176 void
177 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep)
178 {
179 #ifdef INET
180 	struct inpcb *inp = tp->t_inpcb;
181 #endif
182 #ifdef INET6
183 	struct in6pcb *in6p = tp->t_in6pcb;
184 #endif
185 	struct rtentry *rt;
186 	struct ifnet *ifp;
187 	int size;
188 	int iphlen;
189 
190 #ifdef DIAGNOSTIC
191 	if (tp->t_inpcb && tp->t_in6pcb)
192 		panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
193 #endif
194 	switch (tp->t_family) {
195 #ifdef INET
196 	case AF_INET:
197 		iphlen = sizeof(struct ip);
198 		break;
199 #endif
200 #ifdef INET6
201 	case AF_INET6:
202 		iphlen = sizeof(struct ip6_hdr);
203 		break;
204 #endif
205 	default:
206 		size = tcp_mssdflt;
207 		goto out;
208 	}
209 
210 	rt = NULL;
211 #ifdef INET
212 	if (inp)
213 		rt = in_pcbrtentry(inp);
214 #endif
215 #ifdef INET6
216 	if (in6p)
217 		rt = in6_pcbrtentry(in6p);
218 #endif
219 	if (rt == NULL) {
220 		size = tcp_mssdflt;
221 		goto out;
222 	}
223 
224 	ifp = rt->rt_ifp;
225 
226 	size = tcp_mssdflt;
227 	if (rt->rt_rmx.rmx_mtu != 0)
228 		size = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
229 	else if (ifp->if_flags & IFF_LOOPBACK)
230 		size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
231 #ifdef INET
232 	else if (inp && ip_mtudisc)
233 		size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
234 	else if (inp && in_localaddr(inp->inp_faddr))
235 		size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
236 #endif
237 #ifdef INET6
238 	else if (in6p) {
239 #ifdef INET
240 		if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
241 			/* mapped addr case */
242 			struct in_addr d;
243 			bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
244 			if (ip_mtudisc || in_localaddr(d))
245 				size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
246 		} else
247 #endif
248 		{
249 			/*
250 			 * for IPv6, path MTU discovery is always turned on,
251 			 * or the node must use packet size <= 1280.
252 			 */
253 			size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
254 		}
255 	}
256 #endif
257 	size -= tcp_optlen(tp);
258 	/*
259 	 * XXX tp->t_ourmss should have the right size, but without this code
260 	 * fragmentation will occur... need more investigation
261 	 */
262 #ifdef INET
263 	if (inp) {
264 #ifdef IPSEC
265 		size -= ipsec4_hdrsiz_tcp(tp);
266 #endif
267 		size -= ip_optlen(inp);
268 	}
269 #endif
270 #ifdef INET6
271 #ifdef INET
272 	if (in6p && tp->t_family == AF_INET) {
273 #ifdef IPSEC
274 		size -= ipsec4_hdrsiz_tcp(tp);
275 #endif
276 		/* XXX size -= ip_optlen(in6p); */
277 	} else
278 #endif
279 	if (in6p && tp->t_family == AF_INET6) {
280 #ifdef IPSEC
281 		size -= ipsec6_hdrsiz_tcp(tp);
282 #endif
283 		size -= ip6_optlen(in6p);
284 	}
285 #endif
286 
287  out:
288 	/*
289 	 * *rxsegsizep holds *estimated* inbound segment size (estimation
290 	 * assumes that path MTU is the same for both ways).  this is only
291 	 * for silly window avoidance, do not use the value for other purposes.
292 	 *
293 	 * ipseclen is subtracted from both sides, this may not be right.
294 	 * I'm not quite sure about this (could someone comment).
295 	 */
296 	*txsegsizep = min(tp->t_peermss, size);
297 	*rxsegsizep = min(tp->t_ourmss, size);
298 
299 	if (*txsegsizep != tp->t_segsz) {
300 		/*
301 		 * If the new segment size is larger, we don't want to
302 		 * mess up the congestion window, but if it is smaller
303 		 * we'll have to reduce the congestion window to ensure
304 		 * that we don't get into trouble with initial windows
305 		 * and the rest.  In any case, if the segment size
306 		 * has changed, chances are the path has, too, and
307 		 * our congestion window will be different.
308 		 */
309 		if (*txsegsizep < tp->t_segsz) {
310 			tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
311 					   * *txsegsizep, *txsegsizep);
312 			tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
313 						* *txsegsizep, *txsegsizep);
314 		}
315 		tp->t_segsz = *txsegsizep;
316 	}
317 }
318 
319 static
320 #ifndef GPROF
321 __inline
322 #endif
323 int
324 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
325     long len, int hdrlen, struct mbuf **mp)
326 {
327 	struct mbuf *m;
328 
329 	if (tp->t_force && len == 1)
330 		tcpstat.tcps_sndprobe++;
331 	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
332 		tcpstat.tcps_sndrexmitpack++;
333 		tcpstat.tcps_sndrexmitbyte += len;
334 	} else {
335 		tcpstat.tcps_sndpack++;
336 		tcpstat.tcps_sndbyte += len;
337 	}
338 #ifdef notyet
339 	if ((m = m_copypack(so->so_snd.sb_mb, off,
340 	    (int)len, max_linkhdr + hdrlen)) == 0)
341 		return (ENOBUFS);
342 	/*
343 	 * m_copypack left space for our hdr; use it.
344 	 */
345 	m->m_len += hdrlen;
346 	m->m_data -= hdrlen;
347 #else
348 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
349 	if (m != NULL &&
350 	    (max_linkhdr + hdrlen > MHLEN ||
351 	     max_linkhdr + hdrlen + len <= MCLBYTES)) {
352 		MCLGET(m, M_DONTWAIT);
353 		if ((m->m_flags & M_EXT) == 0) {
354 			m_freem(m);
355 			m = NULL;
356 		}
357 	}
358 	if (m == NULL)
359 		return (ENOBUFS);
360 	m->m_data += max_linkhdr;
361 	m->m_len = hdrlen;
362 	if (len <= M_TRAILINGSPACE(m)) {
363 		m_copydata(so->so_snd.sb_mb, off, (int) len,
364 		    mtod(m, caddr_t) + hdrlen);
365 		m->m_len += len;
366 	} else {
367 		m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
368 		if (m->m_next == NULL) {
369 			m_freem(m);
370 			return (ENOBUFS);
371 		}
372 	}
373 #endif
374 
375 	*mp = m;
376 	return (0);
377 }
378 
379 /*
380  * Tcp output routine: figure out what should be sent and send it.
381  */
382 int
383 tcp_output(tp)
384 	struct tcpcb *tp;
385 {
386 	struct socket *so;
387 	struct route *ro;
388 	long len, win;
389 	int off, flags, error;
390 	struct mbuf *m;
391 	struct ip *ip;
392 #ifdef INET6
393 	struct ip6_hdr *ip6;
394 #endif
395 	struct tcphdr *th;
396 	u_char opt[MAX_TCPOPTLEN];
397 	unsigned optlen, hdrlen;
398 	int idle, sendalot, txsegsize, rxsegsize;
399 	int maxburst = TCP_MAXBURST;
400 	int af;		/* address family on the wire */
401 	int iphdrlen;
402 
403 #ifdef DIAGNOSTIC
404 	if (tp->t_inpcb && tp->t_in6pcb)
405 		panic("tcp_output: both t_inpcb and t_in6pcb are set");
406 #endif
407 	so = NULL;
408 	ro = NULL;
409 	if (tp->t_inpcb) {
410 		so = tp->t_inpcb->inp_socket;
411 		ro = &tp->t_inpcb->inp_route;
412 	}
413 #ifdef INET6
414 	else if (tp->t_in6pcb) {
415 		so = tp->t_in6pcb->in6p_socket;
416 		ro = (struct route *)&tp->t_in6pcb->in6p_route;
417 	}
418 #endif
419 
420 	switch (af = tp->t_family) {
421 #ifdef INET
422 	case AF_INET:
423 		if (tp->t_inpcb)
424 			break;
425 #ifdef INET6
426 		/* mapped addr case */
427 		if (tp->t_in6pcb)
428 			break;
429 #endif
430 		return EINVAL;
431 #endif
432 #ifdef INET6
433 	case AF_INET6:
434 		if (tp->t_in6pcb)
435 			break;
436 		return EINVAL;
437 #endif
438 	default:
439 		return EAFNOSUPPORT;
440 	}
441 
442 	tcp_segsize(tp, &txsegsize, &rxsegsize);
443 
444 	idle = (tp->snd_max == tp->snd_una);
445 
446 	/*
447 	 * Restart Window computation.  From draft-floyd-incr-init-win-03:
448 	 *
449 	 *	Optionally, a TCP MAY set the restart window to the
450 	 *	minimum of the value used for the initial window and
451 	 *	the current value of cwnd (in other words, using a
452 	 *	larger value for the restart window should never increase
453 	 *	the size of cwnd).
454 	 */
455 	if (tcp_cwm) {
456 		/*
457 		 * Hughes/Touch/Heidemann Congestion Window Monitoring.
458 		 * Count the number of packets currently pending
459 		 * acknowledgement, and limit our congestion window
460 		 * to a pre-determined allowed burst size plus that count.
461 		 * This prevents bursting once all pending packets have
462 		 * been acknowledged (i.e. transmission is idle).
463 		 *
464 		 * XXX Link this to Initial Window?
465 		 */
466 		tp->snd_cwnd = min(tp->snd_cwnd,
467 		    (tcp_cwm_burstsize * txsegsize) +
468 		    (tp->snd_nxt - tp->snd_una));
469 	} else {
470 		if (idle && tp->t_idle >= tp->t_rxtcur) {
471 			/*
472 			 * We have been idle for "a while" and no acks are
473 			 * expected to clock out any data we send --
474 			 * slow start to get ack "clock" running again.
475 			 */
476 			tp->snd_cwnd = min(tp->snd_cwnd,
477 			    TCP_INITIAL_WINDOW(tcp_init_win, txsegsize));
478 		}
479 	}
480 
481 again:
482 	/*
483 	 * Determine length of data that should be transmitted, and
484 	 * flags that should be used.  If there is some data or critical
485 	 * controls (SYN, RST) to send, then transmit; otherwise,
486 	 * investigate further.
487 	 */
488 	sendalot = 0;
489 	off = tp->snd_nxt - tp->snd_una;
490 	win = min(tp->snd_wnd, tp->snd_cwnd);
491 
492 	flags = tcp_outflags[tp->t_state];
493 	/*
494 	 * If in persist timeout with window of 0, send 1 byte.
495 	 * Otherwise, if window is small but nonzero
496 	 * and timer expired, we will send what we can
497 	 * and go to transmit state.
498 	 */
499 	if (tp->t_force) {
500 		if (win == 0) {
501 			/*
502 			 * If we still have some data to send, then
503 			 * clear the FIN bit.  Usually this would
504 			 * happen below when it realizes that we
505 			 * aren't sending all the data.  However,
506 			 * if we have exactly 1 byte of unset data,
507 			 * then it won't clear the FIN bit below,
508 			 * and if we are in persist state, we wind
509 			 * up sending the packet without recording
510 			 * that we sent the FIN bit.
511 			 *
512 			 * We can't just blindly clear the FIN bit,
513 			 * because if we don't have any more data
514 			 * to send then the probe will be the FIN
515 			 * itself.
516 			 */
517 			if (off < so->so_snd.sb_cc)
518 				flags &= ~TH_FIN;
519 			win = 1;
520 		} else {
521 			TCP_TIMER_DISARM(tp, TCPT_PERSIST);
522 			tp->t_rxtshift = 0;
523 		}
524 	}
525 
526 	if (win < so->so_snd.sb_cc) {
527 		len = win - off;
528 		flags &= ~TH_FIN;
529 	} else
530 		len = so->so_snd.sb_cc - off;
531 
532 	if (len < 0) {
533 		/*
534 		 * If FIN has been sent but not acked,
535 		 * but we haven't been called to retransmit,
536 		 * len will be -1.  Otherwise, window shrank
537 		 * after we sent into it.  If window shrank to 0,
538 		 * cancel pending retransmit, pull snd_nxt back
539 		 * to (closed) window, and set the persist timer
540 		 * if it isn't already going.  If the window didn't
541 		 * close completely, just wait for an ACK.
542 		 *
543 		 * If we have a pending FIN, either it has already been
544 		 * transmitted or it is outside the window, so drop it.
545 		 * If the FIN has been transmitted, but this is not a
546 		 * retransmission, then len must be -1.  Therefore we also
547 		 * prevent here the sending of `gratuitous FINs'.  This
548 		 * eliminates the need to check for that case below (e.g.
549 		 * to back up snd_nxt before the FIN so that the sequence
550 		 * number is correct).
551 		 */
552 		len = 0;
553 		flags &= ~TH_FIN;
554 		if (win == 0) {
555 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
556 			tp->t_rxtshift = 0;
557 			tp->snd_nxt = tp->snd_una;
558 			if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
559 				tcp_setpersist(tp);
560 		}
561 	}
562 	if (len > txsegsize) {
563 		len = txsegsize;
564 		flags &= ~TH_FIN;
565 		sendalot = 1;
566 	}
567 
568 	win = sbspace(&so->so_rcv);
569 
570 	/*
571 	 * Sender silly window avoidance.  If connection is idle
572 	 * and can send all data, a maximum segment,
573 	 * at least a maximum default-size segment do it,
574 	 * or are forced, do it; otherwise don't bother.
575 	 * If peer's buffer is tiny, then send
576 	 * when window is at least half open.
577 	 * If retransmitting (possibly after persist timer forced us
578 	 * to send into a small window), then must resend.
579 	 */
580 	if (len) {
581 		if (len == txsegsize)
582 			goto send;
583 		if ((so->so_state & SS_MORETOCOME) == 0 &&
584 		    ((idle || tp->t_flags & TF_NODELAY) &&
585 		     len + off >= so->so_snd.sb_cc))
586 			goto send;
587 		if (tp->t_force)
588 			goto send;
589 		if (len >= tp->max_sndwnd / 2)
590 			goto send;
591 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
592 			goto send;
593 	}
594 
595 	/*
596 	 * Compare available window to amount of window known to peer
597 	 * (as advertised window less next expected input).  If the
598 	 * difference is at least twice the size of the largest segment
599 	 * we expect to receive (i.e. two segments) or at least 50% of
600 	 * the maximum possible window, then want to send a window update
601 	 * to peer.
602 	 */
603 	if (win > 0) {
604 		/*
605 		 * "adv" is the amount we can increase the window,
606 		 * taking into account that we are limited by
607 		 * TCP_MAXWIN << tp->rcv_scale.
608 		 */
609 		long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
610 			(tp->rcv_adv - tp->rcv_nxt);
611 
612 		if (adv >= (long) (2 * rxsegsize))
613 			goto send;
614 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
615 			goto send;
616 	}
617 
618 	/*
619 	 * Send if we owe peer an ACK.
620 	 */
621 	if (tp->t_flags & TF_ACKNOW)
622 		goto send;
623 	if (flags & (TH_SYN|TH_FIN|TH_RST))
624 		goto send;
625 	if (SEQ_GT(tp->snd_up, tp->snd_una))
626 		goto send;
627 
628 	/*
629 	 * TCP window updates are not reliable, rather a polling protocol
630 	 * using ``persist'' packets is used to insure receipt of window
631 	 * updates.  The three ``states'' for the output side are:
632 	 *	idle			not doing retransmits or persists
633 	 *	persisting		to move a small or zero window
634 	 *	(re)transmitting	and thereby not persisting
635 	 *
636 	 * tp->t_timer[TCPT_PERSIST]
637 	 *	is set when we are in persist state.
638 	 * tp->t_force
639 	 *	is set when we are called to send a persist packet.
640 	 * tp->t_timer[TCPT_REXMT]
641 	 *	is set when we are retransmitting
642 	 * The output side is idle when both timers are zero.
643 	 *
644 	 * If send window is too small, there is data to transmit, and no
645 	 * retransmit or persist is pending, then go to persist state.
646 	 * If nothing happens soon, send when timer expires:
647 	 * if window is nonzero, transmit what we can,
648 	 * otherwise force out a byte.
649 	 */
650 	if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
651 	    TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
652 		tp->t_rxtshift = 0;
653 		tcp_setpersist(tp);
654 	}
655 
656 	/*
657 	 * No reason to send a segment, just return.
658 	 */
659 	return (0);
660 
661 send:
662 	/*
663 	 * Before ESTABLISHED, force sending of initial options
664 	 * unless TCP set not to do any options.
665 	 * NOTE: we assume that the IP/TCP header plus TCP options
666 	 * always fit in a single mbuf, leaving room for a maximum
667 	 * link header, i.e.
668 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
669 	 */
670 	optlen = 0;
671 	switch (af) {
672 #ifdef INET
673 	case AF_INET:
674 		iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
675 		break;
676 #endif
677 #ifdef INET6
678 	case AF_INET6:
679 		iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
680 		break;
681 #endif
682 	default:	/*pacify gcc*/
683 		iphdrlen = 0;
684 		break;
685 	}
686 	hdrlen = iphdrlen;
687 	if (flags & TH_SYN) {
688 		struct rtentry *rt;
689 
690 		rt = NULL;
691 #ifdef INET
692 		if (tp->t_inpcb)
693 			rt = in_pcbrtentry(tp->t_inpcb);
694 #endif
695 #ifdef INET6
696 		if (tp->t_in6pcb)
697 			rt = in6_pcbrtentry(tp->t_in6pcb);
698 #endif
699 
700 		tp->snd_nxt = tp->iss;
701 		tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ?
702 						    rt->rt_ifp : NULL, af);
703 		if ((tp->t_flags & TF_NOOPT) == 0) {
704 			opt[0] = TCPOPT_MAXSEG;
705 			opt[1] = 4;
706 			opt[2] = (tp->t_ourmss >> 8) & 0xff;
707 			opt[3] = tp->t_ourmss & 0xff;
708 			optlen = 4;
709 
710 			if ((tp->t_flags & TF_REQ_SCALE) &&
711 			    ((flags & TH_ACK) == 0 ||
712 			    (tp->t_flags & TF_RCVD_SCALE))) {
713 				*((u_int32_t *) (opt + optlen)) = htonl(
714 					TCPOPT_NOP << 24 |
715 					TCPOPT_WINDOW << 16 |
716 					TCPOLEN_WINDOW << 8 |
717 					tp->request_r_scale);
718 				optlen += 4;
719 			}
720 		}
721  	}
722 
723  	/*
724 	 * Send a timestamp and echo-reply if this is a SYN and our side
725 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
726 	 * and our peer have sent timestamps in our SYN's.
727  	 */
728  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
729  	     (flags & TH_RST) == 0 &&
730  	    ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
731 	     (tp->t_flags & TF_RCVD_TSTMP))) {
732 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
733 
734  		/* Form timestamp option as shown in appendix A of RFC 1323. */
735  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
736  		*lp++ = htonl(TCP_TIMESTAMP(tp));
737  		*lp   = htonl(tp->ts_recent);
738  		optlen += TCPOLEN_TSTAMP_APPA;
739  	}
740 
741  	hdrlen += optlen;
742 
743 #ifdef DIAGNOSTIC
744 	if (len > txsegsize)
745 		panic("tcp data to be sent is larger than segment");
746  	if (max_linkhdr + hdrlen > MCLBYTES)
747 		panic("tcphdr too big");
748 #endif
749 
750 	/*
751 	 * Grab a header mbuf, attaching a copy of data to
752 	 * be transmitted, and initialize the header from
753 	 * the template for sends on this connection.
754 	 */
755 	if (len) {
756 		error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
757 		if (error)
758 			goto out;
759 		/*
760 		 * If we're sending everything we've got, set PUSH.
761 		 * (This will keep happy those implementations which only
762 		 * give data to the user when a buffer fills or
763 		 * a PUSH comes in.)
764 		 */
765 		if (off + len == so->so_snd.sb_cc)
766 			flags |= TH_PUSH;
767 	} else {
768 		if (tp->t_flags & TF_ACKNOW)
769 			tcpstat.tcps_sndacks++;
770 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
771 			tcpstat.tcps_sndctrl++;
772 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
773 			tcpstat.tcps_sndurg++;
774 		else
775 			tcpstat.tcps_sndwinup++;
776 
777 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
778 		if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
779 			MCLGET(m, M_DONTWAIT);
780 			if ((m->m_flags & M_EXT) == 0) {
781 				m_freem(m);
782 				m = NULL;
783 			}
784 		}
785 		if (m == NULL) {
786 			error = ENOBUFS;
787 			goto out;
788 		}
789 		m->m_data += max_linkhdr;
790 		m->m_len = hdrlen;
791 	}
792 	m->m_pkthdr.rcvif = (struct ifnet *)0;
793 	switch (af) {
794 #ifdef INET
795 	case AF_INET:
796 		ip = mtod(m, struct ip *);
797 #ifdef INET6
798 		ip6 = NULL;
799 #endif
800 		th = (struct tcphdr *)(ip + 1);
801 		break;
802 #endif
803 #ifdef INET6
804 	case AF_INET6:
805 		ip = NULL;
806 		ip6 = mtod(m, struct ip6_hdr *);
807 		th = (struct tcphdr *)(ip6 + 1);
808 		break;
809 #endif
810 	default:	/*pacify gcc*/
811 		ip = NULL;
812 #ifdef INET6
813 		ip6 = NULL;
814 #endif
815 		th = NULL;
816 		break;
817 	}
818 	if (tp->t_template == 0)
819 		panic("tcp_output");
820 	if (tp->t_template->m_len < iphdrlen)
821 		panic("tcp_output");
822 	bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen);
823 
824 	/*
825 	 * If we are doing retransmissions, then snd_nxt will
826 	 * not reflect the first unsent octet.  For ACK only
827 	 * packets, we do not want the sequence number of the
828 	 * retransmitted packet, we want the sequence number
829 	 * of the next unsent octet.  So, if there is no data
830 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
831 	 * when filling in ti_seq.  But if we are in persist
832 	 * state, snd_max might reflect one byte beyond the
833 	 * right edge of the window, so use snd_nxt in that
834 	 * case, since we know we aren't doing a retransmission.
835 	 * (retransmit and persist are mutually exclusive...)
836 	 */
837 	if (len || (flags & (TH_SYN|TH_FIN)) ||
838 	    TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
839 		th->th_seq = htonl(tp->snd_nxt);
840 	else
841 		th->th_seq = htonl(tp->snd_max);
842 	th->th_ack = htonl(tp->rcv_nxt);
843 	if (optlen) {
844 		bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
845 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
846 	}
847 	th->th_flags = flags;
848 	/*
849 	 * Calculate receive window.  Don't shrink window,
850 	 * but avoid silly window syndrome.
851 	 */
852 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
853 		win = 0;
854 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
855 		win = (long)TCP_MAXWIN << tp->rcv_scale;
856 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
857 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
858 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
859 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
860 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
861 		if (urp > IP_MAXPACKET)
862 			urp = IP_MAXPACKET;
863 		th->th_urp = htons((u_int16_t)urp);
864 		th->th_flags |= TH_URG;
865 	} else
866 		/*
867 		 * If no urgent pointer to send, then we pull
868 		 * the urgent pointer to the left edge of the send window
869 		 * so that it doesn't drift into the send window on sequence
870 		 * number wraparound.
871 		 */
872 		tp->snd_up = tp->snd_una;		/* drag it along */
873 
874 	/*
875 	 * Set ourselves up to be checksummed just before the packet
876 	 * hits the wire.
877 	 */
878 	switch (af) {
879 #ifdef INET
880 	case AF_INET:
881 		m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
882 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
883 		if (len + optlen) {
884 			/* Fixup the pseudo-header checksum. */
885 			/* XXXJRT Not IP Jumbogram safe. */
886 			th->th_sum = in_cksum_addword(th->th_sum,
887 			    htons((u_int16_t) (len + optlen)));
888 		}
889 		break;
890 #endif
891 #ifdef INET6
892 	case AF_INET6:
893 		/*
894 		 * XXX Actually delaying the checksum is Hard
895 		 * XXX (well, maybe not for Itojun, but it is
896 		 * XXX for me), but we can still take advantage
897 		 * XXX of the cached pseudo-header checksum.
898 		 */
899 		/* equals to hdrlen + len */
900 		m->m_pkthdr.len = sizeof(struct ip6_hdr)
901 			+ sizeof(struct tcphdr) + optlen + len;
902 #ifdef notyet
903 		m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
904 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
905 #endif
906 		if (len + optlen) {
907 			/* Fixup the pseudo-header checksum. */
908 			/* XXXJRT: Not IPv6 Jumbogram safe. */
909 			th->th_sum = in_cksum_addword(th->th_sum,
910 			    htons((u_int16_t) (len + optlen)));
911 		}
912 #ifndef notyet
913 		th->th_sum = in6_cksum(m, 0, sizeof(struct ip6_hdr),
914 		    sizeof(struct tcphdr) + optlen + len);
915 #endif
916 		break;
917 #endif
918 	}
919 
920 	/*
921 	 * In transmit state, time the transmission and arrange for
922 	 * the retransmit.  In persist state, just set snd_max.
923 	 */
924 	if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
925 		tcp_seq startseq = tp->snd_nxt;
926 
927 		/*
928 		 * Advance snd_nxt over sequence space of this segment.
929 		 * There are no states in which we send both a SYN and a FIN,
930 		 * so we collapse the tests for these flags.
931 		 */
932 		if (flags & (TH_SYN|TH_FIN))
933 			tp->snd_nxt++;
934 		tp->snd_nxt += len;
935 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
936 			tp->snd_max = tp->snd_nxt;
937 			/*
938 			 * Time this transmission if not a retransmission and
939 			 * not currently timing anything.
940 			 */
941 			if (tp->t_rtt == 0) {
942 				tp->t_rtt = 1;
943 				tp->t_rtseq = startseq;
944 				tcpstat.tcps_segstimed++;
945 			}
946 		}
947 
948 		/*
949 		 * Set retransmit timer if not currently set,
950 		 * and not doing an ack or a keep-alive probe.
951 		 * Initial value for retransmit timer is smoothed
952 		 * round-trip time + 2 * round-trip time variance.
953 		 * Initialize shift counter which is used for backoff
954 		 * of retransmit time.
955 		 */
956 		if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
957 		    tp->snd_nxt != tp->snd_una) {
958 			TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
959 			if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
960 				TCP_TIMER_DISARM(tp, TCPT_PERSIST);
961 				tp->t_rxtshift = 0;
962 			}
963 		}
964 	} else
965 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
966 			tp->snd_max = tp->snd_nxt + len;
967 
968 #ifdef TCP_DEBUG
969 	/*
970 	 * Trace.
971 	 */
972 	if (so->so_options & SO_DEBUG) {
973 		/*
974 		 * need to recover version # field, which was overwritten
975 		 * on ip_cksum computation.
976 		 */
977 		struct ip *sip;
978 		sip = mtod(m, struct ip *);
979 		switch (af) {
980 #ifdef INET
981 		case AF_INET:
982 			sip->ip_v = 4;
983 			break;
984 #endif
985 #ifdef INET6
986 		case AF_INET6:
987 			sip->ip_v = 6;
988 			break;
989 #endif
990 		}
991 		tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
992 	}
993 #endif
994 
995 	/*
996 	 * Fill in IP length and desired time to live and
997 	 * send to IP level.  There should be a better way
998 	 * to handle ttl and tos; we could keep them in
999 	 * the template, but need a way to checksum without them.
1000 	 */
1001 	m->m_pkthdr.len = hdrlen + len;
1002 
1003 	switch (af) {
1004 #ifdef INET
1005 	case AF_INET:
1006 		ip->ip_len = m->m_pkthdr.len;
1007 		if (tp->t_inpcb) {
1008 			ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
1009 			ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
1010 		}
1011 #ifdef INET6
1012 		else if (tp->t_in6pcb) {
1013 			ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/
1014 			ip->ip_tos = 0;	/*XXX*/
1015 		}
1016 #endif
1017 		break;
1018 #endif
1019 #ifdef INET6
1020 	case AF_INET6:
1021 		ip6->ip6_nxt = IPPROTO_TCP;
1022 		if (tp->t_in6pcb) {
1023 			/*
1024 			 * we separately set hoplimit for every segment, since
1025 			 * the user might want to change the value via
1026 			 * setsockopt. Also, desired default hop limit might
1027 			 * be changed via Neighbor Discovery.
1028 			 */
1029 			ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb,
1030 				ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
1031 		}
1032 		/* ip6->ip6_flow = ??? */
1033 		/* ip6_plen will be filled in ip6_output(). */
1034 		break;
1035 #endif
1036 	}
1037 
1038 #ifdef IPSEC
1039 	if (ipsec_setsocket(m, so) != 0) {
1040 		m_freem(m);
1041 		error = ENOBUFS;
1042 		goto out;
1043 	}
1044 #endif /*IPSEC*/
1045 
1046 	switch (af) {
1047 #ifdef INET
1048 	case AF_INET:
1049 	    {
1050 		struct mbuf *opts;
1051 
1052 		if (tp->t_inpcb)
1053 			opts = tp->t_inpcb->inp_options;
1054 		else
1055 			opts = NULL;
1056 		error = ip_output(m, opts, ro,
1057 			(ip_mtudisc ? IP_MTUDISC : 0) |
1058 			(so->so_options & SO_DONTROUTE),
1059 			0);
1060 		break;
1061 	    }
1062 #endif
1063 #ifdef INET6
1064 	case AF_INET6:
1065 	    {
1066 		struct ip6_pktopts *opts;
1067 
1068 		if (tp->t_in6pcb)
1069 			opts = tp->t_in6pcb->in6p_outputopts;
1070 		else
1071 			opts = NULL;
1072 		error = ip6_output(m, opts, (struct route_in6 *)ro,
1073 			so->so_options & SO_DONTROUTE, 0, NULL);
1074 		break;
1075 	    }
1076 #endif
1077 	default:
1078 		error = EAFNOSUPPORT;
1079 		break;
1080 	}
1081 	if (error) {
1082 out:
1083 		if (error == ENOBUFS) {
1084 			tcpstat.tcps_selfquench++;
1085 #ifdef INET
1086 			if (tp->t_inpcb)
1087 				tcp_quench(tp->t_inpcb, 0);
1088 #endif
1089 #ifdef INET6
1090 			if (tp->t_in6pcb)
1091 				tcp6_quench(tp->t_in6pcb, 0);
1092 #endif
1093 			return (0);
1094 		}
1095 		if ((error == EHOSTUNREACH || error == ENETDOWN)
1096 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
1097 			tp->t_softerror = error;
1098 			return (0);
1099 		}
1100 		return (error);
1101 	}
1102 	tcpstat.tcps_sndtotal++;
1103 	if (tp->t_flags & TF_DELACK)
1104 		tcpstat.tcps_delack++;
1105 
1106 	/*
1107 	 * Data sent (as far as we can tell).
1108 	 * If this advertises a larger window than any other segment,
1109 	 * then remember the size of the advertised window.
1110 	 * Any pending ACK has now been sent.
1111 	 */
1112 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1113 		tp->rcv_adv = tp->rcv_nxt + win;
1114 	tp->last_ack_sent = tp->rcv_nxt;
1115 	tp->t_flags &= ~TF_ACKNOW;
1116 	TCP_CLEAR_DELACK(tp);
1117 #ifdef DIAGNOSTIC
1118 	if (maxburst < 0)
1119 		printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1120 #endif
1121 	if (sendalot && (!tcp_do_newreno || --maxburst))
1122 		goto again;
1123 	return (0);
1124 }
1125 
1126 void
1127 tcp_setpersist(tp)
1128 	struct tcpcb *tp;
1129 {
1130 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
1131 	int nticks;
1132 
1133 	if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1134 		panic("tcp_output REXMT");
1135 	/*
1136 	 * Start/restart persistance timer.
1137 	 */
1138 	if (t < tp->t_rttmin)
1139 		t = tp->t_rttmin;
1140 	TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1141 	    TCPTV_PERSMIN, TCPTV_PERSMAX);
1142 	TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1143 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1144 		tp->t_rxtshift++;
1145 }
1146