xref: /openbsd-src/sys/netinet/tcp_output.c (revision 39b94094a22f24a827039933d795fcc2bc0bbe6c)
1 /*	$OpenBSD: tcp_output.c,v 1.33 2000/09/20 17:00:22 provos Exp $	*/
2 /*	$NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)tcp_output.c	8.3 (Berkeley) 12/30/93
37  */
38 
39 /*
40 %%% portions-copyright-nrl-95
41 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
43 Reserved. All rights under this copyright have been assigned to the US
44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
46 software.
47 You should have received a copy of the license with this software. If you
48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
49 */
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/protosw.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/errno.h>
59 #include <sys/domain.h>
60 
61 #include <net/route.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/tcp.h>
69 #define	TCPOUTFLAGS
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/tcp_debug.h>
76 
77 #ifdef TUBA
78 #include <netiso/iso.h>
79 #include <netiso/tuba_table.h>
80 #endif
81 
82 #ifdef INET6
83 #include <netinet6/tcpipv6.h>
84 #endif /* INET6 */
85 
86 #ifdef TCP_SIGNATURE
87 #include <sys/md5k.h>
88 #endif /* TCP_SIGNATURE */
89 
90 #ifdef notyet
91 extern struct mbuf *m_copypack();
92 #endif
93 
94 #ifdef TCP_SACK
95 extern int tcprexmtthresh;
96 #endif
97 
98 #ifdef TCP_SACK
99 #ifdef TCP_SACK_DEBUG
100 void
101 tcp_print_holes(tp)
102 struct tcpcb *tp;
103 {
104 	struct sackhole *p = tp->snd_holes;
105 	if (p == 0)
106 		return;
107 	printf("Hole report: start--end dups rxmit\n");
108 	while (p) {
109 		printf("%x--%x d %d r %x\n",  p->start, p->end, p->dups,
110                     p->rxmit);
111 		p = p->next;
112 	}
113 	printf("\n");
114 }
115 #endif /* TCP_SACK_DEBUG */
116 
117 /*
118  * Returns pointer to a sackhole if there are any pending retransmissions;
119  * NULL otherwise.
120  */
121 struct sackhole *
122 tcp_sack_output(tp)
123 register struct tcpcb *tp;
124 {
125 	struct sackhole *p;
126 	if (tp->sack_disable)
127 		return 0;
128 	p = tp->snd_holes;
129 	while (p) {
130 #ifndef TCP_FACK
131 		if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
132 #else
133 		/* In FACK, if p->dups is less than tcprexmtthresh, but
134 		 * snd_fack advances more than tcprextmtthresh * tp->t_maxseg,
135 		 * tcp_input() will try fast retransmit. This forces output.
136 		 */
137 		if ((p->dups >= tcprexmtthresh ||
138 		     tp->t_dupacks == tcprexmtthresh) &&
139 		    SEQ_LT(p->rxmit, p->end)) {
140 #endif /* TCP_FACK */
141 			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
142 				p = p->next;
143 				continue;
144 			}
145 #ifdef TCP_SACK_DEBUG
146 			if (p)
147 				tcp_print_holes(tp);
148 #endif
149 			return p;
150 		}
151         	p = p->next;
152 	}
153 	return 0;
154 }
155 
156 /*
157  * After a timeout, the SACK list may be rebuilt.  This SACK information
158  * should be used to avoid retransmitting SACKed data.  This function
159  * traverses the SACK list to see if snd_nxt should be moved forward.
160  */
161 void
162 tcp_sack_adjust(tp)
163 	struct tcpcb *tp;
164 {
165 	struct sackhole *cur = tp->snd_holes;
166 	if (cur == 0)
167 		return; /* No holes */
168 	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
169 		return; /* We're already beyond any SACKed blocks */
170 	/*
171 	 * Two cases for which we want to advance snd_nxt:
172 	 * i) snd_nxt lies between end of one hole and beginning of another
173 	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
174 	 */
175 	while (cur->next) {
176 		if (SEQ_LT(tp->snd_nxt, cur->end))
177 			return;
178 		if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
179 			cur = cur->next;
180 		else {
181 			tp->snd_nxt = cur->next->start;
182 			return;
183 		}
184 	}
185 	if (SEQ_LT(tp->snd_nxt, cur->end))
186 		return;
187 	tp->snd_nxt = tp->rcv_lastsack;
188 	return;
189 }
190 #endif /* TCP_SACK */
191 
192 /*
193  * Tcp output routine: figure out what should be sent and send it.
194  */
195 int
196 tcp_output(tp)
197 	register struct tcpcb *tp;
198 {
199 	register struct socket *so = tp->t_inpcb->inp_socket;
200 	register long len, win;
201 	int off, flags, error;
202 	register struct mbuf *m;
203 	register struct tcphdr *th;
204 	u_char opt[MAX_TCPOPTLEN];
205 	unsigned int optlen, hdrlen;
206 	int idle, sendalot = 0;
207 #ifdef TCP_SACK
208 	int i, sack_rxmit = 0;
209 	struct sackhole *p;
210 #endif
211 #if defined(TCP_SACK)
212 	int maxburst = TCP_MAXBURST;
213 #endif
214 #ifdef TCP_SIGNATURE
215 	unsigned int sigoff;
216 #endif /* TCP_SIGNATURE */
217 
218 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
219 	if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE))
220 		return (EINVAL);
221 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */
222 
223 	/*
224 	 * Determine length of data that should be transmitted,
225 	 * and flags that will be used.
226 	 * If there is some data or critical controls (SYN, RST)
227 	 * to send, then transmit; otherwise, investigate further.
228 	 */
229 	idle = (tp->snd_max == tp->snd_una);
230 	if (idle && tp->t_idle >= tp->t_rxtcur)
231 		/*
232 		 * We have been idle for "a while" and no acks are
233 		 * expected to clock out any data we send --
234 		 * slow start to get ack "clock" running again.
235 		 */
236 		tp->snd_cwnd = tp->t_maxseg;
237 again:
238 #ifdef TCP_SACK
239 	/*
240 	 * If we've recently taken a timeout, snd_max will be greater than
241 	 * snd_nxt.  There may be SACK information that allows us to avoid
242 	 * resending already delivered data.  Adjust snd_nxt accordingly.
243 	 */
244 	if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
245 		tcp_sack_adjust(tp);
246 #endif
247 	off = tp->snd_nxt - tp->snd_una;
248 #if defined(TCP_SACK) && defined(TCP_FACK)
249 	/* Normally, sendable data is limited by off < tp->snd_cwnd.
250 	 * But in FACK, sendable data is limited by snd_awnd < snd_cwnd,
251 	 * regardless of offset.
252 	 */
253 	if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh))
254 		win = tp->snd_wnd;
255 	else
256 #endif
257 	win = ulmin(tp->snd_wnd, tp->snd_cwnd);
258 
259 	flags = tcp_outflags[tp->t_state];
260 
261 #ifdef TCP_SACK
262 	/*
263 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
264 	 * to send out new data (when sendalot is 1), bypass this function.
265 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
266 	 * we're replacing a (future) new transmission with a retransmission
267 	 * now, and we previously incremented snd_cwnd in tcp_input().
268 	 */
269 	if (!tp->sack_disable && !sendalot) {
270 		if (tp->t_dupacks >= tcprexmtthresh &&
271 		    (p = tcp_sack_output(tp))) {
272 			off = p->rxmit - tp->snd_una;
273 			sack_rxmit = 1;
274 #if 0
275 			/* Coalesce holes into a single retransmission */
276 #endif
277 			len = min(tp->t_maxseg, p->end - p->rxmit);
278 #ifndef TCP_FACK
279 			/* in FACK, hold snd_cwnd constant during recovery */
280 			if (SEQ_LT(tp->snd_una, tp->snd_last))
281 				tp->snd_cwnd -= tp->t_maxseg;
282 #endif
283     		}
284 	}
285 #endif /* TCP_SACK */
286 
287 	sendalot = 0;
288 	/*
289 	 * If in persist timeout with window of 0, send 1 byte.
290 	 * Otherwise, if window is small but nonzero
291 	 * and timer expired, we will send what we can
292 	 * and go to transmit state.
293 	 */
294 	if (tp->t_force) {
295 		if (win == 0) {
296 			/*
297 			 * If we still have some data to send, then
298 			 * clear the FIN bit.  Usually this would
299 			 * happen below when it realizes that we
300 			 * aren't sending all the data.  However,
301 			 * if we have exactly 1 byte of unset data,
302 			 * then it won't clear the FIN bit below,
303 			 * and if we are in persist state, we wind
304 			 * up sending the packet without recording
305 			 * that we sent the FIN bit.
306 			 *
307 			 * We can't just blindly clear the FIN bit,
308 			 * because if we don't have any more data
309 			 * to send then the probe will be the FIN
310 			 * itself.
311 			 */
312 			if (off < so->so_snd.sb_cc)
313 				flags &= ~TH_FIN;
314 			win = 1;
315 		} else {
316 			tp->t_timer[TCPT_PERSIST] = 0;
317 			tp->t_rxtshift = 0;
318 		}
319 	}
320 
321 #ifdef TCP_SACK
322 	if (!sack_rxmit) {
323 #endif
324 	len = ulmin(so->so_snd.sb_cc, win) - off;
325 
326 #if defined(TCP_SACK) && defined(TCP_FACK)
327 	/*
328 	 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
329 	 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
330 	 * do not send data (like zero window conditions)
331 	 */
332 	if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
333 	    (tp->snd_awnd >= tp->snd_cwnd))
334 		len = 0;
335 #endif /* TCP_FACK */
336 #ifdef TCP_SACK
337 	}
338 #endif
339 
340 	if (len < 0) {
341 		/*
342 		 * If FIN has been sent but not acked,
343 		 * but we haven't been called to retransmit,
344 		 * len will be -1.  Otherwise, window shrank
345 		 * after we sent into it.  If window shrank to 0,
346 		 * cancel pending retransmit and pull snd_nxt
347 		 * back to (closed) window.  We will enter persist
348 		 * state below.  If the window didn't close completely,
349 		 * just wait for an ACK.
350 		 */
351 		len = 0;
352 		if (win == 0) {
353 			tp->t_timer[TCPT_REXMT] = 0;
354 			tp->snd_nxt = tp->snd_una;
355 		}
356 	}
357 	if (len > tp->t_maxseg) {
358 		len = tp->t_maxseg;
359 		sendalot = 1;
360 	}
361 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
362 		flags &= ~TH_FIN;
363 
364 	win = sbspace(&so->so_rcv);
365 
366 	/*
367 	 * Sender silly window avoidance.  If connection is idle
368 	 * and can send all data, a maximum segment,
369 	 * at least a maximum default-size segment do it,
370 	 * or are forced, do it; otherwise don't bother.
371 	 * If peer's buffer is tiny, then send
372 	 * when window is at least half open.
373 	 * If retransmitting (possibly after persist timer forced us
374 	 * to send into a small window), then must resend.
375 	 */
376 	if (len) {
377 		if (len == tp->t_maxseg)
378 			goto send;
379 		if ((idle || tp->t_flags & TF_NODELAY) &&
380 		    len + off >= so->so_snd.sb_cc)
381 			goto send;
382 		if (tp->t_force)
383 			goto send;
384 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
385 			goto send;
386 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
387 			goto send;
388 #ifdef TCP_SACK
389 		if (sack_rxmit)
390 			goto send;
391 #endif
392 	}
393 
394 	/*
395 	 * Compare available window to amount of window
396 	 * known to peer (as advertised window less
397 	 * next expected input).  If the difference is at least two
398 	 * max size segments, or at least 50% of the maximum possible
399 	 * window, then want to send a window update to peer.
400 	 */
401 	if (win > 0) {
402 		/*
403 		 * "adv" is the amount we can increase the window,
404 		 * taking into account that we are limited by
405 		 * TCP_MAXWIN << tp->rcv_scale.
406 		 */
407 		long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) -
408 			(tp->rcv_adv - tp->rcv_nxt);
409 
410 		if (adv >= (long) (2 * tp->t_maxseg))
411 			goto send;
412 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
413 			goto send;
414 	}
415 
416 	/*
417 	 * Send if we owe peer an ACK.
418 	 */
419 	if (tp->t_flags & TF_ACKNOW)
420 		goto send;
421 	if (flags & (TH_SYN|TH_RST))
422 		goto send;
423 	if (SEQ_GT(tp->snd_up, tp->snd_una))
424 		goto send;
425 	/*
426 	 * If our state indicates that FIN should be sent
427 	 * and we have not yet done so, or we're retransmitting the FIN,
428 	 * then we need to send.
429 	 */
430 	if (flags & TH_FIN &&
431 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
432 		goto send;
433 #ifdef TCP_SACK
434 	/*
435 	 * In SACK, it is possible for tcp_output to fail to send a segment
436 	 * after the retransmission timer has been turned off.  Make sure
437 	 * that the retransmission timer is set.
438 	 */
439 	if (SEQ_GT(tp->snd_max, tp->snd_una) &&
440 	    tp->t_timer[TCPT_REXMT] == 0 &&
441 	    tp->t_timer[TCPT_PERSIST] == 0) {
442 		tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
443 		return (0);
444 	}
445 #endif /* TCP_SACK */
446 
447 	/*
448 	 * TCP window updates are not reliable, rather a polling protocol
449 	 * using ``persist'' packets is used to insure receipt of window
450 	 * updates.  The three ``states'' for the output side are:
451 	 *	idle			not doing retransmits or persists
452 	 *	persisting		to move a small or zero window
453 	 *	(re)transmitting	and thereby not persisting
454 	 *
455 	 * tp->t_timer[TCPT_PERSIST]
456 	 *	is set when we are in persist state.
457 	 * tp->t_force
458 	 *	is set when we are called to send a persist packet.
459 	 * tp->t_timer[TCPT_REXMT]
460 	 *	is set when we are retransmitting
461 	 * The output side is idle when both timers are zero.
462 	 *
463 	 * If send window is too small, there is data to transmit, and no
464 	 * retransmit or persist is pending, then go to persist state.
465 	 * If nothing happens soon, send when timer expires:
466 	 * if window is nonzero, transmit what we can,
467 	 * otherwise force out a byte.
468 	 */
469 	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
470 	    tp->t_timer[TCPT_PERSIST] == 0) {
471 		tp->t_rxtshift = 0;
472 		tcp_setpersist(tp);
473 	}
474 
475 	/*
476 	 * No reason to send a segment, just return.
477 	 */
478 	return (0);
479 
480 send:
481 	/*
482 	 * Before ESTABLISHED, force sending of initial options
483 	 * unless TCP set not to do any options.
484 	 * NOTE: we assume that the IP/TCP header plus TCP options
485 	 * always fit in a single mbuf, leaving room for a maximum
486 	 * link header, i.e.
487 	 *	max_linkhdr + sizeof(network header) + sizeof(struct tcphdr +
488 	 * 		optlen <= MHLEN
489 	 */
490 	optlen = 0;
491 
492 	switch (tp->pf) {
493 	case 0:	/*default to PF_INET*/
494 #ifdef INET
495 	case PF_INET:
496 		hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
497 		break;
498 #endif /* INET */
499 #ifdef INET6
500 	case PF_INET6:
501 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
502 		break;
503 #endif /* INET6 */
504 	default:
505 		return (EPFNOSUPPORT);
506 	}
507 
508 	if (flags & TH_SYN) {
509 		tp->snd_nxt = tp->iss;
510 		if ((tp->t_flags & TF_NOOPT) == 0) {
511 			u_int16_t mss;
512 
513 			opt[0] = TCPOPT_MAXSEG;
514 			opt[1] = 4;
515 			mss = htons((u_int16_t) tcp_mss(tp, flags & TH_ACK ?
516 							tp->t_maxopd : 0));
517 			bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
518 			optlen = 4;
519 
520 			if (flags & TH_ACK)
521 				tcp_mss_update(tp);
522 #ifdef TCP_SACK
523 			/*
524 			 * If this is the first SYN of connection (not a SYN
525 			 * ACK), include SACK_PERMIT_HDR option.  If this is a
526 			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
527 			 * already done so.
528 			 */
529 			if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
530 			    (tp->t_flags & TF_SACK_PERMIT))) {
531 				*((u_int32_t *) (opt + optlen)) =
532 				    htonl(TCPOPT_SACK_PERMIT_HDR);
533 				optlen += 4;
534 			}
535 #endif
536 
537 			if ((tp->t_flags & TF_REQ_SCALE) &&
538 			    ((flags & TH_ACK) == 0 ||
539 			    (tp->t_flags & TF_RCVD_SCALE))) {
540 				*((u_int32_t *) (opt + optlen)) = htonl(
541 					TCPOPT_NOP << 24 |
542 					TCPOPT_WINDOW << 16 |
543 					TCPOLEN_WINDOW << 8 |
544 					tp->request_r_scale);
545 				optlen += 4;
546 			}
547 		}
548 	}
549 
550 	/*
551 	 * Send a timestamp and echo-reply if this is a SYN and our side
552 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
553 	 * and our peer have sent timestamps in our SYN's.
554 	 */
555 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
556 	     (flags & TH_RST) == 0 &&
557 	    ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
558 	     (tp->t_flags & TF_RCVD_TSTMP))) {
559 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
560 
561 		/* Form timestamp option as shown in appendix A of RFC 1323. */
562 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
563 		*lp++ = htonl(tcp_now);
564 		*lp   = htonl(tp->ts_recent);
565 		optlen += TCPOLEN_TSTAMP_APPA;
566 	}
567 
568 #ifdef TCP_SIGNATURE
569 	if (tp->t_flags & TF_SIGNATURE) {
570 		u_int8_t *bp = (u_int8_t *)(opt + optlen);
571 
572 		/* Send signature option */
573 		*(bp++) = TCPOPT_SIGNATURE;
574 		*(bp++) = TCPOLEN_SIGNATURE;
575 		sigoff = optlen + 2;
576 
577 		{
578 			unsigned int i;
579 
580 			for (i = 0; i < 16; i++)
581 				*(bp++) = 0;
582 		}
583 
584 		optlen += TCPOLEN_SIGNATURE;
585 
586 		/* Pad options list to the next 32 bit boundary and
587 		 * terminate it.
588 		 */
589 		*bp++ = TCPOPT_NOP;
590 		*bp++ = TCPOPT_EOL;
591 		optlen += 2;
592 	}
593 #endif /* TCP_SIGNATURE */
594 
595 #ifdef TCP_SACK
596 	/*
597 	 * Send SACKs if necessary.  This should be the last option processed.
598 	 * Only as many SACKs are sent as are permitted by the maximum options
599 	 * size.  No more than three SACKs are sent.
600 	 */
601 	if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
602 	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
603 	    tp->rcv_numsacks) {
604 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
605 		u_int32_t *olp = lp++;
606 		int count = 0;  /* actual number of SACKs inserted */
607 		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
608 
609 		maxsack = min(maxsack, TCP_MAX_SACK);
610 		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
611 			struct sackblk sack = tp->sackblks[i];
612 			if (sack.start == 0 && sack.end == 0)
613 				continue;
614 			*lp++ = htonl(sack.start);
615 			*lp++ = htonl(sack.end);
616 			count++;
617 		}
618 		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
619 		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
620 	}
621 #endif /* TCP_SACK */
622 
623 #ifdef DIAGNOSTIC
624 	if (optlen > MAX_TCPOPTLEN)
625 		panic("tcp_output: options too long");
626 #endif /* DIAGNOSTIC */
627 
628 	hdrlen += optlen;
629 
630 	/*
631 	 * Adjust data length if insertion of options will
632 	 * bump the packet length beyond the t_maxopd length.
633 	 */
634 	if (len > tp->t_maxopd - optlen) {
635 		len = tp->t_maxopd - optlen;
636 		sendalot = 1;
637 		flags &= ~TH_FIN;
638 	 }
639 
640 #ifdef DIAGNOSTIC
641 	if (max_linkhdr + hdrlen > MCLBYTES)
642 		panic("tcphdr too big");
643 #endif
644 
645 	/*
646 	 * Grab a header mbuf, attaching a copy of data to
647 	 * be transmitted, and initialize the header from
648 	 * the template for sends on this connection.
649 	 */
650 	if (len) {
651 		if (tp->t_force && len == 1)
652 			tcpstat.tcps_sndprobe++;
653 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
654 			tcpstat.tcps_sndrexmitpack++;
655 			tcpstat.tcps_sndrexmitbyte += len;
656 		} else {
657 			tcpstat.tcps_sndpack++;
658 			tcpstat.tcps_sndbyte += len;
659 		}
660 #ifdef notyet
661 		if ((m = m_copypack(so->so_snd.sb_mb, off,
662 		    (int)len, max_linkhdr + hdrlen)) == 0) {
663 			error = ENOBUFS;
664 			goto out;
665 		}
666 		/*
667 		 * m_copypack left space for our hdr; use it.
668 		 */
669 		m->m_len += hdrlen;
670 		m->m_data -= hdrlen;
671 #else
672 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
673 		if (m != NULL) {
674 			MCLGET(m, M_DONTWAIT);
675 			if ((m->m_flags & M_EXT) == 0) {
676 				m_freem(m);
677 				m = NULL;
678 			}
679 		}
680 		if (m == NULL) {
681 			error = ENOBUFS;
682 			goto out;
683 		}
684 		m->m_data += max_linkhdr;
685 		m->m_len = hdrlen;
686 		if (len <= MCLBYTES - hdrlen - max_linkhdr) {
687 			m_copydata(so->so_snd.sb_mb, off, (int) len,
688 			    mtod(m, caddr_t) + hdrlen);
689 			m->m_len += len;
690 		} else {
691 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
692 			if (m->m_next == 0) {
693 				(void) m_free(m);
694 				error = ENOBUFS;
695 				goto out;
696 			}
697 		}
698 #endif
699 		/*
700 		 * If we're sending everything we've got, set PUSH.
701 		 * (This will keep happy those implementations which only
702 		 * give data to the user when a buffer fills or
703 		 * a PUSH comes in.)
704 		 */
705 		if (off + len == so->so_snd.sb_cc)
706 			flags |= TH_PUSH;
707 	} else {
708 		if (tp->t_flags & TF_ACKNOW)
709 			tcpstat.tcps_sndacks++;
710 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
711 			tcpstat.tcps_sndctrl++;
712 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
713 			tcpstat.tcps_sndurg++;
714 		else
715 			tcpstat.tcps_sndwinup++;
716 
717 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
718 		if (m != NULL) {
719 			MCLGET(m, M_DONTWAIT);
720 			if ((m->m_flags & M_EXT) == 0) {
721 				m_freem(m);
722 				m = NULL;
723 			}
724 		}
725 		if (m == NULL) {
726 			error = ENOBUFS;
727 			goto out;
728 		}
729 		m->m_data += max_linkhdr;
730 		m->m_len = hdrlen;
731 	}
732 	m->m_pkthdr.rcvif = (struct ifnet *)0;
733 
734 	if (!tp->t_template)
735 		panic("tcp_output");
736 #ifdef DIAGNOSTIC
737 	if (tp->t_template->m_len != hdrlen - optlen)
738 		panic("tcp_output: template len != hdrlen - optlen");
739 #endif /* DIAGNOSTIC */
740 	bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t),
741 		tp->t_template->m_len);
742 	th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len -
743 		sizeof(struct tcphdr));
744 
745 	/*
746 	 * Fill in fields, remembering maximum advertised
747 	 * window for use in delaying messages about window sizes.
748 	 * If resending a FIN, be sure not to use a new sequence number.
749 	 */
750 	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
751 	    (tp->snd_nxt == tp->snd_max))
752 		tp->snd_nxt--;
753 	/*
754 	 * If we are doing retransmissions, then snd_nxt will
755 	 * not reflect the first unsent octet.  For ACK only
756 	 * packets, we do not want the sequence number of the
757 	 * retransmitted packet, we want the sequence number
758 	 * of the next unsent octet.  So, if there is no data
759 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
760 	 * when filling in ti_seq.  But if we are in persist
761 	 * state, snd_max might reflect one byte beyond the
762 	 * right edge of the window, so use snd_nxt in that
763 	 * case, since we know we aren't doing a retransmission.
764 	 * (retransmit and persist are mutually exclusive...)
765 	 */
766 	if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
767 		th->th_seq = htonl(tp->snd_nxt);
768 	else
769 		th->th_seq = htonl(tp->snd_max);
770 
771 #ifdef TCP_SACK
772 	if (sack_rxmit) {
773 		/*
774 		 * If sendalot was turned on (due to option stuffing), turn it
775 		 * off. Properly set th_seq field.  Advance the ret'x pointer
776 		 * by len.
777 		 */
778 		if (sendalot)
779 			sendalot = 0;
780 		th->th_seq = htonl(p->rxmit);
781 		p->rxmit += len;
782 #if defined(TCP_SACK) && defined(TCP_FACK)
783 		tp->retran_data += len;
784 #endif /* TCP_FACK */
785 	}
786 #endif /* TCP_SACK */
787 
788 	th->th_ack = htonl(tp->rcv_nxt);
789 	if (optlen) {
790 		bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
791 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
792 	}
793 	th->th_flags = flags;
794 
795 	/*
796 	 * Calculate receive window.  Don't shrink window,
797 	 * but avoid silly window syndrome.
798 	 */
799 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
800 		win = 0;
801 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
802 		win = (long)TCP_MAXWIN << tp->rcv_scale;
803 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
804 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
805 	if (flags & TH_RST)
806 		win = 0;
807 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
808 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
809 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
810 		if (urp > IP_MAXPACKET)
811 			urp = IP_MAXPACKET;
812 		th->th_urp = htons((u_int16_t)urp);
813 		th->th_flags |= TH_URG;
814 	} else
815 		/*
816 		 * If no urgent pointer to send, then we pull
817 		 * the urgent pointer to the left edge of the send window
818 		 * so that it doesn't drift into the send window on sequence
819 		 * number wraparound.
820 		 */
821 		tp->snd_up = tp->snd_una;		/* drag it along */
822 
823 	/* Put TCP length in pseudo-header */
824 	switch (tp->pf) {
825 	case 0:	/*default to PF_INET*/
826 #ifdef INET
827 	case AF_INET:
828 		if (len + optlen)
829 			mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)(
830 				sizeof (struct tcphdr) + optlen + len));
831 		break;
832 #endif /* INET */
833 #ifdef INET6
834 	case AF_INET6:
835 		break;
836 #endif /* INET6 */
837 	}
838 
839 #ifdef TCP_SIGNATURE
840 	if (tp->t_flags & TF_SIGNATURE) {
841 		MD5_CTX ctx;
842 		union sockaddr_union sa;
843 		struct tdb *tdb;
844 
845 		bzero(&sa, sizeof(union sockaddr_union));
846 
847 		switch (tp->pf) {
848 		case 0:	/*default to PF_INET*/
849 #ifdef INET
850 		case AF_INET:
851 			sa.sa.sa_len = sizeof(struct sockaddr_in);
852 			sa.sa.sa_family = AF_INET;
853 			sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
854 			break;
855 #endif /* INET */
856 #ifdef INET6
857 		case AF_INET6:
858 			sa.sa.sa_len = sizeof(struct sockaddr_in6);
859 			sa.sa.sa_family = AF_INET6;
860 			sa.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
861 			break;
862 #endif /* INET6 */
863 		}
864 
865 		/* XXX gettdb() should really be called at spltdb().      */
866 		/* XXX this is splsoftnet(), currently they are the same. */
867 		tdb = gettdb(0, &sa, IPPROTO_TCP);
868 		if (tdb == NULL)
869 			return (EPERM);
870 
871 		MD5Init(&ctx);
872 
873 		switch (tp->pf) {
874 		case 0:	/*default to PF_INET*/
875 #ifdef INET
876 		case AF_INET:
877 			{
878 				struct ippseudo ippseudo;
879 				struct ipovly *ipovly;
880 
881 				ipovly = mtod(m, struct ipovly *);
882 
883 				ippseudo.ippseudo_src = ipovly->ih_src;
884 				ippseudo.ippseudo_dst = ipovly->ih_dst;
885 				ippseudo.ippseudo_pad = 0;
886 				ippseudo.ippseudo_p   = IPPROTO_TCP;
887 				ippseudo.ippseudo_len = ipovly->ih_len;
888 				MD5Update(&ctx, (char *)&ippseudo,
889 					sizeof(struct ippseudo));
890 				MD5Update(&ctx, mtod(m, caddr_t) +
891 					sizeof(struct ip),
892 					sizeof(struct tcphdr));
893 			}
894 			break;
895 #endif /* INET */
896 #ifdef INET6
897 		case AF_INET6:
898 			{
899 				static int printed = 0;
900 
901 				if (!printed) {
902 					printf("error: TCP MD5 support for "
903 						"IPv6 not yet implemented.\n");
904 					printed = 1;
905 				}
906 			}
907 			break;
908 #endif /* INET6 */
909 		}
910 
911 		if (len && m_apply(m, hdrlen, len, tcp_signature_apply,
912 				(caddr_t)&ctx))
913 			return (EINVAL);
914 
915 		MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen);
916 		MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx);
917 	}
918 #endif /* TCP_SIGNATURE */
919 
920 	/*
921 	 * Put TCP length in extended header, and then
922 	 * checksum extended header and data.
923 	 */
924 	switch (tp->pf) {
925 	case 0:	/*default to PF_INET*/
926 #ifdef INET
927 	case AF_INET:
928 		th->th_sum = in_cksum(m, (int)(hdrlen + len));
929 		break;
930 #endif /* INET */
931 #ifdef INET6
932 	case AF_INET6:
933 		m->m_pkthdr.len = hdrlen + len;
934   		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
935 			hdrlen - sizeof(struct ip6_hdr) + len);
936 		break;
937 #endif /* INET6 */
938 	}
939 
940 	/*
941 	 * In transmit state, time the transmission and arrange for
942 	 * the retransmit.  In persist state, just set snd_max.
943 	 */
944 	if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
945 		tcp_seq startseq = tp->snd_nxt;
946 
947 		/*
948 		 * Advance snd_nxt over sequence space of this segment.
949 		 */
950 		if (flags & (TH_SYN|TH_FIN)) {
951 			if (flags & TH_SYN)
952 				tp->snd_nxt++;
953 			if (flags & TH_FIN) {
954 				tp->snd_nxt++;
955 				tp->t_flags |= TF_SENTFIN;
956 			}
957 		}
958 #ifdef TCP_SACK
959 		if (!tp->sack_disable) {
960 			if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
961 				goto timer;
962 			}
963 		}
964 #endif
965 		tp->snd_nxt += len;
966 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
967 			tp->snd_max = tp->snd_nxt;
968 			/*
969 			 * Time this transmission if not a retransmission and
970 			 * not currently timing anything.
971 			 */
972 			if (tp->t_rtt == 0) {
973 				tp->t_rtt = 1;
974 				tp->t_rtseq = startseq;
975 				tcpstat.tcps_segstimed++;
976 			}
977 		}
978 
979 		/*
980 		 * Set retransmit timer if not currently set,
981 		 * and not doing an ack or a keep-alive probe.
982 		 * Initial value for retransmit timer is smoothed
983 		 * round-trip time + 2 * round-trip time variance.
984 		 * Initialize shift counter which is used for backoff
985 		 * of retransmit time.
986 		 */
987 #ifdef TCP_SACK
988  timer:
989 		if (!tp->sack_disable && sack_rxmit &&
990 		    tp->t_timer[TCPT_REXMT] == 0 &&
991 		    tp->snd_nxt != tp->snd_max) {
992 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
993 			if (tp->t_timer[TCPT_PERSIST]) {
994 				tp->t_timer[TCPT_PERSIST] = 0;
995 				tp->t_rxtshift = 0;
996 			}
997 		}
998 #endif
999 
1000 		if (tp->t_timer[TCPT_REXMT] == 0 &&
1001 		    tp->snd_nxt != tp->snd_una) {
1002 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1003 			if (tp->t_timer[TCPT_PERSIST]) {
1004 				tp->t_timer[TCPT_PERSIST] = 0;
1005 				tp->t_rxtshift = 0;
1006 			}
1007 		}
1008 	} else
1009 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
1010 			tp->snd_max = tp->snd_nxt + len;
1011 
1012 	/*
1013 	 * Trace.
1014 	 */
1015 	if (so->so_options & SO_DEBUG)
1016 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0,
1017 			len);
1018 
1019 	/*
1020 	 * Fill in IP length and desired time to live and
1021 	 * send to IP level.  There should be a better way
1022 	 * to handle ttl and tos; we could keep them in
1023 	 * the template, but need a way to checksum without them.
1024 	 */
1025 	m->m_pkthdr.len = hdrlen + len;
1026 
1027 	switch (tp->pf) {
1028 	case 0:	/*default to PF_INET*/
1029 #ifdef INET
1030 	case AF_INET:
1031 		{
1032 			struct ip *ip;
1033 
1034 			ip = mtod(m, struct ip *);
1035 			ip->ip_len = m->m_pkthdr.len;
1036 			ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
1037 			ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
1038 		}
1039 		error = ip_output(m, tp->t_inpcb->inp_options,
1040 			&tp->t_inpcb->inp_route,
1041 			(ip_mtudisc ? IP_MTUDISC : 0) |
1042 				  (so->so_options & SO_DONTROUTE),
1043 			0, tp->t_inpcb);
1044 		break;
1045 #endif /* INET */
1046 #ifdef INET6
1047 	case AF_INET6:
1048 		{
1049 			struct ip6_hdr *ipv6;
1050 
1051 			ipv6 = mtod(m, struct ip6_hdr *);
1052 			ipv6->ip6_plen = m->m_pkthdr.len -
1053 				sizeof(struct ip6_hdr);
1054 			ipv6->ip6_nxt = IPPROTO_TCP;
1055 			ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
1056 		}
1057 		error = ip6_output(m, tp->t_inpcb->inp_outputopts6,
1058 			  &tp->t_inpcb->inp_route6,
1059 			  (so->so_options & SO_DONTROUTE), NULL, NULL);
1060 		break;
1061 #endif /* INET6 */
1062 #ifdef TUBA
1063 	case AF_ISO:
1064 		if (tp->t_tuba_pcb)
1065 			error = tuba_output(m, tp);
1066 		break;
1067 #endif /* TUBA */
1068 	}
1069 
1070 #if defined(TCP_SACK) && defined(TCP_FACK)
1071 	/* Update snd_awnd to reflect the new data that was sent.  */
1072 	tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
1073 		tp->retran_data;
1074 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */
1075 
1076 	if (error) {
1077 out:
1078 		if (error == ENOBUFS) {
1079 			tcp_quench(tp->t_inpcb, 0);
1080 			return (0);
1081 		}
1082 		if ((error == EHOSTUNREACH || error == ENETDOWN)
1083 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
1084 			tp->t_softerror = error;
1085 			return (0);
1086 		}
1087 		return (error);
1088 	}
1089 	tcpstat.tcps_sndtotal++;
1090 
1091 	/*
1092 	 * Data sent (as far as we can tell).
1093 	 * If this advertises a larger window than any other segment,
1094 	 * then remember the size of the advertised window.
1095 	 * Any pending ACK has now been sent.
1096 	 */
1097 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1098 		tp->rcv_adv = tp->rcv_nxt + win;
1099 	tp->last_ack_sent = tp->rcv_nxt;
1100 	tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1101 #if defined(TCP_SACK)
1102 	if (sendalot && --maxburst)
1103 #else
1104 	if (sendalot)
1105 #endif
1106 		goto again;
1107 	return (0);
1108 }
1109 
1110 void
1111 tcp_setpersist(tp)
1112 	register struct tcpcb *tp;
1113 {
1114 	register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1115 
1116 	if (tp->t_timer[TCPT_REXMT])
1117 		panic("tcp_output REXMT");
1118 	/*
1119 	 * Start/restart persistance timer.
1120 	 */
1121 	if (t < tp->t_rttmin)
1122 		t = tp->t_rttmin;
1123 	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1124 	    t * tcp_backoff[tp->t_rxtshift],
1125 	    TCPTV_PERSMIN, TCPTV_PERSMAX);
1126 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1127 		tp->t_rxtshift++;
1128 }
1129