xref: /openbsd-src/sys/netinet/tcp_output.c (revision b9de2beaa8a944cfbbd81ed84b47568ee4d25b2b)
1 /*	$OpenBSD: tcp_output.c,v 1.21 1999/07/06 20:17:53 cmetz Exp $	*/
2 /*	$NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)tcp_output.c	8.3 (Berkeley) 12/30/93
37  */
38 
39 /*
40 %%% portions-copyright-nrl-95
41 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
43 Reserved. All rights under this copyright have been assigned to the US
44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
46 software.
47 You should have received a copy of the license with this software. If you
48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
49 */
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/protosw.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/errno.h>
59 #include <sys/domain.h>
60 
61 #include <net/route.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/tcp.h>
69 #define	TCPOUTFLAGS
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/tcp_debug.h>
76 
77 #ifdef TUBA
78 #include <netiso/iso.h>
79 #include <netiso/tuba_table.h>
80 #endif
81 
82 #ifdef INET6
83 #include <netinet6/tcpipv6.h>
84 #endif /* INET6 */
85 
86 #ifdef TCP_SIGNATURE
87 #include <sys/md5k.h>
88 #endif /* TCP_SIGNATURE */
89 
90 #ifdef notyet
91 extern struct mbuf *m_copypack();
92 #endif
93 
94 #ifdef TCP_SACK
95 extern int tcprexmtthresh;
96 #endif
97 
98 #ifdef TCP_SACK
99 #ifdef TCP_SACK_DEBUG
100 void
101 tcp_print_holes(tp)
102 struct tcpcb *tp;
103 {
104 	struct sackhole *p = tp->snd_holes;
105 	if (p == 0)
106 		return;
107 	printf("Hole report: start--end dups rxmit\n");
108 	while (p) {
109 		printf("%x--%x d %d r %x\n",  p->start, p->end, p->dups,
110                     p->rxmit);
111 		p = p->next;
112 	}
113 	printf("\n");
114 }
115 #endif /* TCP_SACK_DEBUG */
116 
117 /*
118  * Returns pointer to a sackhole if there are any pending retransmissions;
119  * NULL otherwise.
120  */
121 struct sackhole *
122 tcp_sack_output(tp)
123 register struct tcpcb *tp;
124 {
125 	struct sackhole *p;
126 	if (tp->sack_disable)
127 		return 0;
128 	p = tp->snd_holes;
129 	while (p) {
130 		if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
131 			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
132 				p = p->next;
133 				continue;
134 			}
135 #ifdef TCP_SACK_DEBUG
136 			if (p)
137 				tcp_print_holes(tp);
138 #endif
139 			return p;
140 		}
141         	p = p->next;
142 	}
143 	return 0;
144 }
145 
146 /*
147  * After a timeout, the SACK list may be rebuilt.  This SACK information
148  * should be used to avoid retransmitting SACKed data.  This function
149  * traverses the SACK list to see if snd_nxt should be moved forward.
150  */
151 void
152 tcp_sack_adjust(tp)
153 	struct tcpcb *tp;
154 {
155 	int i;
156 
157 	for (i = 0; i < tp->rcv_numsacks; i++) {
158 		if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start))
159 			break;
160 		if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt))
161 			continue;
162 		if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0)
163 			continue;
164 		/* snd_nxt must be in middle of block of SACKed data */
165 		tp->snd_nxt = tp->sackblks[i].end;
166 		break;
167 	}
168 }
169 #endif /* TCP_SACK */
170 
171 /*
172  * Tcp output routine: figure out what should be sent and send it.
173  */
174 int
175 tcp_output(tp)
176 	register struct tcpcb *tp;
177 {
178 	register struct socket *so = tp->t_inpcb->inp_socket;
179 	register long len, win;
180 	int off, flags, error;
181 	register struct mbuf *m;
182 	register struct tcphdr *th;
183 	u_char opt[MAX_TCPOPTLEN];
184 	unsigned int optlen, hdrlen;
185 	int idle, sendalot;
186 #ifdef TCP_SACK
187 	int i, sack_rxmit = 0;
188 	struct sackhole *p;
189 #endif
190 #if defined(TCP_SACK) || defined(TCP_NEWRENO)
191 	int maxburst = TCP_MAXBURST;
192 #endif
193 #ifdef TCP_SIGNATURE
194 	unsigned int sigoff;
195 #endif /* TCP_SIGNATURE */
196 
197 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
198 	if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE))
199 		return (EINVAL);
200 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */
201 
202 	/*
203 	 * Determine length of data that should be transmitted,
204 	 * and flags that will be used.
205 	 * If there is some data or critical controls (SYN, RST)
206 	 * to send, then transmit; otherwise, investigate further.
207 	 */
208 	idle = (tp->snd_max == tp->snd_una);
209 	if (idle && tp->t_idle >= tp->t_rxtcur)
210 		/*
211 		 * We have been idle for "a while" and no acks are
212 		 * expected to clock out any data we send --
213 		 * slow start to get ack "clock" running again.
214 		 */
215 		tp->snd_cwnd = tp->t_maxseg;
216 again:
217 	sendalot = 0;
218 #ifdef TCP_SACK
219 	/*
220 	 * If we've recently taken a timeout, snd_max will be greater than
221 	 * snd_nxt.  There may be SACK information that allows us to avoid
222 	 * resending already delivered data.  Adjust snd_nxt accordingly.
223 	 */
224 	if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
225 		tcp_sack_adjust(tp);
226 #endif
227 	off = tp->snd_nxt - tp->snd_una;
228 	win = ulmin(tp->snd_wnd, tp->snd_cwnd);
229 
230 	flags = tcp_outflags[tp->t_state];
231 	/*
232 	 * If in persist timeout with window of 0, send 1 byte.
233 	 * Otherwise, if window is small but nonzero
234 	 * and timer expired, we will send what we can
235 	 * and go to transmit state.
236 	 */
237 
238 #ifdef TCP_SACK
239 	/*
240 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
241 	 * to send out new data (when sendalot is 1), bypass this function.
242 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
243 	 * we're replacing a (future) new transmission with a retransmission
244 	 * now, and we previously incremented snd_cwnd in tcp_input().
245 	 */
246 	if (!tp->sack_disable && !sendalot) {
247 		if ((p = tcp_sack_output(tp))) {
248 			off = p->rxmit - tp->snd_una;
249 			sack_rxmit = 1;
250 #if 0
251 			/* Coalesce holes into a single retransmission */
252 #endif
253 			len = min(tp->t_maxseg, p->end - p->rxmit);
254 #ifndef TCP_FACK
255 			/* in FACK, hold snd_cwnd constant during recovery */
256 			if (SEQ_LT(tp->snd_una, tp->snd_last))
257 				tp->snd_cwnd -= tp->t_maxseg;
258 #endif
259     		}
260 	}
261 #endif /* TCP_SACK */
262 
263 	if (tp->t_force) {
264 		if (win == 0) {
265 			/*
266 			 * If we still have some data to send, then
267 			 * clear the FIN bit.  Usually this would
268 			 * happen below when it realizes that we
269 			 * aren't sending all the data.  However,
270 			 * if we have exactly 1 byte of unset data,
271 			 * then it won't clear the FIN bit below,
272 			 * and if we are in persist state, we wind
273 			 * up sending the packet without recording
274 			 * that we sent the FIN bit.
275 			 *
276 			 * We can't just blindly clear the FIN bit,
277 			 * because if we don't have any more data
278 			 * to send then the probe will be the FIN
279 			 * itself.
280 			 */
281 			if (off < so->so_snd.sb_cc)
282 				flags &= ~TH_FIN;
283 			win = 1;
284 		} else {
285 			tp->t_timer[TCPT_PERSIST] = 0;
286 			tp->t_rxtshift = 0;
287 		}
288 	}
289 
290 #ifdef TCP_SACK
291 	if (!sack_rxmit) {
292 #endif
293 	len = ulmin(so->so_snd.sb_cc, win) - off;
294 
295 #if defined(TCP_SACK) && defined(TCP_FACK)
296 	/*
297 	 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
298 	 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
299 	 * do not send data (like zero window conditions)
300 	 */
301 	if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
302 	    (tp->snd_awnd >= tp->snd_cwnd))
303 		len = 0;
304 #endif /* TCP_FACK */
305 #ifdef TCP_SACK
306 	}
307 #endif
308 
309 	if (len < 0) {
310 		/*
311 		 * If FIN has been sent but not acked,
312 		 * but we haven't been called to retransmit,
313 		 * len will be -1.  Otherwise, window shrank
314 		 * after we sent into it.  If window shrank to 0,
315 		 * cancel pending retransmit and pull snd_nxt
316 		 * back to (closed) window.  We will enter persist
317 		 * state below.  If the window didn't close completely,
318 		 * just wait for an ACK.
319 		 */
320 		len = 0;
321 		if (win == 0) {
322 			tp->t_timer[TCPT_REXMT] = 0;
323 			tp->snd_nxt = tp->snd_una;
324 		}
325 	}
326 	if (len > tp->t_maxseg) {
327 		len = tp->t_maxseg;
328 		sendalot = 1;
329 	}
330 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
331 		flags &= ~TH_FIN;
332 
333 	win = sbspace(&so->so_rcv);
334 
335 	/*
336 	 * Sender silly window avoidance.  If connection is idle
337 	 * and can send all data, a maximum segment,
338 	 * at least a maximum default-size segment do it,
339 	 * or are forced, do it; otherwise don't bother.
340 	 * If peer's buffer is tiny, then send
341 	 * when window is at least half open.
342 	 * If retransmitting (possibly after persist timer forced us
343 	 * to send into a small window), then must resend.
344 	 */
345 	if (len) {
346 		if (len == tp->t_maxseg)
347 			goto send;
348 		if ((idle || tp->t_flags & TF_NODELAY) &&
349 		    len + off >= so->so_snd.sb_cc)
350 			goto send;
351 		if (tp->t_force)
352 			goto send;
353 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
354 			goto send;
355 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
356 			goto send;
357 #ifdef TCP_SACK
358 		if (sack_rxmit)
359 			goto send;
360 #endif
361 	}
362 
363 	/*
364 	 * Compare available window to amount of window
365 	 * known to peer (as advertised window less
366 	 * next expected input).  If the difference is at least two
367 	 * max size segments, or at least 50% of the maximum possible
368 	 * window, then want to send a window update to peer.
369 	 */
370 	if (win > 0) {
371 		/*
372 		 * "adv" is the amount we can increase the window,
373 		 * taking into account that we are limited by
374 		 * TCP_MAXWIN << tp->rcv_scale.
375 		 */
376 		long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) -
377 			(tp->rcv_adv - tp->rcv_nxt);
378 
379 		if (adv >= (long) (2 * tp->t_maxseg))
380 			goto send;
381 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
382 			goto send;
383 	}
384 
385 	/*
386 	 * Send if we owe peer an ACK.
387 	 */
388 	if (tp->t_flags & TF_ACKNOW)
389 		goto send;
390 	if (flags & (TH_SYN|TH_RST))
391 		goto send;
392 	if (SEQ_GT(tp->snd_up, tp->snd_una))
393 		goto send;
394 	/*
395 	 * If our state indicates that FIN should be sent
396 	 * and we have not yet done so, or we're retransmitting the FIN,
397 	 * then we need to send.
398 	 */
399 	if (flags & TH_FIN &&
400 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
401 		goto send;
402 
403 	/*
404 	 * TCP window updates are not reliable, rather a polling protocol
405 	 * using ``persist'' packets is used to insure receipt of window
406 	 * updates.  The three ``states'' for the output side are:
407 	 *	idle			not doing retransmits or persists
408 	 *	persisting		to move a small or zero window
409 	 *	(re)transmitting	and thereby not persisting
410 	 *
411 	 * tp->t_timer[TCPT_PERSIST]
412 	 *	is set when we are in persist state.
413 	 * tp->t_force
414 	 *	is set when we are called to send a persist packet.
415 	 * tp->t_timer[TCPT_REXMT]
416 	 *	is set when we are retransmitting
417 	 * The output side is idle when both timers are zero.
418 	 *
419 	 * If send window is too small, there is data to transmit, and no
420 	 * retransmit or persist is pending, then go to persist state.
421 	 * If nothing happens soon, send when timer expires:
422 	 * if window is nonzero, transmit what we can,
423 	 * otherwise force out a byte.
424 	 */
425 	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
426 	    tp->t_timer[TCPT_PERSIST] == 0) {
427 		tp->t_rxtshift = 0;
428 		tcp_setpersist(tp);
429 	}
430 
431 	/*
432 	 * No reason to send a segment, just return.
433 	 */
434 	return (0);
435 
436 send:
437 	/*
438 	 * Before ESTABLISHED, force sending of initial options
439 	 * unless TCP set not to do any options.
440 	 * NOTE: we assume that the IP/TCP header plus TCP options
441 	 * always fit in a single mbuf, leaving room for a maximum
442 	 * link header, i.e.
443 	 *	max_linkhdr + sizeof(network header) + sizeof(struct tcphdr) +
444 	 *		optlen <= MHLEN
445 	 */
446 	optlen = 0;
447 
448 #if defined(INET) && defined(INET6)
449 	switch (tp->pf) {
450 #else /* defined(INET) && defined(INET6) */
451 	switch (0) {
452 #endif /* defined(INET) && defined(INET6) */
453 	case 0:		/* If tp->pf is 0, then assume IPv4 unless not avail */
454 #ifdef INET
455 	case PF_INET:
456 		hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
457 		break;
458 #endif /* INET */
459 #ifdef INET6
460 	case PF_INET6:
461 		hdrlen = sizeof(struct ipv6) + sizeof(struct tcphdr);
462 		break;
463 #endif /* INET6 */
464 	default:
465 		return (EPFNOSUPPORT);
466 	}
467 
468 	if (flags & TH_SYN) {
469 		tp->snd_nxt = tp->iss;
470 		if ((tp->t_flags & TF_NOOPT) == 0) {
471 			u_int16_t mss;
472 
473 			opt[0] = TCPOPT_MAXSEG;
474 			opt[1] = 4;
475 			mss = htons((u_int16_t) tcp_mss(tp, 0));
476 			bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
477 			optlen = 4;
478 #ifdef TCP_SACK
479 			/*
480 			 * If this is the first SYN of connection (not a SYN
481 			 * ACK), include SACK_PERMIT_HDR option.  If this is a
482 			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
483 			 * already done so.
484 			 */
485 			if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
486 			    (tp->t_flags & TF_SACK_PERMIT))) {
487 				*((u_int32_t *) (opt + optlen)) =
488 				    htonl(TCPOPT_SACK_PERMIT_HDR);
489 				optlen += 4;
490 			}
491 #endif
492 
493 			if ((tp->t_flags & TF_REQ_SCALE) &&
494 			    ((flags & TH_ACK) == 0 ||
495 			    (tp->t_flags & TF_RCVD_SCALE))) {
496 				*((u_int32_t *) (opt + optlen)) = htonl(
497 					TCPOPT_NOP << 24 |
498 					TCPOPT_WINDOW << 16 |
499 					TCPOLEN_WINDOW << 8 |
500 					tp->request_r_scale);
501 				optlen += 4;
502 			}
503 		}
504 	}
505 
506 	/*
507 	 * Send a timestamp and echo-reply if this is a SYN and our side
508 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
509 	 * and our peer have sent timestamps in our SYN's.
510 	 */
511 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
512 	     (flags & TH_RST) == 0 &&
513 	    ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
514 	     (tp->t_flags & TF_RCVD_TSTMP))) {
515 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
516 
517 		/* Form timestamp option as shown in appendix A of RFC 1323. */
518 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
519 		*lp++ = htonl(tcp_now);
520 		*lp   = htonl(tp->ts_recent);
521 		optlen += TCPOLEN_TSTAMP_APPA;
522 	}
523 
524 #ifdef TCP_SIGNATURE
525 	if (tp->t_flags & TF_SIGNATURE) {
526 		u_int8_t *bp = (u_int8_t *)(opt + optlen);
527 
528 		/* Send signature option */
529 		*(bp++) = TCPOPT_SIGNATURE;
530 		*(bp++) = TCPOLEN_SIGNATURE;
531 		sigoff = optlen + 2;
532 
533 		{
534 			unsigned int i;
535 
536 			for (i = 0; i < 16; i++)
537 				*(bp++) = 0;
538 		}
539 
540 		optlen += TCPOLEN_SIGNATURE;
541 
542 		/* Pad options list to the next 32 bit boundary and
543 		 * terminate it.
544 		 */
545 		*bp++ = TCPOPT_NOP;
546 		*bp++ = TCPOPT_EOL;
547 		optlen += 2;
548 	}
549 #endif /* TCP_SIGNATURE */
550 
551 #ifdef TCP_SACK
552 	/*
553 	 * Send SACKs if necessary.  This should be the last option processed.
554 	 * Only as many SACKs are sent as are permitted by the maximum options
555 	 * size.  No more than three SACKs are sent.
556 	 */
557 	if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
558 	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
559 	    tp->rcv_numsacks) {
560 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
561 		u_int32_t *olp = lp++;
562 		int count = 0;  /* actual number of SACKs inserted */
563 		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
564 
565 		maxsack = min(maxsack, TCP_MAX_SACK);
566 		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
567 			struct sackblk sack = tp->sackblks[i];
568 			if (sack.start == 0 && sack.end == 0)
569 				continue;
570 			*lp++ = htonl(sack.start);
571 			*lp++ = htonl(sack.end);
572 			count++;
573 		}
574 		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
575 		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
576 	}
577 #endif /* TCP_SACK */
578 
579 #ifdef DIAGNOSTIC
580 	if (optlen > MAX_TCPOPTLEN)
581 		panic("tcp_output: options too long");
582 #endif /* DIAGNOSTIC */
583 
584 	hdrlen += optlen;
585 
586 	/*
587 	 * Adjust data length if insertion of options will
588 	 * bump the packet length beyond the t_maxopd length.
589 	 */
590 	if (len > tp->t_maxopd - optlen) {
591 		len = tp->t_maxopd - optlen;
592 		sendalot = 1;
593 		flags &= ~TH_FIN;
594 	 }
595 
596 #ifdef DIAGNOSTIC
597 	if (max_linkhdr + hdrlen > MHLEN)
598 		panic("tcphdr too big");
599 #endif
600 
601 	/*
602 	 * Grab a header mbuf, attaching a copy of data to
603 	 * be transmitted, and initialize the header from
604 	 * the template for sends on this connection.
605 	 */
606 	if (len) {
607 		if (tp->t_force && len == 1)
608 			tcpstat.tcps_sndprobe++;
609 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
610 			tcpstat.tcps_sndrexmitpack++;
611 			tcpstat.tcps_sndrexmitbyte += len;
612 		} else {
613 			tcpstat.tcps_sndpack++;
614 			tcpstat.tcps_sndbyte += len;
615 		}
616 #ifdef notyet
617 		if ((m = m_copypack(so->so_snd.sb_mb, off,
618 		    (int)len, max_linkhdr + hdrlen)) == 0) {
619 			error = ENOBUFS;
620 			goto out;
621 		}
622 		/*
623 		 * m_copypack left space for our hdr; use it.
624 		 */
625 		m->m_len += hdrlen;
626 		m->m_data -= hdrlen;
627 #else
628 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
629 		if (m == NULL) {
630 			error = ENOBUFS;
631 			goto out;
632 		}
633 		m->m_data += max_linkhdr;
634 		m->m_len = hdrlen;
635 		if (len <= MHLEN - hdrlen - max_linkhdr) {
636 			m_copydata(so->so_snd.sb_mb, off, (int) len,
637 			    mtod(m, caddr_t) + hdrlen);
638 			m->m_len += len;
639 		} else {
640 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
641 			if (m->m_next == 0) {
642 				(void) m_free(m);
643 				error = ENOBUFS;
644 				goto out;
645 			}
646 		}
647 #endif
648 		/*
649 		 * If we're sending everything we've got, set PUSH.
650 		 * (This will keep happy those implementations which only
651 		 * give data to the user when a buffer fills or
652 		 * a PUSH comes in.)
653 		 */
654 		if (off + len == so->so_snd.sb_cc)
655 			flags |= TH_PUSH;
656 	} else {
657 		if (tp->t_flags & TF_ACKNOW)
658 			tcpstat.tcps_sndacks++;
659 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
660 			tcpstat.tcps_sndctrl++;
661 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
662 			tcpstat.tcps_sndurg++;
663 		else
664 			tcpstat.tcps_sndwinup++;
665 
666 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
667 		if (m == NULL) {
668 			error = ENOBUFS;
669 			goto out;
670 		}
671 		m->m_data += max_linkhdr;
672 		m->m_len = hdrlen;
673 	}
674 	m->m_pkthdr.rcvif = (struct ifnet *)0;
675 
676 	if (!tp->t_template)
677 		panic("tcp_output");
678 #ifdef DIAGNOSTIC
679 	if (tp->t_template->m_len != hdrlen - optlen)
680 		panic("tcp_output: template len != hdrlen - optlen");
681 #endif /* DIAGNOSTIC */
682 	bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t),
683 		tp->t_template->m_len);
684 	th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len -
685 		sizeof(struct tcphdr));
686 
687 	/*
688 	 * Fill in fields, remembering maximum advertised
689 	 * window for use in delaying messages about window sizes.
690 	 * If resending a FIN, be sure not to use a new sequence number.
691 	 */
692 	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
693 	    (tp->snd_nxt == tp->snd_max))
694 		tp->snd_nxt--;
695 	/*
696 	 * If we are doing retransmissions, then snd_nxt will
697 	 * not reflect the first unsent octet.  For ACK only
698 	 * packets, we do not want the sequence number of the
699 	 * retransmitted packet, we want the sequence number
700 	 * of the next unsent octet.  So, if there is no data
701 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
702 	 * when filling in ti_seq.  But if we are in persist
703 	 * state, snd_max might reflect one byte beyond the
704 	 * right edge of the window, so use snd_nxt in that
705 	 * case, since we know we aren't doing a retransmission.
706 	 * (retransmit and persist are mutually exclusive...)
707 	 */
708 	if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
709 		th->th_seq = htonl(tp->snd_nxt);
710 	else
711 		th->th_seq = htonl(tp->snd_max);
712 
713 #ifdef TCP_SACK
714 	if (sack_rxmit) {
715 		/*
716 		 * If sendalot was turned on (due to option stuffing), turn it
717 		 * off. Properly set th_seq field.  Advance the ret'x pointer
718 		 * by len.
719 		 */
720 		if (sendalot)
721 			sendalot = 0;
722 		th->th_seq = htonl(p->rxmit);
723 		p->rxmit += len;
724 #if defined(TCP_SACK) && defined(TCP_FACK)
725 		tp->retran_data += len;
726 #endif /* TCP_FACK */
727 	}
728 #endif /* TCP_SACK */
729 
730 	th->th_ack = htonl(tp->rcv_nxt);
731 	if (optlen) {
732 		bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
733 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
734 	}
735 	th->th_flags = flags;
736 
737 	/*
738 	 * Calculate receive window.  Don't shrink window,
739 	 * but avoid silly window syndrome.
740 	 */
741 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
742 		win = 0;
743 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
744 		win = (long)TCP_MAXWIN << tp->rcv_scale;
745 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
746 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
747 	if (flags & TH_RST)
748 		win = 0;
749 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
750 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
751 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
752 		if (urp > IP_MAXPACKET)
753 			urp = IP_MAXPACKET;
754 		th->th_urp = htons((u_int16_t)urp);
755 		th->th_flags |= TH_URG;
756 	} else
757 		/*
758 		 * If no urgent pointer to send, then we pull
759 		 * the urgent pointer to the left edge of the send window
760 		 * so that it doesn't drift into the send window on sequence
761 		 * number wraparound.
762 		 */
763 		tp->snd_up = tp->snd_una;		/* drag it along */
764 
765 	/* Put TCP length in pseudo-header */
766 #if defined(INET) && defined(INET6)
767 	switch (tp->pf) {
768 #else /* defined(INET) && defined(INET6) */
769 	switch (0) {
770 #endif /* defined(INET) && defined(INET6) */
771 	case 0:
772 #ifdef INET
773 	case AF_INET:
774 		if (len + optlen)
775 			mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)(
776 				sizeof (struct tcphdr) + optlen + len));
777 		break;
778 #endif /* INET */
779 #ifdef INET6
780 	case AF_INET6:
781 		break;
782 #endif /* INET6 */
783 	}
784 
785 #ifdef TCP_SIGNATURE
786 	if (tp->t_flags & TF_SIGNATURE) {
787 		MD5_CTX ctx;
788 		union sockaddr_union sa;
789 		struct tdb *tdb;
790 
791 		memset(&sa, 0, sizeof(union sockaddr_union));
792 
793 #if defined(INET) && defined(INET6)
794 		switch(tp->pf) {
795 #else /* defined(INET) && defined(INET6) */
796 		switch (0) {
797 #endif /* defined(INET) && defined(INET6) */
798 		case 0:
799 #ifdef INET
800 		case AF_INET:
801 			sa.sa.sa_len = sizeof(struct sockaddr_in);
802 			sa.sa.sa_family = AF_INET;
803 			sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
804 			break;
805 #endif /* INET */
806 #ifdef INET6
807 		case AF_INET6:
808 			sa.sa.sa_len = sizeof(struct sockaddr_in6);
809 			sa.sa.sa_family = AF_INET6;
810 			sa.sin6.sin6_addr = mtod(m, struct ipv6 *)->ipv6_dst;
811 			break;
812 #endif /* INET6 */
813 		}
814 
815 		tdb = gettdb(0, &sa, IPPROTO_TCP);
816 		if (tdb == NULL)
817 			return (EPERM);
818 
819 		MD5Init(&ctx);
820 
821 #if defined(INET) && defined(INET6)
822 		switch(tp->pf) {
823 #else /* defined(INET) && defined(INET6) */
824 		switch (0) {
825 #endif /* defined(INET) && defined(INET6) */
826 		case 0:
827 #ifdef INET
828 		case AF_INET:
829 			{
830 				struct ippseudo ippseudo;
831 				struct ipovly *ipovly;
832 
833 				ipovly = mtod(m, struct ipovly *);
834 
835 				ippseudo.ippseudo_src = ipovly->ih_src;
836 				ippseudo.ippseudo_dst = ipovly->ih_dst;
837 				ippseudo.ippseudo_pad = 0;
838 				ippseudo.ippseudo_p   = IPPROTO_TCP;
839 				ippseudo.ippseudo_len = ipovly->ih_len;
840 				MD5Update(&ctx, (char *)&ippseudo,
841 					sizeof(struct ippseudo));
842 				MD5Update(&ctx, mtod(m, caddr_t) +
843 					sizeof(struct ip),
844 					sizeof(struct tcphdr));
845 			}
846 			break;
847 #endif /* INET */
848 #ifdef INET6
849 		case AF_INET6:
850 			{
851 				static int printed = 0;
852 
853 				if (!printed) {
854 					printf("error: TCP MD5 support for "
855 						"IPv6 not yet implemented.\n");
856 					printed = 1;
857 				}
858 			}
859 			break;
860 #endif /* INET6 */
861 		}
862 
863 		if (len && m_apply(m, hdrlen, len, tcp_signature_apply,
864 				(caddr_t)&ctx))
865 			return (EINVAL);
866 
867 		MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen);
868 		MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx);
869 	}
870 #endif /* TCP_SIGNATURE */
871 
872 	/*
873 	 * Put TCP length in extended header, and then
874 	 * checksum extended header and data.
875 	 */
876 #if defined(INET) && defined(INET6)
877 	switch (tp->pf) {
878 #else /* defined(INET) && defined(INET6) */
879 	switch (0) {
880 #endif /* defined(INET) && defined(INET6) */
881 	case 0:
882 #ifdef INET
883 	case AF_INET:
884 		th->th_sum = in_cksum(m, (int)(hdrlen + len));
885 		break;
886 #endif /* INET */
887 #ifdef INET6
888 	case AF_INET6:
889   		th->th_sum = in6_cksum(m, IPPROTO_TCP, hdrlen + len,
890 			sizeof(struct ipv6));
891 		break;
892 #endif /* INET6 */
893 	}
894 
895 	/*
896 	 * In transmit state, time the transmission and arrange for
897 	 * the retransmit.  In persist state, just set snd_max.
898 	 */
899 	if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
900 		tcp_seq startseq = tp->snd_nxt;
901 
902 		/*
903 		 * Advance snd_nxt over sequence space of this segment.
904 		 */
905 		if (flags & (TH_SYN|TH_FIN)) {
906 			if (flags & TH_SYN)
907 				tp->snd_nxt++;
908 			if (flags & TH_FIN) {
909 				tp->snd_nxt++;
910 				tp->t_flags |= TF_SENTFIN;
911 			}
912 		}
913 #ifdef TCP_SACK
914 		if (!tp->sack_disable) {
915 			if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
916 				goto timer;
917 			}
918 		}
919 #endif
920 		tp->snd_nxt += len;
921 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
922 			tp->snd_max = tp->snd_nxt;
923 			/*
924 			 * Time this transmission if not a retransmission and
925 			 * not currently timing anything.
926 			 */
927 			if (tp->t_rtt == 0) {
928 				tp->t_rtt = 1;
929 				tp->t_rtseq = startseq;
930 				tcpstat.tcps_segstimed++;
931 			}
932 		}
933 
934 		/*
935 		 * Set retransmit timer if not currently set,
936 		 * and not doing an ack or a keep-alive probe.
937 		 * Initial value for retransmit timer is smoothed
938 		 * round-trip time + 2 * round-trip time variance.
939 		 * Initialize shift counter which is used for backoff
940 		 * of retransmit time.
941 		 */
942 #ifdef TCP_SACK
943  timer:
944 		if (!tp->sack_disable && sack_rxmit &&
945 		    tp->t_timer[TCPT_REXMT] == 0 &&
946 		    tp->snd_nxt != tp->snd_max) {
947 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
948 			if (tp->t_timer[TCPT_PERSIST]) {
949 				tp->t_timer[TCPT_PERSIST] = 0;
950 				tp->t_rxtshift = 0;
951 			}
952 		}
953 #endif
954 
955 		if (tp->t_timer[TCPT_REXMT] == 0 &&
956 		    tp->snd_nxt != tp->snd_una) {
957 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
958 			if (tp->t_timer[TCPT_PERSIST]) {
959 				tp->t_timer[TCPT_PERSIST] = 0;
960 				tp->t_rxtshift = 0;
961 			}
962 		}
963 	} else
964 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
965 			tp->snd_max = tp->snd_nxt + len;
966 
967 	/*
968 	 * Trace.
969 	 */
970 	if (so->so_options & SO_DEBUG)
971 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0,
972 			len);
973 
974 	/*
975 	 * Fill in IP length and desired time to live and
976 	 * send to IP level.  There should be a better way
977 	 * to handle ttl and tos; we could keep them in
978 	 * the template, but need a way to checksum without them.
979 	 */
980 	m->m_pkthdr.len = hdrlen + len;
981 
982 #if defined(INET) && defined(INET6)
983 	switch (tp->pf) {
984 #else /* defined(INET) && defined(INET6) */
985 	switch (0) {
986 #endif /* defined(INET) && defined(INET6) */
987 	case 0:
988 #ifdef INET
989 	case AF_INET:
990 		{
991 			struct ip *ip;
992 
993 			ip = mtod(m, struct ip *);
994 			ip->ip_len = m->m_pkthdr.len;
995 			ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
996 			ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
997 		}
998 
999 		error = ip_output(m, tp->t_inpcb->inp_options,
1000 			&tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE,
1001 			0, tp->t_inpcb);
1002 		break;
1003 #endif /* INET */
1004 #ifdef INET6
1005 	case AF_INET6:
1006 		{
1007 			struct ipv6 *ipv6;
1008 
1009 			ipv6->ipv6_length = m->m_pkthdr.len -
1010 				sizeof(struct ipv6);
1011 			ipv6->ipv6_nexthdr = IPPROTO_TCP;
1012 		}
1013 
1014 		error = ipv6_output(m, &tp->t_inpcb->inp_route6,
1015 			(so->so_options & SO_DONTROUTE), NULL, NULL,
1016 			tp->t_inpcb->inp_socket);
1017 		break;
1018 #endif /* INET6 */
1019 #ifdef TUBA
1020 	case AF_ISO:
1021 		if (tp->t_tuba_pcb)
1022 			error = tuba_output(m, tp);
1023 		break;
1024 #endif /* TUBA */
1025 	}
1026 
1027 #if defined(TCP_SACK) && defined(TCP_FACK)
1028 	/* Update snd_awnd to reflect the new data that was sent.  */
1029 	tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
1030 		tp->retran_data;
1031 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */
1032 
1033 	if (error) {
1034 out:
1035 		if (error == ENOBUFS) {
1036 			tcp_quench(tp->t_inpcb, 0);
1037 			return (0);
1038 		}
1039 		if ((error == EHOSTUNREACH || error == ENETDOWN)
1040 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
1041 			tp->t_softerror = error;
1042 			return (0);
1043 		}
1044 		return (error);
1045 	}
1046 	tcpstat.tcps_sndtotal++;
1047 
1048 	/*
1049 	 * Data sent (as far as we can tell).
1050 	 * If this advertises a larger window than any other segment,
1051 	 * then remember the size of the advertised window.
1052 	 * Any pending ACK has now been sent.
1053 	 */
1054 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1055 		tp->rcv_adv = tp->rcv_nxt + win;
1056 	tp->last_ack_sent = tp->rcv_nxt;
1057 	tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1058 #if defined(TCP_SACK) || defined(TCP_NEWRENO)
1059 	if (sendalot && --maxburst)
1060 #else
1061 	if (sendalot)
1062 #endif
1063 		goto again;
1064 	return (0);
1065 }
1066 
1067 void
1068 tcp_setpersist(tp)
1069 	register struct tcpcb *tp;
1070 {
1071 	register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1072 
1073 	if (tp->t_timer[TCPT_REXMT])
1074 		panic("tcp_output REXMT");
1075 	/*
1076 	 * Start/restart persistance timer.
1077 	 */
1078 	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1079 	    t * tcp_backoff[tp->t_rxtshift],
1080 	    TCPTV_PERSMIN, TCPTV_PERSMAX);
1081 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1082 		tp->t_rxtshift++;
1083 }
1084