xref: /openbsd-src/sys/netinet/tcp_output.c (revision 287546ea80ee896bda0c88b8a8c85a1dc6ff37f9)
1 /*	$OpenBSD: tcp_output.c,v 1.25 1999/12/08 06:50:20 itojun Exp $	*/
2 /*	$NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)tcp_output.c	8.3 (Berkeley) 12/30/93
37  */
38 
39 /*
40 %%% portions-copyright-nrl-95
41 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
43 Reserved. All rights under this copyright have been assigned to the US
44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
46 software.
47 You should have received a copy of the license with this software. If you
48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
49 */
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/protosw.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/errno.h>
59 #include <sys/domain.h>
60 
61 #include <net/route.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/tcp.h>
69 #define	TCPOUTFLAGS
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/tcp_debug.h>
76 
77 #ifdef TUBA
78 #include <netiso/iso.h>
79 #include <netiso/tuba_table.h>
80 #endif
81 
82 #ifdef INET6
83 #include <netinet6/tcpipv6.h>
84 #endif /* INET6 */
85 
86 #ifdef TCP_SIGNATURE
87 #include <sys/md5k.h>
88 #endif /* TCP_SIGNATURE */
89 
90 #ifdef notyet
91 extern struct mbuf *m_copypack();
92 #endif
93 
94 #ifdef TCP_SACK
95 extern int tcprexmtthresh;
96 #endif
97 
98 #ifdef TCP_SACK
99 #ifdef TCP_SACK_DEBUG
100 void
101 tcp_print_holes(tp)
102 struct tcpcb *tp;
103 {
104 	struct sackhole *p = tp->snd_holes;
105 	if (p == 0)
106 		return;
107 	printf("Hole report: start--end dups rxmit\n");
108 	while (p) {
109 		printf("%x--%x d %d r %x\n",  p->start, p->end, p->dups,
110                     p->rxmit);
111 		p = p->next;
112 	}
113 	printf("\n");
114 }
115 #endif /* TCP_SACK_DEBUG */
116 
117 /*
118  * Returns pointer to a sackhole if there are any pending retransmissions;
119  * NULL otherwise.
120  */
121 struct sackhole *
122 tcp_sack_output(tp)
123 register struct tcpcb *tp;
124 {
125 	struct sackhole *p;
126 	if (tp->sack_disable)
127 		return 0;
128 	p = tp->snd_holes;
129 	while (p) {
130 		if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
131 			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
132 				p = p->next;
133 				continue;
134 			}
135 #ifdef TCP_SACK_DEBUG
136 			if (p)
137 				tcp_print_holes(tp);
138 #endif
139 			return p;
140 		}
141         	p = p->next;
142 	}
143 	return 0;
144 }
145 
146 /*
147  * After a timeout, the SACK list may be rebuilt.  This SACK information
148  * should be used to avoid retransmitting SACKed data.  This function
149  * traverses the SACK list to see if snd_nxt should be moved forward.
150  */
151 void
152 tcp_sack_adjust(tp)
153 	struct tcpcb *tp;
154 {
155 	int i;
156 
157 	for (i = 0; i < tp->rcv_numsacks; i++) {
158 		if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start))
159 			break;
160 		if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt))
161 			continue;
162 		if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0)
163 			continue;
164 		/* snd_nxt must be in middle of block of SACKed data */
165 		tp->snd_nxt = tp->sackblks[i].end;
166 		break;
167 	}
168 }
169 #endif /* TCP_SACK */
170 
171 /*
172  * Tcp output routine: figure out what should be sent and send it.
173  */
174 int
175 tcp_output(tp)
176 	register struct tcpcb *tp;
177 {
178 	register struct socket *so = tp->t_inpcb->inp_socket;
179 	register long len, win;
180 	int off, flags, error;
181 	register struct mbuf *m;
182 	register struct tcphdr *th;
183 	u_char opt[MAX_TCPOPTLEN];
184 	unsigned int optlen, hdrlen;
185 	int idle, sendalot;
186 #ifdef TCP_SACK
187 	int i, sack_rxmit = 0;
188 	struct sackhole *p;
189 #endif
190 #if defined(TCP_SACK) || defined(TCP_NEWRENO)
191 	int maxburst = TCP_MAXBURST;
192 #endif
193 #ifdef TCP_SIGNATURE
194 	unsigned int sigoff;
195 #endif /* TCP_SIGNATURE */
196 
197 #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
198 	if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE))
199 		return (EINVAL);
200 #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */
201 
202 	/*
203 	 * Determine length of data that should be transmitted,
204 	 * and flags that will be used.
205 	 * If there is some data or critical controls (SYN, RST)
206 	 * to send, then transmit; otherwise, investigate further.
207 	 */
208 	idle = (tp->snd_max == tp->snd_una);
209 	if (idle && tp->t_idle >= tp->t_rxtcur)
210 		/*
211 		 * We have been idle for "a while" and no acks are
212 		 * expected to clock out any data we send --
213 		 * slow start to get ack "clock" running again.
214 		 */
215 		tp->snd_cwnd = tp->t_maxseg;
216 again:
217 	sendalot = 0;
218 #ifdef TCP_SACK
219 	/*
220 	 * If we've recently taken a timeout, snd_max will be greater than
221 	 * snd_nxt.  There may be SACK information that allows us to avoid
222 	 * resending already delivered data.  Adjust snd_nxt accordingly.
223 	 */
224 	if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
225 		tcp_sack_adjust(tp);
226 #endif
227 	off = tp->snd_nxt - tp->snd_una;
228 	win = ulmin(tp->snd_wnd, tp->snd_cwnd);
229 
230 	flags = tcp_outflags[tp->t_state];
231 	/*
232 	 * If in persist timeout with window of 0, send 1 byte.
233 	 * Otherwise, if window is small but nonzero
234 	 * and timer expired, we will send what we can
235 	 * and go to transmit state.
236 	 */
237 
238 #ifdef TCP_SACK
239 	/*
240 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
241 	 * to send out new data (when sendalot is 1), bypass this function.
242 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
243 	 * we're replacing a (future) new transmission with a retransmission
244 	 * now, and we previously incremented snd_cwnd in tcp_input().
245 	 */
246 	if (!tp->sack_disable && !sendalot) {
247 		if ((p = tcp_sack_output(tp))) {
248 			off = p->rxmit - tp->snd_una;
249 			sack_rxmit = 1;
250 #if 0
251 			/* Coalesce holes into a single retransmission */
252 #endif
253 			len = min(tp->t_maxseg, p->end - p->rxmit);
254 #ifndef TCP_FACK
255 			/* in FACK, hold snd_cwnd constant during recovery */
256 			if (SEQ_LT(tp->snd_una, tp->snd_last))
257 				tp->snd_cwnd -= tp->t_maxseg;
258 #endif
259     		}
260 	}
261 #endif /* TCP_SACK */
262 
263 	if (tp->t_force) {
264 		if (win == 0) {
265 			/*
266 			 * If we still have some data to send, then
267 			 * clear the FIN bit.  Usually this would
268 			 * happen below when it realizes that we
269 			 * aren't sending all the data.  However,
270 			 * if we have exactly 1 byte of unset data,
271 			 * then it won't clear the FIN bit below,
272 			 * and if we are in persist state, we wind
273 			 * up sending the packet without recording
274 			 * that we sent the FIN bit.
275 			 *
276 			 * We can't just blindly clear the FIN bit,
277 			 * because if we don't have any more data
278 			 * to send then the probe will be the FIN
279 			 * itself.
280 			 */
281 			if (off < so->so_snd.sb_cc)
282 				flags &= ~TH_FIN;
283 			win = 1;
284 		} else {
285 			tp->t_timer[TCPT_PERSIST] = 0;
286 			tp->t_rxtshift = 0;
287 		}
288 	}
289 
290 #ifdef TCP_SACK
291 	if (!sack_rxmit) {
292 #endif
293 	len = ulmin(so->so_snd.sb_cc, win) - off;
294 
295 #if defined(TCP_SACK) && defined(TCP_FACK)
296 	/*
297 	 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
298 	 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
299 	 * do not send data (like zero window conditions)
300 	 */
301 	if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
302 	    (tp->snd_awnd >= tp->snd_cwnd))
303 		len = 0;
304 #endif /* TCP_FACK */
305 #ifdef TCP_SACK
306 	}
307 #endif
308 
309 	if (len < 0) {
310 		/*
311 		 * If FIN has been sent but not acked,
312 		 * but we haven't been called to retransmit,
313 		 * len will be -1.  Otherwise, window shrank
314 		 * after we sent into it.  If window shrank to 0,
315 		 * cancel pending retransmit and pull snd_nxt
316 		 * back to (closed) window.  We will enter persist
317 		 * state below.  If the window didn't close completely,
318 		 * just wait for an ACK.
319 		 */
320 		len = 0;
321 		if (win == 0) {
322 			tp->t_timer[TCPT_REXMT] = 0;
323 			tp->snd_nxt = tp->snd_una;
324 		}
325 	}
326 	if (len > tp->t_maxseg) {
327 		len = tp->t_maxseg;
328 		sendalot = 1;
329 	}
330 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
331 		flags &= ~TH_FIN;
332 
333 	win = sbspace(&so->so_rcv);
334 
335 	/*
336 	 * Sender silly window avoidance.  If connection is idle
337 	 * and can send all data, a maximum segment,
338 	 * at least a maximum default-size segment do it,
339 	 * or are forced, do it; otherwise don't bother.
340 	 * If peer's buffer is tiny, then send
341 	 * when window is at least half open.
342 	 * If retransmitting (possibly after persist timer forced us
343 	 * to send into a small window), then must resend.
344 	 */
345 	if (len) {
346 		if (len == tp->t_maxseg)
347 			goto send;
348 		if ((idle || tp->t_flags & TF_NODELAY) &&
349 		    len + off >= so->so_snd.sb_cc)
350 			goto send;
351 		if (tp->t_force)
352 			goto send;
353 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
354 			goto send;
355 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
356 			goto send;
357 #ifdef TCP_SACK
358 		if (sack_rxmit)
359 			goto send;
360 #endif
361 	}
362 
363 	/*
364 	 * Compare available window to amount of window
365 	 * known to peer (as advertised window less
366 	 * next expected input).  If the difference is at least two
367 	 * max size segments, or at least 50% of the maximum possible
368 	 * window, then want to send a window update to peer.
369 	 */
370 	if (win > 0) {
371 		/*
372 		 * "adv" is the amount we can increase the window,
373 		 * taking into account that we are limited by
374 		 * TCP_MAXWIN << tp->rcv_scale.
375 		 */
376 		long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) -
377 			(tp->rcv_adv - tp->rcv_nxt);
378 
379 		if (adv >= (long) (2 * tp->t_maxseg))
380 			goto send;
381 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
382 			goto send;
383 	}
384 
385 	/*
386 	 * Send if we owe peer an ACK.
387 	 */
388 	if (tp->t_flags & TF_ACKNOW)
389 		goto send;
390 	if (flags & (TH_SYN|TH_RST))
391 		goto send;
392 	if (SEQ_GT(tp->snd_up, tp->snd_una))
393 		goto send;
394 	/*
395 	 * If our state indicates that FIN should be sent
396 	 * and we have not yet done so, or we're retransmitting the FIN,
397 	 * then we need to send.
398 	 */
399 	if (flags & TH_FIN &&
400 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
401 		goto send;
402 
403 	/*
404 	 * TCP window updates are not reliable, rather a polling protocol
405 	 * using ``persist'' packets is used to insure receipt of window
406 	 * updates.  The three ``states'' for the output side are:
407 	 *	idle			not doing retransmits or persists
408 	 *	persisting		to move a small or zero window
409 	 *	(re)transmitting	and thereby not persisting
410 	 *
411 	 * tp->t_timer[TCPT_PERSIST]
412 	 *	is set when we are in persist state.
413 	 * tp->t_force
414 	 *	is set when we are called to send a persist packet.
415 	 * tp->t_timer[TCPT_REXMT]
416 	 *	is set when we are retransmitting
417 	 * The output side is idle when both timers are zero.
418 	 *
419 	 * If send window is too small, there is data to transmit, and no
420 	 * retransmit or persist is pending, then go to persist state.
421 	 * If nothing happens soon, send when timer expires:
422 	 * if window is nonzero, transmit what we can,
423 	 * otherwise force out a byte.
424 	 */
425 	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
426 	    tp->t_timer[TCPT_PERSIST] == 0) {
427 		tp->t_rxtshift = 0;
428 		tcp_setpersist(tp);
429 	}
430 
431 	/*
432 	 * No reason to send a segment, just return.
433 	 */
434 	return (0);
435 
436 send:
437 	/*
438 	 * Before ESTABLISHED, force sending of initial options
439 	 * unless TCP set not to do any options.
440 	 * NOTE: we assume that the IP/TCP header plus TCP options
441 	 * always fit in a single mbuf, leaving room for a maximum
442 	 * link header, i.e.
443 	 *	max_linkhdr + sizeof(network header) + sizeof(struct tcphdr +
444 	 * 		optlen <= MHLEN
445 	 */
446 	optlen = 0;
447 
448 	switch (tp->pf) {
449 	case 0:	/*default to PF_INET*/
450 #ifdef INET
451 	case PF_INET:
452 		hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
453 		break;
454 #endif /* INET */
455 #ifdef INET6
456 	case PF_INET6:
457 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
458 		break;
459 #endif /* INET6 */
460 	default:
461 		return (EPFNOSUPPORT);
462 	}
463 
464 	if (flags & TH_SYN) {
465 		tp->snd_nxt = tp->iss;
466 		if ((tp->t_flags & TF_NOOPT) == 0) {
467 			u_int16_t mss;
468 
469 			opt[0] = TCPOPT_MAXSEG;
470 			opt[1] = 4;
471 			mss = htons((u_int16_t) tcp_mss(tp, 0));
472 			bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
473 			optlen = 4;
474 #ifdef TCP_SACK
475 			/*
476 			 * If this is the first SYN of connection (not a SYN
477 			 * ACK), include SACK_PERMIT_HDR option.  If this is a
478 			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
479 			 * already done so.
480 			 */
481 			if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
482 			    (tp->t_flags & TF_SACK_PERMIT))) {
483 				*((u_int32_t *) (opt + optlen)) =
484 				    htonl(TCPOPT_SACK_PERMIT_HDR);
485 				optlen += 4;
486 			}
487 #endif
488 
489 			if ((tp->t_flags & TF_REQ_SCALE) &&
490 			    ((flags & TH_ACK) == 0 ||
491 			    (tp->t_flags & TF_RCVD_SCALE))) {
492 				*((u_int32_t *) (opt + optlen)) = htonl(
493 					TCPOPT_NOP << 24 |
494 					TCPOPT_WINDOW << 16 |
495 					TCPOLEN_WINDOW << 8 |
496 					tp->request_r_scale);
497 				optlen += 4;
498 			}
499 		}
500 	}
501 
502 	/*
503 	 * Send a timestamp and echo-reply if this is a SYN and our side
504 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
505 	 * and our peer have sent timestamps in our SYN's.
506 	 */
507 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
508 	     (flags & TH_RST) == 0 &&
509 	    ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
510 	     (tp->t_flags & TF_RCVD_TSTMP))) {
511 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
512 
513 		/* Form timestamp option as shown in appendix A of RFC 1323. */
514 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
515 		*lp++ = htonl(tcp_now);
516 		*lp   = htonl(tp->ts_recent);
517 		optlen += TCPOLEN_TSTAMP_APPA;
518 	}
519 
520 #ifdef TCP_SIGNATURE
521 	if (tp->t_flags & TF_SIGNATURE) {
522 		u_int8_t *bp = (u_int8_t *)(opt + optlen);
523 
524 		/* Send signature option */
525 		*(bp++) = TCPOPT_SIGNATURE;
526 		*(bp++) = TCPOLEN_SIGNATURE;
527 		sigoff = optlen + 2;
528 
529 		{
530 			unsigned int i;
531 
532 			for (i = 0; i < 16; i++)
533 				*(bp++) = 0;
534 		}
535 
536 		optlen += TCPOLEN_SIGNATURE;
537 
538 		/* Pad options list to the next 32 bit boundary and
539 		 * terminate it.
540 		 */
541 		*bp++ = TCPOPT_NOP;
542 		*bp++ = TCPOPT_EOL;
543 		optlen += 2;
544 	}
545 #endif /* TCP_SIGNATURE */
546 
547 #ifdef TCP_SACK
548 	/*
549 	 * Send SACKs if necessary.  This should be the last option processed.
550 	 * Only as many SACKs are sent as are permitted by the maximum options
551 	 * size.  No more than three SACKs are sent.
552 	 */
553 	if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
554 	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
555 	    tp->rcv_numsacks) {
556 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
557 		u_int32_t *olp = lp++;
558 		int count = 0;  /* actual number of SACKs inserted */
559 		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
560 
561 		maxsack = min(maxsack, TCP_MAX_SACK);
562 		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
563 			struct sackblk sack = tp->sackblks[i];
564 			if (sack.start == 0 && sack.end == 0)
565 				continue;
566 			*lp++ = htonl(sack.start);
567 			*lp++ = htonl(sack.end);
568 			count++;
569 		}
570 		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
571 		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
572 	}
573 #endif /* TCP_SACK */
574 
575 #ifdef DIAGNOSTIC
576 	if (optlen > MAX_TCPOPTLEN)
577 		panic("tcp_output: options too long");
578 #endif /* DIAGNOSTIC */
579 
580 	hdrlen += optlen;
581 
582 	/*
583 	 * Adjust data length if insertion of options will
584 	 * bump the packet length beyond the t_maxopd length.
585 	 */
586 	if (len > tp->t_maxopd - optlen) {
587 		len = tp->t_maxopd - optlen;
588 		sendalot = 1;
589 		flags &= ~TH_FIN;
590 	 }
591 
592 #ifdef DIAGNOSTIC
593 	if (max_linkhdr + hdrlen > MHLEN)
594 		panic("tcphdr too big");
595 #endif
596 
597 	/*
598 	 * Grab a header mbuf, attaching a copy of data to
599 	 * be transmitted, and initialize the header from
600 	 * the template for sends on this connection.
601 	 */
602 	if (len) {
603 		if (tp->t_force && len == 1)
604 			tcpstat.tcps_sndprobe++;
605 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
606 			tcpstat.tcps_sndrexmitpack++;
607 			tcpstat.tcps_sndrexmitbyte += len;
608 		} else {
609 			tcpstat.tcps_sndpack++;
610 			tcpstat.tcps_sndbyte += len;
611 		}
612 #ifdef notyet
613 		if ((m = m_copypack(so->so_snd.sb_mb, off,
614 		    (int)len, max_linkhdr + hdrlen)) == 0) {
615 			error = ENOBUFS;
616 			goto out;
617 		}
618 		/*
619 		 * m_copypack left space for our hdr; use it.
620 		 */
621 		m->m_len += hdrlen;
622 		m->m_data -= hdrlen;
623 #else
624 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
625 		if (m != NULL) {
626 			MCLGET(m, M_DONTWAIT);
627 			if ((m->m_flags & M_EXT) == 0) {
628 				m_freem(m);
629 				m = NULL;
630 			}
631 		}
632 		if (m == NULL) {
633 			error = ENOBUFS;
634 			goto out;
635 		}
636 		m->m_data += max_linkhdr;
637 		m->m_len = hdrlen;
638 		if (len <= MCLBYTES - hdrlen - max_linkhdr) {
639 			m_copydata(so->so_snd.sb_mb, off, (int) len,
640 			    mtod(m, caddr_t) + hdrlen);
641 			m->m_len += len;
642 		} else {
643 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
644 			if (m->m_next == 0) {
645 				(void) m_free(m);
646 				error = ENOBUFS;
647 				goto out;
648 			}
649 		}
650 #endif
651 		/*
652 		 * If we're sending everything we've got, set PUSH.
653 		 * (This will keep happy those implementations which only
654 		 * give data to the user when a buffer fills or
655 		 * a PUSH comes in.)
656 		 */
657 		if (off + len == so->so_snd.sb_cc)
658 			flags |= TH_PUSH;
659 	} else {
660 		if (tp->t_flags & TF_ACKNOW)
661 			tcpstat.tcps_sndacks++;
662 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
663 			tcpstat.tcps_sndctrl++;
664 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
665 			tcpstat.tcps_sndurg++;
666 		else
667 			tcpstat.tcps_sndwinup++;
668 
669 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
670 		if (m != NULL) {
671 			MCLGET(m, M_DONTWAIT);
672 			if ((m->m_flags & M_EXT) == 0) {
673 				m_freem(m);
674 				m = NULL;
675 			}
676 		}
677 		if (m == NULL) {
678 			error = ENOBUFS;
679 			goto out;
680 		}
681 		m->m_data += max_linkhdr;
682 		m->m_len = hdrlen;
683 	}
684 	m->m_pkthdr.rcvif = (struct ifnet *)0;
685 
686 	if (!tp->t_template)
687 		panic("tcp_output");
688 #ifdef DIAGNOSTIC
689 	if (tp->t_template->m_len != hdrlen - optlen)
690 		panic("tcp_output: template len != hdrlen - optlen");
691 #endif /* DIAGNOSTIC */
692 	bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t),
693 		tp->t_template->m_len);
694 	th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len -
695 		sizeof(struct tcphdr));
696 
697 	/*
698 	 * Fill in fields, remembering maximum advertised
699 	 * window for use in delaying messages about window sizes.
700 	 * If resending a FIN, be sure not to use a new sequence number.
701 	 */
702 	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
703 	    (tp->snd_nxt == tp->snd_max))
704 		tp->snd_nxt--;
705 	/*
706 	 * If we are doing retransmissions, then snd_nxt will
707 	 * not reflect the first unsent octet.  For ACK only
708 	 * packets, we do not want the sequence number of the
709 	 * retransmitted packet, we want the sequence number
710 	 * of the next unsent octet.  So, if there is no data
711 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
712 	 * when filling in ti_seq.  But if we are in persist
713 	 * state, snd_max might reflect one byte beyond the
714 	 * right edge of the window, so use snd_nxt in that
715 	 * case, since we know we aren't doing a retransmission.
716 	 * (retransmit and persist are mutually exclusive...)
717 	 */
718 	if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
719 		th->th_seq = htonl(tp->snd_nxt);
720 	else
721 		th->th_seq = htonl(tp->snd_max);
722 
723 #ifdef TCP_SACK
724 	if (sack_rxmit) {
725 		/*
726 		 * If sendalot was turned on (due to option stuffing), turn it
727 		 * off. Properly set th_seq field.  Advance the ret'x pointer
728 		 * by len.
729 		 */
730 		if (sendalot)
731 			sendalot = 0;
732 		th->th_seq = htonl(p->rxmit);
733 		p->rxmit += len;
734 #if defined(TCP_SACK) && defined(TCP_FACK)
735 		tp->retran_data += len;
736 #endif /* TCP_FACK */
737 	}
738 #endif /* TCP_SACK */
739 
740 	th->th_ack = htonl(tp->rcv_nxt);
741 	if (optlen) {
742 		bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
743 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
744 	}
745 	th->th_flags = flags;
746 
747 	/*
748 	 * Calculate receive window.  Don't shrink window,
749 	 * but avoid silly window syndrome.
750 	 */
751 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
752 		win = 0;
753 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
754 		win = (long)TCP_MAXWIN << tp->rcv_scale;
755 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
756 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
757 	if (flags & TH_RST)
758 		win = 0;
759 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
760 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
761 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
762 		if (urp > IP_MAXPACKET)
763 			urp = IP_MAXPACKET;
764 		th->th_urp = htons((u_int16_t)urp);
765 		th->th_flags |= TH_URG;
766 	} else
767 		/*
768 		 * If no urgent pointer to send, then we pull
769 		 * the urgent pointer to the left edge of the send window
770 		 * so that it doesn't drift into the send window on sequence
771 		 * number wraparound.
772 		 */
773 		tp->snd_up = tp->snd_una;		/* drag it along */
774 
775 	/* Put TCP length in pseudo-header */
776 	switch (tp->pf) {
777 	case 0:	/*default to PF_INET*/
778 #ifdef INET
779 	case AF_INET:
780 		if (len + optlen)
781 			mtod(m, struct ipovly *)->ih_len = htons((u_int16_t)(
782 				sizeof (struct tcphdr) + optlen + len));
783 		break;
784 #endif /* INET */
785 #ifdef INET6
786 	case AF_INET6:
787 		break;
788 #endif /* INET6 */
789 	}
790 
791 #ifdef TCP_SIGNATURE
792 	if (tp->t_flags & TF_SIGNATURE) {
793 		MD5_CTX ctx;
794 		union sockaddr_union sa;
795 		struct tdb *tdb;
796 
797 		memset(&sa, 0, sizeof(union sockaddr_union));
798 
799 		switch (tp->pf) {
800 		case 0:	/*default to PF_INET*/
801 #ifdef INET
802 		case AF_INET:
803 			sa.sa.sa_len = sizeof(struct sockaddr_in);
804 			sa.sa.sa_family = AF_INET;
805 			sa.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
806 			break;
807 #endif /* INET */
808 #ifdef INET6
809 		case AF_INET6:
810 			sa.sa.sa_len = sizeof(struct sockaddr_in6);
811 			sa.sa.sa_family = AF_INET6;
812 			sa.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
813 			break;
814 #endif /* INET6 */
815 		}
816 
817 		/* XXX gettdb() should really be called at spltdb().      */
818 		/* XXX this is splsoftnet(), currently they are the same. */
819 		tdb = gettdb(0, &sa, IPPROTO_TCP);
820 		if (tdb == NULL)
821 			return (EPERM);
822 
823 		MD5Init(&ctx);
824 
825 		switch (tp->pf) {
826 		case 0:	/*default to PF_INET*/
827 #ifdef INET
828 		case AF_INET:
829 			{
830 				struct ippseudo ippseudo;
831 				struct ipovly *ipovly;
832 
833 				ipovly = mtod(m, struct ipovly *);
834 
835 				ippseudo.ippseudo_src = ipovly->ih_src;
836 				ippseudo.ippseudo_dst = ipovly->ih_dst;
837 				ippseudo.ippseudo_pad = 0;
838 				ippseudo.ippseudo_p   = IPPROTO_TCP;
839 				ippseudo.ippseudo_len = ipovly->ih_len;
840 				MD5Update(&ctx, (char *)&ippseudo,
841 					sizeof(struct ippseudo));
842 				MD5Update(&ctx, mtod(m, caddr_t) +
843 					sizeof(struct ip),
844 					sizeof(struct tcphdr));
845 			}
846 			break;
847 #endif /* INET */
848 #ifdef INET6
849 		case AF_INET6:
850 			{
851 				static int printed = 0;
852 
853 				if (!printed) {
854 					printf("error: TCP MD5 support for "
855 						"IPv6 not yet implemented.\n");
856 					printed = 1;
857 				}
858 			}
859 			break;
860 #endif /* INET6 */
861 		}
862 
863 		if (len && m_apply(m, hdrlen, len, tcp_signature_apply,
864 				(caddr_t)&ctx))
865 			return (EINVAL);
866 
867 		MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen);
868 		MD5Final(mtod(m, caddr_t) + hdrlen - optlen + sigoff, &ctx);
869 	}
870 #endif /* TCP_SIGNATURE */
871 
872 	/*
873 	 * Put TCP length in extended header, and then
874 	 * checksum extended header and data.
875 	 */
876 	switch (tp->pf) {
877 	case 0:	/*default to PF_INET*/
878 #ifdef INET
879 	case AF_INET:
880 		th->th_sum = in_cksum(m, (int)(hdrlen + len));
881 		break;
882 #endif /* INET */
883 #ifdef INET6
884 	case AF_INET6:
885 		m->m_pkthdr.len = hdrlen + len;
886   		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
887 			hdrlen - sizeof(struct ip6_hdr) + len);
888 		break;
889 #endif /* INET6 */
890 	}
891 
892 	/*
893 	 * In transmit state, time the transmission and arrange for
894 	 * the retransmit.  In persist state, just set snd_max.
895 	 */
896 	if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
897 		tcp_seq startseq = tp->snd_nxt;
898 
899 		/*
900 		 * Advance snd_nxt over sequence space of this segment.
901 		 */
902 		if (flags & (TH_SYN|TH_FIN)) {
903 			if (flags & TH_SYN)
904 				tp->snd_nxt++;
905 			if (flags & TH_FIN) {
906 				tp->snd_nxt++;
907 				tp->t_flags |= TF_SENTFIN;
908 			}
909 		}
910 #ifdef TCP_SACK
911 		if (!tp->sack_disable) {
912 			if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
913 				goto timer;
914 			}
915 		}
916 #endif
917 		tp->snd_nxt += len;
918 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
919 			tp->snd_max = tp->snd_nxt;
920 			/*
921 			 * Time this transmission if not a retransmission and
922 			 * not currently timing anything.
923 			 */
924 			if (tp->t_rtt == 0) {
925 				tp->t_rtt = 1;
926 				tp->t_rtseq = startseq;
927 				tcpstat.tcps_segstimed++;
928 			}
929 		}
930 
931 		/*
932 		 * Set retransmit timer if not currently set,
933 		 * and not doing an ack or a keep-alive probe.
934 		 * Initial value for retransmit timer is smoothed
935 		 * round-trip time + 2 * round-trip time variance.
936 		 * Initialize shift counter which is used for backoff
937 		 * of retransmit time.
938 		 */
939 #ifdef TCP_SACK
940  timer:
941 		if (!tp->sack_disable && sack_rxmit &&
942 		    tp->t_timer[TCPT_REXMT] == 0 &&
943 		    tp->snd_nxt != tp->snd_max) {
944 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
945 			if (tp->t_timer[TCPT_PERSIST]) {
946 				tp->t_timer[TCPT_PERSIST] = 0;
947 				tp->t_rxtshift = 0;
948 			}
949 		}
950 #endif
951 
952 		if (tp->t_timer[TCPT_REXMT] == 0 &&
953 		    tp->snd_nxt != tp->snd_una) {
954 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
955 			if (tp->t_timer[TCPT_PERSIST]) {
956 				tp->t_timer[TCPT_PERSIST] = 0;
957 				tp->t_rxtshift = 0;
958 			}
959 		}
960 	} else
961 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
962 			tp->snd_max = tp->snd_nxt + len;
963 
964 	/*
965 	 * Trace.
966 	 */
967 	if (so->so_options & SO_DEBUG)
968 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0,
969 			len);
970 
971 	/*
972 	 * Fill in IP length and desired time to live and
973 	 * send to IP level.  There should be a better way
974 	 * to handle ttl and tos; we could keep them in
975 	 * the template, but need a way to checksum without them.
976 	 */
977 	m->m_pkthdr.len = hdrlen + len;
978 
979 	switch (tp->pf) {
980 	case 0:	/*default to PF_INET*/
981 #ifdef INET
982 	case AF_INET:
983 		{
984 			struct ip *ip;
985 
986 			ip = mtod(m, struct ip *);
987 			ip->ip_len = m->m_pkthdr.len;
988 			ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
989 			ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
990 		}
991 		error = ip_output(m, tp->t_inpcb->inp_options,
992 			&tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE,
993 			0, tp->t_inpcb);
994 		break;
995 #endif /* INET */
996 #ifdef INET6
997 	case AF_INET6:
998 		{
999 			struct ip6_hdr *ipv6;
1000 
1001 			ipv6 = mtod(m, struct ip6_hdr *);
1002 			ipv6->ip6_plen = m->m_pkthdr.len -
1003 				sizeof(struct ip6_hdr);
1004 			ipv6->ip6_nxt = IPPROTO_TCP;
1005 			ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
1006 		}
1007 		error = ip6_output(m, tp->t_inpcb->inp_outputopts6,
1008 			  &tp->t_inpcb->inp_route6,
1009 			  (so->so_options & SO_DONTROUTE), NULL, NULL);
1010 		break;
1011 #endif /* INET6 */
1012 #ifdef TUBA
1013 	case AF_ISO:
1014 		if (tp->t_tuba_pcb)
1015 			error = tuba_output(m, tp);
1016 		break;
1017 #endif /* TUBA */
1018 	}
1019 
1020 #if defined(TCP_SACK) && defined(TCP_FACK)
1021 	/* Update snd_awnd to reflect the new data that was sent.  */
1022 	tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
1023 		tp->retran_data;
1024 #endif /* defined(TCP_SACK) && defined(TCP_FACK) */
1025 
1026 	if (error) {
1027 out:
1028 		if (error == ENOBUFS) {
1029 			tcp_quench(tp->t_inpcb, 0);
1030 			return (0);
1031 		}
1032 		if ((error == EHOSTUNREACH || error == ENETDOWN)
1033 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
1034 			tp->t_softerror = error;
1035 			return (0);
1036 		}
1037 		return (error);
1038 	}
1039 	tcpstat.tcps_sndtotal++;
1040 
1041 	/*
1042 	 * Data sent (as far as we can tell).
1043 	 * If this advertises a larger window than any other segment,
1044 	 * then remember the size of the advertised window.
1045 	 * Any pending ACK has now been sent.
1046 	 */
1047 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1048 		tp->rcv_adv = tp->rcv_nxt + win;
1049 	tp->last_ack_sent = tp->rcv_nxt;
1050 	tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
1051 #if defined(TCP_SACK) || defined(TCP_NEWRENO)
1052 	if (sendalot && --maxburst)
1053 #else
1054 	if (sendalot)
1055 #endif
1056 		goto again;
1057 	return (0);
1058 }
1059 
1060 void
1061 tcp_setpersist(tp)
1062 	register struct tcpcb *tp;
1063 {
1064 	register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1065 
1066 	if (tp->t_timer[TCPT_REXMT])
1067 		panic("tcp_output REXMT");
1068 	/*
1069 	 * Start/restart persistance timer.
1070 	 */
1071 	if (t < tp->t_rttmin)
1072 		t = tp->t_rttmin;
1073 	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
1074 	    t * tcp_backoff[tp->t_rxtshift],
1075 	    TCPTV_PERSMIN, TCPTV_PERSMAX);
1076 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1077 		tp->t_rxtshift++;
1078 }
1079