1 /* $NetBSD: tcp_output.c,v 1.142 2006/03/25 13:34:35 seanb Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 * 35 * NRL grants permission for redistribution and use in source and binary 36 * forms, with or without modification, of the software and documentation 37 * created at NRL provided that the following conditions are met: 38 * 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgements: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * This product includes software developed at the Information 49 * Technology Division, US Naval Research Laboratory. 50 * 4. Neither the name of the NRL nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 * 66 * The views and conclusions contained in the software and documentation 67 * are those of the authors and should not be interpreted as representing 68 * official policies, either expressed or implied, of the US Naval 69 * Research Laboratory (NRL). 70 */ 71 72 /*- 73 * Copyright (c) 1997, 1998, 2001, 2005 The NetBSD Foundation, Inc. 74 * All rights reserved. 75 * 76 * This code is derived from software contributed to The NetBSD Foundation 77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 78 * Facility, NASA Ames Research Center. 79 * This code is derived from software contributed to The NetBSD Foundation 80 * by Charles M. Hannum. 81 * 82 * Redistribution and use in source and binary forms, with or without 83 * modification, are permitted provided that the following conditions 84 * are met: 85 * 1. Redistributions of source code must retain the above copyright 86 * notice, this list of conditions and the following disclaimer. 87 * 2. Redistributions in binary form must reproduce the above copyright 88 * notice, this list of conditions and the following disclaimer in the 89 * documentation and/or other materials provided with the distribution. 90 * 3. All advertising materials mentioning features or use of this software 91 * must display the following acknowledgement: 92 * This product includes software developed by the NetBSD 93 * Foundation, Inc. and its contributors. 94 * 4. Neither the name of The NetBSD Foundation nor the names of its 95 * contributors may be used to endorse or promote products derived 96 * from this software without specific prior written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 99 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 108 * POSSIBILITY OF SUCH DAMAGE. 109 */ 110 111 /* 112 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 113 * The Regents of the University of California. All rights reserved. 114 * 115 * Redistribution and use in source and binary forms, with or without 116 * modification, are permitted provided that the following conditions 117 * are met: 118 * 1. Redistributions of source code must retain the above copyright 119 * notice, this list of conditions and the following disclaimer. 120 * 2. Redistributions in binary form must reproduce the above copyright 121 * notice, this list of conditions and the following disclaimer in the 122 * documentation and/or other materials provided with the distribution. 123 * 3. Neither the name of the University nor the names of its contributors 124 * may be used to endorse or promote products derived from this software 125 * without specific prior written permission. 126 * 127 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 128 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 129 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 130 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 131 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 132 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 133 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 134 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 135 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 136 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 137 * SUCH DAMAGE. 138 * 139 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 140 */ 141 142 #include <sys/cdefs.h> 143 __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.142 2006/03/25 13:34:35 seanb Exp $"); 144 145 #include "opt_inet.h" 146 #include "opt_ipsec.h" 147 #include "opt_tcp_debug.h" 148 149 #include <sys/param.h> 150 #include <sys/systm.h> 151 #include <sys/malloc.h> 152 #include <sys/mbuf.h> 153 #include <sys/protosw.h> 154 #include <sys/socket.h> 155 #include <sys/socketvar.h> 156 #include <sys/errno.h> 157 #include <sys/domain.h> 158 #include <sys/kernel.h> 159 #ifdef TCP_SIGNATURE 160 #include <sys/md5.h> 161 #endif 162 163 #include <net/if.h> 164 #include <net/route.h> 165 166 #include <netinet/in.h> 167 #include <netinet/in_systm.h> 168 #include <netinet/ip.h> 169 #include <netinet/in_pcb.h> 170 #include <netinet/ip_var.h> 171 172 #ifdef INET6 173 #ifndef INET 174 #include <netinet/in.h> 175 #endif 176 #include <netinet/ip6.h> 177 #include <netinet6/in6_var.h> 178 #include <netinet6/ip6_var.h> 179 #include <netinet6/in6_pcb.h> 180 #include <netinet6/nd6.h> 181 #endif 182 183 #ifdef FAST_IPSEC 184 #include <netipsec/ipsec.h> 185 #include <netipsec/key.h> 186 #endif /* FAST_IPSEC*/ 187 #ifdef IPSEC 188 #include <netinet6/ipsec.h> 189 #endif 190 191 #include <netinet/tcp.h> 192 #define TCPOUTFLAGS 193 #include <netinet/tcp_fsm.h> 194 #include <netinet/tcp_seq.h> 195 #include <netinet/tcp_timer.h> 196 #include <netinet/tcp_var.h> 197 #include <netinet/tcpip.h> 198 #include <netinet/tcp_debug.h> 199 #include <netinet/in_offload.h> 200 201 #ifdef IPSEC 202 #include <netkey/key.h> 203 #endif 204 205 #ifdef notyet 206 extern struct mbuf *m_copypack(); 207 #endif 208 209 /* 210 * Knob to enable Congestion Window Monitoring, and control 211 * the burst size it allows. Default burst is 4 packets, per 212 * the Internet draft. 213 */ 214 int tcp_cwm = 0; 215 int tcp_cwm_burstsize = 4; 216 217 #ifdef TCP_OUTPUT_COUNTERS 218 #include <sys/device.h> 219 220 extern struct evcnt tcp_output_bigheader; 221 extern struct evcnt tcp_output_predict_hit; 222 extern struct evcnt tcp_output_predict_miss; 223 extern struct evcnt tcp_output_copysmall; 224 extern struct evcnt tcp_output_copybig; 225 extern struct evcnt tcp_output_refbig; 226 227 #define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++ 228 #else 229 230 #define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */ 231 232 #endif /* TCP_OUTPUT_COUNTERS */ 233 234 static 235 #ifndef GPROF 236 inline 237 #endif 238 int 239 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep) 240 { 241 #ifdef INET 242 struct inpcb *inp = tp->t_inpcb; 243 #endif 244 #ifdef INET6 245 struct in6pcb *in6p = tp->t_in6pcb; 246 #endif 247 struct socket *so = NULL; 248 struct rtentry *rt; 249 struct ifnet *ifp; 250 int size; 251 int hdrlen; 252 int optlen; 253 254 #ifdef DIAGNOSTIC 255 if (tp->t_inpcb && tp->t_in6pcb) 256 panic("tcp_segsize: both t_inpcb and t_in6pcb are set"); 257 #endif 258 switch (tp->t_family) { 259 #ifdef INET 260 case AF_INET: 261 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 262 break; 263 #endif 264 #ifdef INET6 265 case AF_INET6: 266 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 267 break; 268 #endif 269 default: 270 size = tcp_mssdflt; 271 goto out; 272 } 273 274 rt = NULL; 275 #ifdef INET 276 if (inp) { 277 rt = in_pcbrtentry(inp); 278 so = inp->inp_socket; 279 } 280 #endif 281 #ifdef INET6 282 if (in6p) { 283 rt = in6_pcbrtentry(in6p); 284 so = in6p->in6p_socket; 285 } 286 #endif 287 if (rt == NULL) { 288 size = tcp_mssdflt; 289 goto out; 290 } 291 292 ifp = rt->rt_ifp; 293 294 size = tcp_mssdflt; 295 if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) { 296 #ifdef INET6 297 if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { 298 /* 299 * RFC2460 section 5, last paragraph: if path MTU is 300 * smaller than 1280, use 1280 as packet size and 301 * attach fragment header. 302 */ 303 size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag); 304 } else 305 size = rt->rt_rmx.rmx_mtu - hdrlen; 306 #else 307 size = rt->rt_rmx.rmx_mtu - hdrlen; 308 #endif 309 } else if (ifp->if_flags & IFF_LOOPBACK) 310 size = ifp->if_mtu - hdrlen; 311 #ifdef INET 312 else if (inp && tp->t_mtudisc) 313 size = ifp->if_mtu - hdrlen; 314 else if (inp && in_localaddr(inp->inp_faddr)) 315 size = ifp->if_mtu - hdrlen; 316 #endif 317 #ifdef INET6 318 else if (in6p) { 319 #ifdef INET 320 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { 321 /* mapped addr case */ 322 struct in_addr d; 323 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d)); 324 if (tp->t_mtudisc || in_localaddr(d)) 325 size = ifp->if_mtu - hdrlen; 326 } else 327 #endif 328 { 329 /* 330 * for IPv6, path MTU discovery is always turned on, 331 * or the node must use packet size <= 1280. 332 */ 333 size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU; 334 size -= hdrlen; 335 } 336 } 337 #endif 338 out: 339 /* 340 * Now we must make room for whatever extra TCP/IP options are in 341 * the packet. 342 */ 343 optlen = tcp_optlen(tp); 344 345 /* 346 * XXX tp->t_ourmss should have the right size, but without this code 347 * fragmentation will occur... need more investigation 348 */ 349 #ifdef INET 350 if (inp) { 351 #if defined(IPSEC) || defined(FAST_IPSEC) 352 if (! IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) 353 optlen += ipsec4_hdrsiz_tcp(tp); 354 #endif 355 optlen += ip_optlen(inp); 356 } 357 #endif 358 #ifdef INET6 359 #ifdef INET 360 if (in6p && tp->t_family == AF_INET) { 361 #if defined(IPSEC) || defined(FAST_IPSEC) 362 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND)) 363 optlen += ipsec4_hdrsiz_tcp(tp); 364 #endif 365 /* XXX size -= ip_optlen(in6p); */ 366 } else 367 #endif 368 if (in6p && tp->t_family == AF_INET6) { 369 #ifdef IPSEC 370 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND)) 371 optlen += ipsec6_hdrsiz_tcp(tp); 372 #endif 373 optlen += ip6_optlen(in6p); 374 } 375 #endif 376 size -= optlen; 377 378 /* there may not be any room for data if mtu is too small */ 379 if (size < 0) 380 return (EMSGSIZE); 381 382 /* 383 * *rxsegsizep holds *estimated* inbound segment size (estimation 384 * assumes that path MTU is the same for both ways). this is only 385 * for silly window avoidance, do not use the value for other purposes. 386 * 387 * ipseclen is subtracted from both sides, this may not be right. 388 * I'm not quite sure about this (could someone comment). 389 */ 390 *txsegsizep = min(tp->t_peermss - optlen, size); 391 /* 392 * Never send more than half a buffer full. This insures that we can 393 * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and 394 * therefore acks will never be delayed unless we run out of data to 395 * transmit. 396 */ 397 if (so) 398 *txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep); 399 *rxsegsizep = min(tp->t_ourmss - optlen, size); 400 401 if (*txsegsizep != tp->t_segsz) { 402 /* 403 * If the new segment size is larger, we don't want to 404 * mess up the congestion window, but if it is smaller 405 * we'll have to reduce the congestion window to ensure 406 * that we don't get into trouble with initial windows 407 * and the rest. In any case, if the segment size 408 * has changed, chances are the path has, too, and 409 * our congestion window will be different. 410 */ 411 if (*txsegsizep < tp->t_segsz) { 412 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz) 413 * *txsegsizep, *txsegsizep); 414 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz) 415 * *txsegsizep, *txsegsizep); 416 } 417 tp->t_segsz = *txsegsizep; 418 } 419 420 return (0); 421 } 422 423 static 424 #ifndef GPROF 425 inline 426 #endif 427 int 428 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, 429 long len, int hdrlen, struct mbuf **mp) 430 { 431 struct mbuf *m, *m0; 432 433 if (tp->t_force && len == 1) 434 tcpstat.tcps_sndprobe++; 435 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 436 tcpstat.tcps_sndrexmitpack++; 437 tcpstat.tcps_sndrexmitbyte += len; 438 } else { 439 tcpstat.tcps_sndpack++; 440 tcpstat.tcps_sndbyte += len; 441 } 442 #ifdef notyet 443 if ((m = m_copypack(so->so_snd.sb_mb, off, 444 (int)len, max_linkhdr + hdrlen)) == 0) 445 return (ENOBUFS); 446 /* 447 * m_copypack left space for our hdr; use it. 448 */ 449 m->m_len += hdrlen; 450 m->m_data -= hdrlen; 451 #else 452 MGETHDR(m, M_DONTWAIT, MT_HEADER); 453 if (__predict_false(m == NULL)) 454 return (ENOBUFS); 455 MCLAIM(m, &tcp_tx_mowner); 456 457 /* 458 * XXX Because other code assumes headers will fit in 459 * XXX one header mbuf. 460 * 461 * (This code should almost *never* be run.) 462 */ 463 if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) { 464 TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader); 465 MCLGET(m, M_DONTWAIT); 466 if ((m->m_flags & M_EXT) == 0) { 467 m_freem(m); 468 return (ENOBUFS); 469 } 470 } 471 472 m->m_data += max_linkhdr; 473 m->m_len = hdrlen; 474 475 /* 476 * To avoid traversing the whole sb_mb chain for correct 477 * data to send, remember last sent mbuf, its offset and 478 * the sent size. When called the next time, see if the 479 * data to send is directly following the previous transfer. 480 * This is important for large TCP windows. 481 */ 482 if (off == 0 || tp->t_lastm == NULL || 483 (tp->t_lastoff + tp->t_lastlen) != off) { 484 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss); 485 /* 486 * Either a new packet or a retransmit. 487 * Start from the beginning. 488 */ 489 tp->t_lastm = so->so_snd.sb_mb; 490 tp->t_inoff = off; 491 } else { 492 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit); 493 tp->t_inoff += tp->t_lastlen; 494 } 495 496 /* Traverse forward to next packet */ 497 while (tp->t_inoff > 0) { 498 if (tp->t_lastm == NULL) 499 panic("tp->t_lastm == NULL"); 500 if (tp->t_inoff < tp->t_lastm->m_len) 501 break; 502 tp->t_inoff -= tp->t_lastm->m_len; 503 tp->t_lastm = tp->t_lastm->m_next; 504 } 505 506 tp->t_lastoff = off; 507 tp->t_lastlen = len; 508 m0 = tp->t_lastm; 509 off = tp->t_inoff; 510 511 if (len <= M_TRAILINGSPACE(m)) { 512 m_copydata(m0, off, (int) len, mtod(m, caddr_t) + hdrlen); 513 m->m_len += len; 514 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall); 515 } else { 516 m->m_next = m_copy(m0, off, (int) len); 517 if (m->m_next == NULL) { 518 m_freem(m); 519 return (ENOBUFS); 520 } 521 #ifdef TCP_OUTPUT_COUNTERS 522 if (m->m_next->m_flags & M_EXT) 523 TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig); 524 else 525 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig); 526 #endif /* TCP_OUTPUT_COUNTERS */ 527 } 528 #endif 529 530 *mp = m; 531 return (0); 532 } 533 534 /* 535 * Tcp output routine: figure out what should be sent and send it. 536 */ 537 int 538 tcp_output(struct tcpcb *tp) 539 { 540 struct socket *so; 541 struct route *ro; 542 long len, win; 543 int off, flags, error; 544 struct mbuf *m; 545 struct ip *ip; 546 #ifdef INET6 547 struct ip6_hdr *ip6; 548 #endif 549 struct tcphdr *th; 550 u_char opt[MAX_TCPOPTLEN]; 551 unsigned optlen, hdrlen, packetlen; 552 unsigned int sack_numblks; 553 int idle, sendalot, txsegsize, rxsegsize; 554 int txsegsize_nosack; 555 int maxburst = TCP_MAXBURST; 556 int af; /* address family on the wire */ 557 int iphdrlen; 558 int has_tso, use_tso; 559 int sack_rxmit; 560 int sack_bytes_rxmt; 561 struct sackhole *p; 562 #ifdef TCP_SIGNATURE 563 int sigoff = 0; 564 #endif 565 566 #ifdef DIAGNOSTIC 567 if (tp->t_inpcb && tp->t_in6pcb) 568 panic("tcp_output: both t_inpcb and t_in6pcb are set"); 569 #endif 570 so = NULL; 571 ro = NULL; 572 if (tp->t_inpcb) { 573 so = tp->t_inpcb->inp_socket; 574 ro = &tp->t_inpcb->inp_route; 575 } 576 #ifdef INET6 577 else if (tp->t_in6pcb) { 578 so = tp->t_in6pcb->in6p_socket; 579 ro = (struct route *)&tp->t_in6pcb->in6p_route; 580 } 581 #endif 582 583 switch (af = tp->t_family) { 584 #ifdef INET 585 case AF_INET: 586 if (tp->t_inpcb) 587 break; 588 #ifdef INET6 589 /* mapped addr case */ 590 if (tp->t_in6pcb) 591 break; 592 #endif 593 return (EINVAL); 594 #endif 595 #ifdef INET6 596 case AF_INET6: 597 if (tp->t_in6pcb) 598 break; 599 return (EINVAL); 600 #endif 601 default: 602 return (EAFNOSUPPORT); 603 } 604 605 if (tcp_segsize(tp, &txsegsize, &rxsegsize)) 606 return (EMSGSIZE); 607 608 idle = (tp->snd_max == tp->snd_una); 609 610 /* 611 * Determine if we can use TCP segmentation offload: 612 * - If we're using IPv4 613 * - If there is not an IPsec policy that prevents it 614 * - If the interface can do it 615 */ 616 has_tso = tp->t_inpcb != NULL && 617 #if defined(IPSEC) || defined(FAST_IPSEC) 618 IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp, 619 IPSEC_DIR_OUTBOUND) && 620 #endif 621 tp->t_inpcb->inp_route.ro_rt != NULL && 622 (tp->t_inpcb->inp_route.ro_rt->rt_ifp->if_capenable & 623 IFCAP_TSOv4) != 0; 624 625 /* 626 * Restart Window computation. From draft-floyd-incr-init-win-03: 627 * 628 * Optionally, a TCP MAY set the restart window to the 629 * minimum of the value used for the initial window and 630 * the current value of cwnd (in other words, using a 631 * larger value for the restart window should never increase 632 * the size of cwnd). 633 */ 634 if (tcp_cwm) { 635 /* 636 * Hughes/Touch/Heidemann Congestion Window Monitoring. 637 * Count the number of packets currently pending 638 * acknowledgement, and limit our congestion window 639 * to a pre-determined allowed burst size plus that count. 640 * This prevents bursting once all pending packets have 641 * been acknowledged (i.e. transmission is idle). 642 * 643 * XXX Link this to Initial Window? 644 */ 645 tp->snd_cwnd = min(tp->snd_cwnd, 646 (tcp_cwm_burstsize * txsegsize) + 647 (tp->snd_nxt - tp->snd_una)); 648 } else { 649 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { 650 /* 651 * We have been idle for "a while" and no acks are 652 * expected to clock out any data we send -- 653 * slow start to get ack "clock" running again. 654 */ 655 int ss = tcp_init_win; 656 #ifdef INET 657 if (tp->t_inpcb && 658 in_localaddr(tp->t_inpcb->inp_faddr)) 659 ss = tcp_init_win_local; 660 #endif 661 #ifdef INET6 662 if (tp->t_in6pcb && 663 in6_localaddr(&tp->t_in6pcb->in6p_faddr)) 664 ss = tcp_init_win_local; 665 #endif 666 tp->snd_cwnd = min(tp->snd_cwnd, 667 TCP_INITIAL_WINDOW(ss, txsegsize)); 668 } 669 } 670 671 txsegsize_nosack = txsegsize; 672 again: 673 use_tso = has_tso; 674 TCP_REASS_LOCK(tp); 675 sack_numblks = tcp_sack_numblks(tp); 676 if (sack_numblks) { 677 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { 678 /* don't duplicate D-SACK. */ 679 use_tso = 0; 680 } 681 txsegsize = txsegsize_nosack - TCP_SACK_OPTLEN(sack_numblks); 682 } else { 683 txsegsize = txsegsize_nosack; 684 } 685 686 /* 687 * Determine length of data that should be transmitted, and 688 * flags that should be used. If there is some data or critical 689 * controls (SYN, RST) to send, then transmit; otherwise, 690 * investigate further. 691 * 692 * Readjust SACK information to avoid resending duplicate data. 693 */ 694 if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) 695 tcp_sack_adjust(tp); 696 sendalot = 0; 697 off = tp->snd_nxt - tp->snd_una; 698 win = min(tp->snd_wnd, tp->snd_cwnd); 699 700 flags = tcp_outflags[tp->t_state]; 701 702 /* 703 * Send any SACK-generated retransmissions. If we're explicitly trying 704 * to send out new data (when sendalot is 1), bypass this function. 705 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 706 * we're replacing a (future) new transmission with a retransmission 707 * now, and we previously incremented snd_cwnd in tcp_input(). 708 */ 709 /* 710 * Still in sack recovery , reset rxmit flag to zero. 711 */ 712 sack_rxmit = 0; 713 sack_bytes_rxmt = 0; 714 len = 0; 715 p = NULL; 716 do { 717 long cwin; 718 if (!TCP_SACK_ENABLED(tp)) 719 break; 720 if (tp->t_partialacks < 0) 721 break; 722 p = tcp_sack_output(tp, &sack_bytes_rxmt); 723 if (p == NULL) 724 break; 725 726 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; 727 if (cwin < 0) 728 cwin = 0; 729 /* Do not retransmit SACK segments beyond snd_recover */ 730 if (SEQ_GT(p->end, tp->snd_recover)) { 731 /* 732 * (At least) part of sack hole extends beyond 733 * snd_recover. Check to see if we can rexmit data 734 * for this hole. 735 */ 736 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { 737 /* 738 * Can't rexmit any more data for this hole. 739 * That data will be rexmitted in the next 740 * sack recovery episode, when snd_recover 741 * moves past p->rxmit. 742 */ 743 p = NULL; 744 break; 745 } 746 /* Can rexmit part of the current hole */ 747 len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); 748 } else 749 len = ((long)ulmin(cwin, p->end - p->rxmit)); 750 off = p->rxmit - tp->snd_una; 751 if (off + len > so->so_snd.sb_cc) { 752 /* 1 for TH_FIN */ 753 KASSERT(off + len == so->so_snd.sb_cc + 1); 754 KASSERT(p->rxmit + len == tp->snd_max); 755 len = so->so_snd.sb_cc - off; 756 } 757 if (len > 0) { 758 sack_rxmit = 1; 759 sendalot = 1; 760 } 761 } while (/*CONSTCOND*/0); 762 763 /* 764 * If in persist timeout with window of 0, send 1 byte. 765 * Otherwise, if window is small but nonzero 766 * and timer expired, we will send what we can 767 * and go to transmit state. 768 */ 769 if (tp->t_force) { 770 if (win == 0) { 771 /* 772 * If we still have some data to send, then 773 * clear the FIN bit. Usually this would 774 * happen below when it realizes that we 775 * aren't sending all the data. However, 776 * if we have exactly 1 byte of unset data, 777 * then it won't clear the FIN bit below, 778 * and if we are in persist state, we wind 779 * up sending the packet without recording 780 * that we sent the FIN bit. 781 * 782 * We can't just blindly clear the FIN bit, 783 * because if we don't have any more data 784 * to send then the probe will be the FIN 785 * itself. 786 */ 787 if (off < so->so_snd.sb_cc) 788 flags &= ~TH_FIN; 789 win = 1; 790 } else { 791 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 792 tp->t_rxtshift = 0; 793 } 794 } 795 796 if (!TCP_SACK_ENABLED(tp)) { 797 if (win < so->so_snd.sb_cc) { 798 len = win - off; 799 flags &= ~TH_FIN; 800 } else 801 len = so->so_snd.sb_cc - off; 802 } else if (sack_rxmit == 0) { 803 if (sack_bytes_rxmt != 0) { 804 long cwin; 805 806 /* 807 * We are inside of a SACK recovery episode and are 808 * sending new data, having retransmitted all the 809 * data possible in the scoreboard. 810 */ 811 if (tp->snd_wnd < so->so_snd.sb_cc) { 812 len = tp->snd_wnd - off; 813 flags &= ~TH_FIN; 814 } else { 815 len = so->so_snd.sb_cc - off; 816 } 817 818 /* 819 * From FreeBSD: 820 * Don't remove this (len > 0) check ! 821 * We explicitly check for len > 0 here (although it 822 * isn't really necessary), to work around a gcc 823 * optimization issue - to force gcc to compute 824 * len above. Without this check, the computation 825 * of len is bungled by the optimizer. 826 */ 827 if (len > 0) { 828 cwin = tp->snd_cwnd - 829 (tp->snd_nxt - tp->sack_newdata) - 830 sack_bytes_rxmt; 831 if (cwin < 0) 832 cwin = 0; 833 if (cwin < len) { 834 len = cwin; 835 flags &= ~TH_FIN; 836 } 837 } 838 } else if (win < so->so_snd.sb_cc) { 839 len = win - off; 840 flags &= ~TH_FIN; 841 } else 842 len = so->so_snd.sb_cc - off; 843 } 844 845 if (len < 0) { 846 /* 847 * If FIN has been sent but not acked, 848 * but we haven't been called to retransmit, 849 * len will be -1. Otherwise, window shrank 850 * after we sent into it. If window shrank to 0, 851 * cancel pending retransmit, pull snd_nxt back 852 * to (closed) window, and set the persist timer 853 * if it isn't already going. If the window didn't 854 * close completely, just wait for an ACK. 855 * 856 * If we have a pending FIN, either it has already been 857 * transmitted or it is outside the window, so drop it. 858 * If the FIN has been transmitted, but this is not a 859 * retransmission, then len must be -1. Therefore we also 860 * prevent here the sending of `gratuitous FINs'. This 861 * eliminates the need to check for that case below (e.g. 862 * to back up snd_nxt before the FIN so that the sequence 863 * number is correct). 864 */ 865 len = 0; 866 flags &= ~TH_FIN; 867 if (win == 0) { 868 TCP_TIMER_DISARM(tp, TCPT_REXMT); 869 tp->t_rxtshift = 0; 870 tp->snd_nxt = tp->snd_una; 871 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 872 tcp_setpersist(tp); 873 } 874 } 875 if (len > txsegsize) { 876 if (use_tso) { 877 /* 878 * Truncate TSO transfers to IP_MAXPACKET, and make 879 * sure that we send equal size transfers down the 880 * stack (rather than big-small-big-small-...). 881 */ 882 len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize; 883 if (len <= txsegsize) { 884 use_tso = 0; 885 } 886 } else 887 len = txsegsize; 888 flags &= ~TH_FIN; 889 sendalot = 1; 890 } else 891 use_tso = 0; 892 if (sack_rxmit) { 893 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) 894 flags &= ~TH_FIN; 895 } 896 897 win = sbspace(&so->so_rcv); 898 899 /* 900 * Sender silly window avoidance. If connection is idle 901 * and can send all data, a maximum segment, 902 * at least a maximum default-size segment do it, 903 * or are forced, do it; otherwise don't bother. 904 * If peer's buffer is tiny, then send 905 * when window is at least half open. 906 * If retransmitting (possibly after persist timer forced us 907 * to send into a small window), then must resend. 908 */ 909 if (len) { 910 if (len >= txsegsize) 911 goto send; 912 if ((so->so_state & SS_MORETOCOME) == 0 && 913 ((idle || tp->t_flags & TF_NODELAY) && 914 len + off >= so->so_snd.sb_cc)) 915 goto send; 916 if (tp->t_force) 917 goto send; 918 if (len >= tp->max_sndwnd / 2) 919 goto send; 920 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 921 goto send; 922 if (sack_rxmit) 923 goto send; 924 } 925 926 /* 927 * Compare available window to amount of window known to peer 928 * (as advertised window less next expected input). If the 929 * difference is at least twice the size of the largest segment 930 * we expect to receive (i.e. two segments) or at least 50% of 931 * the maximum possible window, then want to send a window update 932 * to peer. 933 */ 934 if (win > 0) { 935 /* 936 * "adv" is the amount we can increase the window, 937 * taking into account that we are limited by 938 * TCP_MAXWIN << tp->rcv_scale. 939 */ 940 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 941 (tp->rcv_adv - tp->rcv_nxt); 942 943 if (adv >= (long) (2 * rxsegsize)) 944 goto send; 945 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 946 goto send; 947 } 948 949 /* 950 * Send if we owe peer an ACK. 951 */ 952 if (tp->t_flags & TF_ACKNOW) 953 goto send; 954 if (flags & (TH_SYN|TH_FIN|TH_RST)) 955 goto send; 956 if (SEQ_GT(tp->snd_up, tp->snd_una)) 957 goto send; 958 /* 959 * In SACK, it is possible for tcp_output to fail to send a segment 960 * after the retransmission timer has been turned off. Make sure 961 * that the retransmission timer is set. 962 */ 963 if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && 964 !TCP_TIMER_ISARMED(tp, TCPT_REXMT) && 965 !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 966 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 967 goto just_return; 968 } 969 970 /* 971 * TCP window updates are not reliable, rather a polling protocol 972 * using ``persist'' packets is used to insure receipt of window 973 * updates. The three ``states'' for the output side are: 974 * idle not doing retransmits or persists 975 * persisting to move a small or zero window 976 * (re)transmitting and thereby not persisting 977 * 978 * tp->t_timer[TCPT_PERSIST] 979 * is set when we are in persist state. 980 * tp->t_force 981 * is set when we are called to send a persist packet. 982 * tp->t_timer[TCPT_REXMT] 983 * is set when we are retransmitting 984 * The output side is idle when both timers are zero. 985 * 986 * If send window is too small, there is data to transmit, and no 987 * retransmit or persist is pending, then go to persist state. 988 * If nothing happens soon, send when timer expires: 989 * if window is nonzero, transmit what we can, 990 * otherwise force out a byte. 991 */ 992 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 993 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 994 tp->t_rxtshift = 0; 995 tcp_setpersist(tp); 996 } 997 998 /* 999 * No reason to send a segment, just return. 1000 */ 1001 just_return: 1002 TCP_REASS_UNLOCK(tp); 1003 return (0); 1004 1005 send: 1006 /* 1007 * Before ESTABLISHED, force sending of initial options 1008 * unless TCP set not to do any options. 1009 * NOTE: we assume that the IP/TCP header plus TCP options 1010 * always fit in a single mbuf, leaving room for a maximum 1011 * link header, i.e. 1012 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES 1013 */ 1014 optlen = 0; 1015 switch (af) { 1016 #ifdef INET 1017 case AF_INET: 1018 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 1019 break; 1020 #endif 1021 #ifdef INET6 1022 case AF_INET6: 1023 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1024 break; 1025 #endif 1026 default: /*pacify gcc*/ 1027 iphdrlen = 0; 1028 break; 1029 } 1030 hdrlen = iphdrlen; 1031 if (flags & TH_SYN) { 1032 struct rtentry *rt; 1033 1034 rt = NULL; 1035 #ifdef INET 1036 if (tp->t_inpcb) 1037 rt = in_pcbrtentry(tp->t_inpcb); 1038 #endif 1039 #ifdef INET6 1040 if (tp->t_in6pcb) 1041 rt = in6_pcbrtentry(tp->t_in6pcb); 1042 #endif 1043 1044 tp->snd_nxt = tp->iss; 1045 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ? 1046 rt->rt_ifp : NULL, af); 1047 if ((tp->t_flags & TF_NOOPT) == 0) { 1048 opt[0] = TCPOPT_MAXSEG; 1049 opt[1] = 4; 1050 opt[2] = (tp->t_ourmss >> 8) & 0xff; 1051 opt[3] = tp->t_ourmss & 0xff; 1052 optlen = 4; 1053 1054 if ((tp->t_flags & TF_REQ_SCALE) && 1055 ((flags & TH_ACK) == 0 || 1056 (tp->t_flags & TF_RCVD_SCALE))) { 1057 *((u_int32_t *) (opt + optlen)) = htonl( 1058 TCPOPT_NOP << 24 | 1059 TCPOPT_WINDOW << 16 | 1060 TCPOLEN_WINDOW << 8 | 1061 tp->request_r_scale); 1062 optlen += 4; 1063 } 1064 if (tcp_do_sack) { 1065 u_int8_t *cp = (u_int8_t *)(opt + optlen); 1066 1067 cp[0] = TCPOPT_SACK_PERMITTED; 1068 cp[1] = 2; 1069 cp[2] = TCPOPT_NOP; 1070 cp[3] = TCPOPT_NOP; 1071 optlen += 4; 1072 } 1073 } 1074 } 1075 1076 /* 1077 * Send a timestamp and echo-reply if this is a SYN and our side 1078 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 1079 * and our peer have sent timestamps in our SYN's. 1080 */ 1081 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1082 (flags & TH_RST) == 0 && 1083 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 1084 (tp->t_flags & TF_RCVD_TSTMP))) { 1085 u_int32_t *lp = (u_int32_t *)(opt + optlen); 1086 1087 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1088 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1089 *lp++ = htonl(TCP_TIMESTAMP(tp)); 1090 *lp = htonl(tp->ts_recent); 1091 optlen += TCPOLEN_TSTAMP_APPA; 1092 } 1093 1094 /* 1095 * Tack on the SACK block if it is necessary. 1096 */ 1097 if (sack_numblks) { 1098 int sack_len; 1099 u_char *bp = (u_char *)(opt + optlen); 1100 u_int32_t *lp = (u_int32_t *)(bp + 4); 1101 struct ipqent *tiqe; 1102 1103 sack_len = sack_numblks * 8 + 2; 1104 bp[0] = TCPOPT_NOP; 1105 bp[1] = TCPOPT_NOP; 1106 bp[2] = TCPOPT_SACK; 1107 bp[3] = sack_len; 1108 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { 1109 sack_numblks--; 1110 *lp++ = htonl(tp->rcv_dsack_block.left); 1111 *lp++ = htonl(tp->rcv_dsack_block.right); 1112 tp->rcv_sack_flags &= ~TCPSACK_HAVED; 1113 } 1114 for (tiqe = TAILQ_FIRST(&tp->timeq); 1115 sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) { 1116 KASSERT(tiqe != NULL); 1117 sack_numblks--; 1118 *lp++ = htonl(tiqe->ipqe_seq); 1119 *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len + 1120 ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0)); 1121 } 1122 optlen += sack_len + 2; 1123 } 1124 TCP_REASS_UNLOCK(tp); 1125 1126 #ifdef TCP_SIGNATURE 1127 #if defined(INET6) && defined(FAST_IPSEC) 1128 if (tp->t_family == AF_INET) 1129 #endif 1130 if (tp->t_flags & TF_SIGNATURE) { 1131 u_char *bp; 1132 /* 1133 * Initialize TCP-MD5 option (RFC2385) 1134 */ 1135 bp = (u_char *)opt + optlen; 1136 *bp++ = TCPOPT_SIGNATURE; 1137 *bp++ = TCPOLEN_SIGNATURE; 1138 sigoff = optlen + 2; 1139 bzero(bp, TCP_SIGLEN); 1140 bp += TCP_SIGLEN; 1141 optlen += TCPOLEN_SIGNATURE; 1142 /* 1143 * Terminate options list and maintain 32-bit alignment. 1144 */ 1145 *bp++ = TCPOPT_NOP; 1146 *bp++ = TCPOPT_EOL; 1147 optlen += 2; 1148 } 1149 #endif /* TCP_SIGNATURE */ 1150 1151 hdrlen += optlen; 1152 1153 #ifdef DIAGNOSTIC 1154 if (!use_tso && len > txsegsize) 1155 panic("tcp data to be sent is larger than segment"); 1156 else if (use_tso && len > IP_MAXPACKET) 1157 panic("tcp data to be sent is larger than max TSO size"); 1158 if (max_linkhdr + hdrlen > MCLBYTES) 1159 panic("tcphdr too big"); 1160 #endif 1161 1162 /* 1163 * Grab a header mbuf, attaching a copy of data to 1164 * be transmitted, and initialize the header from 1165 * the template for sends on this connection. 1166 */ 1167 if (len) { 1168 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); 1169 if (error) 1170 goto out; 1171 /* 1172 * If we're sending everything we've got, set PUSH. 1173 * (This will keep happy those implementations which only 1174 * give data to the user when a buffer fills or 1175 * a PUSH comes in.) 1176 */ 1177 if (off + len == so->so_snd.sb_cc) 1178 flags |= TH_PUSH; 1179 } else { 1180 if (tp->t_flags & TF_ACKNOW) 1181 tcpstat.tcps_sndacks++; 1182 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 1183 tcpstat.tcps_sndctrl++; 1184 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 1185 tcpstat.tcps_sndurg++; 1186 else 1187 tcpstat.tcps_sndwinup++; 1188 1189 MGETHDR(m, M_DONTWAIT, MT_HEADER); 1190 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 1191 MCLGET(m, M_DONTWAIT); 1192 if ((m->m_flags & M_EXT) == 0) { 1193 m_freem(m); 1194 m = NULL; 1195 } 1196 } 1197 if (m == NULL) { 1198 error = ENOBUFS; 1199 goto out; 1200 } 1201 MCLAIM(m, &tcp_tx_mowner); 1202 m->m_data += max_linkhdr; 1203 m->m_len = hdrlen; 1204 } 1205 m->m_pkthdr.rcvif = (struct ifnet *)0; 1206 switch (af) { 1207 #ifdef INET 1208 case AF_INET: 1209 ip = mtod(m, struct ip *); 1210 #ifdef INET6 1211 ip6 = NULL; 1212 #endif 1213 th = (struct tcphdr *)(ip + 1); 1214 break; 1215 #endif 1216 #ifdef INET6 1217 case AF_INET6: 1218 ip = NULL; 1219 ip6 = mtod(m, struct ip6_hdr *); 1220 th = (struct tcphdr *)(ip6 + 1); 1221 break; 1222 #endif 1223 default: /*pacify gcc*/ 1224 ip = NULL; 1225 #ifdef INET6 1226 ip6 = NULL; 1227 #endif 1228 th = NULL; 1229 break; 1230 } 1231 if (tp->t_template == 0) 1232 panic("tcp_output"); 1233 if (tp->t_template->m_len < iphdrlen) 1234 panic("tcp_output"); 1235 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen); 1236 1237 /* 1238 * If we are doing retransmissions, then snd_nxt will 1239 * not reflect the first unsent octet. For ACK only 1240 * packets, we do not want the sequence number of the 1241 * retransmitted packet, we want the sequence number 1242 * of the next unsent octet. So, if there is no data 1243 * (and no SYN or FIN), use snd_max instead of snd_nxt 1244 * when filling in ti_seq. But if we are in persist 1245 * state, snd_max might reflect one byte beyond the 1246 * right edge of the window, so use snd_nxt in that 1247 * case, since we know we aren't doing a retransmission. 1248 * (retransmit and persist are mutually exclusive...) 1249 */ 1250 if (TCP_SACK_ENABLED(tp) && sack_rxmit) { 1251 th->th_seq = htonl(p->rxmit); 1252 p->rxmit += len; 1253 } else { 1254 if (len || (flags & (TH_SYN|TH_FIN)) || 1255 TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) 1256 th->th_seq = htonl(tp->snd_nxt); 1257 else 1258 th->th_seq = htonl(tp->snd_max); 1259 } 1260 th->th_ack = htonl(tp->rcv_nxt); 1261 if (optlen) { 1262 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); 1263 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 1264 } 1265 th->th_flags = flags; 1266 /* 1267 * Calculate receive window. Don't shrink window, 1268 * but avoid silly window syndrome. 1269 */ 1270 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) 1271 win = 0; 1272 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 1273 win = (long)TCP_MAXWIN << tp->rcv_scale; 1274 if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) 1275 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); 1276 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 1277 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 1278 u_int32_t urp = tp->snd_up - tp->snd_nxt; 1279 if (urp > IP_MAXPACKET) 1280 urp = IP_MAXPACKET; 1281 th->th_urp = htons((u_int16_t)urp); 1282 th->th_flags |= TH_URG; 1283 } else 1284 /* 1285 * If no urgent pointer to send, then we pull 1286 * the urgent pointer to the left edge of the send window 1287 * so that it doesn't drift into the send window on sequence 1288 * number wraparound. 1289 */ 1290 tp->snd_up = tp->snd_una; /* drag it along */ 1291 1292 #ifdef TCP_SIGNATURE 1293 #if defined(INET6) && defined(FAST_IPSEC) 1294 if (tp->t_family == AF_INET) /* XXX */ 1295 #endif 1296 if (sigoff && (tp->t_flags & TF_SIGNATURE)) { 1297 struct secasvar *sav; 1298 u_int8_t *sigp; 1299 1300 sav = tcp_signature_getsav(m, th); 1301 1302 if (sav == NULL) { 1303 if (m) 1304 m_freem(m); 1305 return (EPERM); 1306 } 1307 1308 m->m_pkthdr.len = hdrlen + len; 1309 sigp = (caddr_t)th + sizeof(*th) + sigoff; 1310 tcp_signature(m, th, (caddr_t)th - mtod(m, caddr_t), sav, sigp); 1311 1312 key_sa_recordxfer(sav, m); 1313 #ifdef FAST_IPSEC 1314 KEY_FREESAV(&sav); 1315 #else 1316 key_freesav(sav); 1317 #endif 1318 } 1319 #endif 1320 1321 /* 1322 * Set ourselves up to be checksummed just before the packet 1323 * hits the wire. 1324 */ 1325 switch (af) { 1326 #ifdef INET 1327 case AF_INET: 1328 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1329 if (use_tso) { 1330 m->m_pkthdr.segsz = txsegsize; 1331 m->m_pkthdr.csum_flags = M_CSUM_TSOv4; 1332 } else { 1333 m->m_pkthdr.csum_flags = M_CSUM_TCPv4; 1334 if (len + optlen) { 1335 /* Fixup the pseudo-header checksum. */ 1336 /* XXXJRT Not IP Jumbogram safe. */ 1337 th->th_sum = in_cksum_addword(th->th_sum, 1338 htons((u_int16_t) (len + optlen))); 1339 } 1340 } 1341 break; 1342 #endif 1343 #ifdef INET6 1344 case AF_INET6: 1345 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1346 m->m_pkthdr.csum_flags = M_CSUM_TCPv6; 1347 if (len + optlen) { 1348 /* Fixup the pseudo-header checksum. */ 1349 /* XXXJRT: Not IPv6 Jumbogram safe. */ 1350 th->th_sum = in_cksum_addword(th->th_sum, 1351 htons((u_int16_t) (len + optlen))); 1352 } 1353 break; 1354 #endif 1355 } 1356 1357 /* 1358 * In transmit state, time the transmission and arrange for 1359 * the retransmit. In persist state, just set snd_max. 1360 */ 1361 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 1362 tcp_seq startseq = tp->snd_nxt; 1363 1364 /* 1365 * Advance snd_nxt over sequence space of this segment. 1366 * There are no states in which we send both a SYN and a FIN, 1367 * so we collapse the tests for these flags. 1368 */ 1369 if (flags & (TH_SYN|TH_FIN)) 1370 tp->snd_nxt++; 1371 if (sack_rxmit) 1372 goto timer; 1373 tp->snd_nxt += len; 1374 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 1375 tp->snd_max = tp->snd_nxt; 1376 /* 1377 * Time this transmission if not a retransmission and 1378 * not currently timing anything. 1379 */ 1380 if (tp->t_rtttime == 0) { 1381 tp->t_rtttime = tcp_now; 1382 tp->t_rtseq = startseq; 1383 tcpstat.tcps_segstimed++; 1384 } 1385 } 1386 1387 /* 1388 * Set retransmit timer if not currently set, 1389 * and not doing an ack or a keep-alive probe. 1390 * Initial value for retransmit timer is smoothed 1391 * round-trip time + 2 * round-trip time variance. 1392 * Initialize shift counter which is used for backoff 1393 * of retransmit time. 1394 */ 1395 timer: 1396 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 1397 ((sack_rxmit && tp->snd_nxt != tp->snd_max) || 1398 tp->snd_nxt != tp->snd_una)) { 1399 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 1400 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 1401 tp->t_rxtshift = 0; 1402 } 1403 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1404 } 1405 } else 1406 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 1407 tp->snd_max = tp->snd_nxt + len; 1408 1409 #ifdef TCP_DEBUG 1410 /* 1411 * Trace. 1412 */ 1413 if (so->so_options & SO_DEBUG) 1414 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); 1415 #endif 1416 1417 /* 1418 * Fill in IP length and desired time to live and 1419 * send to IP level. There should be a better way 1420 * to handle ttl and tos; we could keep them in 1421 * the template, but need a way to checksum without them. 1422 */ 1423 m->m_pkthdr.len = hdrlen + len; 1424 1425 switch (af) { 1426 #ifdef INET 1427 case AF_INET: 1428 ip->ip_len = htons(m->m_pkthdr.len); 1429 packetlen = m->m_pkthdr.len; 1430 if (tp->t_inpcb) { 1431 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1432 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 1433 } 1434 #ifdef INET6 1435 else if (tp->t_in6pcb) { 1436 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/ 1437 ip->ip_tos = 0; /*XXX*/ 1438 } 1439 #endif 1440 break; 1441 #endif 1442 #ifdef INET6 1443 case AF_INET6: 1444 packetlen = m->m_pkthdr.len; 1445 ip6->ip6_nxt = IPPROTO_TCP; 1446 if (tp->t_in6pcb) { 1447 /* 1448 * we separately set hoplimit for every segment, since 1449 * the user might want to change the value via 1450 * setsockopt. Also, desired default hop limit might 1451 * be changed via Neighbor Discovery. 1452 */ 1453 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, 1454 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL); 1455 } 1456 /* ip6->ip6_flow = ??? */ 1457 /* ip6_plen will be filled in ip6_output(). */ 1458 break; 1459 #endif 1460 default: /*pacify gcc*/ 1461 packetlen = 0; 1462 break; 1463 } 1464 1465 switch (af) { 1466 #ifdef INET 1467 case AF_INET: 1468 { 1469 struct mbuf *opts; 1470 1471 if (tp->t_inpcb) 1472 opts = tp->t_inpcb->inp_options; 1473 else 1474 opts = NULL; 1475 error = ip_output(m, opts, ro, 1476 (tp->t_mtudisc ? IP_MTUDISC : 0) | 1477 (so->so_options & SO_DONTROUTE), 1478 (struct ip_moptions *)0, so); 1479 break; 1480 } 1481 #endif 1482 #ifdef INET6 1483 case AF_INET6: 1484 { 1485 struct ip6_pktopts *opts; 1486 1487 if (tp->t_in6pcb) 1488 opts = tp->t_in6pcb->in6p_outputopts; 1489 else 1490 opts = NULL; 1491 error = ip6_output(m, opts, (struct route_in6 *)ro, 1492 so->so_options & SO_DONTROUTE, 1493 (struct ip6_moptions *)0, so, NULL); 1494 break; 1495 } 1496 #endif 1497 default: 1498 error = EAFNOSUPPORT; 1499 break; 1500 } 1501 if (error) { 1502 out: 1503 if (error == ENOBUFS) { 1504 tcpstat.tcps_selfquench++; 1505 #ifdef INET 1506 if (tp->t_inpcb) 1507 tcp_quench(tp->t_inpcb, 0); 1508 #endif 1509 #ifdef INET6 1510 if (tp->t_in6pcb) 1511 tcp6_quench(tp->t_in6pcb, 0); 1512 #endif 1513 error = 0; 1514 } else if ((error == EHOSTUNREACH || error == ENETDOWN) && 1515 TCPS_HAVERCVDSYN(tp->t_state)) { 1516 tp->t_softerror = error; 1517 error = 0; 1518 } 1519 1520 /* Back out the seqence number advance. */ 1521 if (sack_rxmit) 1522 p->rxmit -= len; 1523 1524 /* Restart the delayed ACK timer, if necessary. */ 1525 if (tp->t_flags & TF_DELACK) 1526 TCP_RESTART_DELACK(tp); 1527 1528 return (error); 1529 } 1530 1531 if (packetlen > tp->t_pmtud_mtu_sent) 1532 tp->t_pmtud_mtu_sent = packetlen; 1533 1534 tcpstat.tcps_sndtotal++; 1535 if (tp->t_flags & TF_DELACK) 1536 tcpstat.tcps_delack++; 1537 1538 /* 1539 * Data sent (as far as we can tell). 1540 * If this advertises a larger window than any other segment, 1541 * then remember the size of the advertised window. 1542 * Any pending ACK has now been sent. 1543 */ 1544 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1545 tp->rcv_adv = tp->rcv_nxt + win; 1546 tp->last_ack_sent = tp->rcv_nxt; 1547 tp->t_flags &= ~TF_ACKNOW; 1548 TCP_CLEAR_DELACK(tp); 1549 #ifdef DIAGNOSTIC 1550 if (maxburst < 0) 1551 printf("tcp_output: maxburst exceeded by %d\n", -maxburst); 1552 #endif 1553 if (sendalot && (!tcp_do_newreno || --maxburst)) 1554 goto again; 1555 return (0); 1556 } 1557 1558 void 1559 tcp_setpersist(struct tcpcb *tp) 1560 { 1561 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); 1562 int nticks; 1563 1564 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 1565 panic("tcp_output REXMT"); 1566 /* 1567 * Start/restart persistance timer. 1568 */ 1569 if (t < tp->t_rttmin) 1570 t = tp->t_rttmin; 1571 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], 1572 TCPTV_PERSMIN, TCPTV_PERSMAX); 1573 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); 1574 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1575 tp->t_rxtshift++; 1576 } 1577 1578 #if defined(INET) 1579 /* 1580 * tcp4_segment: handle M_CSUM_TSOv4 by software. 1581 * 1582 * => always consume m. 1583 * => call output_func with output_arg for each segments. 1584 */ 1585 1586 int 1587 tcp4_segment(struct mbuf *m, int (*output_func)(void *, struct mbuf *), 1588 void *output_arg) 1589 { 1590 int mss; 1591 int iphlen; 1592 int thlen; 1593 int hlen; 1594 int len; 1595 struct ip *iph; 1596 struct tcphdr *th; 1597 uint16_t ipid; 1598 uint32_t tcpseq; 1599 struct mbuf *hdr = NULL; 1600 struct mbuf *t; 1601 int error = 0; 1602 1603 KASSERT((m->m_flags & M_PKTHDR) != 0); 1604 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0); 1605 1606 m->m_pkthdr.csum_flags = 0; 1607 1608 len = m->m_pkthdr.len; 1609 KASSERT(len >= sizeof(*iph) + sizeof(*th)); 1610 1611 if (m->m_len < sizeof(*iph)) { 1612 m = m_pullup(m, sizeof(*iph)); 1613 if (m == NULL) { 1614 error = ENOMEM; 1615 goto quit; 1616 } 1617 } 1618 iph = mtod(m, struct ip *); 1619 iphlen = iph->ip_hl * 4; 1620 KASSERT(iph->ip_v == IPVERSION); 1621 KASSERT(iphlen >= sizeof(*iph)); 1622 KASSERT(iph->ip_p == IPPROTO_TCP); 1623 ipid = ntohs(iph->ip_id); 1624 1625 hlen = iphlen + sizeof(*th); 1626 if (m->m_len < hlen) { 1627 m = m_pullup(m, hlen); 1628 if (m == NULL) { 1629 error = ENOMEM; 1630 goto quit; 1631 } 1632 } 1633 th = (void *)(mtod(m, char *) + iphlen); 1634 tcpseq = ntohl(th->th_seq); 1635 thlen = th->th_off * 4; 1636 hlen = iphlen + thlen; 1637 1638 mss = m->m_pkthdr.segsz; 1639 KASSERT(mss != 0); 1640 KASSERT(len > hlen); 1641 1642 t = m_split(m, hlen, M_NOWAIT); 1643 if (t == NULL) { 1644 error = ENOMEM; 1645 goto quit; 1646 } 1647 hdr = m; 1648 m = t; 1649 len -= hlen; 1650 KASSERT(len % mss == 0); 1651 while (len > 0) { 1652 struct mbuf *n; 1653 1654 n = m_dup(hdr, 0, hlen, M_NOWAIT); 1655 if (n == NULL) { 1656 error = ENOMEM; 1657 goto quit; 1658 } 1659 KASSERT(n->m_len == hlen); /* XXX */ 1660 1661 t = m_split(m, mss, M_NOWAIT); 1662 if (t == NULL) { 1663 m_freem(n); 1664 error = ENOMEM; 1665 goto quit; 1666 } 1667 m_cat(n, m); 1668 m = t; 1669 1670 KASSERT(n->m_len >= hlen); /* XXX */ 1671 1672 n->m_pkthdr.len = hlen + mss; 1673 iph = mtod(n, struct ip *); 1674 KASSERT(iph->ip_v == IPVERSION); 1675 iph->ip_len = htons(n->m_pkthdr.len); 1676 iph->ip_id = htons(ipid); 1677 th = (void *)(mtod(n, char *) + iphlen); 1678 th->th_seq = htonl(tcpseq); 1679 iph->ip_sum = 0; 1680 iph->ip_sum = in_cksum(n, iphlen); 1681 th->th_sum = 0; 1682 th->th_sum = in4_cksum(n, IPPROTO_TCP, iphlen, thlen + mss); 1683 1684 error = (*output_func)(output_arg, n); 1685 if (error) { 1686 goto quit; 1687 } 1688 1689 tcpseq += mss; 1690 ipid++; 1691 len -= mss; 1692 } 1693 1694 quit: 1695 if (hdr != NULL) { 1696 m_freem(hdr); 1697 } 1698 if (m != NULL) { 1699 m_freem(m); 1700 } 1701 1702 return error; 1703 } 1704 #endif /* defined(INET) */ 1705