1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate * 26*0Sstevel@tonic-gate * tcp.c, Code implementing the TCP protocol. 27*0Sstevel@tonic-gate */ 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 30*0Sstevel@tonic-gate 31*0Sstevel@tonic-gate #include <sys/types.h> 32*0Sstevel@tonic-gate #include <socket_impl.h> 33*0Sstevel@tonic-gate #include <socket_inet.h> 34*0Sstevel@tonic-gate #include <sys/sysmacros.h> 35*0Sstevel@tonic-gate #include <sys/promif.h> 36*0Sstevel@tonic-gate #include <sys/socket.h> 37*0Sstevel@tonic-gate #include <netinet/in_systm.h> 38*0Sstevel@tonic-gate #include <netinet/in.h> 39*0Sstevel@tonic-gate #include <netinet/ip.h> 40*0Sstevel@tonic-gate #include <netinet/tcp.h> 41*0Sstevel@tonic-gate #include <net/if_types.h> 42*0Sstevel@tonic-gate #include <sys/salib.h> 43*0Sstevel@tonic-gate 44*0Sstevel@tonic-gate #include "ipv4.h" 45*0Sstevel@tonic-gate #include "ipv4_impl.h" 46*0Sstevel@tonic-gate #include "mac.h" 47*0Sstevel@tonic-gate #include "mac_impl.h" 48*0Sstevel@tonic-gate #include "v4_sum_impl.h" 49*0Sstevel@tonic-gate #include <sys/bootdebug.h> 50*0Sstevel@tonic-gate #include "tcp_inet.h" 51*0Sstevel@tonic-gate #include "tcp_sack.h" 52*0Sstevel@tonic-gate #include <inet/common.h> 53*0Sstevel@tonic-gate #include <inet/mib2.h> 54*0Sstevel@tonic-gate 55*0Sstevel@tonic-gate /* 56*0Sstevel@tonic-gate * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 57*0Sstevel@tonic-gate */ 58*0Sstevel@tonic-gate #undef BUMP_MIB 59*0Sstevel@tonic-gate #define BUMP_MIB(x) (x)++ 60*0Sstevel@tonic-gate 61*0Sstevel@tonic-gate #undef UPDATE_MIB 62*0Sstevel@tonic-gate #define UPDATE_MIB(x, y) x += y 63*0Sstevel@tonic-gate 64*0Sstevel@tonic-gate /* 65*0Sstevel@tonic-gate * MIB-2 stuff for SNMP 66*0Sstevel@tonic-gate */ 67*0Sstevel@tonic-gate mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 68*0Sstevel@tonic-gate 69*0Sstevel@tonic-gate /* The TCP mib does not include the following errors. */ 70*0Sstevel@tonic-gate static uint_t tcp_cksum_errors; 71*0Sstevel@tonic-gate static uint_t tcp_drops; 72*0Sstevel@tonic-gate 73*0Sstevel@tonic-gate /* Macros for timestamp comparisons */ 74*0Sstevel@tonic-gate #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 75*0Sstevel@tonic-gate #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 76*0Sstevel@tonic-gate 77*0Sstevel@tonic-gate /* 78*0Sstevel@tonic-gate * Parameters for TCP Initial Send Sequence number (ISS) generation. 79*0Sstevel@tonic-gate * The ISS is calculated by adding three components: a time component 80*0Sstevel@tonic-gate * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 81*0Sstevel@tonic-gate * suggested by RFC 793, page 27); 82*0Sstevel@tonic-gate * a per-connection component which grows by 125000 for every new connection; 83*0Sstevel@tonic-gate * and an "extra" component that grows by a random amount centered 84*0Sstevel@tonic-gate * approximately on 64000. This causes the the ISS generator to cycle every 85*0Sstevel@tonic-gate * 4.89 hours if no TCP connections are made, and faster if connections are 86*0Sstevel@tonic-gate * made. 87*0Sstevel@tonic-gate */ 88*0Sstevel@tonic-gate #define ISS_INCR 250000 89*0Sstevel@tonic-gate #define ISS_NSEC_SHT 0 90*0Sstevel@tonic-gate 91*0Sstevel@tonic-gate static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 92*0Sstevel@tonic-gate 93*0Sstevel@tonic-gate #define TCP_XMIT_LOWATER 4096 94*0Sstevel@tonic-gate #define TCP_XMIT_HIWATER 49152 95*0Sstevel@tonic-gate #define TCP_RECV_LOWATER 2048 96*0Sstevel@tonic-gate #define TCP_RECV_HIWATER 49152 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate /* 99*0Sstevel@tonic-gate * PAWS needs a timer for 24 days. This is the number of ms in 24 days 100*0Sstevel@tonic-gate */ 101*0Sstevel@tonic-gate #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate /* 104*0Sstevel@tonic-gate * TCP options struct returned from tcp_parse_options. 105*0Sstevel@tonic-gate */ 106*0Sstevel@tonic-gate typedef struct tcp_opt_s { 107*0Sstevel@tonic-gate uint32_t tcp_opt_mss; 108*0Sstevel@tonic-gate uint32_t tcp_opt_wscale; 109*0Sstevel@tonic-gate uint32_t tcp_opt_ts_val; 110*0Sstevel@tonic-gate uint32_t tcp_opt_ts_ecr; 111*0Sstevel@tonic-gate tcp_t *tcp; 112*0Sstevel@tonic-gate } tcp_opt_t; 113*0Sstevel@tonic-gate 114*0Sstevel@tonic-gate /* 115*0Sstevel@tonic-gate * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 116*0Sstevel@tonic-gate */ 117*0Sstevel@tonic-gate 118*0Sstevel@tonic-gate #ifdef _BIG_ENDIAN 119*0Sstevel@tonic-gate #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 120*0Sstevel@tonic-gate (TCPOPT_TSTAMP << 8) | 10) 121*0Sstevel@tonic-gate #else 122*0Sstevel@tonic-gate #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 123*0Sstevel@tonic-gate (TCPOPT_NOP << 8) | TCPOPT_NOP) 124*0Sstevel@tonic-gate #endif 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gate /* 127*0Sstevel@tonic-gate * Flags returned from tcp_parse_options. 128*0Sstevel@tonic-gate */ 129*0Sstevel@tonic-gate #define TCP_OPT_MSS_PRESENT 1 130*0Sstevel@tonic-gate #define TCP_OPT_WSCALE_PRESENT 2 131*0Sstevel@tonic-gate #define TCP_OPT_TSTAMP_PRESENT 4 132*0Sstevel@tonic-gate #define TCP_OPT_SACK_OK_PRESENT 8 133*0Sstevel@tonic-gate #define TCP_OPT_SACK_PRESENT 16 134*0Sstevel@tonic-gate 135*0Sstevel@tonic-gate /* TCP option length */ 136*0Sstevel@tonic-gate #define TCPOPT_NOP_LEN 1 137*0Sstevel@tonic-gate #define TCPOPT_MAXSEG_LEN 4 138*0Sstevel@tonic-gate #define TCPOPT_WS_LEN 3 139*0Sstevel@tonic-gate #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 140*0Sstevel@tonic-gate #define TCPOPT_TSTAMP_LEN 10 141*0Sstevel@tonic-gate #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 142*0Sstevel@tonic-gate #define TCPOPT_SACK_OK_LEN 2 143*0Sstevel@tonic-gate #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 144*0Sstevel@tonic-gate #define TCPOPT_REAL_SACK_LEN 4 145*0Sstevel@tonic-gate #define TCPOPT_MAX_SACK_LEN 36 146*0Sstevel@tonic-gate #define TCPOPT_HEADER_LEN 2 147*0Sstevel@tonic-gate 148*0Sstevel@tonic-gate /* TCP cwnd burst factor. */ 149*0Sstevel@tonic-gate #define TCP_CWND_INFINITE 65535 150*0Sstevel@tonic-gate #define TCP_CWND_SS 3 151*0Sstevel@tonic-gate #define TCP_CWND_NORMAL 5 152*0Sstevel@tonic-gate 153*0Sstevel@tonic-gate /* Named Dispatch Parameter Management Structure */ 154*0Sstevel@tonic-gate typedef struct tcpparam_s { 155*0Sstevel@tonic-gate uint32_t tcp_param_min; 156*0Sstevel@tonic-gate uint32_t tcp_param_max; 157*0Sstevel@tonic-gate uint32_t tcp_param_val; 158*0Sstevel@tonic-gate char *tcp_param_name; 159*0Sstevel@tonic-gate } tcpparam_t; 160*0Sstevel@tonic-gate 161*0Sstevel@tonic-gate /* Max size IP datagram is 64k - 1 */ 162*0Sstevel@tonic-gate #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 163*0Sstevel@tonic-gate sizeof (tcph_t))) 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate /* Max of the above */ 166*0Sstevel@tonic-gate #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 167*0Sstevel@tonic-gate 168*0Sstevel@tonic-gate /* Largest TCP port number */ 169*0Sstevel@tonic-gate #define TCP_MAX_PORT (64 * 1024 - 1) 170*0Sstevel@tonic-gate 171*0Sstevel@tonic-gate /* Round up the value to the nearest mss. */ 172*0Sstevel@tonic-gate #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 173*0Sstevel@tonic-gate 174*0Sstevel@tonic-gate #define MS 1L 175*0Sstevel@tonic-gate #define SECONDS (1000 * MS) 176*0Sstevel@tonic-gate #define MINUTES (60 * SECONDS) 177*0Sstevel@tonic-gate #define HOURS (60 * MINUTES) 178*0Sstevel@tonic-gate #define DAYS (24 * HOURS) 179*0Sstevel@tonic-gate 180*0Sstevel@tonic-gate /* All NDD params in the core TCP became static variables. */ 181*0Sstevel@tonic-gate static int tcp_time_wait_interval = 1 * MINUTES; 182*0Sstevel@tonic-gate static int tcp_conn_req_max_q = 128; 183*0Sstevel@tonic-gate static int tcp_conn_req_max_q0 = 1024; 184*0Sstevel@tonic-gate static int tcp_conn_req_min = 1; 185*0Sstevel@tonic-gate static int tcp_conn_grace_period = 0 * SECONDS; 186*0Sstevel@tonic-gate static int tcp_cwnd_max_ = 1024 * 1024; 187*0Sstevel@tonic-gate static int tcp_smallest_nonpriv_port = 1024; 188*0Sstevel@tonic-gate static int tcp_ip_abort_cinterval = 3 * MINUTES; 189*0Sstevel@tonic-gate static int tcp_ip_abort_linterval = 3 * MINUTES; 190*0Sstevel@tonic-gate static int tcp_ip_abort_interval = 8 * MINUTES; 191*0Sstevel@tonic-gate static int tcp_ip_notify_cinterval = 10 * SECONDS; 192*0Sstevel@tonic-gate static int tcp_ip_notify_interval = 10 * SECONDS; 193*0Sstevel@tonic-gate static int tcp_ipv4_ttl = 64; 194*0Sstevel@tonic-gate static int tcp_mss_def_ipv4 = 536; 195*0Sstevel@tonic-gate static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 196*0Sstevel@tonic-gate static int tcp_mss_min = 108; 197*0Sstevel@tonic-gate static int tcp_naglim_def = (4*1024)-1; 198*0Sstevel@tonic-gate static int tcp_rexmit_interval_initial = 3 * SECONDS; 199*0Sstevel@tonic-gate static int tcp_rexmit_interval_max = 60 * SECONDS; 200*0Sstevel@tonic-gate static int tcp_rexmit_interval_min = 400 * MS; 201*0Sstevel@tonic-gate static int tcp_dupack_fast_retransmit = 3; 202*0Sstevel@tonic-gate static int tcp_smallest_anon_port = 32 * 1024; 203*0Sstevel@tonic-gate static int tcp_largest_anon_port = TCP_MAX_PORT; 204*0Sstevel@tonic-gate static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 205*0Sstevel@tonic-gate static int tcp_recv_hiwat_minmss = 4; 206*0Sstevel@tonic-gate static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 207*0Sstevel@tonic-gate static int tcp_max_buf = 1024 * 1024; 208*0Sstevel@tonic-gate static int tcp_wscale_always = 1; 209*0Sstevel@tonic-gate static int tcp_tstamp_always = 1; 210*0Sstevel@tonic-gate static int tcp_tstamp_if_wscale = 1; 211*0Sstevel@tonic-gate static int tcp_rexmit_interval_extra = 0; 212*0Sstevel@tonic-gate static int tcp_slow_start_after_idle = 2; 213*0Sstevel@tonic-gate static int tcp_slow_start_initial = 2; 214*0Sstevel@tonic-gate static int tcp_sack_permitted = 2; 215*0Sstevel@tonic-gate static int tcp_ecn_permitted = 2; 216*0Sstevel@tonic-gate 217*0Sstevel@tonic-gate /* Extra room to fit in headers. */ 218*0Sstevel@tonic-gate static uint_t tcp_wroff_xtra; 219*0Sstevel@tonic-gate 220*0Sstevel@tonic-gate /* Hint for next port to try. */ 221*0Sstevel@tonic-gate static in_port_t tcp_next_port_to_try = 32*1024; 222*0Sstevel@tonic-gate 223*0Sstevel@tonic-gate /* 224*0Sstevel@tonic-gate * Figure out the value of window scale opton. Note that the rwnd is 225*0Sstevel@tonic-gate * ASSUMED to be rounded up to the nearest MSS before the calculation. 226*0Sstevel@tonic-gate * We cannot find the scale value and then do a round up of tcp_rwnd 227*0Sstevel@tonic-gate * because the scale value may not be correct after that. 228*0Sstevel@tonic-gate */ 229*0Sstevel@tonic-gate #define SET_WS_VALUE(tcp) \ 230*0Sstevel@tonic-gate { \ 231*0Sstevel@tonic-gate int i; \ 232*0Sstevel@tonic-gate uint32_t rwnd = (tcp)->tcp_rwnd; \ 233*0Sstevel@tonic-gate for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 234*0Sstevel@tonic-gate i++, rwnd >>= 1) \ 235*0Sstevel@tonic-gate ; \ 236*0Sstevel@tonic-gate (tcp)->tcp_rcv_ws = i; \ 237*0Sstevel@tonic-gate } 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gate /* 240*0Sstevel@tonic-gate * Set ECN capable transport (ECT) code point in IP header. 241*0Sstevel@tonic-gate * 242*0Sstevel@tonic-gate * Note that there are 2 ECT code points '01' and '10', which are called 243*0Sstevel@tonic-gate * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 244*0Sstevel@tonic-gate * point ECT(0) for TCP as described in RFC 2481. 245*0Sstevel@tonic-gate */ 246*0Sstevel@tonic-gate #define SET_ECT(tcp, iph) \ 247*0Sstevel@tonic-gate if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 248*0Sstevel@tonic-gate /* We need to clear the code point first. */ \ 249*0Sstevel@tonic-gate ((struct ip *)(iph))->ip_tos &= 0xFC; \ 250*0Sstevel@tonic-gate ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 251*0Sstevel@tonic-gate } 252*0Sstevel@tonic-gate 253*0Sstevel@tonic-gate /* 254*0Sstevel@tonic-gate * The format argument to pass to tcp_display(). 255*0Sstevel@tonic-gate * DISP_PORT_ONLY means that the returned string has only port info. 256*0Sstevel@tonic-gate * DISP_ADDR_AND_PORT means that the returned string also contains the 257*0Sstevel@tonic-gate * remote and local IP address. 258*0Sstevel@tonic-gate */ 259*0Sstevel@tonic-gate #define DISP_PORT_ONLY 1 260*0Sstevel@tonic-gate #define DISP_ADDR_AND_PORT 2 261*0Sstevel@tonic-gate 262*0Sstevel@tonic-gate /* 263*0Sstevel@tonic-gate * TCP reassembly macros. We hide starting and ending sequence numbers in 264*0Sstevel@tonic-gate * b_next and b_prev of messages on the reassembly queue. The messages are 265*0Sstevel@tonic-gate * chained using b_cont. These macros are used in tcp_reass() so we don't 266*0Sstevel@tonic-gate * have to see the ugly casts and assignments. 267*0Sstevel@tonic-gate */ 268*0Sstevel@tonic-gate #define TCP_REASS_SEQ(mp) ((uint32_t)((mp)->b_next)) 269*0Sstevel@tonic-gate #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = (mblk_t *)(u)) 270*0Sstevel@tonic-gate #define TCP_REASS_END(mp) ((uint32_t)((mp)->b_prev)) 271*0Sstevel@tonic-gate #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = (mblk_t *)(u)) 272*0Sstevel@tonic-gate 273*0Sstevel@tonic-gate #define TCP_TIMER_RESTART(tcp, intvl) \ 274*0Sstevel@tonic-gate (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 275*0Sstevel@tonic-gate (tcp)->tcp_timer_running = B_TRUE; 276*0Sstevel@tonic-gate 277*0Sstevel@tonic-gate static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 278*0Sstevel@tonic-gate static mblk_t *tcp_ack_mp(tcp_t *); 279*0Sstevel@tonic-gate static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 280*0Sstevel@tonic-gate static uint16_t tcp_cksum(uint16_t *, uint32_t); 281*0Sstevel@tonic-gate static void tcp_clean_death(int, tcp_t *, int err); 282*0Sstevel@tonic-gate static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 283*0Sstevel@tonic-gate static char *tcp_display(tcp_t *, char *, char); 284*0Sstevel@tonic-gate static int tcp_drain_input(tcp_t *, int, int); 285*0Sstevel@tonic-gate static void tcp_drain_needed(int, tcp_t *); 286*0Sstevel@tonic-gate static boolean_t tcp_drop_q0(tcp_t *); 287*0Sstevel@tonic-gate static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 288*0Sstevel@tonic-gate static int tcp_header_len(struct inetgram *); 289*0Sstevel@tonic-gate static in_port_t tcp_report_ports(uint16_t *, enum Ports); 290*0Sstevel@tonic-gate static int tcp_input(int); 291*0Sstevel@tonic-gate static void tcp_iss_init(tcp_t *); 292*0Sstevel@tonic-gate static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 293*0Sstevel@tonic-gate static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 294*0Sstevel@tonic-gate static int tcp_conn_check(tcp_t *); 295*0Sstevel@tonic-gate static int tcp_close(int); 296*0Sstevel@tonic-gate static void tcp_close_detached(tcp_t *); 297*0Sstevel@tonic-gate static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 298*0Sstevel@tonic-gate static void tcp_eager_unlink(tcp_t *); 299*0Sstevel@tonic-gate static void tcp_free(tcp_t *); 300*0Sstevel@tonic-gate static int tcp_header_init_ipv4(tcp_t *); 301*0Sstevel@tonic-gate static void tcp_mss_set(tcp_t *, uint32_t); 302*0Sstevel@tonic-gate static int tcp_parse_options(tcph_t *, tcp_opt_t *); 303*0Sstevel@tonic-gate static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 304*0Sstevel@tonic-gate static void tcp_process_options(tcp_t *, tcph_t *); 305*0Sstevel@tonic-gate static int tcp_random(void); 306*0Sstevel@tonic-gate static void tcp_random_init(void); 307*0Sstevel@tonic-gate static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 308*0Sstevel@tonic-gate static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 309*0Sstevel@tonic-gate static void tcp_rcv_drain(int sock_id, tcp_t *); 310*0Sstevel@tonic-gate static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 311*0Sstevel@tonic-gate static void tcp_rput_data(tcp_t *, mblk_t *, int); 312*0Sstevel@tonic-gate static int tcp_rwnd_set(tcp_t *, uint32_t); 313*0Sstevel@tonic-gate static int32_t tcp_sack_rxmit(tcp_t *, int); 314*0Sstevel@tonic-gate static void tcp_set_cksum(mblk_t *); 315*0Sstevel@tonic-gate static void tcp_set_rto(tcp_t *, int32_t); 316*0Sstevel@tonic-gate static void tcp_ss_rexmit(tcp_t *, int); 317*0Sstevel@tonic-gate static int tcp_state_wait(int, tcp_t *, int); 318*0Sstevel@tonic-gate static void tcp_timer(tcp_t *, int); 319*0Sstevel@tonic-gate static void tcp_time_wait_append(tcp_t *); 320*0Sstevel@tonic-gate static void tcp_time_wait_collector(void); 321*0Sstevel@tonic-gate static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 322*0Sstevel@tonic-gate uint32_t, int, tcph_t *, int sock_id); 323*0Sstevel@tonic-gate static void tcp_time_wait_remove(tcp_t *); 324*0Sstevel@tonic-gate static in_port_t tcp_update_next_port(in_port_t); 325*0Sstevel@tonic-gate static int tcp_verify_cksum(mblk_t *); 326*0Sstevel@tonic-gate static void tcp_wput_data(tcp_t *, mblk_t *, int); 327*0Sstevel@tonic-gate static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 328*0Sstevel@tonic-gate int, uint_t, int); 329*0Sstevel@tonic-gate static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 330*0Sstevel@tonic-gate int, uint_t); 331*0Sstevel@tonic-gate static int tcp_xmit_end(tcp_t *, int); 332*0Sstevel@tonic-gate static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 333*0Sstevel@tonic-gate static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 334*0Sstevel@tonic-gate mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 335*0Sstevel@tonic-gate static int tcp_init_values(tcp_t *, struct inetboot_socket *); 336*0Sstevel@tonic-gate 337*0Sstevel@tonic-gate #if DEBUG > 1 338*0Sstevel@tonic-gate #define TCP_DUMP_PACKET(str, mp) \ 339*0Sstevel@tonic-gate { \ 340*0Sstevel@tonic-gate int len = (mp)->b_wptr - (mp)->b_rptr; \ 341*0Sstevel@tonic-gate \ 342*0Sstevel@tonic-gate printf("%s: dump TCP(%d): \n", (str), len); \ 343*0Sstevel@tonic-gate hexdump((char *)(mp)->b_rptr, len); \ 344*0Sstevel@tonic-gate } 345*0Sstevel@tonic-gate #else 346*0Sstevel@tonic-gate #define TCP_DUMP_PACKET(str, mp) 347*0Sstevel@tonic-gate #endif 348*0Sstevel@tonic-gate 349*0Sstevel@tonic-gate #ifdef DEBUG 350*0Sstevel@tonic-gate #define DEBUG_1(str, arg) printf(str, (arg)) 351*0Sstevel@tonic-gate #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 352*0Sstevel@tonic-gate #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 353*0Sstevel@tonic-gate #else 354*0Sstevel@tonic-gate #define DEBUG_1(str, arg) 355*0Sstevel@tonic-gate #define DEBUG_2(str, arg1, arg2) 356*0Sstevel@tonic-gate #define DEBUG_3(str, arg1, arg2, arg3) 357*0Sstevel@tonic-gate #endif 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate /* Whether it is the first time TCP is used. */ 360*0Sstevel@tonic-gate static boolean_t tcp_initialized = B_FALSE; 361*0Sstevel@tonic-gate 362*0Sstevel@tonic-gate /* TCP time wait list. */ 363*0Sstevel@tonic-gate static tcp_t *tcp_time_wait_head; 364*0Sstevel@tonic-gate static tcp_t *tcp_time_wait_tail; 365*0Sstevel@tonic-gate static uint32_t tcp_cum_timewait; 366*0Sstevel@tonic-gate /* When the tcp_time_wait_collector is run. */ 367*0Sstevel@tonic-gate static uint32_t tcp_time_wait_runtime; 368*0Sstevel@tonic-gate 369*0Sstevel@tonic-gate #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 370*0Sstevel@tonic-gate if (prom_gettime() > tcp_time_wait_runtime) \ 371*0Sstevel@tonic-gate tcp_time_wait_collector(); 372*0Sstevel@tonic-gate 373*0Sstevel@tonic-gate /* 374*0Sstevel@tonic-gate * Accept will return with an error if there is no connection coming in 375*0Sstevel@tonic-gate * after this (in ms). 376*0Sstevel@tonic-gate */ 377*0Sstevel@tonic-gate static int tcp_accept_timeout = 60000; 378*0Sstevel@tonic-gate 379*0Sstevel@tonic-gate /* 380*0Sstevel@tonic-gate * Initialize the TCP-specific parts of a socket. 381*0Sstevel@tonic-gate */ 382*0Sstevel@tonic-gate void 383*0Sstevel@tonic-gate tcp_socket_init(struct inetboot_socket *isp) 384*0Sstevel@tonic-gate { 385*0Sstevel@tonic-gate /* Do some initializations. */ 386*0Sstevel@tonic-gate if (!tcp_initialized) { 387*0Sstevel@tonic-gate tcp_random_init(); 388*0Sstevel@tonic-gate /* Extra head room for the MAC layer address. */ 389*0Sstevel@tonic-gate if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 390*0Sstevel@tonic-gate tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 391*0Sstevel@tonic-gate } 392*0Sstevel@tonic-gate /* Schedule the first time wait cleanup time */ 393*0Sstevel@tonic-gate tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 394*0Sstevel@tonic-gate tcp_initialized = B_TRUE; 395*0Sstevel@tonic-gate } 396*0Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 397*0Sstevel@tonic-gate 398*0Sstevel@tonic-gate isp->proto = IPPROTO_TCP; 399*0Sstevel@tonic-gate isp->input[TRANSPORT_LVL] = tcp_input; 400*0Sstevel@tonic-gate /* Socket layer should call tcp_send() directly. */ 401*0Sstevel@tonic-gate isp->output[TRANSPORT_LVL] = NULL; 402*0Sstevel@tonic-gate isp->close[TRANSPORT_LVL] = tcp_close; 403*0Sstevel@tonic-gate isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 404*0Sstevel@tonic-gate isp->ports = tcp_report_ports; 405*0Sstevel@tonic-gate if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 406*0Sstevel@tonic-gate errno = ENOBUFS; 407*0Sstevel@tonic-gate return; 408*0Sstevel@tonic-gate } 409*0Sstevel@tonic-gate if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 410*0Sstevel@tonic-gate bkmem_free(isp->pcb, sizeof (tcp_t)); 411*0Sstevel@tonic-gate return; 412*0Sstevel@tonic-gate } 413*0Sstevel@tonic-gate /* 414*0Sstevel@tonic-gate * This is set last because this field is used to determine if 415*0Sstevel@tonic-gate * a socket is in use or not. 416*0Sstevel@tonic-gate */ 417*0Sstevel@tonic-gate isp->type = INETBOOT_STREAM; 418*0Sstevel@tonic-gate } 419*0Sstevel@tonic-gate 420*0Sstevel@tonic-gate /* 421*0Sstevel@tonic-gate * Return the size of a TCP header including TCP option. 422*0Sstevel@tonic-gate */ 423*0Sstevel@tonic-gate static int 424*0Sstevel@tonic-gate tcp_header_len(struct inetgram *igm) 425*0Sstevel@tonic-gate { 426*0Sstevel@tonic-gate mblk_t *pkt; 427*0Sstevel@tonic-gate int ipvers; 428*0Sstevel@tonic-gate 429*0Sstevel@tonic-gate /* Just returns the standard TCP header without option */ 430*0Sstevel@tonic-gate if (igm == NULL) 431*0Sstevel@tonic-gate return (sizeof (tcph_t)); 432*0Sstevel@tonic-gate 433*0Sstevel@tonic-gate if ((pkt = igm->igm_mp) == NULL) 434*0Sstevel@tonic-gate return (0); 435*0Sstevel@tonic-gate 436*0Sstevel@tonic-gate ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 437*0Sstevel@tonic-gate if (ipvers == IPV4_VERSION) { 438*0Sstevel@tonic-gate return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 439*0Sstevel@tonic-gate } else { 440*0Sstevel@tonic-gate dprintf("tcp_header_len: non-IPv4 packet.\n"); 441*0Sstevel@tonic-gate return (0); 442*0Sstevel@tonic-gate } 443*0Sstevel@tonic-gate } 444*0Sstevel@tonic-gate 445*0Sstevel@tonic-gate /* 446*0Sstevel@tonic-gate * Return the requested port number in network order. 447*0Sstevel@tonic-gate */ 448*0Sstevel@tonic-gate static in_port_t 449*0Sstevel@tonic-gate tcp_report_ports(uint16_t *tcphp, enum Ports request) 450*0Sstevel@tonic-gate { 451*0Sstevel@tonic-gate if (request == SOURCE) 452*0Sstevel@tonic-gate return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 453*0Sstevel@tonic-gate return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 454*0Sstevel@tonic-gate } 455*0Sstevel@tonic-gate 456*0Sstevel@tonic-gate /* 457*0Sstevel@tonic-gate * Because inetboot is not interrupt driven, TCP can only poll. This 458*0Sstevel@tonic-gate * means that there can be packets stuck in the NIC buffer waiting to 459*0Sstevel@tonic-gate * be processed. Thus we need to drain them before, for example, sending 460*0Sstevel@tonic-gate * anything because an ACK may actually be stuck there. 461*0Sstevel@tonic-gate * 462*0Sstevel@tonic-gate * The timeout arguments determine how long we should wait for draining. 463*0Sstevel@tonic-gate */ 464*0Sstevel@tonic-gate static int 465*0Sstevel@tonic-gate tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 466*0Sstevel@tonic-gate { 467*0Sstevel@tonic-gate struct inetgram *in_gram; 468*0Sstevel@tonic-gate struct inetgram *old_in_gram; 469*0Sstevel@tonic-gate int old_timeout; 470*0Sstevel@tonic-gate mblk_t *mp; 471*0Sstevel@tonic-gate int i; 472*0Sstevel@tonic-gate 473*0Sstevel@tonic-gate dprintf("tcp_drain_input(%d): %s\n", sock_id, 474*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 475*0Sstevel@tonic-gate 476*0Sstevel@tonic-gate /* 477*0Sstevel@tonic-gate * Since the driver uses the in_timeout value in the socket 478*0Sstevel@tonic-gate * structure to determine the timeout value, we need to save 479*0Sstevel@tonic-gate * the original one so that we can restore that after draining. 480*0Sstevel@tonic-gate */ 481*0Sstevel@tonic-gate old_timeout = sockets[sock_id].in_timeout; 482*0Sstevel@tonic-gate sockets[sock_id].in_timeout = timeout; 483*0Sstevel@tonic-gate 484*0Sstevel@tonic-gate /* 485*0Sstevel@tonic-gate * We do this because the input queue may have some user 486*0Sstevel@tonic-gate * data already. 487*0Sstevel@tonic-gate */ 488*0Sstevel@tonic-gate old_in_gram = sockets[sock_id].inq; 489*0Sstevel@tonic-gate sockets[sock_id].inq = NULL; 490*0Sstevel@tonic-gate 491*0Sstevel@tonic-gate /* Go out and check the wire */ 492*0Sstevel@tonic-gate for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 493*0Sstevel@tonic-gate if (sockets[sock_id].input[i] != NULL) { 494*0Sstevel@tonic-gate if (sockets[sock_id].input[i](sock_id) < 0) { 495*0Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 496*0Sstevel@tonic-gate if (sockets[sock_id].inq != NULL) 497*0Sstevel@tonic-gate nuke_grams(&sockets[sock_id].inq); 498*0Sstevel@tonic-gate sockets[sock_id].inq = old_in_gram; 499*0Sstevel@tonic-gate return (-1); 500*0Sstevel@tonic-gate } 501*0Sstevel@tonic-gate } 502*0Sstevel@tonic-gate } 503*0Sstevel@tonic-gate #if DEBUG 504*0Sstevel@tonic-gate printf("tcp_drain_input: done with checking packets\n"); 505*0Sstevel@tonic-gate #endif 506*0Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 507*0Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 508*0Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 509*0Sstevel@tonic-gate #if DEBUG 510*0Sstevel@tonic-gate printf("tcp_drain_input: unexpected packet " 511*0Sstevel@tonic-gate "level %d frame found\n", in_gram->igm_level); 512*0Sstevel@tonic-gate #endif 513*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 514*0Sstevel@tonic-gate continue; 515*0Sstevel@tonic-gate } 516*0Sstevel@tonic-gate mp = in_gram->igm_mp; 517*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 518*0Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 519*0Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 520*0Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 521*0Sstevel@tonic-gate 522*0Sstevel@tonic-gate /* 523*0Sstevel@tonic-gate * The other side may have closed this connection or 524*0Sstevel@tonic-gate * RST us. But we need to continue to process other 525*0Sstevel@tonic-gate * packets in the socket's queue because they may be 526*0Sstevel@tonic-gate * belong to another TCP connections. 527*0Sstevel@tonic-gate */ 528*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 529*0Sstevel@tonic-gate tcp = NULL; 530*0Sstevel@tonic-gate } 531*0Sstevel@tonic-gate 532*0Sstevel@tonic-gate if (tcp == NULL || sockets[sock_id].pcb == NULL) { 533*0Sstevel@tonic-gate if (sockets[sock_id].so_error != 0) 534*0Sstevel@tonic-gate return (-1); 535*0Sstevel@tonic-gate else 536*0Sstevel@tonic-gate return (0); 537*0Sstevel@tonic-gate } 538*0Sstevel@tonic-gate #if DEBUG 539*0Sstevel@tonic-gate printf("tcp_drain_input: done with processing packets\n"); 540*0Sstevel@tonic-gate #endif 541*0Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 542*0Sstevel@tonic-gate sockets[sock_id].inq = old_in_gram; 543*0Sstevel@tonic-gate 544*0Sstevel@tonic-gate /* 545*0Sstevel@tonic-gate * Data may have been received so indicate it is available 546*0Sstevel@tonic-gate */ 547*0Sstevel@tonic-gate tcp_drain_needed(sock_id, tcp); 548*0Sstevel@tonic-gate return (0); 549*0Sstevel@tonic-gate } 550*0Sstevel@tonic-gate 551*0Sstevel@tonic-gate /* 552*0Sstevel@tonic-gate * The receive entry point for upper layer to call to get data. Note 553*0Sstevel@tonic-gate * that this follows the current architecture that lower layer receive 554*0Sstevel@tonic-gate * routines have been called already. Thus if the inq of socket is 555*0Sstevel@tonic-gate * not NULL, the packets must be for us. 556*0Sstevel@tonic-gate */ 557*0Sstevel@tonic-gate static int 558*0Sstevel@tonic-gate tcp_input(int sock_id) 559*0Sstevel@tonic-gate { 560*0Sstevel@tonic-gate struct inetgram *in_gram; 561*0Sstevel@tonic-gate mblk_t *mp; 562*0Sstevel@tonic-gate tcp_t *tcp; 563*0Sstevel@tonic-gate 564*0Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 565*0Sstevel@tonic-gate 566*0Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) 567*0Sstevel@tonic-gate return (-1); 568*0Sstevel@tonic-gate 569*0Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 570*0Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 571*0Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 572*0Sstevel@tonic-gate #ifdef DEBUG 573*0Sstevel@tonic-gate printf("tcp_input: unexpected packet " 574*0Sstevel@tonic-gate "level %d frame found\n", in_gram->igm_level); 575*0Sstevel@tonic-gate #endif 576*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 577*0Sstevel@tonic-gate continue; 578*0Sstevel@tonic-gate } 579*0Sstevel@tonic-gate mp = in_gram->igm_mp; 580*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 581*0Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 582*0Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 583*0Sstevel@tonic-gate /* The TCP may be gone because it gets a RST. */ 584*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 585*0Sstevel@tonic-gate return (-1); 586*0Sstevel@tonic-gate } 587*0Sstevel@tonic-gate 588*0Sstevel@tonic-gate /* Flush the receive list. */ 589*0Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 590*0Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 591*0Sstevel@tonic-gate } else { 592*0Sstevel@tonic-gate /* The other side has closed the connection, report this up. */ 593*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 594*0Sstevel@tonic-gate sockets[sock_id].so_state |= SS_CANTRCVMORE; 595*0Sstevel@tonic-gate return (0); 596*0Sstevel@tonic-gate } 597*0Sstevel@tonic-gate } 598*0Sstevel@tonic-gate return (0); 599*0Sstevel@tonic-gate } 600*0Sstevel@tonic-gate 601*0Sstevel@tonic-gate /* 602*0Sstevel@tonic-gate * The send entry point for upper layer to call to send data. In order 603*0Sstevel@tonic-gate * to minimize changes to the core TCP code, we need to put the 604*0Sstevel@tonic-gate * data into mblks. 605*0Sstevel@tonic-gate */ 606*0Sstevel@tonic-gate int 607*0Sstevel@tonic-gate tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 608*0Sstevel@tonic-gate { 609*0Sstevel@tonic-gate mblk_t *mp; 610*0Sstevel@tonic-gate mblk_t *head = NULL; 611*0Sstevel@tonic-gate mblk_t *tail; 612*0Sstevel@tonic-gate int mss = tcp->tcp_mss; 613*0Sstevel@tonic-gate int cnt = 0; 614*0Sstevel@tonic-gate int win_size; 615*0Sstevel@tonic-gate char *buf = (char *)msg; 616*0Sstevel@tonic-gate 617*0Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 618*0Sstevel@tonic-gate 619*0Sstevel@tonic-gate /* We don't want to append 0 size mblk. */ 620*0Sstevel@tonic-gate if (len == 0) 621*0Sstevel@tonic-gate return (0); 622*0Sstevel@tonic-gate while (len > 0) { 623*0Sstevel@tonic-gate if (len < mss) { 624*0Sstevel@tonic-gate mss = len; 625*0Sstevel@tonic-gate } 626*0Sstevel@tonic-gate /* 627*0Sstevel@tonic-gate * If we cannot allocate more buffer, stop here and 628*0Sstevel@tonic-gate * the number of bytes buffered will be returned. 629*0Sstevel@tonic-gate * 630*0Sstevel@tonic-gate * Note that we follow the core TCP optimization that 631*0Sstevel@tonic-gate * each mblk contains only MSS bytes data. 632*0Sstevel@tonic-gate */ 633*0Sstevel@tonic-gate if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 634*0Sstevel@tonic-gate TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 635*0Sstevel@tonic-gate break; 636*0Sstevel@tonic-gate } 637*0Sstevel@tonic-gate mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 638*0Sstevel@tonic-gate bcopy(buf, mp->b_rptr, mss); 639*0Sstevel@tonic-gate mp->b_wptr = mp->b_rptr + mss; 640*0Sstevel@tonic-gate buf += mss; 641*0Sstevel@tonic-gate cnt += mss; 642*0Sstevel@tonic-gate len -= mss; 643*0Sstevel@tonic-gate 644*0Sstevel@tonic-gate if (head == NULL) { 645*0Sstevel@tonic-gate head = mp; 646*0Sstevel@tonic-gate tail = mp; 647*0Sstevel@tonic-gate } else { 648*0Sstevel@tonic-gate tail->b_cont = mp; 649*0Sstevel@tonic-gate tail = mp; 650*0Sstevel@tonic-gate } 651*0Sstevel@tonic-gate } 652*0Sstevel@tonic-gate 653*0Sstevel@tonic-gate /* 654*0Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 655*0Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 656*0Sstevel@tonic-gate * otherwise, we may not be able to send. 657*0Sstevel@tonic-gate * 658*0Sstevel@tonic-gate * We expect an ACK in two cases: 659*0Sstevel@tonic-gate * 660*0Sstevel@tonic-gate * 1) We have un-ACK'ed data. 661*0Sstevel@tonic-gate * 662*0Sstevel@tonic-gate * 2) All ACK's have been received and the sender's window has been 663*0Sstevel@tonic-gate * closed. We need an ACK back to open the window so that we can 664*0Sstevel@tonic-gate * send. In this case, call tcp_drain_input() if the window size is 665*0Sstevel@tonic-gate * less than 2 * MSS. 666*0Sstevel@tonic-gate */ 667*0Sstevel@tonic-gate 668*0Sstevel@tonic-gate /* window size = MIN(swnd, cwnd) - unacked bytes */ 669*0Sstevel@tonic-gate win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 670*0Sstevel@tonic-gate tcp->tcp_swnd; 671*0Sstevel@tonic-gate win_size -= tcp->tcp_snxt; 672*0Sstevel@tonic-gate win_size += tcp->tcp_suna; 673*0Sstevel@tonic-gate if (win_size < (2 * tcp->tcp_mss)) 674*0Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) 675*0Sstevel@tonic-gate return (-1); 676*0Sstevel@tonic-gate 677*0Sstevel@tonic-gate tcp_wput_data(tcp, head, sock_id); 678*0Sstevel@tonic-gate return (cnt); 679*0Sstevel@tonic-gate } 680*0Sstevel@tonic-gate 681*0Sstevel@tonic-gate /* Free up all TCP related stuff */ 682*0Sstevel@tonic-gate static void 683*0Sstevel@tonic-gate tcp_free(tcp_t *tcp) 684*0Sstevel@tonic-gate { 685*0Sstevel@tonic-gate if (tcp->tcp_iphc != NULL) { 686*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 687*0Sstevel@tonic-gate tcp->tcp_iphc = NULL; 688*0Sstevel@tonic-gate } 689*0Sstevel@tonic-gate if (tcp->tcp_xmit_head != NULL) { 690*0Sstevel@tonic-gate freemsg(tcp->tcp_xmit_head); 691*0Sstevel@tonic-gate tcp->tcp_xmit_head = NULL; 692*0Sstevel@tonic-gate } 693*0Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 694*0Sstevel@tonic-gate freemsg(tcp->tcp_rcv_list); 695*0Sstevel@tonic-gate tcp->tcp_rcv_list = NULL; 696*0Sstevel@tonic-gate } 697*0Sstevel@tonic-gate if (tcp->tcp_reass_head != NULL) { 698*0Sstevel@tonic-gate freemsg(tcp->tcp_reass_head); 699*0Sstevel@tonic-gate tcp->tcp_reass_head = NULL; 700*0Sstevel@tonic-gate } 701*0Sstevel@tonic-gate if (tcp->tcp_sack_info != NULL) { 702*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_sack_info, 703*0Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 704*0Sstevel@tonic-gate tcp->tcp_sack_info = NULL; 705*0Sstevel@tonic-gate } 706*0Sstevel@tonic-gate } 707*0Sstevel@tonic-gate 708*0Sstevel@tonic-gate static void 709*0Sstevel@tonic-gate tcp_close_detached(tcp_t *tcp) 710*0Sstevel@tonic-gate { 711*0Sstevel@tonic-gate if (tcp->tcp_listener != NULL) 712*0Sstevel@tonic-gate tcp_eager_unlink(tcp); 713*0Sstevel@tonic-gate tcp_free(tcp); 714*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 715*0Sstevel@tonic-gate } 716*0Sstevel@tonic-gate 717*0Sstevel@tonic-gate /* 718*0Sstevel@tonic-gate * If we are an eager connection hanging off a listener that hasn't 719*0Sstevel@tonic-gate * formally accepted the connection yet, get off his list and blow off 720*0Sstevel@tonic-gate * any data that we have accumulated. 721*0Sstevel@tonic-gate */ 722*0Sstevel@tonic-gate static void 723*0Sstevel@tonic-gate tcp_eager_unlink(tcp_t *tcp) 724*0Sstevel@tonic-gate { 725*0Sstevel@tonic-gate tcp_t *listener = tcp->tcp_listener; 726*0Sstevel@tonic-gate 727*0Sstevel@tonic-gate assert(listener != NULL); 728*0Sstevel@tonic-gate if (tcp->tcp_eager_next_q0 != NULL) { 729*0Sstevel@tonic-gate assert(tcp->tcp_eager_prev_q0 != NULL); 730*0Sstevel@tonic-gate 731*0Sstevel@tonic-gate /* Remove the eager tcp from q0 */ 732*0Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 733*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 734*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 735*0Sstevel@tonic-gate tcp->tcp_eager_next_q0; 736*0Sstevel@tonic-gate listener->tcp_conn_req_cnt_q0--; 737*0Sstevel@tonic-gate } else { 738*0Sstevel@tonic-gate tcp_t **tcpp = &listener->tcp_eager_next_q; 739*0Sstevel@tonic-gate tcp_t *prev = NULL; 740*0Sstevel@tonic-gate 741*0Sstevel@tonic-gate for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 742*0Sstevel@tonic-gate if (tcpp[0] == tcp) { 743*0Sstevel@tonic-gate if (listener->tcp_eager_last_q == tcp) { 744*0Sstevel@tonic-gate /* 745*0Sstevel@tonic-gate * If we are unlinking the last 746*0Sstevel@tonic-gate * element on the list, adjust 747*0Sstevel@tonic-gate * tail pointer. Set tail pointer 748*0Sstevel@tonic-gate * to nil when list is empty. 749*0Sstevel@tonic-gate */ 750*0Sstevel@tonic-gate assert(tcp->tcp_eager_next_q == NULL); 751*0Sstevel@tonic-gate if (listener->tcp_eager_last_q == 752*0Sstevel@tonic-gate listener->tcp_eager_next_q) { 753*0Sstevel@tonic-gate listener->tcp_eager_last_q = 754*0Sstevel@tonic-gate NULL; 755*0Sstevel@tonic-gate } else { 756*0Sstevel@tonic-gate /* 757*0Sstevel@tonic-gate * We won't get here if there 758*0Sstevel@tonic-gate * is only one eager in the 759*0Sstevel@tonic-gate * list. 760*0Sstevel@tonic-gate */ 761*0Sstevel@tonic-gate assert(prev != NULL); 762*0Sstevel@tonic-gate listener->tcp_eager_last_q = 763*0Sstevel@tonic-gate prev; 764*0Sstevel@tonic-gate } 765*0Sstevel@tonic-gate } 766*0Sstevel@tonic-gate tcpp[0] = tcp->tcp_eager_next_q; 767*0Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 768*0Sstevel@tonic-gate tcp->tcp_eager_last_q = NULL; 769*0Sstevel@tonic-gate listener->tcp_conn_req_cnt_q--; 770*0Sstevel@tonic-gate break; 771*0Sstevel@tonic-gate } 772*0Sstevel@tonic-gate prev = tcpp[0]; 773*0Sstevel@tonic-gate } 774*0Sstevel@tonic-gate } 775*0Sstevel@tonic-gate tcp->tcp_listener = NULL; 776*0Sstevel@tonic-gate } 777*0Sstevel@tonic-gate 778*0Sstevel@tonic-gate /* 779*0Sstevel@tonic-gate * Reset any eager connection hanging off this listener 780*0Sstevel@tonic-gate * and then reclaim it's resources. 781*0Sstevel@tonic-gate */ 782*0Sstevel@tonic-gate static void 783*0Sstevel@tonic-gate tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 784*0Sstevel@tonic-gate { 785*0Sstevel@tonic-gate tcp_t *eager; 786*0Sstevel@tonic-gate 787*0Sstevel@tonic-gate if (!q0_only) { 788*0Sstevel@tonic-gate /* First cleanup q */ 789*0Sstevel@tonic-gate while ((eager = listener->tcp_eager_next_q) != NULL) { 790*0Sstevel@tonic-gate assert(listener->tcp_eager_last_q != NULL); 791*0Sstevel@tonic-gate tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 792*0Sstevel@tonic-gate eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 793*0Sstevel@tonic-gate sock_id); 794*0Sstevel@tonic-gate tcp_close_detached(eager); 795*0Sstevel@tonic-gate } 796*0Sstevel@tonic-gate assert(listener->tcp_eager_last_q == NULL); 797*0Sstevel@tonic-gate } 798*0Sstevel@tonic-gate /* Then cleanup q0 */ 799*0Sstevel@tonic-gate while ((eager = listener->tcp_eager_next_q0) != listener) { 800*0Sstevel@tonic-gate tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 801*0Sstevel@tonic-gate eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 802*0Sstevel@tonic-gate tcp_close_detached(eager); 803*0Sstevel@tonic-gate } 804*0Sstevel@tonic-gate } 805*0Sstevel@tonic-gate 806*0Sstevel@tonic-gate /* 807*0Sstevel@tonic-gate * To handle the shutdown request. Called from shutdown() 808*0Sstevel@tonic-gate */ 809*0Sstevel@tonic-gate int 810*0Sstevel@tonic-gate tcp_shutdown(int sock_id) 811*0Sstevel@tonic-gate { 812*0Sstevel@tonic-gate tcp_t *tcp; 813*0Sstevel@tonic-gate 814*0Sstevel@tonic-gate DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 815*0Sstevel@tonic-gate 816*0Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) { 817*0Sstevel@tonic-gate return (-1); 818*0Sstevel@tonic-gate } 819*0Sstevel@tonic-gate 820*0Sstevel@tonic-gate /* 821*0Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 822*0Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 823*0Sstevel@tonic-gate * otherwise, we may not be able to send. 824*0Sstevel@tonic-gate */ 825*0Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 826*0Sstevel@tonic-gate /* 827*0Sstevel@tonic-gate * If we return now without freeing TCP, there will be 828*0Sstevel@tonic-gate * a memory leak. 829*0Sstevel@tonic-gate */ 830*0Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) 831*0Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 832*0Sstevel@tonic-gate return (-1); 833*0Sstevel@tonic-gate } 834*0Sstevel@tonic-gate 835*0Sstevel@tonic-gate DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 836*0Sstevel@tonic-gate switch (tcp->tcp_state) { 837*0Sstevel@tonic-gate 838*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 839*0Sstevel@tonic-gate /* 840*0Sstevel@tonic-gate * Shutdown during the connect 3-way handshake 841*0Sstevel@tonic-gate */ 842*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 843*0Sstevel@tonic-gate /* 844*0Sstevel@tonic-gate * Transmit the FIN 845*0Sstevel@tonic-gate * wait for the FIN to be ACKed, 846*0Sstevel@tonic-gate * then remain in FIN_WAIT_2 847*0Sstevel@tonic-gate */ 848*0Sstevel@tonic-gate dprintf("tcp_shutdown: sending fin\n"); 849*0Sstevel@tonic-gate if (tcp_xmit_end(tcp, sock_id) == 0 && 850*0Sstevel@tonic-gate tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 851*0Sstevel@tonic-gate /* During the wait, TCP may be gone... */ 852*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 853*0Sstevel@tonic-gate return (-1); 854*0Sstevel@tonic-gate } 855*0Sstevel@tonic-gate dprintf("tcp_shutdown: done\n"); 856*0Sstevel@tonic-gate break; 857*0Sstevel@tonic-gate 858*0Sstevel@tonic-gate default: 859*0Sstevel@tonic-gate break; 860*0Sstevel@tonic-gate 861*0Sstevel@tonic-gate } 862*0Sstevel@tonic-gate return (0); 863*0Sstevel@tonic-gate } 864*0Sstevel@tonic-gate 865*0Sstevel@tonic-gate /* To handle closing of the socket */ 866*0Sstevel@tonic-gate static int 867*0Sstevel@tonic-gate tcp_close(int sock_id) 868*0Sstevel@tonic-gate { 869*0Sstevel@tonic-gate char *msg; 870*0Sstevel@tonic-gate tcp_t *tcp; 871*0Sstevel@tonic-gate int error = 0; 872*0Sstevel@tonic-gate 873*0Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) { 874*0Sstevel@tonic-gate return (-1); 875*0Sstevel@tonic-gate } 876*0Sstevel@tonic-gate 877*0Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 878*0Sstevel@tonic-gate 879*0Sstevel@tonic-gate /* 880*0Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 881*0Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 882*0Sstevel@tonic-gate * otherwise, we may not be able to send. 883*0Sstevel@tonic-gate */ 884*0Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 885*0Sstevel@tonic-gate /* 886*0Sstevel@tonic-gate * If we return now without freeing TCP, there will be 887*0Sstevel@tonic-gate * a memory leak. 888*0Sstevel@tonic-gate */ 889*0Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) 890*0Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 891*0Sstevel@tonic-gate return (-1); 892*0Sstevel@tonic-gate } 893*0Sstevel@tonic-gate 894*0Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 895*0Sstevel@tonic-gate /* Cleanup for listener */ 896*0Sstevel@tonic-gate tcp_eager_cleanup(tcp, 0, sock_id); 897*0Sstevel@tonic-gate } 898*0Sstevel@tonic-gate 899*0Sstevel@tonic-gate msg = NULL; 900*0Sstevel@tonic-gate switch (tcp->tcp_state) { 901*0Sstevel@tonic-gate case TCPS_CLOSED: 902*0Sstevel@tonic-gate case TCPS_IDLE: 903*0Sstevel@tonic-gate case TCPS_BOUND: 904*0Sstevel@tonic-gate case TCPS_LISTEN: 905*0Sstevel@tonic-gate break; 906*0Sstevel@tonic-gate case TCPS_SYN_SENT: 907*0Sstevel@tonic-gate msg = "tcp_close, during connect"; 908*0Sstevel@tonic-gate break; 909*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 910*0Sstevel@tonic-gate /* 911*0Sstevel@tonic-gate * Close during the connect 3-way handshake 912*0Sstevel@tonic-gate * but here there may or may not be pending data 913*0Sstevel@tonic-gate * already on queue. Process almost same as in 914*0Sstevel@tonic-gate * the ESTABLISHED state. 915*0Sstevel@tonic-gate */ 916*0Sstevel@tonic-gate /* FALLTHRU */ 917*0Sstevel@tonic-gate default: 918*0Sstevel@tonic-gate /* 919*0Sstevel@tonic-gate * If SO_LINGER has set a zero linger time, abort the 920*0Sstevel@tonic-gate * connection with a reset. 921*0Sstevel@tonic-gate */ 922*0Sstevel@tonic-gate if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 923*0Sstevel@tonic-gate msg = "tcp_close, zero lingertime"; 924*0Sstevel@tonic-gate break; 925*0Sstevel@tonic-gate } 926*0Sstevel@tonic-gate 927*0Sstevel@tonic-gate /* 928*0Sstevel@tonic-gate * Abort connection if there is unread data queued. 929*0Sstevel@tonic-gate */ 930*0Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL || 931*0Sstevel@tonic-gate tcp->tcp_reass_head != NULL) { 932*0Sstevel@tonic-gate msg = "tcp_close, unread data"; 933*0Sstevel@tonic-gate break; 934*0Sstevel@tonic-gate } 935*0Sstevel@tonic-gate if (tcp->tcp_state <= TCPS_LISTEN) 936*0Sstevel@tonic-gate break; 937*0Sstevel@tonic-gate 938*0Sstevel@tonic-gate /* 939*0Sstevel@tonic-gate * Transmit the FIN before detaching the tcp_t. 940*0Sstevel@tonic-gate * After tcp_detach returns this queue/perimeter 941*0Sstevel@tonic-gate * no longer owns the tcp_t thus others can modify it. 942*0Sstevel@tonic-gate * The TCP could be closed in tcp_state_wait called by 943*0Sstevel@tonic-gate * tcp_wput_data called by tcp_xmit_end. 944*0Sstevel@tonic-gate */ 945*0Sstevel@tonic-gate (void) tcp_xmit_end(tcp, sock_id); 946*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 947*0Sstevel@tonic-gate return (0); 948*0Sstevel@tonic-gate 949*0Sstevel@tonic-gate /* 950*0Sstevel@tonic-gate * If lingering on close then wait until the fin is acked, 951*0Sstevel@tonic-gate * the SO_LINGER time passes, or a reset is sent/received. 952*0Sstevel@tonic-gate */ 953*0Sstevel@tonic-gate if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 954*0Sstevel@tonic-gate !(tcp->tcp_fin_acked) && 955*0Sstevel@tonic-gate tcp->tcp_state >= TCPS_ESTABLISHED) { 956*0Sstevel@tonic-gate uint32_t stoptime; /* in ms */ 957*0Sstevel@tonic-gate 958*0Sstevel@tonic-gate tcp->tcp_client_errno = 0; 959*0Sstevel@tonic-gate stoptime = prom_gettime() + 960*0Sstevel@tonic-gate (tcp->tcp_lingertime * 1000); 961*0Sstevel@tonic-gate while (!(tcp->tcp_fin_acked) && 962*0Sstevel@tonic-gate tcp->tcp_state >= TCPS_ESTABLISHED && 963*0Sstevel@tonic-gate tcp->tcp_client_errno == 0 && 964*0Sstevel@tonic-gate ((int32_t)(stoptime - prom_gettime()) > 0)) { 965*0Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 966*0Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) { 967*0Sstevel@tonic-gate tcp_clean_death(sock_id, 968*0Sstevel@tonic-gate tcp, 0); 969*0Sstevel@tonic-gate } 970*0Sstevel@tonic-gate return (-1); 971*0Sstevel@tonic-gate } 972*0Sstevel@tonic-gate } 973*0Sstevel@tonic-gate tcp->tcp_client_errno = 0; 974*0Sstevel@tonic-gate } 975*0Sstevel@tonic-gate if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 976*0Sstevel@tonic-gate /* During the wait, TCP may be gone... */ 977*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 978*0Sstevel@tonic-gate return (0); 979*0Sstevel@tonic-gate msg = "tcp_close, couldn't detach"; 980*0Sstevel@tonic-gate } else { 981*0Sstevel@tonic-gate return (0); 982*0Sstevel@tonic-gate } 983*0Sstevel@tonic-gate break; 984*0Sstevel@tonic-gate } 985*0Sstevel@tonic-gate 986*0Sstevel@tonic-gate /* Something went wrong... Send a RST and report the error */ 987*0Sstevel@tonic-gate if (msg != NULL) { 988*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_ESTABLISHED || 989*0Sstevel@tonic-gate tcp->tcp_state == TCPS_CLOSE_WAIT) 990*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpEstabResets); 991*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_SENT || 992*0Sstevel@tonic-gate tcp->tcp_state == TCPS_SYN_RCVD) 993*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpAttemptFails); 994*0Sstevel@tonic-gate tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 995*0Sstevel@tonic-gate sock_id); 996*0Sstevel@tonic-gate } 997*0Sstevel@tonic-gate 998*0Sstevel@tonic-gate tcp_free(tcp); 999*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1000*0Sstevel@tonic-gate sockets[sock_id].pcb = NULL; 1001*0Sstevel@tonic-gate return (error); 1002*0Sstevel@tonic-gate } 1003*0Sstevel@tonic-gate 1004*0Sstevel@tonic-gate /* To make an endpoint a listener. */ 1005*0Sstevel@tonic-gate int 1006*0Sstevel@tonic-gate tcp_listen(int sock_id, int backlog) 1007*0Sstevel@tonic-gate { 1008*0Sstevel@tonic-gate tcp_t *tcp; 1009*0Sstevel@tonic-gate 1010*0Sstevel@tonic-gate if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1011*0Sstevel@tonic-gate errno = EINVAL; 1012*0Sstevel@tonic-gate return (-1); 1013*0Sstevel@tonic-gate } 1014*0Sstevel@tonic-gate /* We allow calling listen() multiple times to change the backlog. */ 1015*0Sstevel@tonic-gate if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1016*0Sstevel@tonic-gate errno = EOPNOTSUPP; 1017*0Sstevel@tonic-gate return (-1); 1018*0Sstevel@tonic-gate } 1019*0Sstevel@tonic-gate /* The following initialization should only be done once. */ 1020*0Sstevel@tonic-gate if (tcp->tcp_state != TCPS_LISTEN) { 1021*0Sstevel@tonic-gate tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1022*0Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 1023*0Sstevel@tonic-gate tcp->tcp_state = TCPS_LISTEN; 1024*0Sstevel@tonic-gate tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1025*0Sstevel@tonic-gate } 1026*0Sstevel@tonic-gate if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1027*0Sstevel@tonic-gate tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1028*0Sstevel@tonic-gate } 1029*0Sstevel@tonic-gate if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1030*0Sstevel@tonic-gate tcp->tcp_conn_req_max = tcp_conn_req_min; 1031*0Sstevel@tonic-gate } 1032*0Sstevel@tonic-gate return (0); 1033*0Sstevel@tonic-gate } 1034*0Sstevel@tonic-gate 1035*0Sstevel@tonic-gate /* To accept connections. */ 1036*0Sstevel@tonic-gate int 1037*0Sstevel@tonic-gate tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1038*0Sstevel@tonic-gate { 1039*0Sstevel@tonic-gate tcp_t *listener; 1040*0Sstevel@tonic-gate tcp_t *eager; 1041*0Sstevel@tonic-gate int sd, new_sock_id; 1042*0Sstevel@tonic-gate struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1043*0Sstevel@tonic-gate int timeout; 1044*0Sstevel@tonic-gate 1045*0Sstevel@tonic-gate /* Sanity check. */ 1046*0Sstevel@tonic-gate if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1047*0Sstevel@tonic-gate new_addr == NULL || addr_len == NULL || 1048*0Sstevel@tonic-gate *addr_len < sizeof (struct sockaddr_in) || 1049*0Sstevel@tonic-gate listener->tcp_state != TCPS_LISTEN) { 1050*0Sstevel@tonic-gate errno = EINVAL; 1051*0Sstevel@tonic-gate return (-1); 1052*0Sstevel@tonic-gate } 1053*0Sstevel@tonic-gate 1054*0Sstevel@tonic-gate if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1055*0Sstevel@tonic-gate timeout = prom_gettime() + sockets[sock_id].in_timeout; 1056*0Sstevel@tonic-gate else 1057*0Sstevel@tonic-gate timeout = prom_gettime() + tcp_accept_timeout; 1058*0Sstevel@tonic-gate while (listener->tcp_eager_next_q == NULL && 1059*0Sstevel@tonic-gate timeout > prom_gettime()) { 1060*0Sstevel@tonic-gate #if DEBUG 1061*0Sstevel@tonic-gate printf("tcp_accept: Waiting in tcp_accept()\n"); 1062*0Sstevel@tonic-gate #endif 1063*0Sstevel@tonic-gate if (tcp_drain_input(listener, sock_id, 5) < 0) { 1064*0Sstevel@tonic-gate return (-1); 1065*0Sstevel@tonic-gate } 1066*0Sstevel@tonic-gate } 1067*0Sstevel@tonic-gate /* If there is an eager, don't timeout... */ 1068*0Sstevel@tonic-gate if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1069*0Sstevel@tonic-gate #if DEBUG 1070*0Sstevel@tonic-gate printf("tcp_accept: timeout\n"); 1071*0Sstevel@tonic-gate #endif 1072*0Sstevel@tonic-gate errno = ETIMEDOUT; 1073*0Sstevel@tonic-gate return (-1); 1074*0Sstevel@tonic-gate } 1075*0Sstevel@tonic-gate #if DEBUG 1076*0Sstevel@tonic-gate printf("tcp_accept: got a connection\n"); 1077*0Sstevel@tonic-gate #endif 1078*0Sstevel@tonic-gate 1079*0Sstevel@tonic-gate /* Now create the socket for this new TCP. */ 1080*0Sstevel@tonic-gate if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1081*0Sstevel@tonic-gate return (-1); 1082*0Sstevel@tonic-gate } 1083*0Sstevel@tonic-gate if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1084*0Sstevel@tonic-gate /* This should not happen! */ 1085*0Sstevel@tonic-gate prom_panic("so_check_fd() fails in tcp_accept()"); 1086*0Sstevel@tonic-gate /* Free the TCP PCB in the original socket. */ 1087*0Sstevel@tonic-gate bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1088*0Sstevel@tonic-gate /* Dequeue the eager and attach it to the socket. */ 1089*0Sstevel@tonic-gate eager = listener->tcp_eager_next_q; 1090*0Sstevel@tonic-gate listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1091*0Sstevel@tonic-gate if (listener->tcp_eager_last_q == eager) 1092*0Sstevel@tonic-gate listener->tcp_eager_last_q = NULL; 1093*0Sstevel@tonic-gate eager->tcp_eager_next_q = NULL; 1094*0Sstevel@tonic-gate sockets[new_sock_id].pcb = eager; 1095*0Sstevel@tonic-gate listener->tcp_conn_req_cnt_q--; 1096*0Sstevel@tonic-gate 1097*0Sstevel@tonic-gate /* Copy in the address info. */ 1098*0Sstevel@tonic-gate bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1099*0Sstevel@tonic-gate sizeof (in_addr_t)); 1100*0Sstevel@tonic-gate bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1101*0Sstevel@tonic-gate new_addr->sin_family = AF_INET; 1102*0Sstevel@tonic-gate 1103*0Sstevel@tonic-gate #ifdef DEBUG 1104*0Sstevel@tonic-gate printf("tcp_accept(), new sock_id: %d\n", sd); 1105*0Sstevel@tonic-gate #endif 1106*0Sstevel@tonic-gate return (sd); 1107*0Sstevel@tonic-gate } 1108*0Sstevel@tonic-gate 1109*0Sstevel@tonic-gate /* Update the next anonymous port to use. */ 1110*0Sstevel@tonic-gate static in_port_t 1111*0Sstevel@tonic-gate tcp_update_next_port(in_port_t port) 1112*0Sstevel@tonic-gate { 1113*0Sstevel@tonic-gate /* Don't allow the port to fall out of the anonymous port range. */ 1114*0Sstevel@tonic-gate if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1115*0Sstevel@tonic-gate port = (in_port_t)tcp_smallest_anon_port; 1116*0Sstevel@tonic-gate 1117*0Sstevel@tonic-gate if (port < tcp_smallest_nonpriv_port) 1118*0Sstevel@tonic-gate port = (in_port_t)tcp_smallest_nonpriv_port; 1119*0Sstevel@tonic-gate return (port); 1120*0Sstevel@tonic-gate } 1121*0Sstevel@tonic-gate 1122*0Sstevel@tonic-gate /* To check whether a bind to a port is allowed. */ 1123*0Sstevel@tonic-gate static in_port_t 1124*0Sstevel@tonic-gate tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1125*0Sstevel@tonic-gate boolean_t bind_to_req_port_only) 1126*0Sstevel@tonic-gate { 1127*0Sstevel@tonic-gate int i, count; 1128*0Sstevel@tonic-gate tcp_t *tcp; 1129*0Sstevel@tonic-gate 1130*0Sstevel@tonic-gate count = tcp_largest_anon_port - tcp_smallest_anon_port; 1131*0Sstevel@tonic-gate try_again: 1132*0Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1133*0Sstevel@tonic-gate if (sockets[i].type != INETBOOT_STREAM || 1134*0Sstevel@tonic-gate ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1135*0Sstevel@tonic-gate ntohs(tcp->tcp_lport) != port) { 1136*0Sstevel@tonic-gate continue; 1137*0Sstevel@tonic-gate } 1138*0Sstevel@tonic-gate /* 1139*0Sstevel@tonic-gate * Both TCPs have the same port. If SO_REUSEDADDR is 1140*0Sstevel@tonic-gate * set and the bound TCP has a state greater than 1141*0Sstevel@tonic-gate * TCPS_LISTEN, it is fine. 1142*0Sstevel@tonic-gate */ 1143*0Sstevel@tonic-gate if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1144*0Sstevel@tonic-gate continue; 1145*0Sstevel@tonic-gate } 1146*0Sstevel@tonic-gate if (tcp->tcp_bound_source != INADDR_ANY && 1147*0Sstevel@tonic-gate *addr != INADDR_ANY && 1148*0Sstevel@tonic-gate tcp->tcp_bound_source != *addr) { 1149*0Sstevel@tonic-gate continue; 1150*0Sstevel@tonic-gate } 1151*0Sstevel@tonic-gate if (bind_to_req_port_only) { 1152*0Sstevel@tonic-gate return (0); 1153*0Sstevel@tonic-gate } 1154*0Sstevel@tonic-gate if (--count > 0) { 1155*0Sstevel@tonic-gate port = tcp_update_next_port(++port); 1156*0Sstevel@tonic-gate goto try_again; 1157*0Sstevel@tonic-gate } else { 1158*0Sstevel@tonic-gate return (0); 1159*0Sstevel@tonic-gate } 1160*0Sstevel@tonic-gate } 1161*0Sstevel@tonic-gate return (port); 1162*0Sstevel@tonic-gate } 1163*0Sstevel@tonic-gate 1164*0Sstevel@tonic-gate /* To handle the bind request. */ 1165*0Sstevel@tonic-gate int 1166*0Sstevel@tonic-gate tcp_bind(int sock_id) 1167*0Sstevel@tonic-gate { 1168*0Sstevel@tonic-gate tcp_t *tcp; 1169*0Sstevel@tonic-gate in_port_t requested_port, allocated_port; 1170*0Sstevel@tonic-gate boolean_t bind_to_req_port_only; 1171*0Sstevel@tonic-gate boolean_t reuseaddr; 1172*0Sstevel@tonic-gate 1173*0Sstevel@tonic-gate if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1174*0Sstevel@tonic-gate errno = EINVAL; 1175*0Sstevel@tonic-gate return (-1); 1176*0Sstevel@tonic-gate } 1177*0Sstevel@tonic-gate 1178*0Sstevel@tonic-gate if (tcp->tcp_state >= TCPS_BOUND) { 1179*0Sstevel@tonic-gate /* We don't allow multiple bind(). */ 1180*0Sstevel@tonic-gate errno = EPROTO; 1181*0Sstevel@tonic-gate return (-1); 1182*0Sstevel@tonic-gate } 1183*0Sstevel@tonic-gate 1184*0Sstevel@tonic-gate requested_port = ntohs(sockets[sock_id].bind.sin_port); 1185*0Sstevel@tonic-gate 1186*0Sstevel@tonic-gate /* The bound source can be INADDR_ANY. */ 1187*0Sstevel@tonic-gate tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1188*0Sstevel@tonic-gate 1189*0Sstevel@tonic-gate tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1190*0Sstevel@tonic-gate 1191*0Sstevel@tonic-gate /* Verify the port is available. */ 1192*0Sstevel@tonic-gate if (requested_port == 0) 1193*0Sstevel@tonic-gate bind_to_req_port_only = B_FALSE; 1194*0Sstevel@tonic-gate else /* T_BIND_REQ and requested_port != 0 */ 1195*0Sstevel@tonic-gate bind_to_req_port_only = B_TRUE; 1196*0Sstevel@tonic-gate 1197*0Sstevel@tonic-gate if (requested_port == 0) { 1198*0Sstevel@tonic-gate requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1199*0Sstevel@tonic-gate } 1200*0Sstevel@tonic-gate reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1201*0Sstevel@tonic-gate allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1202*0Sstevel@tonic-gate reuseaddr, bind_to_req_port_only); 1203*0Sstevel@tonic-gate 1204*0Sstevel@tonic-gate if (allocated_port == 0) { 1205*0Sstevel@tonic-gate errno = EADDRINUSE; 1206*0Sstevel@tonic-gate return (-1); 1207*0Sstevel@tonic-gate } 1208*0Sstevel@tonic-gate tcp->tcp_lport = htons(allocated_port); 1209*0Sstevel@tonic-gate *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1210*0Sstevel@tonic-gate sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1211*0Sstevel@tonic-gate tcp->tcp_state = TCPS_BOUND; 1212*0Sstevel@tonic-gate return (0); 1213*0Sstevel@tonic-gate } 1214*0Sstevel@tonic-gate 1215*0Sstevel@tonic-gate /* 1216*0Sstevel@tonic-gate * Check for duplicate TCP connections. 1217*0Sstevel@tonic-gate */ 1218*0Sstevel@tonic-gate static int 1219*0Sstevel@tonic-gate tcp_conn_check(tcp_t *tcp) 1220*0Sstevel@tonic-gate { 1221*0Sstevel@tonic-gate int i; 1222*0Sstevel@tonic-gate tcp_t *tmp_tcp; 1223*0Sstevel@tonic-gate 1224*0Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1225*0Sstevel@tonic-gate if (sockets[i].type != INETBOOT_STREAM) 1226*0Sstevel@tonic-gate continue; 1227*0Sstevel@tonic-gate /* Socket may not be closed but the TCP can be gone. */ 1228*0Sstevel@tonic-gate if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1229*0Sstevel@tonic-gate continue; 1230*0Sstevel@tonic-gate /* We only care about TCP in states later than SYN_SENT. */ 1231*0Sstevel@tonic-gate if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1232*0Sstevel@tonic-gate continue; 1233*0Sstevel@tonic-gate if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1234*0Sstevel@tonic-gate tmp_tcp->tcp_fport != tcp->tcp_fport || 1235*0Sstevel@tonic-gate tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1236*0Sstevel@tonic-gate tmp_tcp->tcp_remote != tcp->tcp_remote) { 1237*0Sstevel@tonic-gate continue; 1238*0Sstevel@tonic-gate } else { 1239*0Sstevel@tonic-gate return (-1); 1240*0Sstevel@tonic-gate } 1241*0Sstevel@tonic-gate } 1242*0Sstevel@tonic-gate return (0); 1243*0Sstevel@tonic-gate } 1244*0Sstevel@tonic-gate 1245*0Sstevel@tonic-gate /* To handle a connect request. */ 1246*0Sstevel@tonic-gate int 1247*0Sstevel@tonic-gate tcp_connect(int sock_id) 1248*0Sstevel@tonic-gate { 1249*0Sstevel@tonic-gate tcp_t *tcp; 1250*0Sstevel@tonic-gate in_addr_t dstaddr; 1251*0Sstevel@tonic-gate in_port_t dstport; 1252*0Sstevel@tonic-gate tcph_t *tcph; 1253*0Sstevel@tonic-gate int mss; 1254*0Sstevel@tonic-gate mblk_t *syn_mp; 1255*0Sstevel@tonic-gate 1256*0Sstevel@tonic-gate if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1257*0Sstevel@tonic-gate errno = EINVAL; 1258*0Sstevel@tonic-gate return (-1); 1259*0Sstevel@tonic-gate } 1260*0Sstevel@tonic-gate 1261*0Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 1262*0Sstevel@tonic-gate 1263*0Sstevel@tonic-gate dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1264*0Sstevel@tonic-gate dstport = sockets[sock_id].remote.sin_port; 1265*0Sstevel@tonic-gate 1266*0Sstevel@tonic-gate /* 1267*0Sstevel@tonic-gate * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1268*0Sstevel@tonic-gate * We don't have enough info to check for broadcast addr, except 1269*0Sstevel@tonic-gate * for the all 1 broadcast. 1270*0Sstevel@tonic-gate */ 1271*0Sstevel@tonic-gate if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1272*0Sstevel@tonic-gate dstaddr == INADDR_BROADCAST) { 1273*0Sstevel@tonic-gate /* 1274*0Sstevel@tonic-gate * SunOS 4.x and 4.3 BSD allow an application 1275*0Sstevel@tonic-gate * to connect a TCP socket to INADDR_ANY. 1276*0Sstevel@tonic-gate * When they do this, the kernel picks the 1277*0Sstevel@tonic-gate * address of one interface and uses it 1278*0Sstevel@tonic-gate * instead. The kernel usually ends up 1279*0Sstevel@tonic-gate * picking the address of the loopback 1280*0Sstevel@tonic-gate * interface. This is an undocumented feature. 1281*0Sstevel@tonic-gate * However, we provide the same thing here 1282*0Sstevel@tonic-gate * in order to have source and binary 1283*0Sstevel@tonic-gate * compatibility with SunOS 4.x. 1284*0Sstevel@tonic-gate * Update the T_CONN_REQ (sin/sin6) since it is used to 1285*0Sstevel@tonic-gate * generate the T_CONN_CON. 1286*0Sstevel@tonic-gate * 1287*0Sstevel@tonic-gate * Fail this for inetboot TCP. 1288*0Sstevel@tonic-gate */ 1289*0Sstevel@tonic-gate errno = EINVAL; 1290*0Sstevel@tonic-gate return (-1); 1291*0Sstevel@tonic-gate } 1292*0Sstevel@tonic-gate 1293*0Sstevel@tonic-gate /* It is not bound to any address yet... */ 1294*0Sstevel@tonic-gate if (tcp->tcp_bound_source == INADDR_ANY) { 1295*0Sstevel@tonic-gate ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1296*0Sstevel@tonic-gate /* We don't have an address! */ 1297*0Sstevel@tonic-gate if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1298*0Sstevel@tonic-gate INADDR_ANY) { 1299*0Sstevel@tonic-gate errno = EPROTO; 1300*0Sstevel@tonic-gate return (-1); 1301*0Sstevel@tonic-gate } 1302*0Sstevel@tonic-gate tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1303*0Sstevel@tonic-gate tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1304*0Sstevel@tonic-gate } 1305*0Sstevel@tonic-gate 1306*0Sstevel@tonic-gate /* 1307*0Sstevel@tonic-gate * Don't let an endpoint connect to itself. 1308*0Sstevel@tonic-gate */ 1309*0Sstevel@tonic-gate if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1310*0Sstevel@tonic-gate dstport == tcp->tcp_lport) { 1311*0Sstevel@tonic-gate errno = EINVAL; 1312*0Sstevel@tonic-gate return (-1); 1313*0Sstevel@tonic-gate } 1314*0Sstevel@tonic-gate 1315*0Sstevel@tonic-gate tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1316*0Sstevel@tonic-gate tcp->tcp_remote = dstaddr; 1317*0Sstevel@tonic-gate tcph = tcp->tcp_tcph; 1318*0Sstevel@tonic-gate *(uint16_t *)tcph->th_fport = dstport; 1319*0Sstevel@tonic-gate tcp->tcp_fport = dstport; 1320*0Sstevel@tonic-gate 1321*0Sstevel@tonic-gate /* 1322*0Sstevel@tonic-gate * Don't allow this connection to completely duplicate 1323*0Sstevel@tonic-gate * an existing connection. 1324*0Sstevel@tonic-gate */ 1325*0Sstevel@tonic-gate if (tcp_conn_check(tcp) < 0) { 1326*0Sstevel@tonic-gate errno = EADDRINUSE; 1327*0Sstevel@tonic-gate return (-1); 1328*0Sstevel@tonic-gate } 1329*0Sstevel@tonic-gate 1330*0Sstevel@tonic-gate /* 1331*0Sstevel@tonic-gate * Just make sure our rwnd is at 1332*0Sstevel@tonic-gate * least tcp_recv_hiwat_mss * MSS 1333*0Sstevel@tonic-gate * large, and round up to the nearest 1334*0Sstevel@tonic-gate * MSS. 1335*0Sstevel@tonic-gate * 1336*0Sstevel@tonic-gate * We do the round up here because 1337*0Sstevel@tonic-gate * we need to get the interface 1338*0Sstevel@tonic-gate * MTU first before we can do the 1339*0Sstevel@tonic-gate * round up. 1340*0Sstevel@tonic-gate */ 1341*0Sstevel@tonic-gate mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1342*0Sstevel@tonic-gate tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1343*0Sstevel@tonic-gate tcp_recv_hiwat_minmss * mss); 1344*0Sstevel@tonic-gate tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1345*0Sstevel@tonic-gate SET_WS_VALUE(tcp); 1346*0Sstevel@tonic-gate U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1347*0Sstevel@tonic-gate tcp->tcp_tcph->th_win); 1348*0Sstevel@tonic-gate if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1349*0Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_TRUE; 1350*0Sstevel@tonic-gate 1351*0Sstevel@tonic-gate /* 1352*0Sstevel@tonic-gate * Set tcp_snd_ts_ok to true 1353*0Sstevel@tonic-gate * so that tcp_xmit_mp will 1354*0Sstevel@tonic-gate * include the timestamp 1355*0Sstevel@tonic-gate * option in the SYN segment. 1356*0Sstevel@tonic-gate */ 1357*0Sstevel@tonic-gate if (tcp_tstamp_always || 1358*0Sstevel@tonic-gate (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1359*0Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_TRUE; 1360*0Sstevel@tonic-gate } 1361*0Sstevel@tonic-gate 1362*0Sstevel@tonic-gate if (tcp_sack_permitted == 2 || 1363*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok) { 1364*0Sstevel@tonic-gate assert(tcp->tcp_sack_info == NULL); 1365*0Sstevel@tonic-gate if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1366*0Sstevel@tonic-gate sizeof (tcp_sack_info_t))) == NULL) { 1367*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 1368*0Sstevel@tonic-gate } else { 1369*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 1370*0Sstevel@tonic-gate } 1371*0Sstevel@tonic-gate } 1372*0Sstevel@tonic-gate /* 1373*0Sstevel@tonic-gate * Should we use ECN? Note that the current 1374*0Sstevel@tonic-gate * default value (SunOS 5.9) of tcp_ecn_permitted 1375*0Sstevel@tonic-gate * is 2. The reason for doing this is that there 1376*0Sstevel@tonic-gate * are equipments out there that will drop ECN 1377*0Sstevel@tonic-gate * enabled IP packets. Setting it to 1 avoids 1378*0Sstevel@tonic-gate * compatibility problems. 1379*0Sstevel@tonic-gate */ 1380*0Sstevel@tonic-gate if (tcp_ecn_permitted == 2) 1381*0Sstevel@tonic-gate tcp->tcp_ecn_ok = B_TRUE; 1382*0Sstevel@tonic-gate 1383*0Sstevel@tonic-gate tcp_iss_init(tcp); 1384*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1385*0Sstevel@tonic-gate tcp->tcp_active_open = B_TRUE; 1386*0Sstevel@tonic-gate 1387*0Sstevel@tonic-gate tcp->tcp_state = TCPS_SYN_SENT; 1388*0Sstevel@tonic-gate syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1389*0Sstevel@tonic-gate NULL, B_FALSE); 1390*0Sstevel@tonic-gate if (syn_mp != NULL) { 1391*0Sstevel@tonic-gate int ret; 1392*0Sstevel@tonic-gate 1393*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 1394*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_connect", syn_mp); 1395*0Sstevel@tonic-gate /* Send out the SYN packet. */ 1396*0Sstevel@tonic-gate ret = ipv4_tcp_output(sock_id, syn_mp); 1397*0Sstevel@tonic-gate freeb(syn_mp); 1398*0Sstevel@tonic-gate if (ret < 0) { 1399*0Sstevel@tonic-gate return (-1); 1400*0Sstevel@tonic-gate } 1401*0Sstevel@tonic-gate /* tcp_state_wait() will finish the 3 way handshake. */ 1402*0Sstevel@tonic-gate return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1403*0Sstevel@tonic-gate } else { 1404*0Sstevel@tonic-gate errno = ENOBUFS; 1405*0Sstevel@tonic-gate return (-1); 1406*0Sstevel@tonic-gate } 1407*0Sstevel@tonic-gate } 1408*0Sstevel@tonic-gate 1409*0Sstevel@tonic-gate /* 1410*0Sstevel@tonic-gate * Common accept code. Called by tcp_conn_request. 1411*0Sstevel@tonic-gate * cr_pkt is the SYN packet. 1412*0Sstevel@tonic-gate */ 1413*0Sstevel@tonic-gate static int 1414*0Sstevel@tonic-gate tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1415*0Sstevel@tonic-gate uint_t ip_hdr_len) 1416*0Sstevel@tonic-gate { 1417*0Sstevel@tonic-gate tcph_t *tcph; 1418*0Sstevel@tonic-gate 1419*0Sstevel@tonic-gate #ifdef DEBUG 1420*0Sstevel@tonic-gate printf("tcp_accept_comm #######################\n"); 1421*0Sstevel@tonic-gate #endif 1422*0Sstevel@tonic-gate 1423*0Sstevel@tonic-gate /* 1424*0Sstevel@tonic-gate * When we get here, we know that the acceptor header template 1425*0Sstevel@tonic-gate * has already been initialized. 1426*0Sstevel@tonic-gate * However, it may not match the listener if the listener 1427*0Sstevel@tonic-gate * includes options... 1428*0Sstevel@tonic-gate * It may also not match the listener if the listener is v6 and 1429*0Sstevel@tonic-gate * and the acceptor is v4 1430*0Sstevel@tonic-gate */ 1431*0Sstevel@tonic-gate acceptor->tcp_lport = listener->tcp_lport; 1432*0Sstevel@tonic-gate 1433*0Sstevel@tonic-gate if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1434*0Sstevel@tonic-gate if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1435*0Sstevel@tonic-gate /* 1436*0Sstevel@tonic-gate * Listener had options of some sort; acceptor inherits. 1437*0Sstevel@tonic-gate * Free up the acceptor template and allocate one 1438*0Sstevel@tonic-gate * of the right size. 1439*0Sstevel@tonic-gate */ 1440*0Sstevel@tonic-gate bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1441*0Sstevel@tonic-gate acceptor->tcp_iphc = bkmem_zalloc( 1442*0Sstevel@tonic-gate listener->tcp_iphc_len); 1443*0Sstevel@tonic-gate if (acceptor->tcp_iphc == NULL) { 1444*0Sstevel@tonic-gate acceptor->tcp_iphc_len = 0; 1445*0Sstevel@tonic-gate return (ENOMEM); 1446*0Sstevel@tonic-gate } 1447*0Sstevel@tonic-gate acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1448*0Sstevel@tonic-gate } 1449*0Sstevel@tonic-gate acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1450*0Sstevel@tonic-gate acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1451*0Sstevel@tonic-gate acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1452*0Sstevel@tonic-gate 1453*0Sstevel@tonic-gate /* 1454*0Sstevel@tonic-gate * Copy the IP+TCP header template from listener to acceptor 1455*0Sstevel@tonic-gate */ 1456*0Sstevel@tonic-gate bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1457*0Sstevel@tonic-gate listener->tcp_hdr_len); 1458*0Sstevel@tonic-gate acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1459*0Sstevel@tonic-gate acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1460*0Sstevel@tonic-gate acceptor->tcp_ip_hdr_len); 1461*0Sstevel@tonic-gate } else { 1462*0Sstevel@tonic-gate prom_panic("tcp_accept_comm: version not equal"); 1463*0Sstevel@tonic-gate } 1464*0Sstevel@tonic-gate 1465*0Sstevel@tonic-gate /* Copy our new dest and fport from the connection request packet */ 1466*0Sstevel@tonic-gate if (acceptor->tcp_ipversion == IPV4_VERSION) { 1467*0Sstevel@tonic-gate struct ip *ipha; 1468*0Sstevel@tonic-gate 1469*0Sstevel@tonic-gate ipha = (struct ip *)cr_pkt->b_rptr; 1470*0Sstevel@tonic-gate acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1471*0Sstevel@tonic-gate acceptor->tcp_remote = ipha->ip_src.s_addr; 1472*0Sstevel@tonic-gate acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1473*0Sstevel@tonic-gate acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1474*0Sstevel@tonic-gate tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1475*0Sstevel@tonic-gate } else { 1476*0Sstevel@tonic-gate prom_panic("tcp_accept_comm: not IPv4"); 1477*0Sstevel@tonic-gate } 1478*0Sstevel@tonic-gate bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1479*0Sstevel@tonic-gate bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1480*0Sstevel@tonic-gate sizeof (in_port_t)); 1481*0Sstevel@tonic-gate /* 1482*0Sstevel@tonic-gate * For an all-port proxy listener, the local port is determined by 1483*0Sstevel@tonic-gate * the port number field in the SYN packet. 1484*0Sstevel@tonic-gate */ 1485*0Sstevel@tonic-gate if (listener->tcp_lport == 0) { 1486*0Sstevel@tonic-gate acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1487*0Sstevel@tonic-gate bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1488*0Sstevel@tonic-gate sizeof (in_port_t)); 1489*0Sstevel@tonic-gate } 1490*0Sstevel@tonic-gate /* Inherit various TCP parameters from the listener */ 1491*0Sstevel@tonic-gate acceptor->tcp_naglim = listener->tcp_naglim; 1492*0Sstevel@tonic-gate acceptor->tcp_first_timer_threshold = 1493*0Sstevel@tonic-gate listener->tcp_first_timer_threshold; 1494*0Sstevel@tonic-gate acceptor->tcp_second_timer_threshold = 1495*0Sstevel@tonic-gate listener->tcp_second_timer_threshold; 1496*0Sstevel@tonic-gate 1497*0Sstevel@tonic-gate acceptor->tcp_first_ctimer_threshold = 1498*0Sstevel@tonic-gate listener->tcp_first_ctimer_threshold; 1499*0Sstevel@tonic-gate acceptor->tcp_second_ctimer_threshold = 1500*0Sstevel@tonic-gate listener->tcp_second_ctimer_threshold; 1501*0Sstevel@tonic-gate 1502*0Sstevel@tonic-gate acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1503*0Sstevel@tonic-gate 1504*0Sstevel@tonic-gate acceptor->tcp_state = TCPS_LISTEN; 1505*0Sstevel@tonic-gate tcp_iss_init(acceptor); 1506*0Sstevel@tonic-gate 1507*0Sstevel@tonic-gate /* Process all TCP options. */ 1508*0Sstevel@tonic-gate tcp_process_options(acceptor, tcph); 1509*0Sstevel@tonic-gate 1510*0Sstevel@tonic-gate /* Is the other end ECN capable? */ 1511*0Sstevel@tonic-gate if (tcp_ecn_permitted >= 1 && 1512*0Sstevel@tonic-gate (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1513*0Sstevel@tonic-gate acceptor->tcp_ecn_ok = B_TRUE; 1514*0Sstevel@tonic-gate } 1515*0Sstevel@tonic-gate 1516*0Sstevel@tonic-gate /* 1517*0Sstevel@tonic-gate * listener->tcp_rq->q_hiwat should be the default window size or a 1518*0Sstevel@tonic-gate * window size changed via SO_RCVBUF option. First round up the 1519*0Sstevel@tonic-gate * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1520*0Sstevel@tonic-gate * scale option value if needed. Call tcp_rwnd_set() to finish the 1521*0Sstevel@tonic-gate * setting. 1522*0Sstevel@tonic-gate * 1523*0Sstevel@tonic-gate * Note if there is a rpipe metric associated with the remote host, 1524*0Sstevel@tonic-gate * we should not inherit receive window size from listener. 1525*0Sstevel@tonic-gate */ 1526*0Sstevel@tonic-gate acceptor->tcp_rwnd = MSS_ROUNDUP( 1527*0Sstevel@tonic-gate (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1528*0Sstevel@tonic-gate acceptor->tcp_rwnd), acceptor->tcp_mss); 1529*0Sstevel@tonic-gate if (acceptor->tcp_snd_ws_ok) 1530*0Sstevel@tonic-gate SET_WS_VALUE(acceptor); 1531*0Sstevel@tonic-gate /* 1532*0Sstevel@tonic-gate * Note that this is the only place tcp_rwnd_set() is called for 1533*0Sstevel@tonic-gate * accepting a connection. We need to call it here instead of 1534*0Sstevel@tonic-gate * after the 3-way handshake because we need to tell the other 1535*0Sstevel@tonic-gate * side our rwnd in the SYN-ACK segment. 1536*0Sstevel@tonic-gate */ 1537*0Sstevel@tonic-gate (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1538*0Sstevel@tonic-gate 1539*0Sstevel@tonic-gate return (0); 1540*0Sstevel@tonic-gate } 1541*0Sstevel@tonic-gate 1542*0Sstevel@tonic-gate /* 1543*0Sstevel@tonic-gate * Defense for the SYN attack - 1544*0Sstevel@tonic-gate * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1545*0Sstevel@tonic-gate * one that doesn't have the dontdrop bit set. 1546*0Sstevel@tonic-gate * 2. Don't drop a SYN request before its first timeout. This gives every 1547*0Sstevel@tonic-gate * request at least til the first timeout to complete its 3-way handshake. 1548*0Sstevel@tonic-gate * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1549*0Sstevel@tonic-gate * # of timeout drops back to <= q0len/32 => SYN alert off 1550*0Sstevel@tonic-gate */ 1551*0Sstevel@tonic-gate static boolean_t 1552*0Sstevel@tonic-gate tcp_drop_q0(tcp_t *tcp) 1553*0Sstevel@tonic-gate { 1554*0Sstevel@tonic-gate tcp_t *eager; 1555*0Sstevel@tonic-gate 1556*0Sstevel@tonic-gate assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1557*0Sstevel@tonic-gate /* 1558*0Sstevel@tonic-gate * New one is added after next_q0 so prev_q0 points to the oldest 1559*0Sstevel@tonic-gate * Also do not drop any established connections that are deferred on 1560*0Sstevel@tonic-gate * q0 due to q being full 1561*0Sstevel@tonic-gate */ 1562*0Sstevel@tonic-gate 1563*0Sstevel@tonic-gate eager = tcp->tcp_eager_prev_q0; 1564*0Sstevel@tonic-gate while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1565*0Sstevel@tonic-gate /* XXX should move the eager to the head */ 1566*0Sstevel@tonic-gate eager = eager->tcp_eager_prev_q0; 1567*0Sstevel@tonic-gate if (eager == tcp) { 1568*0Sstevel@tonic-gate eager = tcp->tcp_eager_prev_q0; 1569*0Sstevel@tonic-gate break; 1570*0Sstevel@tonic-gate } 1571*0Sstevel@tonic-gate } 1572*0Sstevel@tonic-gate dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1573*0Sstevel@tonic-gate " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1574*0Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0, 1575*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1576*0Sstevel@tonic-gate 1577*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1578*0Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1579*0Sstevel@tonic-gate return (B_TRUE); 1580*0Sstevel@tonic-gate } 1581*0Sstevel@tonic-gate 1582*0Sstevel@tonic-gate /* ARGSUSED */ 1583*0Sstevel@tonic-gate static tcp_t * 1584*0Sstevel@tonic-gate tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1585*0Sstevel@tonic-gate { 1586*0Sstevel@tonic-gate tcp_t *eager; 1587*0Sstevel@tonic-gate struct ip *ipha; 1588*0Sstevel@tonic-gate int err; 1589*0Sstevel@tonic-gate 1590*0Sstevel@tonic-gate #ifdef DEBUG 1591*0Sstevel@tonic-gate printf("tcp_conn_request ###################\n"); 1592*0Sstevel@tonic-gate #endif 1593*0Sstevel@tonic-gate 1594*0Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1595*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpListenDrop); 1596*0Sstevel@tonic-gate dprintf("tcp_conn_request: listen backlog (max=%d) " 1597*0Sstevel@tonic-gate "overflow (%d pending) on %s", 1598*0Sstevel@tonic-gate tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1599*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1600*0Sstevel@tonic-gate return (NULL); 1601*0Sstevel@tonic-gate } 1602*0Sstevel@tonic-gate 1603*0Sstevel@tonic-gate assert(OK_32PTR(mp->b_rptr)); 1604*0Sstevel@tonic-gate 1605*0Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q0 >= 1606*0Sstevel@tonic-gate tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1607*0Sstevel@tonic-gate /* 1608*0Sstevel@tonic-gate * Q0 is full. Drop a pending half-open req from the queue 1609*0Sstevel@tonic-gate * to make room for the new SYN req. Also mark the time we 1610*0Sstevel@tonic-gate * drop a SYN. 1611*0Sstevel@tonic-gate */ 1612*0Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 1613*0Sstevel@tonic-gate if (!tcp_drop_q0(tcp)) { 1614*0Sstevel@tonic-gate freemsg(mp); 1615*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpListenDropQ0); 1616*0Sstevel@tonic-gate dprintf("tcp_conn_request: listen half-open queue " 1617*0Sstevel@tonic-gate "(max=%d) full (%d pending) on %s", 1618*0Sstevel@tonic-gate tcp_conn_req_max_q0, 1619*0Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0, 1620*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1621*0Sstevel@tonic-gate return (NULL); 1622*0Sstevel@tonic-gate } 1623*0Sstevel@tonic-gate } 1624*0Sstevel@tonic-gate 1625*0Sstevel@tonic-gate ipha = (struct ip *)mp->b_rptr; 1626*0Sstevel@tonic-gate if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1627*0Sstevel@tonic-gate ipha->ip_src.s_addr == INADDR_BROADCAST || 1628*0Sstevel@tonic-gate ipha->ip_src.s_addr == INADDR_ANY || 1629*0Sstevel@tonic-gate ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1630*0Sstevel@tonic-gate freemsg(mp); 1631*0Sstevel@tonic-gate return (NULL); 1632*0Sstevel@tonic-gate } 1633*0Sstevel@tonic-gate /* 1634*0Sstevel@tonic-gate * We allow the connection to proceed 1635*0Sstevel@tonic-gate * by generating a detached tcp state vector and put it in 1636*0Sstevel@tonic-gate * the eager queue. When an accept happens, it will be 1637*0Sstevel@tonic-gate * dequeued sequentially. 1638*0Sstevel@tonic-gate */ 1639*0Sstevel@tonic-gate if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1640*0Sstevel@tonic-gate freemsg(mp); 1641*0Sstevel@tonic-gate errno = ENOBUFS; 1642*0Sstevel@tonic-gate return (NULL); 1643*0Sstevel@tonic-gate } 1644*0Sstevel@tonic-gate if ((errno = tcp_init_values(eager, NULL)) != 0) { 1645*0Sstevel@tonic-gate freemsg(mp); 1646*0Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1647*0Sstevel@tonic-gate return (NULL); 1648*0Sstevel@tonic-gate } 1649*0Sstevel@tonic-gate 1650*0Sstevel@tonic-gate /* 1651*0Sstevel@tonic-gate * Eager connection inherits address form from its listener, 1652*0Sstevel@tonic-gate * but its packet form comes from the version of the received 1653*0Sstevel@tonic-gate * SYN segment. 1654*0Sstevel@tonic-gate */ 1655*0Sstevel@tonic-gate eager->tcp_family = tcp->tcp_family; 1656*0Sstevel@tonic-gate 1657*0Sstevel@tonic-gate err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1658*0Sstevel@tonic-gate if (err) { 1659*0Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1660*0Sstevel@tonic-gate return (NULL); 1661*0Sstevel@tonic-gate } 1662*0Sstevel@tonic-gate 1663*0Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1664*0Sstevel@tonic-gate eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1665*0Sstevel@tonic-gate tcp->tcp_eager_next_q0 = eager; 1666*0Sstevel@tonic-gate eager->tcp_eager_prev_q0 = tcp; 1667*0Sstevel@tonic-gate 1668*0Sstevel@tonic-gate /* Set tcp_listener before adding it to tcp_conn_fanout */ 1669*0Sstevel@tonic-gate eager->tcp_listener = tcp; 1670*0Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0++; 1671*0Sstevel@tonic-gate 1672*0Sstevel@tonic-gate return (eager); 1673*0Sstevel@tonic-gate } 1674*0Sstevel@tonic-gate 1675*0Sstevel@tonic-gate /* 1676*0Sstevel@tonic-gate * To get around the non-interrupt problem of inetboot. 1677*0Sstevel@tonic-gate * Keep on processing packets until a certain state is reached or the 1678*0Sstevel@tonic-gate * TCP is destroyed because of getting a RST packet. 1679*0Sstevel@tonic-gate */ 1680*0Sstevel@tonic-gate static int 1681*0Sstevel@tonic-gate tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1682*0Sstevel@tonic-gate { 1683*0Sstevel@tonic-gate int i; 1684*0Sstevel@tonic-gate struct inetgram *in_gram; 1685*0Sstevel@tonic-gate mblk_t *mp; 1686*0Sstevel@tonic-gate int timeout; 1687*0Sstevel@tonic-gate boolean_t changed = B_FALSE; 1688*0Sstevel@tonic-gate 1689*0Sstevel@tonic-gate /* 1690*0Sstevel@tonic-gate * We need to make sure that the MAC does not wait longer 1691*0Sstevel@tonic-gate * than RTO for any packet so that TCP can do retransmission. 1692*0Sstevel@tonic-gate * But if the MAC timeout is less than tcp_rto, we are fine 1693*0Sstevel@tonic-gate * and do not need to change it. 1694*0Sstevel@tonic-gate */ 1695*0Sstevel@tonic-gate timeout = sockets[sock_id].in_timeout; 1696*0Sstevel@tonic-gate if (timeout > tcp->tcp_rto) { 1697*0Sstevel@tonic-gate sockets[sock_id].in_timeout = tcp->tcp_rto; 1698*0Sstevel@tonic-gate changed = B_TRUE; 1699*0Sstevel@tonic-gate } 1700*0Sstevel@tonic-gate retry: 1701*0Sstevel@tonic-gate if (sockets[sock_id].inq == NULL) { 1702*0Sstevel@tonic-gate /* Go out and check the wire */ 1703*0Sstevel@tonic-gate for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1704*0Sstevel@tonic-gate if (sockets[sock_id].input[i] != NULL) { 1705*0Sstevel@tonic-gate if (sockets[sock_id].input[i](sock_id) < 0) { 1706*0Sstevel@tonic-gate if (changed) { 1707*0Sstevel@tonic-gate sockets[sock_id].in_timeout = 1708*0Sstevel@tonic-gate timeout; 1709*0Sstevel@tonic-gate } 1710*0Sstevel@tonic-gate return (-1); 1711*0Sstevel@tonic-gate } 1712*0Sstevel@tonic-gate } 1713*0Sstevel@tonic-gate } 1714*0Sstevel@tonic-gate } 1715*0Sstevel@tonic-gate 1716*0Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 1717*0Sstevel@tonic-gate if (tcp != NULL && tcp->tcp_state == state) 1718*0Sstevel@tonic-gate break; 1719*0Sstevel@tonic-gate 1720*0Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 1721*0Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 1722*0Sstevel@tonic-gate #ifdef DEBUG 1723*0Sstevel@tonic-gate printf("tcp_state_wait for state %d: unexpected " 1724*0Sstevel@tonic-gate "packet level %d frame found\n", state, 1725*0Sstevel@tonic-gate in_gram->igm_level); 1726*0Sstevel@tonic-gate #endif 1727*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1728*0Sstevel@tonic-gate continue; 1729*0Sstevel@tonic-gate } 1730*0Sstevel@tonic-gate mp = in_gram->igm_mp; 1731*0Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1732*0Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1733*0Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 1734*0Sstevel@tonic-gate 1735*0Sstevel@tonic-gate /* 1736*0Sstevel@tonic-gate * The other side may have closed this connection or 1737*0Sstevel@tonic-gate * RST us. But we need to continue to process other 1738*0Sstevel@tonic-gate * packets in the socket's queue because they may be 1739*0Sstevel@tonic-gate * belong to another TCP connections. 1740*0Sstevel@tonic-gate */ 1741*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) { 1742*0Sstevel@tonic-gate tcp = NULL; 1743*0Sstevel@tonic-gate } 1744*0Sstevel@tonic-gate } 1745*0Sstevel@tonic-gate 1746*0Sstevel@tonic-gate /* If the other side has closed the connection, just return. */ 1747*0Sstevel@tonic-gate if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1748*0Sstevel@tonic-gate #ifdef DEBUG 1749*0Sstevel@tonic-gate printf("tcp_state_wait other side dead: state %d " 1750*0Sstevel@tonic-gate "error %d\n", state, sockets[sock_id].so_error); 1751*0Sstevel@tonic-gate #endif 1752*0Sstevel@tonic-gate if (sockets[sock_id].so_error != 0) 1753*0Sstevel@tonic-gate return (-1); 1754*0Sstevel@tonic-gate else 1755*0Sstevel@tonic-gate return (0); 1756*0Sstevel@tonic-gate } 1757*0Sstevel@tonic-gate /* 1758*0Sstevel@tonic-gate * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1759*0Sstevel@tonic-gate * indicator to tcp_state_wait to mean that it is being called 1760*0Sstevel@tonic-gate * to wait till we have received acks for all the new segments sent. 1761*0Sstevel@tonic-gate */ 1762*0Sstevel@tonic-gate if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1763*0Sstevel@tonic-gate goto done; 1764*0Sstevel@tonic-gate } 1765*0Sstevel@tonic-gate if (tcp->tcp_state != state) { 1766*0Sstevel@tonic-gate if (prom_gettime() > tcp->tcp_rto_timeout) 1767*0Sstevel@tonic-gate tcp_timer(tcp, sock_id); 1768*0Sstevel@tonic-gate goto retry; 1769*0Sstevel@tonic-gate } 1770*0Sstevel@tonic-gate done: 1771*0Sstevel@tonic-gate if (changed) 1772*0Sstevel@tonic-gate sockets[sock_id].in_timeout = timeout; 1773*0Sstevel@tonic-gate 1774*0Sstevel@tonic-gate tcp_drain_needed(sock_id, tcp); 1775*0Sstevel@tonic-gate return (0); 1776*0Sstevel@tonic-gate } 1777*0Sstevel@tonic-gate 1778*0Sstevel@tonic-gate /* Verify the checksum of a segment. */ 1779*0Sstevel@tonic-gate static int 1780*0Sstevel@tonic-gate tcp_verify_cksum(mblk_t *mp) 1781*0Sstevel@tonic-gate { 1782*0Sstevel@tonic-gate struct ip *iph; 1783*0Sstevel@tonic-gate tcpha_t *tcph; 1784*0Sstevel@tonic-gate int len; 1785*0Sstevel@tonic-gate uint16_t old_sum; 1786*0Sstevel@tonic-gate 1787*0Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 1788*0Sstevel@tonic-gate tcph = (tcpha_t *)(iph + 1); 1789*0Sstevel@tonic-gate len = ntohs(iph->ip_len); 1790*0Sstevel@tonic-gate 1791*0Sstevel@tonic-gate /* 1792*0Sstevel@tonic-gate * Calculate the TCP checksum. Need to include the psuedo header, 1793*0Sstevel@tonic-gate * which is similar to the real IP header starting at the TTL field. 1794*0Sstevel@tonic-gate */ 1795*0Sstevel@tonic-gate iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1796*0Sstevel@tonic-gate old_sum = tcph->tha_sum; 1797*0Sstevel@tonic-gate tcph->tha_sum = 0; 1798*0Sstevel@tonic-gate iph->ip_ttl = 0; 1799*0Sstevel@tonic-gate if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1800*0Sstevel@tonic-gate len - IP_SIMPLE_HDR_LENGTH + 12)) { 1801*0Sstevel@tonic-gate return (0); 1802*0Sstevel@tonic-gate } else { 1803*0Sstevel@tonic-gate tcp_cksum_errors++; 1804*0Sstevel@tonic-gate return (-1); 1805*0Sstevel@tonic-gate } 1806*0Sstevel@tonic-gate } 1807*0Sstevel@tonic-gate 1808*0Sstevel@tonic-gate /* To find a TCP connection matching the incoming segment. */ 1809*0Sstevel@tonic-gate static tcp_t * 1810*0Sstevel@tonic-gate tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1811*0Sstevel@tonic-gate { 1812*0Sstevel@tonic-gate int i; 1813*0Sstevel@tonic-gate tcp_t *tcp; 1814*0Sstevel@tonic-gate 1815*0Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1816*0Sstevel@tonic-gate if (sockets[i].type == INETBOOT_STREAM && 1817*0Sstevel@tonic-gate (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1818*0Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1819*0Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1820*0Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1821*0Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source && 1822*0Sstevel@tonic-gate tcp->tcp_state >= min_state) { 1823*0Sstevel@tonic-gate *sock_id = i; 1824*0Sstevel@tonic-gate return (tcp); 1825*0Sstevel@tonic-gate } 1826*0Sstevel@tonic-gate } 1827*0Sstevel@tonic-gate } 1828*0Sstevel@tonic-gate /* Find it in the time wait list. */ 1829*0Sstevel@tonic-gate for (tcp = tcp_time_wait_head; tcp != NULL; 1830*0Sstevel@tonic-gate tcp = tcp->tcp_time_wait_next) { 1831*0Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1832*0Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1833*0Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1834*0Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source && 1835*0Sstevel@tonic-gate tcp->tcp_state >= min_state) { 1836*0Sstevel@tonic-gate *sock_id = -1; 1837*0Sstevel@tonic-gate return (tcp); 1838*0Sstevel@tonic-gate } 1839*0Sstevel@tonic-gate } 1840*0Sstevel@tonic-gate return (NULL); 1841*0Sstevel@tonic-gate } 1842*0Sstevel@tonic-gate 1843*0Sstevel@tonic-gate /* To find a TCP listening connection matching the incoming segment. */ 1844*0Sstevel@tonic-gate static tcp_t * 1845*0Sstevel@tonic-gate tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1846*0Sstevel@tonic-gate { 1847*0Sstevel@tonic-gate int i; 1848*0Sstevel@tonic-gate tcp_t *tcp; 1849*0Sstevel@tonic-gate 1850*0Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1851*0Sstevel@tonic-gate if (sockets[i].type == INETBOOT_STREAM && 1852*0Sstevel@tonic-gate (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1853*0Sstevel@tonic-gate if (tcp->tcp_lport == port && 1854*0Sstevel@tonic-gate (tcp->tcp_bound_source == addr || 1855*0Sstevel@tonic-gate tcp->tcp_bound_source == INADDR_ANY)) { 1856*0Sstevel@tonic-gate *sock_id = i; 1857*0Sstevel@tonic-gate return (tcp); 1858*0Sstevel@tonic-gate } 1859*0Sstevel@tonic-gate } 1860*0Sstevel@tonic-gate } 1861*0Sstevel@tonic-gate 1862*0Sstevel@tonic-gate return (NULL); 1863*0Sstevel@tonic-gate } 1864*0Sstevel@tonic-gate 1865*0Sstevel@tonic-gate /* To find a TCP eager matching the incoming segment. */ 1866*0Sstevel@tonic-gate static tcp_t * 1867*0Sstevel@tonic-gate tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1868*0Sstevel@tonic-gate { 1869*0Sstevel@tonic-gate tcp_t *tcp; 1870*0Sstevel@tonic-gate 1871*0Sstevel@tonic-gate #ifdef DEBUG 1872*0Sstevel@tonic-gate printf("tcp_lookup_eager_ipv4 ###############\n"); 1873*0Sstevel@tonic-gate #endif 1874*0Sstevel@tonic-gate for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1875*0Sstevel@tonic-gate tcp = tcp->tcp_eager_next_q) { 1876*0Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1877*0Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1878*0Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1879*0Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1880*0Sstevel@tonic-gate return (tcp); 1881*0Sstevel@tonic-gate } 1882*0Sstevel@tonic-gate } 1883*0Sstevel@tonic-gate 1884*0Sstevel@tonic-gate for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1885*0Sstevel@tonic-gate tcp = tcp->tcp_eager_next_q0) { 1886*0Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1887*0Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1888*0Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1889*0Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1890*0Sstevel@tonic-gate return (tcp); 1891*0Sstevel@tonic-gate } 1892*0Sstevel@tonic-gate } 1893*0Sstevel@tonic-gate #ifdef DEBUG 1894*0Sstevel@tonic-gate printf("No eager found\n"); 1895*0Sstevel@tonic-gate #endif 1896*0Sstevel@tonic-gate return (NULL); 1897*0Sstevel@tonic-gate } 1898*0Sstevel@tonic-gate 1899*0Sstevel@tonic-gate /* To destroy a TCP control block. */ 1900*0Sstevel@tonic-gate static void 1901*0Sstevel@tonic-gate tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1902*0Sstevel@tonic-gate { 1903*0Sstevel@tonic-gate tcp_free(tcp); 1904*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) 1905*0Sstevel@tonic-gate tcp_time_wait_remove(tcp); 1906*0Sstevel@tonic-gate 1907*0Sstevel@tonic-gate if (sock_id >= 0) { 1908*0Sstevel@tonic-gate sockets[sock_id].pcb = NULL; 1909*0Sstevel@tonic-gate if (err != 0) 1910*0Sstevel@tonic-gate sockets[sock_id].so_error = err; 1911*0Sstevel@tonic-gate } 1912*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1913*0Sstevel@tonic-gate } 1914*0Sstevel@tonic-gate 1915*0Sstevel@tonic-gate /* 1916*0Sstevel@tonic-gate * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1917*0Sstevel@tonic-gate * We do not allow the receive window to shrink. After setting rwnd, 1918*0Sstevel@tonic-gate * set the flow control hiwat of the stream. 1919*0Sstevel@tonic-gate * 1920*0Sstevel@tonic-gate * This function is called in 2 cases: 1921*0Sstevel@tonic-gate * 1922*0Sstevel@tonic-gate * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1923*0Sstevel@tonic-gate * connection (passive open) and in tcp_rput_data() for active connect. 1924*0Sstevel@tonic-gate * This is called after tcp_mss_set() when the desired MSS value is known. 1925*0Sstevel@tonic-gate * This makes sure that our window size is a mutiple of the other side's 1926*0Sstevel@tonic-gate * MSS. 1927*0Sstevel@tonic-gate * 2) Handling SO_RCVBUF option. 1928*0Sstevel@tonic-gate * 1929*0Sstevel@tonic-gate * It is ASSUMED that the requested size is a multiple of the current MSS. 1930*0Sstevel@tonic-gate * 1931*0Sstevel@tonic-gate * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1932*0Sstevel@tonic-gate * user requests so. 1933*0Sstevel@tonic-gate */ 1934*0Sstevel@tonic-gate static int 1935*0Sstevel@tonic-gate tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1936*0Sstevel@tonic-gate { 1937*0Sstevel@tonic-gate uint32_t mss = tcp->tcp_mss; 1938*0Sstevel@tonic-gate uint32_t old_max_rwnd; 1939*0Sstevel@tonic-gate uint32_t max_transmittable_rwnd; 1940*0Sstevel@tonic-gate 1941*0Sstevel@tonic-gate if (tcp->tcp_rwnd_max != 0) 1942*0Sstevel@tonic-gate old_max_rwnd = tcp->tcp_rwnd_max; 1943*0Sstevel@tonic-gate else 1944*0Sstevel@tonic-gate old_max_rwnd = tcp->tcp_rwnd; 1945*0Sstevel@tonic-gate 1946*0Sstevel@tonic-gate /* 1947*0Sstevel@tonic-gate * Insist on a receive window that is at least 1948*0Sstevel@tonic-gate * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1949*0Sstevel@tonic-gate * funny TCP interactions of Nagle algorithm, SWS avoidance 1950*0Sstevel@tonic-gate * and delayed acknowledgement. 1951*0Sstevel@tonic-gate */ 1952*0Sstevel@tonic-gate rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1953*0Sstevel@tonic-gate 1954*0Sstevel@tonic-gate /* 1955*0Sstevel@tonic-gate * If window size info has already been exchanged, TCP should not 1956*0Sstevel@tonic-gate * shrink the window. Shrinking window is doable if done carefully. 1957*0Sstevel@tonic-gate * We may add that support later. But so far there is not a real 1958*0Sstevel@tonic-gate * need to do that. 1959*0Sstevel@tonic-gate */ 1960*0Sstevel@tonic-gate if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1961*0Sstevel@tonic-gate /* MSS may have changed, do a round up again. */ 1962*0Sstevel@tonic-gate rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1963*0Sstevel@tonic-gate } 1964*0Sstevel@tonic-gate 1965*0Sstevel@tonic-gate /* 1966*0Sstevel@tonic-gate * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1967*0Sstevel@tonic-gate * can be applied even before the window scale option is decided. 1968*0Sstevel@tonic-gate */ 1969*0Sstevel@tonic-gate max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1970*0Sstevel@tonic-gate if (rwnd > max_transmittable_rwnd) { 1971*0Sstevel@tonic-gate rwnd = max_transmittable_rwnd - 1972*0Sstevel@tonic-gate (max_transmittable_rwnd % mss); 1973*0Sstevel@tonic-gate if (rwnd < mss) 1974*0Sstevel@tonic-gate rwnd = max_transmittable_rwnd; 1975*0Sstevel@tonic-gate /* 1976*0Sstevel@tonic-gate * If we're over the limit we may have to back down tcp_rwnd. 1977*0Sstevel@tonic-gate * The increment below won't work for us. So we set all three 1978*0Sstevel@tonic-gate * here and the increment below will have no effect. 1979*0Sstevel@tonic-gate */ 1980*0Sstevel@tonic-gate tcp->tcp_rwnd = old_max_rwnd = rwnd; 1981*0Sstevel@tonic-gate } 1982*0Sstevel@tonic-gate 1983*0Sstevel@tonic-gate /* 1984*0Sstevel@tonic-gate * Increment the current rwnd by the amount the maximum grew (we 1985*0Sstevel@tonic-gate * can not overwrite it since we might be in the middle of a 1986*0Sstevel@tonic-gate * connection.) 1987*0Sstevel@tonic-gate */ 1988*0Sstevel@tonic-gate tcp->tcp_rwnd += rwnd - old_max_rwnd; 1989*0Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 1990*0Sstevel@tonic-gate if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 1991*0Sstevel@tonic-gate tcp->tcp_cwnd_max = rwnd; 1992*0Sstevel@tonic-gate tcp->tcp_rwnd_max = rwnd; 1993*0Sstevel@tonic-gate 1994*0Sstevel@tonic-gate return (rwnd); 1995*0Sstevel@tonic-gate } 1996*0Sstevel@tonic-gate 1997*0Sstevel@tonic-gate /* 1998*0Sstevel@tonic-gate * Extract option values from a tcp header. We put any found values into the 1999*0Sstevel@tonic-gate * tcpopt struct and return a bitmask saying which options were found. 2000*0Sstevel@tonic-gate */ 2001*0Sstevel@tonic-gate static int 2002*0Sstevel@tonic-gate tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2003*0Sstevel@tonic-gate { 2004*0Sstevel@tonic-gate uchar_t *endp; 2005*0Sstevel@tonic-gate int len; 2006*0Sstevel@tonic-gate uint32_t mss; 2007*0Sstevel@tonic-gate uchar_t *up = (uchar_t *)tcph; 2008*0Sstevel@tonic-gate int found = 0; 2009*0Sstevel@tonic-gate int32_t sack_len; 2010*0Sstevel@tonic-gate tcp_seq sack_begin, sack_end; 2011*0Sstevel@tonic-gate tcp_t *tcp; 2012*0Sstevel@tonic-gate 2013*0Sstevel@tonic-gate endp = up + TCP_HDR_LENGTH(tcph); 2014*0Sstevel@tonic-gate up += TCP_MIN_HEADER_LENGTH; 2015*0Sstevel@tonic-gate while (up < endp) { 2016*0Sstevel@tonic-gate len = endp - up; 2017*0Sstevel@tonic-gate switch (*up) { 2018*0Sstevel@tonic-gate case TCPOPT_EOL: 2019*0Sstevel@tonic-gate break; 2020*0Sstevel@tonic-gate 2021*0Sstevel@tonic-gate case TCPOPT_NOP: 2022*0Sstevel@tonic-gate up++; 2023*0Sstevel@tonic-gate continue; 2024*0Sstevel@tonic-gate 2025*0Sstevel@tonic-gate case TCPOPT_MAXSEG: 2026*0Sstevel@tonic-gate if (len < TCPOPT_MAXSEG_LEN || 2027*0Sstevel@tonic-gate up[1] != TCPOPT_MAXSEG_LEN) 2028*0Sstevel@tonic-gate break; 2029*0Sstevel@tonic-gate 2030*0Sstevel@tonic-gate mss = BE16_TO_U16(up+2); 2031*0Sstevel@tonic-gate /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2032*0Sstevel@tonic-gate tcpopt->tcp_opt_mss = mss; 2033*0Sstevel@tonic-gate found |= TCP_OPT_MSS_PRESENT; 2034*0Sstevel@tonic-gate 2035*0Sstevel@tonic-gate up += TCPOPT_MAXSEG_LEN; 2036*0Sstevel@tonic-gate continue; 2037*0Sstevel@tonic-gate 2038*0Sstevel@tonic-gate case TCPOPT_WSCALE: 2039*0Sstevel@tonic-gate if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2040*0Sstevel@tonic-gate break; 2041*0Sstevel@tonic-gate 2042*0Sstevel@tonic-gate if (up[2] > TCP_MAX_WINSHIFT) 2043*0Sstevel@tonic-gate tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2044*0Sstevel@tonic-gate else 2045*0Sstevel@tonic-gate tcpopt->tcp_opt_wscale = up[2]; 2046*0Sstevel@tonic-gate found |= TCP_OPT_WSCALE_PRESENT; 2047*0Sstevel@tonic-gate 2048*0Sstevel@tonic-gate up += TCPOPT_WS_LEN; 2049*0Sstevel@tonic-gate continue; 2050*0Sstevel@tonic-gate 2051*0Sstevel@tonic-gate case TCPOPT_SACK_PERMITTED: 2052*0Sstevel@tonic-gate if (len < TCPOPT_SACK_OK_LEN || 2053*0Sstevel@tonic-gate up[1] != TCPOPT_SACK_OK_LEN) 2054*0Sstevel@tonic-gate break; 2055*0Sstevel@tonic-gate found |= TCP_OPT_SACK_OK_PRESENT; 2056*0Sstevel@tonic-gate up += TCPOPT_SACK_OK_LEN; 2057*0Sstevel@tonic-gate continue; 2058*0Sstevel@tonic-gate 2059*0Sstevel@tonic-gate case TCPOPT_SACK: 2060*0Sstevel@tonic-gate if (len <= 2 || up[1] <= 2 || len < up[1]) 2061*0Sstevel@tonic-gate break; 2062*0Sstevel@tonic-gate 2063*0Sstevel@tonic-gate /* If TCP is not interested in SACK blks... */ 2064*0Sstevel@tonic-gate if ((tcp = tcpopt->tcp) == NULL) { 2065*0Sstevel@tonic-gate up += up[1]; 2066*0Sstevel@tonic-gate continue; 2067*0Sstevel@tonic-gate } 2068*0Sstevel@tonic-gate sack_len = up[1] - TCPOPT_HEADER_LEN; 2069*0Sstevel@tonic-gate up += TCPOPT_HEADER_LEN; 2070*0Sstevel@tonic-gate 2071*0Sstevel@tonic-gate /* 2072*0Sstevel@tonic-gate * If the list is empty, allocate one and assume 2073*0Sstevel@tonic-gate * nothing is sack'ed. 2074*0Sstevel@tonic-gate */ 2075*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2076*0Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2077*0Sstevel@tonic-gate tcp_notsack_update(&(tcp->tcp_notsack_list), 2078*0Sstevel@tonic-gate tcp->tcp_suna, tcp->tcp_snxt, 2079*0Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 2080*0Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 2081*0Sstevel@tonic-gate 2082*0Sstevel@tonic-gate /* 2083*0Sstevel@tonic-gate * Make sure tcp_notsack_list is not NULL. 2084*0Sstevel@tonic-gate * This happens when kmem_alloc(KM_NOSLEEP) 2085*0Sstevel@tonic-gate * returns NULL. 2086*0Sstevel@tonic-gate */ 2087*0Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2088*0Sstevel@tonic-gate up += sack_len; 2089*0Sstevel@tonic-gate continue; 2090*0Sstevel@tonic-gate } 2091*0Sstevel@tonic-gate tcp->tcp_fack = tcp->tcp_suna; 2092*0Sstevel@tonic-gate } 2093*0Sstevel@tonic-gate 2094*0Sstevel@tonic-gate while (sack_len > 0) { 2095*0Sstevel@tonic-gate if (up + 8 > endp) { 2096*0Sstevel@tonic-gate up = endp; 2097*0Sstevel@tonic-gate break; 2098*0Sstevel@tonic-gate } 2099*0Sstevel@tonic-gate sack_begin = BE32_TO_U32(up); 2100*0Sstevel@tonic-gate up += 4; 2101*0Sstevel@tonic-gate sack_end = BE32_TO_U32(up); 2102*0Sstevel@tonic-gate up += 4; 2103*0Sstevel@tonic-gate sack_len -= 8; 2104*0Sstevel@tonic-gate /* 2105*0Sstevel@tonic-gate * Bounds checking. Make sure the SACK 2106*0Sstevel@tonic-gate * info is within tcp_suna and tcp_snxt. 2107*0Sstevel@tonic-gate * If this SACK blk is out of bound, ignore 2108*0Sstevel@tonic-gate * it but continue to parse the following 2109*0Sstevel@tonic-gate * blks. 2110*0Sstevel@tonic-gate */ 2111*0Sstevel@tonic-gate if (SEQ_LEQ(sack_end, sack_begin) || 2112*0Sstevel@tonic-gate SEQ_LT(sack_begin, tcp->tcp_suna) || 2113*0Sstevel@tonic-gate SEQ_GT(sack_end, tcp->tcp_snxt)) { 2114*0Sstevel@tonic-gate continue; 2115*0Sstevel@tonic-gate } 2116*0Sstevel@tonic-gate tcp_notsack_insert(&(tcp->tcp_notsack_list), 2117*0Sstevel@tonic-gate sack_begin, sack_end, 2118*0Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 2119*0Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 2120*0Sstevel@tonic-gate if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2121*0Sstevel@tonic-gate tcp->tcp_fack = sack_end; 2122*0Sstevel@tonic-gate } 2123*0Sstevel@tonic-gate } 2124*0Sstevel@tonic-gate found |= TCP_OPT_SACK_PRESENT; 2125*0Sstevel@tonic-gate continue; 2126*0Sstevel@tonic-gate 2127*0Sstevel@tonic-gate case TCPOPT_TSTAMP: 2128*0Sstevel@tonic-gate if (len < TCPOPT_TSTAMP_LEN || 2129*0Sstevel@tonic-gate up[1] != TCPOPT_TSTAMP_LEN) 2130*0Sstevel@tonic-gate break; 2131*0Sstevel@tonic-gate 2132*0Sstevel@tonic-gate tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2133*0Sstevel@tonic-gate tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2134*0Sstevel@tonic-gate 2135*0Sstevel@tonic-gate found |= TCP_OPT_TSTAMP_PRESENT; 2136*0Sstevel@tonic-gate 2137*0Sstevel@tonic-gate up += TCPOPT_TSTAMP_LEN; 2138*0Sstevel@tonic-gate continue; 2139*0Sstevel@tonic-gate 2140*0Sstevel@tonic-gate default: 2141*0Sstevel@tonic-gate if (len <= 1 || len < (int)up[1] || up[1] == 0) 2142*0Sstevel@tonic-gate break; 2143*0Sstevel@tonic-gate up += up[1]; 2144*0Sstevel@tonic-gate continue; 2145*0Sstevel@tonic-gate } 2146*0Sstevel@tonic-gate break; 2147*0Sstevel@tonic-gate } 2148*0Sstevel@tonic-gate return (found); 2149*0Sstevel@tonic-gate } 2150*0Sstevel@tonic-gate 2151*0Sstevel@tonic-gate /* 2152*0Sstevel@tonic-gate * Set the mss associated with a particular tcp based on its current value, 2153*0Sstevel@tonic-gate * and a new one passed in. Observe minimums and maximums, and reset 2154*0Sstevel@tonic-gate * other state variables that we want to view as multiples of mss. 2155*0Sstevel@tonic-gate * 2156*0Sstevel@tonic-gate * This function is called in various places mainly because 2157*0Sstevel@tonic-gate * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2158*0Sstevel@tonic-gate * other side's SYN/SYN-ACK packet arrives. 2159*0Sstevel@tonic-gate * 2) PMTUd may get us a new MSS. 2160*0Sstevel@tonic-gate * 3) If the other side stops sending us timestamp option, we need to 2161*0Sstevel@tonic-gate * increase the MSS size to use the extra bytes available. 2162*0Sstevel@tonic-gate */ 2163*0Sstevel@tonic-gate static void 2164*0Sstevel@tonic-gate tcp_mss_set(tcp_t *tcp, uint32_t mss) 2165*0Sstevel@tonic-gate { 2166*0Sstevel@tonic-gate uint32_t mss_max; 2167*0Sstevel@tonic-gate 2168*0Sstevel@tonic-gate mss_max = tcp_mss_max_ipv4; 2169*0Sstevel@tonic-gate 2170*0Sstevel@tonic-gate if (mss < tcp_mss_min) 2171*0Sstevel@tonic-gate mss = tcp_mss_min; 2172*0Sstevel@tonic-gate if (mss > mss_max) 2173*0Sstevel@tonic-gate mss = mss_max; 2174*0Sstevel@tonic-gate /* 2175*0Sstevel@tonic-gate * Unless naglim has been set by our client to 2176*0Sstevel@tonic-gate * a non-mss value, force naglim to track mss. 2177*0Sstevel@tonic-gate * This can help to aggregate small writes. 2178*0Sstevel@tonic-gate */ 2179*0Sstevel@tonic-gate if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2180*0Sstevel@tonic-gate tcp->tcp_naglim = mss; 2181*0Sstevel@tonic-gate /* 2182*0Sstevel@tonic-gate * TCP should be able to buffer at least 4 MSS data for obvious 2183*0Sstevel@tonic-gate * performance reason. 2184*0Sstevel@tonic-gate */ 2185*0Sstevel@tonic-gate if ((mss << 2) > tcp->tcp_xmit_hiwater) 2186*0Sstevel@tonic-gate tcp->tcp_xmit_hiwater = mss << 2; 2187*0Sstevel@tonic-gate tcp->tcp_mss = mss; 2188*0Sstevel@tonic-gate /* 2189*0Sstevel@tonic-gate * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2190*0Sstevel@tonic-gate * Previously, we use tcp_slow_start_initial to control the size 2191*0Sstevel@tonic-gate * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2192*0Sstevel@tonic-gate * is smaller than the cwnd calculated from the formula suggested in 2193*0Sstevel@tonic-gate * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2194*0Sstevel@tonic-gate * Otherwise, use the cwnd from the draft's formula. The default 2195*0Sstevel@tonic-gate * of tcp_slow_start_initial is 2. 2196*0Sstevel@tonic-gate */ 2197*0Sstevel@tonic-gate tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2198*0Sstevel@tonic-gate MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2199*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 2200*0Sstevel@tonic-gate } 2201*0Sstevel@tonic-gate 2202*0Sstevel@tonic-gate /* 2203*0Sstevel@tonic-gate * Process all TCP option in SYN segment. 2204*0Sstevel@tonic-gate * 2205*0Sstevel@tonic-gate * This function sets up the correct tcp_mss value according to the 2206*0Sstevel@tonic-gate * MSS option value and our header size. It also sets up the window scale 2207*0Sstevel@tonic-gate * and timestamp values, and initialize SACK info blocks. But it does not 2208*0Sstevel@tonic-gate * change receive window size after setting the tcp_mss value. The caller 2209*0Sstevel@tonic-gate * should do the appropriate change. 2210*0Sstevel@tonic-gate */ 2211*0Sstevel@tonic-gate void 2212*0Sstevel@tonic-gate tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2213*0Sstevel@tonic-gate { 2214*0Sstevel@tonic-gate int options; 2215*0Sstevel@tonic-gate tcp_opt_t tcpopt; 2216*0Sstevel@tonic-gate uint32_t mss_max; 2217*0Sstevel@tonic-gate char *tmp_tcph; 2218*0Sstevel@tonic-gate 2219*0Sstevel@tonic-gate tcpopt.tcp = NULL; 2220*0Sstevel@tonic-gate options = tcp_parse_options(tcph, &tcpopt); 2221*0Sstevel@tonic-gate 2222*0Sstevel@tonic-gate /* 2223*0Sstevel@tonic-gate * Process MSS option. Note that MSS option value does not account 2224*0Sstevel@tonic-gate * for IP or TCP options. This means that it is equal to MTU - minimum 2225*0Sstevel@tonic-gate * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2226*0Sstevel@tonic-gate * IPv6. 2227*0Sstevel@tonic-gate */ 2228*0Sstevel@tonic-gate if (!(options & TCP_OPT_MSS_PRESENT)) { 2229*0Sstevel@tonic-gate tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2230*0Sstevel@tonic-gate } else { 2231*0Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 2232*0Sstevel@tonic-gate mss_max = tcp_mss_max_ipv4; 2233*0Sstevel@tonic-gate if (tcpopt.tcp_opt_mss < tcp_mss_min) 2234*0Sstevel@tonic-gate tcpopt.tcp_opt_mss = tcp_mss_min; 2235*0Sstevel@tonic-gate else if (tcpopt.tcp_opt_mss > mss_max) 2236*0Sstevel@tonic-gate tcpopt.tcp_opt_mss = mss_max; 2237*0Sstevel@tonic-gate } 2238*0Sstevel@tonic-gate 2239*0Sstevel@tonic-gate /* Process Window Scale option. */ 2240*0Sstevel@tonic-gate if (options & TCP_OPT_WSCALE_PRESENT) { 2241*0Sstevel@tonic-gate tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2242*0Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_TRUE; 2243*0Sstevel@tonic-gate } else { 2244*0Sstevel@tonic-gate tcp->tcp_snd_ws = B_FALSE; 2245*0Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_FALSE; 2246*0Sstevel@tonic-gate tcp->tcp_rcv_ws = B_FALSE; 2247*0Sstevel@tonic-gate } 2248*0Sstevel@tonic-gate 2249*0Sstevel@tonic-gate /* Process Timestamp option. */ 2250*0Sstevel@tonic-gate if ((options & TCP_OPT_TSTAMP_PRESENT) && 2251*0Sstevel@tonic-gate (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2252*0Sstevel@tonic-gate tmp_tcph = (char *)tcp->tcp_tcph; 2253*0Sstevel@tonic-gate 2254*0Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_TRUE; 2255*0Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2256*0Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 2257*0Sstevel@tonic-gate assert(OK_32PTR(tmp_tcph)); 2258*0Sstevel@tonic-gate assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2259*0Sstevel@tonic-gate 2260*0Sstevel@tonic-gate /* Fill in our template header with basic timestamp option. */ 2261*0Sstevel@tonic-gate tmp_tcph += tcp->tcp_tcp_hdr_len; 2262*0Sstevel@tonic-gate tmp_tcph[0] = TCPOPT_NOP; 2263*0Sstevel@tonic-gate tmp_tcph[1] = TCPOPT_NOP; 2264*0Sstevel@tonic-gate tmp_tcph[2] = TCPOPT_TSTAMP; 2265*0Sstevel@tonic-gate tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2266*0Sstevel@tonic-gate tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2267*0Sstevel@tonic-gate tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2268*0Sstevel@tonic-gate tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2269*0Sstevel@tonic-gate } else { 2270*0Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_FALSE; 2271*0Sstevel@tonic-gate } 2272*0Sstevel@tonic-gate 2273*0Sstevel@tonic-gate /* 2274*0Sstevel@tonic-gate * Process SACK options. If SACK is enabled for this connection, 2275*0Sstevel@tonic-gate * then allocate the SACK info structure. 2276*0Sstevel@tonic-gate */ 2277*0Sstevel@tonic-gate if ((options & TCP_OPT_SACK_OK_PRESENT) && 2278*0Sstevel@tonic-gate (tcp->tcp_snd_sack_ok || 2279*0Sstevel@tonic-gate (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2280*0Sstevel@tonic-gate /* This should be true only in the passive case. */ 2281*0Sstevel@tonic-gate if (tcp->tcp_sack_info == NULL) { 2282*0Sstevel@tonic-gate tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2283*0Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 2284*0Sstevel@tonic-gate } 2285*0Sstevel@tonic-gate if (tcp->tcp_sack_info == NULL) { 2286*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 2287*0Sstevel@tonic-gate } else { 2288*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 2289*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 2290*0Sstevel@tonic-gate tcp->tcp_max_sack_blk = 3; 2291*0Sstevel@tonic-gate } else { 2292*0Sstevel@tonic-gate tcp->tcp_max_sack_blk = 4; 2293*0Sstevel@tonic-gate } 2294*0Sstevel@tonic-gate } 2295*0Sstevel@tonic-gate } else { 2296*0Sstevel@tonic-gate /* 2297*0Sstevel@tonic-gate * Resetting tcp_snd_sack_ok to B_FALSE so that 2298*0Sstevel@tonic-gate * no SACK info will be used for this 2299*0Sstevel@tonic-gate * connection. This assumes that SACK usage 2300*0Sstevel@tonic-gate * permission is negotiated. This may need 2301*0Sstevel@tonic-gate * to be changed once this is clarified. 2302*0Sstevel@tonic-gate */ 2303*0Sstevel@tonic-gate if (tcp->tcp_sack_info != NULL) { 2304*0Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_sack_info, 2305*0Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 2306*0Sstevel@tonic-gate tcp->tcp_sack_info = NULL; 2307*0Sstevel@tonic-gate } 2308*0Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 2309*0Sstevel@tonic-gate } 2310*0Sstevel@tonic-gate 2311*0Sstevel@tonic-gate /* 2312*0Sstevel@tonic-gate * Now we know the exact TCP/IP header length, subtract 2313*0Sstevel@tonic-gate * that from tcp_mss to get our side's MSS. 2314*0Sstevel@tonic-gate */ 2315*0Sstevel@tonic-gate tcp->tcp_mss -= tcp->tcp_hdr_len; 2316*0Sstevel@tonic-gate /* 2317*0Sstevel@tonic-gate * Here we assume that the other side's header size will be equal to 2318*0Sstevel@tonic-gate * our header size. We calculate the real MSS accordingly. Need to 2319*0Sstevel@tonic-gate * take into additional stuffs IPsec puts in. 2320*0Sstevel@tonic-gate * 2321*0Sstevel@tonic-gate * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2322*0Sstevel@tonic-gate */ 2323*0Sstevel@tonic-gate tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2324*0Sstevel@tonic-gate (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2325*0Sstevel@tonic-gate 2326*0Sstevel@tonic-gate /* 2327*0Sstevel@tonic-gate * Set MSS to the smaller one of both ends of the connection. 2328*0Sstevel@tonic-gate * We should not have called tcp_mss_set() before, but our 2329*0Sstevel@tonic-gate * side of the MSS should have been set to a proper value 2330*0Sstevel@tonic-gate * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2331*0Sstevel@tonic-gate * STREAM head parameters properly. 2332*0Sstevel@tonic-gate * 2333*0Sstevel@tonic-gate * If we have a larger-than-16-bit window but the other side 2334*0Sstevel@tonic-gate * didn't want to do window scale, tcp_rwnd_set() will take 2335*0Sstevel@tonic-gate * care of that. 2336*0Sstevel@tonic-gate */ 2337*0Sstevel@tonic-gate tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2338*0Sstevel@tonic-gate } 2339*0Sstevel@tonic-gate 2340*0Sstevel@tonic-gate /* 2341*0Sstevel@tonic-gate * This function does PAWS protection check. Returns B_TRUE if the 2342*0Sstevel@tonic-gate * segment passes the PAWS test, else returns B_FALSE. 2343*0Sstevel@tonic-gate */ 2344*0Sstevel@tonic-gate boolean_t 2345*0Sstevel@tonic-gate tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2346*0Sstevel@tonic-gate { 2347*0Sstevel@tonic-gate uint8_t flags; 2348*0Sstevel@tonic-gate int options; 2349*0Sstevel@tonic-gate uint8_t *up; 2350*0Sstevel@tonic-gate 2351*0Sstevel@tonic-gate flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2352*0Sstevel@tonic-gate /* 2353*0Sstevel@tonic-gate * If timestamp option is aligned nicely, get values inline, 2354*0Sstevel@tonic-gate * otherwise call general routine to parse. Only do that 2355*0Sstevel@tonic-gate * if timestamp is the only option. 2356*0Sstevel@tonic-gate */ 2357*0Sstevel@tonic-gate if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2358*0Sstevel@tonic-gate TCPOPT_REAL_TS_LEN && 2359*0Sstevel@tonic-gate OK_32PTR((up = ((uint8_t *)tcph) + 2360*0Sstevel@tonic-gate TCP_MIN_HEADER_LENGTH)) && 2361*0Sstevel@tonic-gate *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2362*0Sstevel@tonic-gate tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2363*0Sstevel@tonic-gate tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2364*0Sstevel@tonic-gate 2365*0Sstevel@tonic-gate options = TCP_OPT_TSTAMP_PRESENT; 2366*0Sstevel@tonic-gate } else { 2367*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 2368*0Sstevel@tonic-gate tcpoptp->tcp = tcp; 2369*0Sstevel@tonic-gate } else { 2370*0Sstevel@tonic-gate tcpoptp->tcp = NULL; 2371*0Sstevel@tonic-gate } 2372*0Sstevel@tonic-gate options = tcp_parse_options(tcph, tcpoptp); 2373*0Sstevel@tonic-gate } 2374*0Sstevel@tonic-gate 2375*0Sstevel@tonic-gate if (options & TCP_OPT_TSTAMP_PRESENT) { 2376*0Sstevel@tonic-gate /* 2377*0Sstevel@tonic-gate * Do PAWS per RFC 1323 section 4.2. Accept RST 2378*0Sstevel@tonic-gate * regardless of the timestamp, page 18 RFC 1323.bis. 2379*0Sstevel@tonic-gate */ 2380*0Sstevel@tonic-gate if ((flags & TH_RST) == 0 && 2381*0Sstevel@tonic-gate TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2382*0Sstevel@tonic-gate tcp->tcp_ts_recent)) { 2383*0Sstevel@tonic-gate if (TSTMP_LT(prom_gettime(), 2384*0Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2385*0Sstevel@tonic-gate /* This segment is not acceptable. */ 2386*0Sstevel@tonic-gate return (B_FALSE); 2387*0Sstevel@tonic-gate } else { 2388*0Sstevel@tonic-gate /* 2389*0Sstevel@tonic-gate * Connection has been idle for 2390*0Sstevel@tonic-gate * too long. Reset the timestamp 2391*0Sstevel@tonic-gate * and assume the segment is valid. 2392*0Sstevel@tonic-gate */ 2393*0Sstevel@tonic-gate tcp->tcp_ts_recent = 2394*0Sstevel@tonic-gate tcpoptp->tcp_opt_ts_val; 2395*0Sstevel@tonic-gate } 2396*0Sstevel@tonic-gate } 2397*0Sstevel@tonic-gate } else { 2398*0Sstevel@tonic-gate /* 2399*0Sstevel@tonic-gate * If we don't get a timestamp on every packet, we 2400*0Sstevel@tonic-gate * figure we can't really trust 'em, so we stop sending 2401*0Sstevel@tonic-gate * and parsing them. 2402*0Sstevel@tonic-gate */ 2403*0Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_FALSE; 2404*0Sstevel@tonic-gate 2405*0Sstevel@tonic-gate tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2406*0Sstevel@tonic-gate tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2407*0Sstevel@tonic-gate tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2408*0Sstevel@tonic-gate tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2409*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 2410*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2411*0Sstevel@tonic-gate tcp->tcp_max_sack_blk = 4; 2412*0Sstevel@tonic-gate } 2413*0Sstevel@tonic-gate } 2414*0Sstevel@tonic-gate return (B_TRUE); 2415*0Sstevel@tonic-gate } 2416*0Sstevel@tonic-gate 2417*0Sstevel@tonic-gate /* 2418*0Sstevel@tonic-gate * tcp_get_seg_mp() is called to get the pointer to a segment in the 2419*0Sstevel@tonic-gate * send queue which starts at the given seq. no. 2420*0Sstevel@tonic-gate * 2421*0Sstevel@tonic-gate * Parameters: 2422*0Sstevel@tonic-gate * tcp_t *tcp: the tcp instance pointer. 2423*0Sstevel@tonic-gate * uint32_t seq: the starting seq. no of the requested segment. 2424*0Sstevel@tonic-gate * int32_t *off: after the execution, *off will be the offset to 2425*0Sstevel@tonic-gate * the returned mblk which points to the requested seq no. 2426*0Sstevel@tonic-gate * 2427*0Sstevel@tonic-gate * Return: 2428*0Sstevel@tonic-gate * A mblk_t pointer pointing to the requested segment in send queue. 2429*0Sstevel@tonic-gate */ 2430*0Sstevel@tonic-gate static mblk_t * 2431*0Sstevel@tonic-gate tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2432*0Sstevel@tonic-gate { 2433*0Sstevel@tonic-gate int32_t cnt; 2434*0Sstevel@tonic-gate mblk_t *mp; 2435*0Sstevel@tonic-gate 2436*0Sstevel@tonic-gate /* Defensive coding. Make sure we don't send incorrect data. */ 2437*0Sstevel@tonic-gate if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2438*0Sstevel@tonic-gate off == NULL) { 2439*0Sstevel@tonic-gate return (NULL); 2440*0Sstevel@tonic-gate } 2441*0Sstevel@tonic-gate cnt = seq - tcp->tcp_suna; 2442*0Sstevel@tonic-gate mp = tcp->tcp_xmit_head; 2443*0Sstevel@tonic-gate while (cnt > 0 && mp) { 2444*0Sstevel@tonic-gate cnt -= mp->b_wptr - mp->b_rptr; 2445*0Sstevel@tonic-gate if (cnt < 0) { 2446*0Sstevel@tonic-gate cnt += mp->b_wptr - mp->b_rptr; 2447*0Sstevel@tonic-gate break; 2448*0Sstevel@tonic-gate } 2449*0Sstevel@tonic-gate mp = mp->b_cont; 2450*0Sstevel@tonic-gate } 2451*0Sstevel@tonic-gate assert(mp != NULL); 2452*0Sstevel@tonic-gate *off = cnt; 2453*0Sstevel@tonic-gate return (mp); 2454*0Sstevel@tonic-gate } 2455*0Sstevel@tonic-gate 2456*0Sstevel@tonic-gate /* 2457*0Sstevel@tonic-gate * This function handles all retransmissions if SACK is enabled for this 2458*0Sstevel@tonic-gate * connection. First it calculates how many segments can be retransmitted 2459*0Sstevel@tonic-gate * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2460*0Sstevel@tonic-gate * segments. A segment is eligible if sack_cnt for that segment is greater 2461*0Sstevel@tonic-gate * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2462*0Sstevel@tonic-gate * all eligible segments, it checks to see if TCP can send some new segments 2463*0Sstevel@tonic-gate * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2464*0Sstevel@tonic-gate * 2465*0Sstevel@tonic-gate * Parameters: 2466*0Sstevel@tonic-gate * tcp_t *tcp: the tcp structure of the connection. 2467*0Sstevel@tonic-gate * 2468*0Sstevel@tonic-gate * Return: 2469*0Sstevel@tonic-gate * 1 if the pipe is not full (new data can be sent), 0 otherwise 2470*0Sstevel@tonic-gate */ 2471*0Sstevel@tonic-gate static int32_t 2472*0Sstevel@tonic-gate tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2473*0Sstevel@tonic-gate { 2474*0Sstevel@tonic-gate notsack_blk_t *notsack_blk; 2475*0Sstevel@tonic-gate int32_t usable_swnd; 2476*0Sstevel@tonic-gate int32_t mss; 2477*0Sstevel@tonic-gate uint32_t seg_len; 2478*0Sstevel@tonic-gate mblk_t *xmit_mp; 2479*0Sstevel@tonic-gate 2480*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2481*0Sstevel@tonic-gate assert(tcp->tcp_notsack_list != NULL); 2482*0Sstevel@tonic-gate assert(tcp->tcp_rexmit == B_FALSE); 2483*0Sstevel@tonic-gate 2484*0Sstevel@tonic-gate /* Defensive coding in case there is a bug... */ 2485*0Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2486*0Sstevel@tonic-gate return (0); 2487*0Sstevel@tonic-gate } 2488*0Sstevel@tonic-gate notsack_blk = tcp->tcp_notsack_list; 2489*0Sstevel@tonic-gate mss = tcp->tcp_mss; 2490*0Sstevel@tonic-gate 2491*0Sstevel@tonic-gate /* 2492*0Sstevel@tonic-gate * Limit the num of outstanding data in the network to be 2493*0Sstevel@tonic-gate * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2494*0Sstevel@tonic-gate */ 2495*0Sstevel@tonic-gate usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2496*0Sstevel@tonic-gate 2497*0Sstevel@tonic-gate /* At least retransmit 1 MSS of data. */ 2498*0Sstevel@tonic-gate if (usable_swnd <= 0) { 2499*0Sstevel@tonic-gate usable_swnd = mss; 2500*0Sstevel@tonic-gate } 2501*0Sstevel@tonic-gate 2502*0Sstevel@tonic-gate /* Make sure no new RTT samples will be taken. */ 2503*0Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 2504*0Sstevel@tonic-gate 2505*0Sstevel@tonic-gate notsack_blk = tcp->tcp_notsack_list; 2506*0Sstevel@tonic-gate while (usable_swnd > 0) { 2507*0Sstevel@tonic-gate mblk_t *snxt_mp, *tmp_mp; 2508*0Sstevel@tonic-gate tcp_seq begin = tcp->tcp_sack_snxt; 2509*0Sstevel@tonic-gate tcp_seq end; 2510*0Sstevel@tonic-gate int32_t off; 2511*0Sstevel@tonic-gate 2512*0Sstevel@tonic-gate for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2513*0Sstevel@tonic-gate if (SEQ_GT(notsack_blk->end, begin) && 2514*0Sstevel@tonic-gate (notsack_blk->sack_cnt >= 2515*0Sstevel@tonic-gate tcp_dupack_fast_retransmit)) { 2516*0Sstevel@tonic-gate end = notsack_blk->end; 2517*0Sstevel@tonic-gate if (SEQ_LT(begin, notsack_blk->begin)) { 2518*0Sstevel@tonic-gate begin = notsack_blk->begin; 2519*0Sstevel@tonic-gate } 2520*0Sstevel@tonic-gate break; 2521*0Sstevel@tonic-gate } 2522*0Sstevel@tonic-gate } 2523*0Sstevel@tonic-gate /* 2524*0Sstevel@tonic-gate * All holes are filled. Manipulate tcp_cwnd to send more 2525*0Sstevel@tonic-gate * if we can. Note that after the SACK recovery, tcp_cwnd is 2526*0Sstevel@tonic-gate * set to tcp_cwnd_ssthresh. 2527*0Sstevel@tonic-gate */ 2528*0Sstevel@tonic-gate if (notsack_blk == NULL) { 2529*0Sstevel@tonic-gate usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2530*0Sstevel@tonic-gate if (usable_swnd <= 0) { 2531*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2532*0Sstevel@tonic-gate assert(tcp->tcp_cwnd > 0); 2533*0Sstevel@tonic-gate return (0); 2534*0Sstevel@tonic-gate } else { 2535*0Sstevel@tonic-gate usable_swnd = usable_swnd / mss; 2536*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2537*0Sstevel@tonic-gate MAX(usable_swnd * mss, mss); 2538*0Sstevel@tonic-gate return (1); 2539*0Sstevel@tonic-gate } 2540*0Sstevel@tonic-gate } 2541*0Sstevel@tonic-gate 2542*0Sstevel@tonic-gate /* 2543*0Sstevel@tonic-gate * Note that we may send more than usable_swnd allows here 2544*0Sstevel@tonic-gate * because of round off, but no more than 1 MSS of data. 2545*0Sstevel@tonic-gate */ 2546*0Sstevel@tonic-gate seg_len = end - begin; 2547*0Sstevel@tonic-gate if (seg_len > mss) 2548*0Sstevel@tonic-gate seg_len = mss; 2549*0Sstevel@tonic-gate snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2550*0Sstevel@tonic-gate assert(snxt_mp != NULL); 2551*0Sstevel@tonic-gate /* This should not happen. Defensive coding again... */ 2552*0Sstevel@tonic-gate if (snxt_mp == NULL) { 2553*0Sstevel@tonic-gate return (0); 2554*0Sstevel@tonic-gate } 2555*0Sstevel@tonic-gate 2556*0Sstevel@tonic-gate xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2557*0Sstevel@tonic-gate &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2558*0Sstevel@tonic-gate 2559*0Sstevel@tonic-gate if (xmit_mp == NULL) 2560*0Sstevel@tonic-gate return (0); 2561*0Sstevel@tonic-gate 2562*0Sstevel@tonic-gate usable_swnd -= seg_len; 2563*0Sstevel@tonic-gate tcp->tcp_pipe += seg_len; 2564*0Sstevel@tonic-gate tcp->tcp_sack_snxt = begin + seg_len; 2565*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2566*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, xmit_mp); 2567*0Sstevel@tonic-gate freeb(xmit_mp); 2568*0Sstevel@tonic-gate 2569*0Sstevel@tonic-gate /* 2570*0Sstevel@tonic-gate * Update the send timestamp to avoid false retransmission. 2571*0Sstevel@tonic-gate */ 2572*0Sstevel@tonic-gate snxt_mp->b_prev = (mblk_t *)prom_gettime(); 2573*0Sstevel@tonic-gate 2574*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 2575*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2576*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2577*0Sstevel@tonic-gate /* 2578*0Sstevel@tonic-gate * Update tcp_rexmit_max to extend this SACK recovery phase. 2579*0Sstevel@tonic-gate * This happens when new data sent during fast recovery is 2580*0Sstevel@tonic-gate * also lost. If TCP retransmits those new data, it needs 2581*0Sstevel@tonic-gate * to extend SACK recover phase to avoid starting another 2582*0Sstevel@tonic-gate * fast retransmit/recovery unnecessarily. 2583*0Sstevel@tonic-gate */ 2584*0Sstevel@tonic-gate if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2585*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2586*0Sstevel@tonic-gate } 2587*0Sstevel@tonic-gate } 2588*0Sstevel@tonic-gate return (0); 2589*0Sstevel@tonic-gate } 2590*0Sstevel@tonic-gate 2591*0Sstevel@tonic-gate static void 2592*0Sstevel@tonic-gate tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2593*0Sstevel@tonic-gate { 2594*0Sstevel@tonic-gate uchar_t *rptr; 2595*0Sstevel@tonic-gate struct ip *iph; 2596*0Sstevel@tonic-gate tcp_t *tcp1; 2597*0Sstevel@tonic-gate tcpha_t *tcph; 2598*0Sstevel@tonic-gate uint32_t seg_ack; 2599*0Sstevel@tonic-gate int seg_len; 2600*0Sstevel@tonic-gate uint_t ip_hdr_len; 2601*0Sstevel@tonic-gate uint32_t seg_seq; 2602*0Sstevel@tonic-gate mblk_t *mp1; 2603*0Sstevel@tonic-gate uint_t flags; 2604*0Sstevel@tonic-gate uint32_t new_swnd = 0; 2605*0Sstevel@tonic-gate int mss; 2606*0Sstevel@tonic-gate boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2607*0Sstevel@tonic-gate int32_t gap; 2608*0Sstevel@tonic-gate int32_t rgap; 2609*0Sstevel@tonic-gate tcp_opt_t tcpopt; 2610*0Sstevel@tonic-gate int32_t bytes_acked; 2611*0Sstevel@tonic-gate int npkt; 2612*0Sstevel@tonic-gate uint32_t cwnd; 2613*0Sstevel@tonic-gate uint32_t add; 2614*0Sstevel@tonic-gate 2615*0Sstevel@tonic-gate #ifdef DEBUG 2616*0Sstevel@tonic-gate printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2617*0Sstevel@tonic-gate sock_id, mp, mp->b_datap); 2618*0Sstevel@tonic-gate #endif 2619*0Sstevel@tonic-gate 2620*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 2621*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data", mp); 2622*0Sstevel@tonic-gate 2623*0Sstevel@tonic-gate assert(OK_32PTR(mp->b_rptr)); 2624*0Sstevel@tonic-gate 2625*0Sstevel@tonic-gate rptr = mp->b_rptr; 2626*0Sstevel@tonic-gate iph = (struct ip *)rptr; 2627*0Sstevel@tonic-gate ip_hdr_len = IPH_HDR_LENGTH(rptr); 2628*0Sstevel@tonic-gate if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2629*0Sstevel@tonic-gate #ifdef DEBUG 2630*0Sstevel@tonic-gate printf("Not simple IP header\n"); 2631*0Sstevel@tonic-gate #endif 2632*0Sstevel@tonic-gate /* We cannot handle IP option yet... */ 2633*0Sstevel@tonic-gate tcp_drops++; 2634*0Sstevel@tonic-gate freeb(mp); 2635*0Sstevel@tonic-gate return; 2636*0Sstevel@tonic-gate } 2637*0Sstevel@tonic-gate /* The TCP header must be aligned. */ 2638*0Sstevel@tonic-gate tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2639*0Sstevel@tonic-gate seg_seq = ntohl(tcph->tha_seq); 2640*0Sstevel@tonic-gate seg_ack = ntohl(tcph->tha_ack); 2641*0Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2642*0Sstevel@tonic-gate seg_len = (int)(mp->b_wptr - rptr) - 2643*0Sstevel@tonic-gate (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2644*0Sstevel@tonic-gate /* In inetboot, b_cont should always be NULL. */ 2645*0Sstevel@tonic-gate assert(mp->b_cont == NULL); 2646*0Sstevel@tonic-gate 2647*0Sstevel@tonic-gate /* Verify the checksum. */ 2648*0Sstevel@tonic-gate if (tcp_verify_cksum(mp) < 0) { 2649*0Sstevel@tonic-gate #ifdef DEBUG 2650*0Sstevel@tonic-gate printf("tcp_rput_data: wrong cksum\n"); 2651*0Sstevel@tonic-gate #endif 2652*0Sstevel@tonic-gate freemsg(mp); 2653*0Sstevel@tonic-gate return; 2654*0Sstevel@tonic-gate } 2655*0Sstevel@tonic-gate 2656*0Sstevel@tonic-gate /* 2657*0Sstevel@tonic-gate * This segment is not for us, try to find its 2658*0Sstevel@tonic-gate * intended receiver. 2659*0Sstevel@tonic-gate */ 2660*0Sstevel@tonic-gate if (tcp == NULL || 2661*0Sstevel@tonic-gate tcph->tha_lport != tcp->tcp_fport || 2662*0Sstevel@tonic-gate tcph->tha_fport != tcp->tcp_lport || 2663*0Sstevel@tonic-gate iph->ip_src.s_addr != tcp->tcp_remote || 2664*0Sstevel@tonic-gate iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2665*0Sstevel@tonic-gate #ifdef DEBUG 2666*0Sstevel@tonic-gate printf("tcp_rput_data: not for us, state %d\n", 2667*0Sstevel@tonic-gate tcp->tcp_state); 2668*0Sstevel@tonic-gate #endif 2669*0Sstevel@tonic-gate /* 2670*0Sstevel@tonic-gate * First try to find a established connection. If none 2671*0Sstevel@tonic-gate * is found, look for a listener. 2672*0Sstevel@tonic-gate * 2673*0Sstevel@tonic-gate * If a listener is found, we need to check to see if the 2674*0Sstevel@tonic-gate * incoming segment is for one of its eagers. If it is, 2675*0Sstevel@tonic-gate * give it to the eager. If not, listener should take care 2676*0Sstevel@tonic-gate * of it. 2677*0Sstevel@tonic-gate */ 2678*0Sstevel@tonic-gate if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2679*0Sstevel@tonic-gate &sock_id)) != NULL || 2680*0Sstevel@tonic-gate (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2681*0Sstevel@tonic-gate tcph->tha_fport, &sock_id)) != NULL) { 2682*0Sstevel@tonic-gate if (tcp1->tcp_state == TCPS_LISTEN) { 2683*0Sstevel@tonic-gate if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2684*0Sstevel@tonic-gate iph, tcph)) == NULL) { 2685*0Sstevel@tonic-gate /* No eager... sent to listener */ 2686*0Sstevel@tonic-gate #ifdef DEBUG 2687*0Sstevel@tonic-gate printf("found the listener: %s\n", 2688*0Sstevel@tonic-gate tcp_display(tcp1, NULL, 2689*0Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2690*0Sstevel@tonic-gate #endif 2691*0Sstevel@tonic-gate tcp = tcp1; 2692*0Sstevel@tonic-gate } 2693*0Sstevel@tonic-gate #ifdef DEBUG 2694*0Sstevel@tonic-gate else { 2695*0Sstevel@tonic-gate printf("found the eager: %s\n", 2696*0Sstevel@tonic-gate tcp_display(tcp, NULL, 2697*0Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2698*0Sstevel@tonic-gate } 2699*0Sstevel@tonic-gate #endif 2700*0Sstevel@tonic-gate } else { 2701*0Sstevel@tonic-gate /* Non listener found... */ 2702*0Sstevel@tonic-gate #ifdef DEBUG 2703*0Sstevel@tonic-gate printf("found the connection: %s\n", 2704*0Sstevel@tonic-gate tcp_display(tcp1, NULL, 2705*0Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2706*0Sstevel@tonic-gate #endif 2707*0Sstevel@tonic-gate tcp = tcp1; 2708*0Sstevel@tonic-gate } 2709*0Sstevel@tonic-gate } else { 2710*0Sstevel@tonic-gate /* 2711*0Sstevel@tonic-gate * No connection for this segment... 2712*0Sstevel@tonic-gate * Send a RST to the other side. 2713*0Sstevel@tonic-gate */ 2714*0Sstevel@tonic-gate tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2715*0Sstevel@tonic-gate return; 2716*0Sstevel@tonic-gate } 2717*0Sstevel@tonic-gate } 2718*0Sstevel@tonic-gate 2719*0Sstevel@tonic-gate flags = tcph->tha_flags & 0xFF; 2720*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInSegs); 2721*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) { 2722*0Sstevel@tonic-gate tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2723*0Sstevel@tonic-gate seg_len, (tcph_t *)tcph, sock_id); 2724*0Sstevel@tonic-gate return; 2725*0Sstevel@tonic-gate } 2726*0Sstevel@tonic-gate /* 2727*0Sstevel@tonic-gate * From this point we can assume that the tcp is not compressed, 2728*0Sstevel@tonic-gate * since we would have branched off to tcp_time_wait_processing() 2729*0Sstevel@tonic-gate * in such a case. 2730*0Sstevel@tonic-gate */ 2731*0Sstevel@tonic-gate assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2732*0Sstevel@tonic-gate 2733*0Sstevel@tonic-gate /* 2734*0Sstevel@tonic-gate * After this point, we know we have the correct TCP, so update 2735*0Sstevel@tonic-gate * the receive time. 2736*0Sstevel@tonic-gate */ 2737*0Sstevel@tonic-gate tcp->tcp_last_recv_time = prom_gettime(); 2738*0Sstevel@tonic-gate 2739*0Sstevel@tonic-gate /* In inetboot, we do not handle urgent pointer... */ 2740*0Sstevel@tonic-gate if (flags & TH_URG) { 2741*0Sstevel@tonic-gate freemsg(mp); 2742*0Sstevel@tonic-gate DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2743*0Sstevel@tonic-gate "pointer\n", sock_id); 2744*0Sstevel@tonic-gate tcp_drops++; 2745*0Sstevel@tonic-gate return; 2746*0Sstevel@tonic-gate } 2747*0Sstevel@tonic-gate 2748*0Sstevel@tonic-gate switch (tcp->tcp_state) { 2749*0Sstevel@tonic-gate case TCPS_LISTEN: 2750*0Sstevel@tonic-gate if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2751*0Sstevel@tonic-gate if (flags & TH_RST) { 2752*0Sstevel@tonic-gate freemsg(mp); 2753*0Sstevel@tonic-gate return; 2754*0Sstevel@tonic-gate } 2755*0Sstevel@tonic-gate if (flags & TH_ACK) { 2756*0Sstevel@tonic-gate tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2757*0Sstevel@tonic-gate sock_id, mp, seg_ack, 0, TH_RST, 2758*0Sstevel@tonic-gate ip_hdr_len); 2759*0Sstevel@tonic-gate return; 2760*0Sstevel@tonic-gate } 2761*0Sstevel@tonic-gate if (!(flags & TH_SYN)) { 2762*0Sstevel@tonic-gate freemsg(mp); 2763*0Sstevel@tonic-gate return; 2764*0Sstevel@tonic-gate } 2765*0Sstevel@tonic-gate printf("tcp_rput_data: %d\n", __LINE__); 2766*0Sstevel@tonic-gate prom_panic("inetboot"); 2767*0Sstevel@tonic-gate } 2768*0Sstevel@tonic-gate if (tcp->tcp_conn_req_max > 0) { 2769*0Sstevel@tonic-gate tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2770*0Sstevel@tonic-gate if (tcp == NULL) { 2771*0Sstevel@tonic-gate freemsg(mp); 2772*0Sstevel@tonic-gate return; 2773*0Sstevel@tonic-gate } 2774*0Sstevel@tonic-gate #ifdef DEBUG 2775*0Sstevel@tonic-gate printf("tcp_rput_data: new tcp created\n"); 2776*0Sstevel@tonic-gate #endif 2777*0Sstevel@tonic-gate } 2778*0Sstevel@tonic-gate tcp->tcp_irs = seg_seq; 2779*0Sstevel@tonic-gate tcp->tcp_rack = seg_seq; 2780*0Sstevel@tonic-gate tcp->tcp_rnxt = seg_seq + 1; 2781*0Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2782*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpPassiveOpens); 2783*0Sstevel@tonic-gate goto syn_rcvd; 2784*0Sstevel@tonic-gate case TCPS_SYN_SENT: 2785*0Sstevel@tonic-gate if (flags & TH_ACK) { 2786*0Sstevel@tonic-gate /* 2787*0Sstevel@tonic-gate * Note that our stack cannot send data before a 2788*0Sstevel@tonic-gate * connection is established, therefore the 2789*0Sstevel@tonic-gate * following check is valid. Otherwise, it has 2790*0Sstevel@tonic-gate * to be changed. 2791*0Sstevel@tonic-gate */ 2792*0Sstevel@tonic-gate if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2793*0Sstevel@tonic-gate SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2794*0Sstevel@tonic-gate if (flags & TH_RST) { 2795*0Sstevel@tonic-gate freemsg(mp); 2796*0Sstevel@tonic-gate return; 2797*0Sstevel@tonic-gate } 2798*0Sstevel@tonic-gate tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2799*0Sstevel@tonic-gate tcp, mp, seg_ack, 0, TH_RST, 2800*0Sstevel@tonic-gate ip_hdr_len, sock_id); 2801*0Sstevel@tonic-gate return; 2802*0Sstevel@tonic-gate } 2803*0Sstevel@tonic-gate assert(tcp->tcp_suna + 1 == seg_ack); 2804*0Sstevel@tonic-gate } 2805*0Sstevel@tonic-gate if (flags & TH_RST) { 2806*0Sstevel@tonic-gate freemsg(mp); 2807*0Sstevel@tonic-gate if (flags & TH_ACK) { 2808*0Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2809*0Sstevel@tonic-gate } 2810*0Sstevel@tonic-gate return; 2811*0Sstevel@tonic-gate } 2812*0Sstevel@tonic-gate if (!(flags & TH_SYN)) { 2813*0Sstevel@tonic-gate freemsg(mp); 2814*0Sstevel@tonic-gate return; 2815*0Sstevel@tonic-gate } 2816*0Sstevel@tonic-gate 2817*0Sstevel@tonic-gate /* Process all TCP options. */ 2818*0Sstevel@tonic-gate tcp_process_options(tcp, (tcph_t *)tcph); 2819*0Sstevel@tonic-gate /* 2820*0Sstevel@tonic-gate * The following changes our rwnd to be a multiple of the 2821*0Sstevel@tonic-gate * MIN(peer MSS, our MSS) for performance reason. 2822*0Sstevel@tonic-gate */ 2823*0Sstevel@tonic-gate (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2824*0Sstevel@tonic-gate tcp->tcp_mss)); 2825*0Sstevel@tonic-gate 2826*0Sstevel@tonic-gate /* Is the other end ECN capable? */ 2827*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 2828*0Sstevel@tonic-gate if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2829*0Sstevel@tonic-gate tcp->tcp_ecn_ok = B_FALSE; 2830*0Sstevel@tonic-gate } 2831*0Sstevel@tonic-gate } 2832*0Sstevel@tonic-gate /* 2833*0Sstevel@tonic-gate * Clear ECN flags because it may interfere with later 2834*0Sstevel@tonic-gate * processing. 2835*0Sstevel@tonic-gate */ 2836*0Sstevel@tonic-gate flags &= ~(TH_ECE|TH_CWR); 2837*0Sstevel@tonic-gate 2838*0Sstevel@tonic-gate tcp->tcp_irs = seg_seq; 2839*0Sstevel@tonic-gate tcp->tcp_rack = seg_seq; 2840*0Sstevel@tonic-gate tcp->tcp_rnxt = seg_seq + 1; 2841*0Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2842*0Sstevel@tonic-gate 2843*0Sstevel@tonic-gate if (flags & TH_ACK) { 2844*0Sstevel@tonic-gate /* One for the SYN */ 2845*0Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss + 1; 2846*0Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2847*0Sstevel@tonic-gate tcp->tcp_state = TCPS_ESTABLISHED; 2848*0Sstevel@tonic-gate 2849*0Sstevel@tonic-gate /* 2850*0Sstevel@tonic-gate * If SYN was retransmitted, need to reset all 2851*0Sstevel@tonic-gate * retransmission info. This is because this 2852*0Sstevel@tonic-gate * segment will be treated as a dup ACK. 2853*0Sstevel@tonic-gate */ 2854*0Sstevel@tonic-gate if (tcp->tcp_rexmit) { 2855*0Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 2856*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2857*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 2858*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2859*0Sstevel@tonic-gate 2860*0Sstevel@tonic-gate /* 2861*0Sstevel@tonic-gate * Set tcp_cwnd back to 1 MSS, per 2862*0Sstevel@tonic-gate * recommendation from 2863*0Sstevel@tonic-gate * draft-floyd-incr-init-win-01.txt, 2864*0Sstevel@tonic-gate * Increasing TCP's Initial Window. 2865*0Sstevel@tonic-gate */ 2866*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 2867*0Sstevel@tonic-gate } 2868*0Sstevel@tonic-gate 2869*0Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 2870*0Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 2871*0Sstevel@tonic-gate 2872*0Sstevel@tonic-gate new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2873*0Sstevel@tonic-gate tcp->tcp_swnd = new_swnd; 2874*0Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 2875*0Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 2876*0Sstevel@tonic-gate 2877*0Sstevel@tonic-gate /* 2878*0Sstevel@tonic-gate * Always send the three-way handshake ack immediately 2879*0Sstevel@tonic-gate * in order to make the connection complete as soon as 2880*0Sstevel@tonic-gate * possible on the accepting host. 2881*0Sstevel@tonic-gate */ 2882*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 2883*0Sstevel@tonic-gate /* 2884*0Sstevel@tonic-gate * Check to see if there is data to be sent. If 2885*0Sstevel@tonic-gate * yes, set the transmit flag. Then check to see 2886*0Sstevel@tonic-gate * if received data processing needs to be done. 2887*0Sstevel@tonic-gate * If not, go straight to xmit_check. This short 2888*0Sstevel@tonic-gate * cut is OK as we don't support T/TCP. 2889*0Sstevel@tonic-gate */ 2890*0Sstevel@tonic-gate if (tcp->tcp_unsent) 2891*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 2892*0Sstevel@tonic-gate 2893*0Sstevel@tonic-gate if (seg_len == 0) { 2894*0Sstevel@tonic-gate freemsg(mp); 2895*0Sstevel@tonic-gate goto xmit_check; 2896*0Sstevel@tonic-gate } 2897*0Sstevel@tonic-gate 2898*0Sstevel@tonic-gate flags &= ~TH_SYN; 2899*0Sstevel@tonic-gate seg_seq++; 2900*0Sstevel@tonic-gate break; 2901*0Sstevel@tonic-gate } 2902*0Sstevel@tonic-gate syn_rcvd: 2903*0Sstevel@tonic-gate tcp->tcp_state = TCPS_SYN_RCVD; 2904*0Sstevel@tonic-gate mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2905*0Sstevel@tonic-gate NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2906*0Sstevel@tonic-gate if (mp1 != NULL) { 2907*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2908*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 2909*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2910*0Sstevel@tonic-gate freeb(mp1); 2911*0Sstevel@tonic-gate /* 2912*0Sstevel@tonic-gate * Let's wait till our SYN has been ACKED since we 2913*0Sstevel@tonic-gate * don't have a timer. 2914*0Sstevel@tonic-gate */ 2915*0Sstevel@tonic-gate if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2916*0Sstevel@tonic-gate freemsg(mp); 2917*0Sstevel@tonic-gate return; 2918*0Sstevel@tonic-gate } 2919*0Sstevel@tonic-gate } 2920*0Sstevel@tonic-gate freemsg(mp); 2921*0Sstevel@tonic-gate return; 2922*0Sstevel@tonic-gate default: 2923*0Sstevel@tonic-gate break; 2924*0Sstevel@tonic-gate } 2925*0Sstevel@tonic-gate mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2926*0Sstevel@tonic-gate new_swnd = ntohs(tcph->tha_win) << 2927*0Sstevel@tonic-gate ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2928*0Sstevel@tonic-gate mss = tcp->tcp_mss; 2929*0Sstevel@tonic-gate 2930*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 2931*0Sstevel@tonic-gate if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2932*0Sstevel@tonic-gate /* 2933*0Sstevel@tonic-gate * This segment is not acceptable. 2934*0Sstevel@tonic-gate * Drop it and send back an ACK. 2935*0Sstevel@tonic-gate */ 2936*0Sstevel@tonic-gate freemsg(mp); 2937*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 2938*0Sstevel@tonic-gate goto ack_check; 2939*0Sstevel@tonic-gate } 2940*0Sstevel@tonic-gate } else if (tcp->tcp_snd_sack_ok) { 2941*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2942*0Sstevel@tonic-gate tcpopt.tcp = tcp; 2943*0Sstevel@tonic-gate /* 2944*0Sstevel@tonic-gate * SACK info in already updated in tcp_parse_options. Ignore 2945*0Sstevel@tonic-gate * all other TCP options... 2946*0Sstevel@tonic-gate */ 2947*0Sstevel@tonic-gate (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2948*0Sstevel@tonic-gate } 2949*0Sstevel@tonic-gate try_again:; 2950*0Sstevel@tonic-gate gap = seg_seq - tcp->tcp_rnxt; 2951*0Sstevel@tonic-gate rgap = tcp->tcp_rwnd - (gap + seg_len); 2952*0Sstevel@tonic-gate /* 2953*0Sstevel@tonic-gate * gap is the amount of sequence space between what we expect to see 2954*0Sstevel@tonic-gate * and what we got for seg_seq. A positive value for gap means 2955*0Sstevel@tonic-gate * something got lost. A negative value means we got some old stuff. 2956*0Sstevel@tonic-gate */ 2957*0Sstevel@tonic-gate if (gap < 0) { 2958*0Sstevel@tonic-gate /* Old stuff present. Is the SYN in there? */ 2959*0Sstevel@tonic-gate if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2960*0Sstevel@tonic-gate (seg_len != 0)) { 2961*0Sstevel@tonic-gate flags &= ~TH_SYN; 2962*0Sstevel@tonic-gate seg_seq++; 2963*0Sstevel@tonic-gate /* Recompute the gaps after noting the SYN. */ 2964*0Sstevel@tonic-gate goto try_again; 2965*0Sstevel@tonic-gate } 2966*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2967*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2968*0Sstevel@tonic-gate (seg_len > -gap ? -gap : seg_len)); 2969*0Sstevel@tonic-gate /* Remove the old stuff from seg_len. */ 2970*0Sstevel@tonic-gate seg_len += gap; 2971*0Sstevel@tonic-gate /* 2972*0Sstevel@tonic-gate * Anything left? 2973*0Sstevel@tonic-gate * Make sure to check for unack'd FIN when rest of data 2974*0Sstevel@tonic-gate * has been previously ack'd. 2975*0Sstevel@tonic-gate */ 2976*0Sstevel@tonic-gate if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2977*0Sstevel@tonic-gate /* 2978*0Sstevel@tonic-gate * Resets are only valid if they lie within our offered 2979*0Sstevel@tonic-gate * window. If the RST bit is set, we just ignore this 2980*0Sstevel@tonic-gate * segment. 2981*0Sstevel@tonic-gate */ 2982*0Sstevel@tonic-gate if (flags & TH_RST) { 2983*0Sstevel@tonic-gate freemsg(mp); 2984*0Sstevel@tonic-gate return; 2985*0Sstevel@tonic-gate } 2986*0Sstevel@tonic-gate 2987*0Sstevel@tonic-gate /* 2988*0Sstevel@tonic-gate * This segment is "unacceptable". None of its 2989*0Sstevel@tonic-gate * sequence space lies within our advertized window. 2990*0Sstevel@tonic-gate * 2991*0Sstevel@tonic-gate * Adjust seg_len to the original value for tracing. 2992*0Sstevel@tonic-gate */ 2993*0Sstevel@tonic-gate seg_len -= gap; 2994*0Sstevel@tonic-gate #ifdef DEBUG 2995*0Sstevel@tonic-gate printf("tcp_rput: unacceptable, gap %d, rgap " 2996*0Sstevel@tonic-gate "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 2997*0Sstevel@tonic-gate "seg_len %d, rnxt %u, snxt %u, %s", 2998*0Sstevel@tonic-gate gap, rgap, flags, seg_seq, seg_ack, 2999*0Sstevel@tonic-gate seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3000*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3001*0Sstevel@tonic-gate #endif 3002*0Sstevel@tonic-gate 3003*0Sstevel@tonic-gate /* 3004*0Sstevel@tonic-gate * Arrange to send an ACK in response to the 3005*0Sstevel@tonic-gate * unacceptable segment per RFC 793 page 69. There 3006*0Sstevel@tonic-gate * is only one small difference between ours and the 3007*0Sstevel@tonic-gate * acceptability test in the RFC - we accept ACK-only 3008*0Sstevel@tonic-gate * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3009*0Sstevel@tonic-gate * will be generated. 3010*0Sstevel@tonic-gate * 3011*0Sstevel@tonic-gate * Note that we have to ACK an ACK-only packet at least 3012*0Sstevel@tonic-gate * for stacks that send 0-length keep-alives with 3013*0Sstevel@tonic-gate * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3014*0Sstevel@tonic-gate * section 4.2.3.6. As long as we don't ever generate 3015*0Sstevel@tonic-gate * an unacceptable packet in response to an incoming 3016*0Sstevel@tonic-gate * packet that is unacceptable, it should not cause 3017*0Sstevel@tonic-gate * "ACK wars". 3018*0Sstevel@tonic-gate */ 3019*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3020*0Sstevel@tonic-gate 3021*0Sstevel@tonic-gate /* 3022*0Sstevel@tonic-gate * Continue processing this segment in order to use the 3023*0Sstevel@tonic-gate * ACK information it contains, but skip all other 3024*0Sstevel@tonic-gate * sequence-number processing. Processing the ACK 3025*0Sstevel@tonic-gate * information is necessary in order to 3026*0Sstevel@tonic-gate * re-synchronize connections that may have lost 3027*0Sstevel@tonic-gate * synchronization. 3028*0Sstevel@tonic-gate * 3029*0Sstevel@tonic-gate * We clear seg_len and flag fields related to 3030*0Sstevel@tonic-gate * sequence number processing as they are not 3031*0Sstevel@tonic-gate * to be trusted for an unacceptable segment. 3032*0Sstevel@tonic-gate */ 3033*0Sstevel@tonic-gate seg_len = 0; 3034*0Sstevel@tonic-gate flags &= ~(TH_SYN | TH_FIN | TH_URG); 3035*0Sstevel@tonic-gate goto process_ack; 3036*0Sstevel@tonic-gate } 3037*0Sstevel@tonic-gate 3038*0Sstevel@tonic-gate /* Fix seg_seq, and chew the gap off the front. */ 3039*0Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 3040*0Sstevel@tonic-gate do { 3041*0Sstevel@tonic-gate mblk_t *mp2; 3042*0Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3043*0Sstevel@tonic-gate (uintptr_t)UINT_MAX); 3044*0Sstevel@tonic-gate gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3045*0Sstevel@tonic-gate if (gap > 0) { 3046*0Sstevel@tonic-gate mp->b_rptr = mp->b_wptr - gap; 3047*0Sstevel@tonic-gate break; 3048*0Sstevel@tonic-gate } 3049*0Sstevel@tonic-gate mp2 = mp; 3050*0Sstevel@tonic-gate mp = mp->b_cont; 3051*0Sstevel@tonic-gate freeb(mp2); 3052*0Sstevel@tonic-gate } while (gap < 0); 3053*0Sstevel@tonic-gate } 3054*0Sstevel@tonic-gate /* 3055*0Sstevel@tonic-gate * rgap is the amount of stuff received out of window. A negative 3056*0Sstevel@tonic-gate * value is the amount out of window. 3057*0Sstevel@tonic-gate */ 3058*0Sstevel@tonic-gate if (rgap < 0) { 3059*0Sstevel@tonic-gate mblk_t *mp2; 3060*0Sstevel@tonic-gate 3061*0Sstevel@tonic-gate if (tcp->tcp_rwnd == 0) 3062*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInWinProbe); 3063*0Sstevel@tonic-gate else { 3064*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3065*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3066*0Sstevel@tonic-gate } 3067*0Sstevel@tonic-gate 3068*0Sstevel@tonic-gate /* 3069*0Sstevel@tonic-gate * seg_len does not include the FIN, so if more than 3070*0Sstevel@tonic-gate * just the FIN is out of window, we act like we don't 3071*0Sstevel@tonic-gate * see it. (If just the FIN is out of window, rgap 3072*0Sstevel@tonic-gate * will be zero and we will go ahead and acknowledge 3073*0Sstevel@tonic-gate * the FIN.) 3074*0Sstevel@tonic-gate */ 3075*0Sstevel@tonic-gate flags &= ~TH_FIN; 3076*0Sstevel@tonic-gate 3077*0Sstevel@tonic-gate /* Fix seg_len and make sure there is something left. */ 3078*0Sstevel@tonic-gate seg_len += rgap; 3079*0Sstevel@tonic-gate if (seg_len <= 0) { 3080*0Sstevel@tonic-gate /* 3081*0Sstevel@tonic-gate * Resets are only valid if they lie within our offered 3082*0Sstevel@tonic-gate * window. If the RST bit is set, we just ignore this 3083*0Sstevel@tonic-gate * segment. 3084*0Sstevel@tonic-gate */ 3085*0Sstevel@tonic-gate if (flags & TH_RST) { 3086*0Sstevel@tonic-gate freemsg(mp); 3087*0Sstevel@tonic-gate return; 3088*0Sstevel@tonic-gate } 3089*0Sstevel@tonic-gate 3090*0Sstevel@tonic-gate /* Per RFC 793, we need to send back an ACK. */ 3091*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3092*0Sstevel@tonic-gate 3093*0Sstevel@tonic-gate /* 3094*0Sstevel@tonic-gate * If this is a zero window probe, continue to 3095*0Sstevel@tonic-gate * process the ACK part. But we need to set seg_len 3096*0Sstevel@tonic-gate * to 0 to avoid data processing. Otherwise just 3097*0Sstevel@tonic-gate * drop the segment and send back an ACK. 3098*0Sstevel@tonic-gate */ 3099*0Sstevel@tonic-gate if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3100*0Sstevel@tonic-gate flags &= ~(TH_SYN | TH_URG); 3101*0Sstevel@tonic-gate seg_len = 0; 3102*0Sstevel@tonic-gate /* Let's see if we can update our rwnd */ 3103*0Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 3104*0Sstevel@tonic-gate goto process_ack; 3105*0Sstevel@tonic-gate } else { 3106*0Sstevel@tonic-gate freemsg(mp); 3107*0Sstevel@tonic-gate goto ack_check; 3108*0Sstevel@tonic-gate } 3109*0Sstevel@tonic-gate } 3110*0Sstevel@tonic-gate /* Pitch out of window stuff off the end. */ 3111*0Sstevel@tonic-gate rgap = seg_len; 3112*0Sstevel@tonic-gate mp2 = mp; 3113*0Sstevel@tonic-gate do { 3114*0Sstevel@tonic-gate assert((uintptr_t)(mp2->b_wptr - 3115*0Sstevel@tonic-gate mp2->b_rptr) <= (uintptr_t)INT_MAX); 3116*0Sstevel@tonic-gate rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3117*0Sstevel@tonic-gate if (rgap < 0) { 3118*0Sstevel@tonic-gate mp2->b_wptr += rgap; 3119*0Sstevel@tonic-gate if ((mp1 = mp2->b_cont) != NULL) { 3120*0Sstevel@tonic-gate mp2->b_cont = NULL; 3121*0Sstevel@tonic-gate freemsg(mp1); 3122*0Sstevel@tonic-gate } 3123*0Sstevel@tonic-gate break; 3124*0Sstevel@tonic-gate } 3125*0Sstevel@tonic-gate } while ((mp2 = mp2->b_cont) != NULL); 3126*0Sstevel@tonic-gate } 3127*0Sstevel@tonic-gate ok:; 3128*0Sstevel@tonic-gate /* 3129*0Sstevel@tonic-gate * TCP should check ECN info for segments inside the window only. 3130*0Sstevel@tonic-gate * Therefore the check should be done here. 3131*0Sstevel@tonic-gate */ 3132*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 3133*0Sstevel@tonic-gate uchar_t tos = ((struct ip *)rptr)->ip_tos; 3134*0Sstevel@tonic-gate 3135*0Sstevel@tonic-gate if (flags & TH_CWR) { 3136*0Sstevel@tonic-gate tcp->tcp_ecn_echo_on = B_FALSE; 3137*0Sstevel@tonic-gate } 3138*0Sstevel@tonic-gate /* 3139*0Sstevel@tonic-gate * Note that both ECN_CE and CWR can be set in the 3140*0Sstevel@tonic-gate * same segment. In this case, we once again turn 3141*0Sstevel@tonic-gate * on ECN_ECHO. 3142*0Sstevel@tonic-gate */ 3143*0Sstevel@tonic-gate if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3144*0Sstevel@tonic-gate tcp->tcp_ecn_echo_on = B_TRUE; 3145*0Sstevel@tonic-gate } 3146*0Sstevel@tonic-gate } 3147*0Sstevel@tonic-gate 3148*0Sstevel@tonic-gate /* 3149*0Sstevel@tonic-gate * Check whether we can update tcp_ts_recent. This test is 3150*0Sstevel@tonic-gate * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3151*0Sstevel@tonic-gate * Extensions for High Performance: An Update", Internet Draft. 3152*0Sstevel@tonic-gate */ 3153*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && 3154*0Sstevel@tonic-gate TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3155*0Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3156*0Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3157*0Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 3158*0Sstevel@tonic-gate } 3159*0Sstevel@tonic-gate 3160*0Sstevel@tonic-gate if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3161*0Sstevel@tonic-gate /* 3162*0Sstevel@tonic-gate * FIN in an out of order segment. We record this in 3163*0Sstevel@tonic-gate * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3164*0Sstevel@tonic-gate * Clear the FIN so that any check on FIN flag will fail. 3165*0Sstevel@tonic-gate * Remember that FIN also counts in the sequence number 3166*0Sstevel@tonic-gate * space. So we need to ack out of order FIN only segments. 3167*0Sstevel@tonic-gate */ 3168*0Sstevel@tonic-gate if (flags & TH_FIN) { 3169*0Sstevel@tonic-gate tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3170*0Sstevel@tonic-gate tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3171*0Sstevel@tonic-gate flags &= ~TH_FIN; 3172*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3173*0Sstevel@tonic-gate } 3174*0Sstevel@tonic-gate if (seg_len > 0) { 3175*0Sstevel@tonic-gate /* Fill in the SACK blk list. */ 3176*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 3177*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 3178*0Sstevel@tonic-gate tcp_sack_insert(tcp->tcp_sack_list, 3179*0Sstevel@tonic-gate seg_seq, seg_seq + seg_len, 3180*0Sstevel@tonic-gate &(tcp->tcp_num_sack_blk)); 3181*0Sstevel@tonic-gate } 3182*0Sstevel@tonic-gate 3183*0Sstevel@tonic-gate /* 3184*0Sstevel@tonic-gate * Attempt reassembly and see if we have something 3185*0Sstevel@tonic-gate * ready to go. 3186*0Sstevel@tonic-gate */ 3187*0Sstevel@tonic-gate mp = tcp_reass(tcp, mp, seg_seq); 3188*0Sstevel@tonic-gate /* Always ack out of order packets */ 3189*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED | TH_PUSH; 3190*0Sstevel@tonic-gate if (mp != NULL) { 3191*0Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - 3192*0Sstevel@tonic-gate mp->b_rptr) <= (uintptr_t)INT_MAX); 3193*0Sstevel@tonic-gate seg_len = mp->b_cont ? msgdsize(mp) : 3194*0Sstevel@tonic-gate (int)(mp->b_wptr - mp->b_rptr); 3195*0Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 3196*0Sstevel@tonic-gate /* 3197*0Sstevel@tonic-gate * A gap is filled and the seq num and len 3198*0Sstevel@tonic-gate * of the gap match that of a previously 3199*0Sstevel@tonic-gate * received FIN, put the FIN flag back in. 3200*0Sstevel@tonic-gate */ 3201*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3202*0Sstevel@tonic-gate seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3203*0Sstevel@tonic-gate flags |= TH_FIN; 3204*0Sstevel@tonic-gate tcp->tcp_valid_bits &= 3205*0Sstevel@tonic-gate ~TCP_OFO_FIN_VALID; 3206*0Sstevel@tonic-gate } 3207*0Sstevel@tonic-gate } else { 3208*0Sstevel@tonic-gate /* 3209*0Sstevel@tonic-gate * Keep going even with NULL mp. 3210*0Sstevel@tonic-gate * There may be a useful ACK or something else 3211*0Sstevel@tonic-gate * we don't want to miss. 3212*0Sstevel@tonic-gate * 3213*0Sstevel@tonic-gate * But TCP should not perform fast retransmit 3214*0Sstevel@tonic-gate * because of the ack number. TCP uses 3215*0Sstevel@tonic-gate * seg_len == 0 to determine if it is a pure 3216*0Sstevel@tonic-gate * ACK. And this is not a pure ACK. 3217*0Sstevel@tonic-gate */ 3218*0Sstevel@tonic-gate seg_len = 0; 3219*0Sstevel@tonic-gate ofo_seg = B_TRUE; 3220*0Sstevel@tonic-gate } 3221*0Sstevel@tonic-gate } 3222*0Sstevel@tonic-gate } else if (seg_len > 0) { 3223*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3224*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3225*0Sstevel@tonic-gate /* 3226*0Sstevel@tonic-gate * If an out of order FIN was received before, and the seq 3227*0Sstevel@tonic-gate * num and len of the new segment match that of the FIN, 3228*0Sstevel@tonic-gate * put the FIN flag back in. 3229*0Sstevel@tonic-gate */ 3230*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3231*0Sstevel@tonic-gate seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3232*0Sstevel@tonic-gate flags |= TH_FIN; 3233*0Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3234*0Sstevel@tonic-gate } 3235*0Sstevel@tonic-gate } 3236*0Sstevel@tonic-gate if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3237*0Sstevel@tonic-gate if (flags & TH_RST) { 3238*0Sstevel@tonic-gate freemsg(mp); 3239*0Sstevel@tonic-gate switch (tcp->tcp_state) { 3240*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 3241*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3242*0Sstevel@tonic-gate break; 3243*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 3244*0Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 3245*0Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 3246*0Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 3247*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3248*0Sstevel@tonic-gate break; 3249*0Sstevel@tonic-gate case TCPS_CLOSING: 3250*0Sstevel@tonic-gate case TCPS_LAST_ACK: 3251*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 3252*0Sstevel@tonic-gate break; 3253*0Sstevel@tonic-gate default: 3254*0Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 3255*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ENXIO); 3256*0Sstevel@tonic-gate break; 3257*0Sstevel@tonic-gate } 3258*0Sstevel@tonic-gate return; 3259*0Sstevel@tonic-gate } 3260*0Sstevel@tonic-gate if (flags & TH_SYN) { 3261*0Sstevel@tonic-gate /* 3262*0Sstevel@tonic-gate * See RFC 793, Page 71 3263*0Sstevel@tonic-gate * 3264*0Sstevel@tonic-gate * The seq number must be in the window as it should 3265*0Sstevel@tonic-gate * be "fixed" above. If it is outside window, it should 3266*0Sstevel@tonic-gate * be already rejected. Note that we allow seg_seq to be 3267*0Sstevel@tonic-gate * rnxt + rwnd because we want to accept 0 window probe. 3268*0Sstevel@tonic-gate */ 3269*0Sstevel@tonic-gate assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3270*0Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3271*0Sstevel@tonic-gate freemsg(mp); 3272*0Sstevel@tonic-gate /* 3273*0Sstevel@tonic-gate * If the ACK flag is not set, just use our snxt as the 3274*0Sstevel@tonic-gate * seq number of the RST segment. 3275*0Sstevel@tonic-gate */ 3276*0Sstevel@tonic-gate if (!(flags & TH_ACK)) { 3277*0Sstevel@tonic-gate seg_ack = tcp->tcp_snxt; 3278*0Sstevel@tonic-gate } 3279*0Sstevel@tonic-gate tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3280*0Sstevel@tonic-gate seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3281*0Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 3282*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3283*0Sstevel@tonic-gate return; 3284*0Sstevel@tonic-gate } 3285*0Sstevel@tonic-gate 3286*0Sstevel@tonic-gate process_ack: 3287*0Sstevel@tonic-gate if (!(flags & TH_ACK)) { 3288*0Sstevel@tonic-gate #ifdef DEBUG 3289*0Sstevel@tonic-gate printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3290*0Sstevel@tonic-gate #endif 3291*0Sstevel@tonic-gate freemsg(mp); 3292*0Sstevel@tonic-gate goto xmit_check; 3293*0Sstevel@tonic-gate } 3294*0Sstevel@tonic-gate } 3295*0Sstevel@tonic-gate bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3296*0Sstevel@tonic-gate 3297*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_RCVD) { 3298*0Sstevel@tonic-gate tcp_t *listener = tcp->tcp_listener; 3299*0Sstevel@tonic-gate #ifdef DEBUG 3300*0Sstevel@tonic-gate printf("Done with eager 3-way handshake\n"); 3301*0Sstevel@tonic-gate #endif 3302*0Sstevel@tonic-gate /* 3303*0Sstevel@tonic-gate * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3304*0Sstevel@tonic-gate * but that would mean we have an ack that ignored our SYN. 3305*0Sstevel@tonic-gate */ 3306*0Sstevel@tonic-gate if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3307*0Sstevel@tonic-gate freemsg(mp); 3308*0Sstevel@tonic-gate tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3309*0Sstevel@tonic-gate tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3310*0Sstevel@tonic-gate return; 3311*0Sstevel@tonic-gate } 3312*0Sstevel@tonic-gate 3313*0Sstevel@tonic-gate /* 3314*0Sstevel@tonic-gate * if the conn_req_q is full defer processing 3315*0Sstevel@tonic-gate * until space is availabe after accept() 3316*0Sstevel@tonic-gate * processing 3317*0Sstevel@tonic-gate */ 3318*0Sstevel@tonic-gate if (listener->tcp_conn_req_cnt_q < 3319*0Sstevel@tonic-gate listener->tcp_conn_req_max) { 3320*0Sstevel@tonic-gate tcp_t *tail; 3321*0Sstevel@tonic-gate 3322*0Sstevel@tonic-gate listener->tcp_conn_req_cnt_q0--; 3323*0Sstevel@tonic-gate listener->tcp_conn_req_cnt_q++; 3324*0Sstevel@tonic-gate 3325*0Sstevel@tonic-gate /* Move from SYN_RCVD to ESTABLISHED list */ 3326*0Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3327*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 3328*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3329*0Sstevel@tonic-gate tcp->tcp_eager_next_q0; 3330*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0 = NULL; 3331*0Sstevel@tonic-gate tcp->tcp_eager_next_q0 = NULL; 3332*0Sstevel@tonic-gate 3333*0Sstevel@tonic-gate /* 3334*0Sstevel@tonic-gate * Insert at end of the queue because sockfs 3335*0Sstevel@tonic-gate * sends down T_CONN_RES in chronological 3336*0Sstevel@tonic-gate * order. Leaving the older conn indications 3337*0Sstevel@tonic-gate * at front of the queue helps reducing search 3338*0Sstevel@tonic-gate * time. 3339*0Sstevel@tonic-gate */ 3340*0Sstevel@tonic-gate tail = listener->tcp_eager_last_q; 3341*0Sstevel@tonic-gate if (tail != NULL) { 3342*0Sstevel@tonic-gate tail->tcp_eager_next_q = tcp; 3343*0Sstevel@tonic-gate } else { 3344*0Sstevel@tonic-gate listener->tcp_eager_next_q = tcp; 3345*0Sstevel@tonic-gate } 3346*0Sstevel@tonic-gate listener->tcp_eager_last_q = tcp; 3347*0Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 3348*0Sstevel@tonic-gate } else { 3349*0Sstevel@tonic-gate /* 3350*0Sstevel@tonic-gate * Defer connection on q0 and set deferred 3351*0Sstevel@tonic-gate * connection bit true 3352*0Sstevel@tonic-gate */ 3353*0Sstevel@tonic-gate tcp->tcp_conn_def_q0 = B_TRUE; 3354*0Sstevel@tonic-gate 3355*0Sstevel@tonic-gate /* take tcp out of q0 ... */ 3356*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3357*0Sstevel@tonic-gate tcp->tcp_eager_next_q0; 3358*0Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3359*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 3360*0Sstevel@tonic-gate 3361*0Sstevel@tonic-gate /* ... and place it at the end of q0 */ 3362*0Sstevel@tonic-gate tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3363*0Sstevel@tonic-gate tcp->tcp_eager_next_q0 = listener; 3364*0Sstevel@tonic-gate listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3365*0Sstevel@tonic-gate listener->tcp_eager_prev_q0 = tcp; 3366*0Sstevel@tonic-gate } 3367*0Sstevel@tonic-gate 3368*0Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3369*0Sstevel@tonic-gate bytes_acked--; 3370*0Sstevel@tonic-gate 3371*0Sstevel@tonic-gate /* 3372*0Sstevel@tonic-gate * If SYN was retransmitted, need to reset all 3373*0Sstevel@tonic-gate * retransmission info as this segment will be 3374*0Sstevel@tonic-gate * treated as a dup ACK. 3375*0Sstevel@tonic-gate */ 3376*0Sstevel@tonic-gate if (tcp->tcp_rexmit) { 3377*0Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 3378*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3379*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 3380*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3381*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3382*0Sstevel@tonic-gate tcp->tcp_cwnd = mss; 3383*0Sstevel@tonic-gate } 3384*0Sstevel@tonic-gate 3385*0Sstevel@tonic-gate /* 3386*0Sstevel@tonic-gate * We set the send window to zero here. 3387*0Sstevel@tonic-gate * This is needed if there is data to be 3388*0Sstevel@tonic-gate * processed already on the queue. 3389*0Sstevel@tonic-gate * Later (at swnd_update label), the 3390*0Sstevel@tonic-gate * "new_swnd > tcp_swnd" condition is satisfied 3391*0Sstevel@tonic-gate * the XMIT_NEEDED flag is set in the current 3392*0Sstevel@tonic-gate * (SYN_RCVD) state. This ensures tcp_wput_data() is 3393*0Sstevel@tonic-gate * called if there is already data on queue in 3394*0Sstevel@tonic-gate * this state. 3395*0Sstevel@tonic-gate */ 3396*0Sstevel@tonic-gate tcp->tcp_swnd = 0; 3397*0Sstevel@tonic-gate 3398*0Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 3399*0Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 3400*0Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 3401*0Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 3402*0Sstevel@tonic-gate tcp->tcp_state = TCPS_ESTABLISHED; 3403*0Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3404*0Sstevel@tonic-gate } 3405*0Sstevel@tonic-gate /* This code follows 4.4BSD-Lite2 mostly. */ 3406*0Sstevel@tonic-gate if (bytes_acked < 0) 3407*0Sstevel@tonic-gate goto est; 3408*0Sstevel@tonic-gate 3409*0Sstevel@tonic-gate /* 3410*0Sstevel@tonic-gate * If TCP is ECN capable and the congestion experience bit is 3411*0Sstevel@tonic-gate * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3412*0Sstevel@tonic-gate * done once per window (or more loosely, per RTT). 3413*0Sstevel@tonic-gate */ 3414*0Sstevel@tonic-gate if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3415*0Sstevel@tonic-gate tcp->tcp_cwr = B_FALSE; 3416*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3417*0Sstevel@tonic-gate if (!tcp->tcp_cwr) { 3418*0Sstevel@tonic-gate npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3419*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3420*0Sstevel@tonic-gate tcp->tcp_cwnd = npkt * mss; 3421*0Sstevel@tonic-gate /* 3422*0Sstevel@tonic-gate * If the cwnd is 0, use the timer to clock out 3423*0Sstevel@tonic-gate * new segments. This is required by the ECN spec. 3424*0Sstevel@tonic-gate */ 3425*0Sstevel@tonic-gate if (npkt == 0) { 3426*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3427*0Sstevel@tonic-gate /* 3428*0Sstevel@tonic-gate * This makes sure that when the ACK comes 3429*0Sstevel@tonic-gate * back, we will increase tcp_cwnd by 1 MSS. 3430*0Sstevel@tonic-gate */ 3431*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 3432*0Sstevel@tonic-gate } 3433*0Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 3434*0Sstevel@tonic-gate /* 3435*0Sstevel@tonic-gate * This marks the end of the current window of in 3436*0Sstevel@tonic-gate * flight data. That is why we don't use 3437*0Sstevel@tonic-gate * tcp_suna + tcp_swnd. Only data in flight can 3438*0Sstevel@tonic-gate * provide ECN info. 3439*0Sstevel@tonic-gate */ 3440*0Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3441*0Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 3442*0Sstevel@tonic-gate } 3443*0Sstevel@tonic-gate } 3444*0Sstevel@tonic-gate 3445*0Sstevel@tonic-gate mp1 = tcp->tcp_xmit_head; 3446*0Sstevel@tonic-gate if (bytes_acked == 0) { 3447*0Sstevel@tonic-gate if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3448*0Sstevel@tonic-gate int dupack_cnt; 3449*0Sstevel@tonic-gate 3450*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDupAck); 3451*0Sstevel@tonic-gate /* 3452*0Sstevel@tonic-gate * Fast retransmit. When we have seen exactly three 3453*0Sstevel@tonic-gate * identical ACKs while we have unacked data 3454*0Sstevel@tonic-gate * outstanding we take it as a hint that our peer 3455*0Sstevel@tonic-gate * dropped something. 3456*0Sstevel@tonic-gate * 3457*0Sstevel@tonic-gate * If TCP is retransmitting, don't do fast retransmit. 3458*0Sstevel@tonic-gate */ 3459*0Sstevel@tonic-gate if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3460*0Sstevel@tonic-gate ! tcp->tcp_rexmit) { 3461*0Sstevel@tonic-gate /* Do Limited Transmit */ 3462*0Sstevel@tonic-gate if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3463*0Sstevel@tonic-gate tcp_dupack_fast_retransmit) { 3464*0Sstevel@tonic-gate /* 3465*0Sstevel@tonic-gate * RFC 3042 3466*0Sstevel@tonic-gate * 3467*0Sstevel@tonic-gate * What we need to do is temporarily 3468*0Sstevel@tonic-gate * increase tcp_cwnd so that new 3469*0Sstevel@tonic-gate * data can be sent if it is allowed 3470*0Sstevel@tonic-gate * by the receive window (tcp_rwnd). 3471*0Sstevel@tonic-gate * tcp_wput_data() will take care of 3472*0Sstevel@tonic-gate * the rest. 3473*0Sstevel@tonic-gate * 3474*0Sstevel@tonic-gate * If the connection is SACK capable, 3475*0Sstevel@tonic-gate * only do limited xmit when there 3476*0Sstevel@tonic-gate * is SACK info. 3477*0Sstevel@tonic-gate * 3478*0Sstevel@tonic-gate * Note how tcp_cwnd is incremented. 3479*0Sstevel@tonic-gate * The first dup ACK will increase 3480*0Sstevel@tonic-gate * it by 1 MSS. The second dup ACK 3481*0Sstevel@tonic-gate * will increase it by 2 MSS. This 3482*0Sstevel@tonic-gate * means that only 1 new segment will 3483*0Sstevel@tonic-gate * be sent for each dup ACK. 3484*0Sstevel@tonic-gate */ 3485*0Sstevel@tonic-gate if (tcp->tcp_unsent > 0 && 3486*0Sstevel@tonic-gate (!tcp->tcp_snd_sack_ok || 3487*0Sstevel@tonic-gate (tcp->tcp_snd_sack_ok && 3488*0Sstevel@tonic-gate tcp->tcp_notsack_list != NULL))) { 3489*0Sstevel@tonic-gate tcp->tcp_cwnd += mss << 3490*0Sstevel@tonic-gate (tcp->tcp_dupack_cnt - 1); 3491*0Sstevel@tonic-gate flags |= TH_LIMIT_XMIT; 3492*0Sstevel@tonic-gate } 3493*0Sstevel@tonic-gate } else if (dupack_cnt == 3494*0Sstevel@tonic-gate tcp_dupack_fast_retransmit) { 3495*0Sstevel@tonic-gate 3496*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3497*0Sstevel@tonic-gate /* 3498*0Sstevel@tonic-gate * If we have reduced tcp_ssthresh 3499*0Sstevel@tonic-gate * because of ECN, do not reduce it again 3500*0Sstevel@tonic-gate * unless it is already one window of data 3501*0Sstevel@tonic-gate * away. After one window of data, tcp_cwr 3502*0Sstevel@tonic-gate * should then be cleared. Note that 3503*0Sstevel@tonic-gate * for non ECN capable connection, tcp_cwr 3504*0Sstevel@tonic-gate * should always be false. 3505*0Sstevel@tonic-gate * 3506*0Sstevel@tonic-gate * Adjust cwnd since the duplicate 3507*0Sstevel@tonic-gate * ack indicates that a packet was 3508*0Sstevel@tonic-gate * dropped (due to congestion.) 3509*0Sstevel@tonic-gate */ 3510*0Sstevel@tonic-gate if (!tcp->tcp_cwr) { 3511*0Sstevel@tonic-gate npkt = (MIN(tcp->tcp_cwnd, 3512*0Sstevel@tonic-gate tcp->tcp_swnd) >> 1) / mss; 3513*0Sstevel@tonic-gate if (npkt < 2) 3514*0Sstevel@tonic-gate npkt = 2; 3515*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = npkt * mss; 3516*0Sstevel@tonic-gate tcp->tcp_cwnd = (npkt + 3517*0Sstevel@tonic-gate tcp->tcp_dupack_cnt) * mss; 3518*0Sstevel@tonic-gate } 3519*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 3520*0Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 3521*0Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3522*0Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 3523*0Sstevel@tonic-gate } 3524*0Sstevel@tonic-gate 3525*0Sstevel@tonic-gate /* 3526*0Sstevel@tonic-gate * We do Hoe's algorithm. Refer to her 3527*0Sstevel@tonic-gate * paper "Improving the Start-up Behavior 3528*0Sstevel@tonic-gate * of a Congestion Control Scheme for TCP," 3529*0Sstevel@tonic-gate * appeared in SIGCOMM'96. 3530*0Sstevel@tonic-gate * 3531*0Sstevel@tonic-gate * Save highest seq no we have sent so far. 3532*0Sstevel@tonic-gate * Be careful about the invisible FIN byte. 3533*0Sstevel@tonic-gate */ 3534*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3535*0Sstevel@tonic-gate (tcp->tcp_unsent == 0)) { 3536*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_fss; 3537*0Sstevel@tonic-gate } else { 3538*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 3539*0Sstevel@tonic-gate } 3540*0Sstevel@tonic-gate 3541*0Sstevel@tonic-gate /* 3542*0Sstevel@tonic-gate * Do not allow bursty traffic during. 3543*0Sstevel@tonic-gate * fast recovery. Refer to Fall and Floyd's 3544*0Sstevel@tonic-gate * paper "Simulation-based Comparisons of 3545*0Sstevel@tonic-gate * Tahoe, Reno and SACK TCP" (in CCR ??) 3546*0Sstevel@tonic-gate * This is a best current practise. 3547*0Sstevel@tonic-gate */ 3548*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_SS; 3549*0Sstevel@tonic-gate 3550*0Sstevel@tonic-gate /* 3551*0Sstevel@tonic-gate * For SACK: 3552*0Sstevel@tonic-gate * Calculate tcp_pipe, which is the 3553*0Sstevel@tonic-gate * estimated number of bytes in 3554*0Sstevel@tonic-gate * network. 3555*0Sstevel@tonic-gate * 3556*0Sstevel@tonic-gate * tcp_fack is the highest sack'ed seq num 3557*0Sstevel@tonic-gate * TCP has received. 3558*0Sstevel@tonic-gate * 3559*0Sstevel@tonic-gate * tcp_pipe is explained in the above quoted 3560*0Sstevel@tonic-gate * Fall and Floyd's paper. tcp_fack is 3561*0Sstevel@tonic-gate * explained in Mathis and Mahdavi's 3562*0Sstevel@tonic-gate * "Forward Acknowledgment: Refining TCP 3563*0Sstevel@tonic-gate * Congestion Control" in SIGCOMM '96. 3564*0Sstevel@tonic-gate */ 3565*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 3566*0Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 3567*0Sstevel@tonic-gate if (tcp->tcp_notsack_list != NULL) { 3568*0Sstevel@tonic-gate tcp->tcp_pipe = tcp->tcp_snxt - 3569*0Sstevel@tonic-gate tcp->tcp_fack; 3570*0Sstevel@tonic-gate tcp->tcp_sack_snxt = seg_ack; 3571*0Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3572*0Sstevel@tonic-gate } else { 3573*0Sstevel@tonic-gate /* 3574*0Sstevel@tonic-gate * Always initialize tcp_pipe 3575*0Sstevel@tonic-gate * even though we don't have 3576*0Sstevel@tonic-gate * any SACK info. If later 3577*0Sstevel@tonic-gate * we get SACK info and 3578*0Sstevel@tonic-gate * tcp_pipe is not initialized, 3579*0Sstevel@tonic-gate * funny things will happen. 3580*0Sstevel@tonic-gate */ 3581*0Sstevel@tonic-gate tcp->tcp_pipe = 3582*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh; 3583*0Sstevel@tonic-gate } 3584*0Sstevel@tonic-gate } else { 3585*0Sstevel@tonic-gate flags |= TH_REXMIT_NEEDED; 3586*0Sstevel@tonic-gate } /* tcp_snd_sack_ok */ 3587*0Sstevel@tonic-gate 3588*0Sstevel@tonic-gate } else { 3589*0Sstevel@tonic-gate /* 3590*0Sstevel@tonic-gate * Here we perform congestion 3591*0Sstevel@tonic-gate * avoidance, but NOT slow start. 3592*0Sstevel@tonic-gate * This is known as the Fast 3593*0Sstevel@tonic-gate * Recovery Algorithm. 3594*0Sstevel@tonic-gate */ 3595*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3596*0Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3597*0Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3598*0Sstevel@tonic-gate tcp->tcp_pipe -= mss; 3599*0Sstevel@tonic-gate if (tcp->tcp_pipe < 0) 3600*0Sstevel@tonic-gate tcp->tcp_pipe = 0; 3601*0Sstevel@tonic-gate } else { 3602*0Sstevel@tonic-gate /* 3603*0Sstevel@tonic-gate * We know that one more packet has 3604*0Sstevel@tonic-gate * left the pipe thus we can update 3605*0Sstevel@tonic-gate * cwnd. 3606*0Sstevel@tonic-gate */ 3607*0Sstevel@tonic-gate cwnd = tcp->tcp_cwnd + mss; 3608*0Sstevel@tonic-gate if (cwnd > tcp->tcp_cwnd_max) 3609*0Sstevel@tonic-gate cwnd = tcp->tcp_cwnd_max; 3610*0Sstevel@tonic-gate tcp->tcp_cwnd = cwnd; 3611*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3612*0Sstevel@tonic-gate } 3613*0Sstevel@tonic-gate } 3614*0Sstevel@tonic-gate } 3615*0Sstevel@tonic-gate } else if (tcp->tcp_zero_win_probe) { 3616*0Sstevel@tonic-gate /* 3617*0Sstevel@tonic-gate * If the window has opened, need to arrange 3618*0Sstevel@tonic-gate * to send additional data. 3619*0Sstevel@tonic-gate */ 3620*0Sstevel@tonic-gate if (new_swnd != 0) { 3621*0Sstevel@tonic-gate /* tcp_suna != tcp_snxt */ 3622*0Sstevel@tonic-gate /* Packet contains a window update */ 3623*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInWinUpdate); 3624*0Sstevel@tonic-gate tcp->tcp_zero_win_probe = 0; 3625*0Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 3626*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3627*0Sstevel@tonic-gate 3628*0Sstevel@tonic-gate /* 3629*0Sstevel@tonic-gate * Transmit starting with tcp_suna since 3630*0Sstevel@tonic-gate * the one byte probe is not ack'ed. 3631*0Sstevel@tonic-gate * If TCP has sent more than one identical 3632*0Sstevel@tonic-gate * probe, tcp_rexmit will be set. That means 3633*0Sstevel@tonic-gate * tcp_ss_rexmit() will send out the one 3634*0Sstevel@tonic-gate * byte along with new data. Otherwise, 3635*0Sstevel@tonic-gate * fake the retransmission. 3636*0Sstevel@tonic-gate */ 3637*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3638*0Sstevel@tonic-gate if (!tcp->tcp_rexmit) { 3639*0Sstevel@tonic-gate tcp->tcp_rexmit = B_TRUE; 3640*0Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3641*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3642*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3643*0Sstevel@tonic-gate } 3644*0Sstevel@tonic-gate } 3645*0Sstevel@tonic-gate } 3646*0Sstevel@tonic-gate goto swnd_update; 3647*0Sstevel@tonic-gate } 3648*0Sstevel@tonic-gate 3649*0Sstevel@tonic-gate /* 3650*0Sstevel@tonic-gate * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3651*0Sstevel@tonic-gate * If the ACK value acks something that we have not yet sent, it might 3652*0Sstevel@tonic-gate * be an old duplicate segment. Send an ACK to re-synchronize the 3653*0Sstevel@tonic-gate * other side. 3654*0Sstevel@tonic-gate * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3655*0Sstevel@tonic-gate * state is handled above, so we can always just drop the segment and 3656*0Sstevel@tonic-gate * send an ACK here. 3657*0Sstevel@tonic-gate * 3658*0Sstevel@tonic-gate * Should we send ACKs in response to ACK only segments? 3659*0Sstevel@tonic-gate */ 3660*0Sstevel@tonic-gate if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3661*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInAckUnsent); 3662*0Sstevel@tonic-gate /* drop the received segment */ 3663*0Sstevel@tonic-gate freemsg(mp); 3664*0Sstevel@tonic-gate 3665*0Sstevel@tonic-gate /* Send back an ACK. */ 3666*0Sstevel@tonic-gate mp = tcp_ack_mp(tcp); 3667*0Sstevel@tonic-gate 3668*0Sstevel@tonic-gate if (mp == NULL) { 3669*0Sstevel@tonic-gate return; 3670*0Sstevel@tonic-gate } 3671*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 3672*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 3673*0Sstevel@tonic-gate freeb(mp); 3674*0Sstevel@tonic-gate return; 3675*0Sstevel@tonic-gate } 3676*0Sstevel@tonic-gate 3677*0Sstevel@tonic-gate /* 3678*0Sstevel@tonic-gate * TCP gets a new ACK, update the notsack'ed list to delete those 3679*0Sstevel@tonic-gate * blocks that are covered by this ACK. 3680*0Sstevel@tonic-gate */ 3681*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3682*0Sstevel@tonic-gate tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3683*0Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3684*0Sstevel@tonic-gate } 3685*0Sstevel@tonic-gate 3686*0Sstevel@tonic-gate /* 3687*0Sstevel@tonic-gate * If we got an ACK after fast retransmit, check to see 3688*0Sstevel@tonic-gate * if it is a partial ACK. If it is not and the congestion 3689*0Sstevel@tonic-gate * window was inflated to account for the other side's 3690*0Sstevel@tonic-gate * cached packets, retract it. If it is, do Hoe's algorithm. 3691*0Sstevel@tonic-gate */ 3692*0Sstevel@tonic-gate if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3693*0Sstevel@tonic-gate assert(tcp->tcp_rexmit == B_FALSE); 3694*0Sstevel@tonic-gate if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3695*0Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3696*0Sstevel@tonic-gate /* 3697*0Sstevel@tonic-gate * Restore the orig tcp_cwnd_ssthresh after 3698*0Sstevel@tonic-gate * fast retransmit phase. 3699*0Sstevel@tonic-gate */ 3700*0Sstevel@tonic-gate if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3701*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3702*0Sstevel@tonic-gate } 3703*0Sstevel@tonic-gate tcp->tcp_rexmit_max = seg_ack; 3704*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 3705*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3706*0Sstevel@tonic-gate 3707*0Sstevel@tonic-gate /* 3708*0Sstevel@tonic-gate * Remove all notsack info to avoid confusion with 3709*0Sstevel@tonic-gate * the next fast retrasnmit/recovery phase. 3710*0Sstevel@tonic-gate */ 3711*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3712*0Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3713*0Sstevel@tonic-gate TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3714*0Sstevel@tonic-gate } 3715*0Sstevel@tonic-gate } else { 3716*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3717*0Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3718*0Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3719*0Sstevel@tonic-gate tcp->tcp_pipe -= mss; 3720*0Sstevel@tonic-gate if (tcp->tcp_pipe < 0) 3721*0Sstevel@tonic-gate tcp->tcp_pipe = 0; 3722*0Sstevel@tonic-gate } else { 3723*0Sstevel@tonic-gate /* 3724*0Sstevel@tonic-gate * Hoe's algorithm: 3725*0Sstevel@tonic-gate * 3726*0Sstevel@tonic-gate * Retransmit the unack'ed segment and 3727*0Sstevel@tonic-gate * restart fast recovery. Note that we 3728*0Sstevel@tonic-gate * need to scale back tcp_cwnd to the 3729*0Sstevel@tonic-gate * original value when we started fast 3730*0Sstevel@tonic-gate * recovery. This is to prevent overly 3731*0Sstevel@tonic-gate * aggressive behaviour in sending new 3732*0Sstevel@tonic-gate * segments. 3733*0Sstevel@tonic-gate */ 3734*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3735*0Sstevel@tonic-gate tcp_dupack_fast_retransmit * mss; 3736*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3737*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3738*0Sstevel@tonic-gate flags |= TH_REXMIT_NEEDED; 3739*0Sstevel@tonic-gate } 3740*0Sstevel@tonic-gate } 3741*0Sstevel@tonic-gate } else { 3742*0Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3743*0Sstevel@tonic-gate if (tcp->tcp_rexmit) { 3744*0Sstevel@tonic-gate /* 3745*0Sstevel@tonic-gate * TCP is retranmitting. If the ACK ack's all 3746*0Sstevel@tonic-gate * outstanding data, update tcp_rexmit_max and 3747*0Sstevel@tonic-gate * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3748*0Sstevel@tonic-gate * to the correct value. 3749*0Sstevel@tonic-gate * 3750*0Sstevel@tonic-gate * Note that SEQ_LEQ() is used. This is to avoid 3751*0Sstevel@tonic-gate * unnecessary fast retransmit caused by dup ACKs 3752*0Sstevel@tonic-gate * received when TCP does slow start retransmission 3753*0Sstevel@tonic-gate * after a time out. During this phase, TCP may 3754*0Sstevel@tonic-gate * send out segments which are already received. 3755*0Sstevel@tonic-gate * This causes dup ACKs to be sent back. 3756*0Sstevel@tonic-gate */ 3757*0Sstevel@tonic-gate if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3758*0Sstevel@tonic-gate if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3759*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = seg_ack; 3760*0Sstevel@tonic-gate } 3761*0Sstevel@tonic-gate if (seg_ack != tcp->tcp_rexmit_max) { 3762*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3763*0Sstevel@tonic-gate } 3764*0Sstevel@tonic-gate } else { 3765*0Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 3766*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3767*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3768*0Sstevel@tonic-gate } 3769*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3770*0Sstevel@tonic-gate } 3771*0Sstevel@tonic-gate } 3772*0Sstevel@tonic-gate 3773*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInAckSegs); 3774*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3775*0Sstevel@tonic-gate tcp->tcp_suna = seg_ack; 3776*0Sstevel@tonic-gate if (tcp->tcp_zero_win_probe != 0) { 3777*0Sstevel@tonic-gate tcp->tcp_zero_win_probe = 0; 3778*0Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 3779*0Sstevel@tonic-gate } 3780*0Sstevel@tonic-gate 3781*0Sstevel@tonic-gate /* 3782*0Sstevel@tonic-gate * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3783*0Sstevel@tonic-gate * Note that it cannot be the SYN being ack'ed. The code flow 3784*0Sstevel@tonic-gate * will not reach here. 3785*0Sstevel@tonic-gate */ 3786*0Sstevel@tonic-gate if (mp1 == NULL) { 3787*0Sstevel@tonic-gate goto fin_acked; 3788*0Sstevel@tonic-gate } 3789*0Sstevel@tonic-gate 3790*0Sstevel@tonic-gate /* 3791*0Sstevel@tonic-gate * Update the congestion window. 3792*0Sstevel@tonic-gate * 3793*0Sstevel@tonic-gate * If TCP is not ECN capable or TCP is ECN capable but the 3794*0Sstevel@tonic-gate * congestion experience bit is not set, increase the tcp_cwnd as 3795*0Sstevel@tonic-gate * usual. 3796*0Sstevel@tonic-gate */ 3797*0Sstevel@tonic-gate if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3798*0Sstevel@tonic-gate cwnd = tcp->tcp_cwnd; 3799*0Sstevel@tonic-gate add = mss; 3800*0Sstevel@tonic-gate 3801*0Sstevel@tonic-gate if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3802*0Sstevel@tonic-gate /* 3803*0Sstevel@tonic-gate * This is to prevent an increase of less than 1 MSS of 3804*0Sstevel@tonic-gate * tcp_cwnd. With partial increase, tcp_wput_data() 3805*0Sstevel@tonic-gate * may send out tinygrams in order to preserve mblk 3806*0Sstevel@tonic-gate * boundaries. 3807*0Sstevel@tonic-gate * 3808*0Sstevel@tonic-gate * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3809*0Sstevel@tonic-gate * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3810*0Sstevel@tonic-gate * increased by 1 MSS for every RTTs. 3811*0Sstevel@tonic-gate */ 3812*0Sstevel@tonic-gate if (tcp->tcp_cwnd_cnt <= 0) { 3813*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = cwnd + add; 3814*0Sstevel@tonic-gate } else { 3815*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt -= add; 3816*0Sstevel@tonic-gate add = 0; 3817*0Sstevel@tonic-gate } 3818*0Sstevel@tonic-gate } 3819*0Sstevel@tonic-gate tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3820*0Sstevel@tonic-gate } 3821*0Sstevel@tonic-gate 3822*0Sstevel@tonic-gate /* Can we update the RTT estimates? */ 3823*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 3824*0Sstevel@tonic-gate /* Ignore zero timestamp echo-reply. */ 3825*0Sstevel@tonic-gate if (tcpopt.tcp_opt_ts_ecr != 0) { 3826*0Sstevel@tonic-gate tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3827*0Sstevel@tonic-gate tcpopt.tcp_opt_ts_ecr)); 3828*0Sstevel@tonic-gate } 3829*0Sstevel@tonic-gate 3830*0Sstevel@tonic-gate /* If needed, restart the timer. */ 3831*0Sstevel@tonic-gate if (tcp->tcp_set_timer == 1) { 3832*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3833*0Sstevel@tonic-gate tcp->tcp_set_timer = 0; 3834*0Sstevel@tonic-gate } 3835*0Sstevel@tonic-gate /* 3836*0Sstevel@tonic-gate * Update tcp_csuna in case the other side stops sending 3837*0Sstevel@tonic-gate * us timestamps. 3838*0Sstevel@tonic-gate */ 3839*0Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 3840*0Sstevel@tonic-gate } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3841*0Sstevel@tonic-gate /* 3842*0Sstevel@tonic-gate * An ACK sequence we haven't seen before, so get the RTT 3843*0Sstevel@tonic-gate * and update the RTO. 3844*0Sstevel@tonic-gate */ 3845*0Sstevel@tonic-gate tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3846*0Sstevel@tonic-gate (uint32_t)mp1->b_prev)); 3847*0Sstevel@tonic-gate 3848*0Sstevel@tonic-gate /* Remeber the last sequence to be ACKed */ 3849*0Sstevel@tonic-gate tcp->tcp_csuna = seg_ack; 3850*0Sstevel@tonic-gate if (tcp->tcp_set_timer == 1) { 3851*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3852*0Sstevel@tonic-gate tcp->tcp_set_timer = 0; 3853*0Sstevel@tonic-gate } 3854*0Sstevel@tonic-gate } else { 3855*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3856*0Sstevel@tonic-gate } 3857*0Sstevel@tonic-gate 3858*0Sstevel@tonic-gate /* Eat acknowledged bytes off the xmit queue. */ 3859*0Sstevel@tonic-gate for (;;) { 3860*0Sstevel@tonic-gate mblk_t *mp2; 3861*0Sstevel@tonic-gate uchar_t *wptr; 3862*0Sstevel@tonic-gate 3863*0Sstevel@tonic-gate wptr = mp1->b_wptr; 3864*0Sstevel@tonic-gate assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3865*0Sstevel@tonic-gate bytes_acked -= (int)(wptr - mp1->b_rptr); 3866*0Sstevel@tonic-gate if (bytes_acked < 0) { 3867*0Sstevel@tonic-gate mp1->b_rptr = wptr + bytes_acked; 3868*0Sstevel@tonic-gate break; 3869*0Sstevel@tonic-gate } 3870*0Sstevel@tonic-gate mp1->b_prev = NULL; 3871*0Sstevel@tonic-gate mp2 = mp1; 3872*0Sstevel@tonic-gate mp1 = mp1->b_cont; 3873*0Sstevel@tonic-gate freeb(mp2); 3874*0Sstevel@tonic-gate if (bytes_acked == 0) { 3875*0Sstevel@tonic-gate if (mp1 == NULL) { 3876*0Sstevel@tonic-gate /* Everything is ack'ed, clear the tail. */ 3877*0Sstevel@tonic-gate tcp->tcp_xmit_tail = NULL; 3878*0Sstevel@tonic-gate goto pre_swnd_update; 3879*0Sstevel@tonic-gate } 3880*0Sstevel@tonic-gate if (mp2 != tcp->tcp_xmit_tail) 3881*0Sstevel@tonic-gate break; 3882*0Sstevel@tonic-gate tcp->tcp_xmit_tail = mp1; 3883*0Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 3884*0Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 3885*0Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3886*0Sstevel@tonic-gate mp1->b_rptr); 3887*0Sstevel@tonic-gate break; 3888*0Sstevel@tonic-gate } 3889*0Sstevel@tonic-gate if (mp1 == NULL) { 3890*0Sstevel@tonic-gate /* 3891*0Sstevel@tonic-gate * More was acked but there is nothing more 3892*0Sstevel@tonic-gate * outstanding. This means that the FIN was 3893*0Sstevel@tonic-gate * just acked or that we're talking to a clown. 3894*0Sstevel@tonic-gate */ 3895*0Sstevel@tonic-gate fin_acked: 3896*0Sstevel@tonic-gate assert(tcp->tcp_fin_sent); 3897*0Sstevel@tonic-gate tcp->tcp_xmit_tail = NULL; 3898*0Sstevel@tonic-gate if (tcp->tcp_fin_sent) { 3899*0Sstevel@tonic-gate tcp->tcp_fin_acked = B_TRUE; 3900*0Sstevel@tonic-gate } else { 3901*0Sstevel@tonic-gate /* 3902*0Sstevel@tonic-gate * We should never got here because 3903*0Sstevel@tonic-gate * we have already checked that the 3904*0Sstevel@tonic-gate * number of bytes ack'ed should be 3905*0Sstevel@tonic-gate * smaller than or equal to what we 3906*0Sstevel@tonic-gate * have sent so far (it is the 3907*0Sstevel@tonic-gate * acceptability check of the ACK). 3908*0Sstevel@tonic-gate * We can only get here if the send 3909*0Sstevel@tonic-gate * queue is corrupted. 3910*0Sstevel@tonic-gate * 3911*0Sstevel@tonic-gate * Terminate the connection and 3912*0Sstevel@tonic-gate * panic the system. It is better 3913*0Sstevel@tonic-gate * for us to panic instead of 3914*0Sstevel@tonic-gate * continuing to avoid other disaster. 3915*0Sstevel@tonic-gate */ 3916*0Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3917*0Sstevel@tonic-gate tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3918*0Sstevel@tonic-gate printf("Memory corruption " 3919*0Sstevel@tonic-gate "detected for connection %s.\n", 3920*0Sstevel@tonic-gate tcp_display(tcp, NULL, 3921*0Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 3922*0Sstevel@tonic-gate /* We should never get here... */ 3923*0Sstevel@tonic-gate prom_panic("tcp_rput_data"); 3924*0Sstevel@tonic-gate return; 3925*0Sstevel@tonic-gate } 3926*0Sstevel@tonic-gate goto pre_swnd_update; 3927*0Sstevel@tonic-gate } 3928*0Sstevel@tonic-gate assert(mp2 != tcp->tcp_xmit_tail); 3929*0Sstevel@tonic-gate } 3930*0Sstevel@tonic-gate if (tcp->tcp_unsent) { 3931*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3932*0Sstevel@tonic-gate } 3933*0Sstevel@tonic-gate pre_swnd_update: 3934*0Sstevel@tonic-gate tcp->tcp_xmit_head = mp1; 3935*0Sstevel@tonic-gate swnd_update: 3936*0Sstevel@tonic-gate /* 3937*0Sstevel@tonic-gate * The following check is different from most other implementations. 3938*0Sstevel@tonic-gate * For bi-directional transfer, when segments are dropped, the 3939*0Sstevel@tonic-gate * "normal" check will not accept a window update in those 3940*0Sstevel@tonic-gate * retransmitted segemnts. Failing to do that, TCP may send out 3941*0Sstevel@tonic-gate * segments which are outside receiver's window. As TCP accepts 3942*0Sstevel@tonic-gate * the ack in those retransmitted segments, if the window update in 3943*0Sstevel@tonic-gate * the same segment is not accepted, TCP will incorrectly calculates 3944*0Sstevel@tonic-gate * that it can send more segments. This can create a deadlock 3945*0Sstevel@tonic-gate * with the receiver if its window becomes zero. 3946*0Sstevel@tonic-gate */ 3947*0Sstevel@tonic-gate if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3948*0Sstevel@tonic-gate SEQ_LT(tcp->tcp_swl1, seg_seq) || 3949*0Sstevel@tonic-gate (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3950*0Sstevel@tonic-gate /* 3951*0Sstevel@tonic-gate * The criteria for update is: 3952*0Sstevel@tonic-gate * 3953*0Sstevel@tonic-gate * 1. the segment acknowledges some data. Or 3954*0Sstevel@tonic-gate * 2. the segment is new, i.e. it has a higher seq num. Or 3955*0Sstevel@tonic-gate * 3. the segment is not old and the advertised window is 3956*0Sstevel@tonic-gate * larger than the previous advertised window. 3957*0Sstevel@tonic-gate */ 3958*0Sstevel@tonic-gate if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3959*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3960*0Sstevel@tonic-gate tcp->tcp_swnd = new_swnd; 3961*0Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 3962*0Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 3963*0Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 3964*0Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 3965*0Sstevel@tonic-gate } 3966*0Sstevel@tonic-gate est: 3967*0Sstevel@tonic-gate if (tcp->tcp_state > TCPS_ESTABLISHED) { 3968*0Sstevel@tonic-gate switch (tcp->tcp_state) { 3969*0Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 3970*0Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 3971*0Sstevel@tonic-gate tcp->tcp_state = TCPS_FIN_WAIT_2; 3972*0Sstevel@tonic-gate /* 3973*0Sstevel@tonic-gate * We implement the non-standard BSD/SunOS 3974*0Sstevel@tonic-gate * FIN_WAIT_2 flushing algorithm. 3975*0Sstevel@tonic-gate * If there is no user attached to this 3976*0Sstevel@tonic-gate * TCP endpoint, then this TCP struct 3977*0Sstevel@tonic-gate * could hang around forever in FIN_WAIT_2 3978*0Sstevel@tonic-gate * state if the peer forgets to send us 3979*0Sstevel@tonic-gate * a FIN. To prevent this, we wait only 3980*0Sstevel@tonic-gate * 2*MSL (a convenient time value) for 3981*0Sstevel@tonic-gate * the FIN to arrive. If it doesn't show up, 3982*0Sstevel@tonic-gate * we flush the TCP endpoint. This algorithm, 3983*0Sstevel@tonic-gate * though a violation of RFC-793, has worked 3984*0Sstevel@tonic-gate * for over 10 years in BSD systems. 3985*0Sstevel@tonic-gate * Note: SunOS 4.x waits 675 seconds before 3986*0Sstevel@tonic-gate * flushing the FIN_WAIT_2 connection. 3987*0Sstevel@tonic-gate */ 3988*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, 3989*0Sstevel@tonic-gate tcp_fin_wait_2_flush_interval); 3990*0Sstevel@tonic-gate } 3991*0Sstevel@tonic-gate break; 3992*0Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 3993*0Sstevel@tonic-gate break; /* Shutdown hook? */ 3994*0Sstevel@tonic-gate case TCPS_LAST_ACK: 3995*0Sstevel@tonic-gate freemsg(mp); 3996*0Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 3997*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 3998*0Sstevel@tonic-gate return; 3999*0Sstevel@tonic-gate } 4000*0Sstevel@tonic-gate goto xmit_check; 4001*0Sstevel@tonic-gate case TCPS_CLOSING: 4002*0Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 4003*0Sstevel@tonic-gate tcp->tcp_state = TCPS_TIME_WAIT; 4004*0Sstevel@tonic-gate tcp_time_wait_append(tcp); 4005*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4006*0Sstevel@tonic-gate } 4007*0Sstevel@tonic-gate /*FALLTHRU*/ 4008*0Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 4009*0Sstevel@tonic-gate freemsg(mp); 4010*0Sstevel@tonic-gate goto xmit_check; 4011*0Sstevel@tonic-gate default: 4012*0Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 4013*0Sstevel@tonic-gate break; 4014*0Sstevel@tonic-gate } 4015*0Sstevel@tonic-gate } 4016*0Sstevel@tonic-gate if (flags & TH_FIN) { 4017*0Sstevel@tonic-gate /* Make sure we ack the fin */ 4018*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 4019*0Sstevel@tonic-gate if (!tcp->tcp_fin_rcvd) { 4020*0Sstevel@tonic-gate tcp->tcp_fin_rcvd = B_TRUE; 4021*0Sstevel@tonic-gate tcp->tcp_rnxt++; 4022*0Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4023*0Sstevel@tonic-gate 4024*0Sstevel@tonic-gate switch (tcp->tcp_state) { 4025*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 4026*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 4027*0Sstevel@tonic-gate tcp->tcp_state = TCPS_CLOSE_WAIT; 4028*0Sstevel@tonic-gate /* Keepalive? */ 4029*0Sstevel@tonic-gate break; 4030*0Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 4031*0Sstevel@tonic-gate if (!tcp->tcp_fin_acked) { 4032*0Sstevel@tonic-gate tcp->tcp_state = TCPS_CLOSING; 4033*0Sstevel@tonic-gate break; 4034*0Sstevel@tonic-gate } 4035*0Sstevel@tonic-gate /* FALLTHRU */ 4036*0Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 4037*0Sstevel@tonic-gate tcp->tcp_state = TCPS_TIME_WAIT; 4038*0Sstevel@tonic-gate tcp_time_wait_append(tcp); 4039*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4040*0Sstevel@tonic-gate if (seg_len) { 4041*0Sstevel@tonic-gate /* 4042*0Sstevel@tonic-gate * implies data piggybacked on FIN. 4043*0Sstevel@tonic-gate * break to handle data. 4044*0Sstevel@tonic-gate */ 4045*0Sstevel@tonic-gate break; 4046*0Sstevel@tonic-gate } 4047*0Sstevel@tonic-gate freemsg(mp); 4048*0Sstevel@tonic-gate goto ack_check; 4049*0Sstevel@tonic-gate } 4050*0Sstevel@tonic-gate } 4051*0Sstevel@tonic-gate } 4052*0Sstevel@tonic-gate if (mp == NULL) 4053*0Sstevel@tonic-gate goto xmit_check; 4054*0Sstevel@tonic-gate if (seg_len == 0) { 4055*0Sstevel@tonic-gate freemsg(mp); 4056*0Sstevel@tonic-gate goto xmit_check; 4057*0Sstevel@tonic-gate } 4058*0Sstevel@tonic-gate if (mp->b_rptr == mp->b_wptr) { 4059*0Sstevel@tonic-gate /* 4060*0Sstevel@tonic-gate * The header has been consumed, so we remove the 4061*0Sstevel@tonic-gate * zero-length mblk here. 4062*0Sstevel@tonic-gate */ 4063*0Sstevel@tonic-gate mp1 = mp; 4064*0Sstevel@tonic-gate mp = mp->b_cont; 4065*0Sstevel@tonic-gate freeb(mp1); 4066*0Sstevel@tonic-gate } 4067*0Sstevel@tonic-gate /* 4068*0Sstevel@tonic-gate * ACK every other segments, unless the input queue is empty 4069*0Sstevel@tonic-gate * as we don't have a timer available. 4070*0Sstevel@tonic-gate */ 4071*0Sstevel@tonic-gate if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4072*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 4073*0Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 4074*0Sstevel@tonic-gate } 4075*0Sstevel@tonic-gate tcp->tcp_rnxt += seg_len; 4076*0Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4077*0Sstevel@tonic-gate 4078*0Sstevel@tonic-gate /* Update SACK list */ 4079*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4080*0Sstevel@tonic-gate tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4081*0Sstevel@tonic-gate &(tcp->tcp_num_sack_blk)); 4082*0Sstevel@tonic-gate } 4083*0Sstevel@tonic-gate 4084*0Sstevel@tonic-gate if (tcp->tcp_listener) { 4085*0Sstevel@tonic-gate /* 4086*0Sstevel@tonic-gate * Side queue inbound data until the accept happens. 4087*0Sstevel@tonic-gate * tcp_accept/tcp_rput drains this when the accept happens. 4088*0Sstevel@tonic-gate */ 4089*0Sstevel@tonic-gate tcp_rcv_enqueue(tcp, mp, seg_len); 4090*0Sstevel@tonic-gate } else { 4091*0Sstevel@tonic-gate /* Just queue the data until the app calls read. */ 4092*0Sstevel@tonic-gate tcp_rcv_enqueue(tcp, mp, seg_len); 4093*0Sstevel@tonic-gate /* 4094*0Sstevel@tonic-gate * Make sure the timer is running if we have data waiting 4095*0Sstevel@tonic-gate * for a push bit. This provides resiliency against 4096*0Sstevel@tonic-gate * implementations that do not correctly generate push bits. 4097*0Sstevel@tonic-gate */ 4098*0Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) 4099*0Sstevel@tonic-gate flags |= TH_TIMER_NEEDED; 4100*0Sstevel@tonic-gate } 4101*0Sstevel@tonic-gate 4102*0Sstevel@tonic-gate xmit_check: 4103*0Sstevel@tonic-gate /* Is there anything left to do? */ 4104*0Sstevel@tonic-gate if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4105*0Sstevel@tonic-gate TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4106*0Sstevel@tonic-gate return; 4107*0Sstevel@tonic-gate 4108*0Sstevel@tonic-gate /* Any transmit work to do and a non-zero window? */ 4109*0Sstevel@tonic-gate if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4110*0Sstevel@tonic-gate TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4111*0Sstevel@tonic-gate if (flags & TH_REXMIT_NEEDED) { 4112*0Sstevel@tonic-gate uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4113*0Sstevel@tonic-gate 4114*0Sstevel@tonic-gate if (snd_size > mss) 4115*0Sstevel@tonic-gate snd_size = mss; 4116*0Sstevel@tonic-gate if (snd_size > tcp->tcp_swnd) 4117*0Sstevel@tonic-gate snd_size = tcp->tcp_swnd; 4118*0Sstevel@tonic-gate mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4119*0Sstevel@tonic-gate NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4120*0Sstevel@tonic-gate B_TRUE); 4121*0Sstevel@tonic-gate 4122*0Sstevel@tonic-gate if (mp1 != NULL) { 4123*0Sstevel@tonic-gate tcp->tcp_xmit_head->b_prev = 4124*0Sstevel@tonic-gate (mblk_t *)prom_gettime(); 4125*0Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 4126*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4127*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4128*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 4129*0Sstevel@tonic-gate freeb(mp1); 4130*0Sstevel@tonic-gate } 4131*0Sstevel@tonic-gate } 4132*0Sstevel@tonic-gate if (flags & TH_NEED_SACK_REXMIT) { 4133*0Sstevel@tonic-gate if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4134*0Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 4135*0Sstevel@tonic-gate } 4136*0Sstevel@tonic-gate } 4137*0Sstevel@tonic-gate /* 4138*0Sstevel@tonic-gate * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4139*0Sstevel@tonic-gate * out new segment. Note that tcp_rexmit should not be 4140*0Sstevel@tonic-gate * set, otherwise TH_LIMIT_XMIT should not be set. 4141*0Sstevel@tonic-gate */ 4142*0Sstevel@tonic-gate if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4143*0Sstevel@tonic-gate if (!tcp->tcp_rexmit) { 4144*0Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4145*0Sstevel@tonic-gate } else { 4146*0Sstevel@tonic-gate tcp_ss_rexmit(tcp, sock_id); 4147*0Sstevel@tonic-gate } 4148*0Sstevel@tonic-gate /* 4149*0Sstevel@tonic-gate * The TCP could be closed in tcp_state_wait via 4150*0Sstevel@tonic-gate * tcp_wput_data (tcp_ss_rexmit could call 4151*0Sstevel@tonic-gate * tcp_wput_data as well). 4152*0Sstevel@tonic-gate */ 4153*0Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 4154*0Sstevel@tonic-gate return; 4155*0Sstevel@tonic-gate } 4156*0Sstevel@tonic-gate /* 4157*0Sstevel@tonic-gate * Adjust tcp_cwnd back to normal value after sending 4158*0Sstevel@tonic-gate * new data segments. 4159*0Sstevel@tonic-gate */ 4160*0Sstevel@tonic-gate if (flags & TH_LIMIT_XMIT) { 4161*0Sstevel@tonic-gate tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4162*0Sstevel@tonic-gate } 4163*0Sstevel@tonic-gate 4164*0Sstevel@tonic-gate /* Anything more to do? */ 4165*0Sstevel@tonic-gate if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4166*0Sstevel@tonic-gate return; 4167*0Sstevel@tonic-gate } 4168*0Sstevel@tonic-gate ack_check: 4169*0Sstevel@tonic-gate if (flags & TH_ACK_NEEDED) { 4170*0Sstevel@tonic-gate /* 4171*0Sstevel@tonic-gate * Time to send an ack for some reason. 4172*0Sstevel@tonic-gate */ 4173*0Sstevel@tonic-gate if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4174*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4175*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 4176*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 4177*0Sstevel@tonic-gate freeb(mp1); 4178*0Sstevel@tonic-gate } 4179*0Sstevel@tonic-gate } 4180*0Sstevel@tonic-gate } 4181*0Sstevel@tonic-gate 4182*0Sstevel@tonic-gate /* 4183*0Sstevel@tonic-gate * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4184*0Sstevel@tonic-gate * retransmission after a timeout. 4185*0Sstevel@tonic-gate * 4186*0Sstevel@tonic-gate * To limit the number of duplicate segments, we limit the number of segment 4187*0Sstevel@tonic-gate * to be sent in one time to tcp_snd_burst, the burst variable. 4188*0Sstevel@tonic-gate */ 4189*0Sstevel@tonic-gate static void 4190*0Sstevel@tonic-gate tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4191*0Sstevel@tonic-gate { 4192*0Sstevel@tonic-gate uint32_t snxt; 4193*0Sstevel@tonic-gate uint32_t smax; 4194*0Sstevel@tonic-gate int32_t win; 4195*0Sstevel@tonic-gate int32_t mss; 4196*0Sstevel@tonic-gate int32_t off; 4197*0Sstevel@tonic-gate int32_t burst = tcp->tcp_snd_burst; 4198*0Sstevel@tonic-gate mblk_t *snxt_mp; 4199*0Sstevel@tonic-gate 4200*0Sstevel@tonic-gate /* 4201*0Sstevel@tonic-gate * Note that tcp_rexmit can be set even though TCP has retransmitted 4202*0Sstevel@tonic-gate * all unack'ed segments. 4203*0Sstevel@tonic-gate */ 4204*0Sstevel@tonic-gate if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4205*0Sstevel@tonic-gate smax = tcp->tcp_rexmit_max; 4206*0Sstevel@tonic-gate snxt = tcp->tcp_rexmit_nxt; 4207*0Sstevel@tonic-gate if (SEQ_LT(snxt, tcp->tcp_suna)) { 4208*0Sstevel@tonic-gate snxt = tcp->tcp_suna; 4209*0Sstevel@tonic-gate } 4210*0Sstevel@tonic-gate win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4211*0Sstevel@tonic-gate win -= snxt - tcp->tcp_suna; 4212*0Sstevel@tonic-gate mss = tcp->tcp_mss; 4213*0Sstevel@tonic-gate snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4214*0Sstevel@tonic-gate 4215*0Sstevel@tonic-gate while (SEQ_LT(snxt, smax) && (win > 0) && 4216*0Sstevel@tonic-gate (burst > 0) && (snxt_mp != NULL)) { 4217*0Sstevel@tonic-gate mblk_t *xmit_mp; 4218*0Sstevel@tonic-gate mblk_t *old_snxt_mp = snxt_mp; 4219*0Sstevel@tonic-gate uint32_t cnt = mss; 4220*0Sstevel@tonic-gate 4221*0Sstevel@tonic-gate if (win < cnt) { 4222*0Sstevel@tonic-gate cnt = win; 4223*0Sstevel@tonic-gate } 4224*0Sstevel@tonic-gate if (SEQ_GT(snxt + cnt, smax)) { 4225*0Sstevel@tonic-gate cnt = smax - snxt; 4226*0Sstevel@tonic-gate } 4227*0Sstevel@tonic-gate xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4228*0Sstevel@tonic-gate &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4229*0Sstevel@tonic-gate 4230*0Sstevel@tonic-gate if (xmit_mp == NULL) 4231*0Sstevel@tonic-gate return; 4232*0Sstevel@tonic-gate 4233*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, xmit_mp); 4234*0Sstevel@tonic-gate freeb(xmit_mp); 4235*0Sstevel@tonic-gate 4236*0Sstevel@tonic-gate snxt += cnt; 4237*0Sstevel@tonic-gate win -= cnt; 4238*0Sstevel@tonic-gate /* 4239*0Sstevel@tonic-gate * Update the send timestamp to avoid false 4240*0Sstevel@tonic-gate * retransmission. 4241*0Sstevel@tonic-gate */ 4242*0Sstevel@tonic-gate old_snxt_mp->b_prev = (mblk_t *)prom_gettime(); 4243*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4244*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4245*0Sstevel@tonic-gate 4246*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = snxt; 4247*0Sstevel@tonic-gate burst--; 4248*0Sstevel@tonic-gate } 4249*0Sstevel@tonic-gate /* 4250*0Sstevel@tonic-gate * If we have transmitted all we have at the time 4251*0Sstevel@tonic-gate * we started the retranmission, we can leave 4252*0Sstevel@tonic-gate * the rest of the job to tcp_wput_data(). But we 4253*0Sstevel@tonic-gate * need to check the send window first. If the 4254*0Sstevel@tonic-gate * win is not 0, go on with tcp_wput_data(). 4255*0Sstevel@tonic-gate */ 4256*0Sstevel@tonic-gate if (SEQ_LT(snxt, smax) || win == 0) { 4257*0Sstevel@tonic-gate return; 4258*0Sstevel@tonic-gate } 4259*0Sstevel@tonic-gate } 4260*0Sstevel@tonic-gate /* Only call tcp_wput_data() if there is data to be sent. */ 4261*0Sstevel@tonic-gate if (tcp->tcp_unsent) { 4262*0Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4263*0Sstevel@tonic-gate } 4264*0Sstevel@tonic-gate } 4265*0Sstevel@tonic-gate 4266*0Sstevel@tonic-gate /* 4267*0Sstevel@tonic-gate * tcp_timer is the timer service routine. It handles all timer events for 4268*0Sstevel@tonic-gate * a tcp instance except keepalives. It figures out from the state of the 4269*0Sstevel@tonic-gate * tcp instance what kind of action needs to be done at the time it is called. 4270*0Sstevel@tonic-gate */ 4271*0Sstevel@tonic-gate static void 4272*0Sstevel@tonic-gate tcp_timer(tcp_t *tcp, int sock_id) 4273*0Sstevel@tonic-gate { 4274*0Sstevel@tonic-gate mblk_t *mp; 4275*0Sstevel@tonic-gate uint32_t first_threshold; 4276*0Sstevel@tonic-gate uint32_t second_threshold; 4277*0Sstevel@tonic-gate uint32_t ms; 4278*0Sstevel@tonic-gate uint32_t mss; 4279*0Sstevel@tonic-gate 4280*0Sstevel@tonic-gate first_threshold = tcp->tcp_first_timer_threshold; 4281*0Sstevel@tonic-gate second_threshold = tcp->tcp_second_timer_threshold; 4282*0Sstevel@tonic-gate switch (tcp->tcp_state) { 4283*0Sstevel@tonic-gate case TCPS_IDLE: 4284*0Sstevel@tonic-gate case TCPS_BOUND: 4285*0Sstevel@tonic-gate case TCPS_LISTEN: 4286*0Sstevel@tonic-gate return; 4287*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 4288*0Sstevel@tonic-gate case TCPS_SYN_SENT: 4289*0Sstevel@tonic-gate first_threshold = tcp->tcp_first_ctimer_threshold; 4290*0Sstevel@tonic-gate second_threshold = tcp->tcp_second_ctimer_threshold; 4291*0Sstevel@tonic-gate break; 4292*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 4293*0Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 4294*0Sstevel@tonic-gate case TCPS_CLOSING: 4295*0Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 4296*0Sstevel@tonic-gate case TCPS_LAST_ACK: 4297*0Sstevel@tonic-gate /* If we have data to rexmit */ 4298*0Sstevel@tonic-gate if (tcp->tcp_suna != tcp->tcp_snxt) { 4299*0Sstevel@tonic-gate int32_t time_to_wait; 4300*0Sstevel@tonic-gate 4301*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpTimRetrans); 4302*0Sstevel@tonic-gate if (tcp->tcp_xmit_head == NULL) 4303*0Sstevel@tonic-gate break; 4304*0Sstevel@tonic-gate time_to_wait = (int32_t)(prom_gettime() - 4305*0Sstevel@tonic-gate (uint32_t)tcp->tcp_xmit_head->b_prev); 4306*0Sstevel@tonic-gate time_to_wait = tcp->tcp_rto - time_to_wait; 4307*0Sstevel@tonic-gate if (time_to_wait > 0) { 4308*0Sstevel@tonic-gate /* 4309*0Sstevel@tonic-gate * Timer fired too early, so restart it. 4310*0Sstevel@tonic-gate */ 4311*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, time_to_wait); 4312*0Sstevel@tonic-gate return; 4313*0Sstevel@tonic-gate } 4314*0Sstevel@tonic-gate /* 4315*0Sstevel@tonic-gate * When we probe zero windows, we force the swnd open. 4316*0Sstevel@tonic-gate * If our peer acks with a closed window swnd will be 4317*0Sstevel@tonic-gate * set to zero by tcp_rput(). As long as we are 4318*0Sstevel@tonic-gate * receiving acks tcp_rput will 4319*0Sstevel@tonic-gate * reset 'tcp_ms_we_have_waited' so as not to trip the 4320*0Sstevel@tonic-gate * first and second interval actions. NOTE: the timer 4321*0Sstevel@tonic-gate * interval is allowed to continue its exponential 4322*0Sstevel@tonic-gate * backoff. 4323*0Sstevel@tonic-gate */ 4324*0Sstevel@tonic-gate if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4325*0Sstevel@tonic-gate DEBUG_1("tcp_timer (%d): zero win", sock_id); 4326*0Sstevel@tonic-gate break; 4327*0Sstevel@tonic-gate } else { 4328*0Sstevel@tonic-gate /* 4329*0Sstevel@tonic-gate * After retransmission, we need to do 4330*0Sstevel@tonic-gate * slow start. Set the ssthresh to one 4331*0Sstevel@tonic-gate * half of current effective window and 4332*0Sstevel@tonic-gate * cwnd to one MSS. Also reset 4333*0Sstevel@tonic-gate * tcp_cwnd_cnt. 4334*0Sstevel@tonic-gate * 4335*0Sstevel@tonic-gate * Note that if tcp_ssthresh is reduced because 4336*0Sstevel@tonic-gate * of ECN, do not reduce it again unless it is 4337*0Sstevel@tonic-gate * already one window of data away (tcp_cwr 4338*0Sstevel@tonic-gate * should then be cleared) or this is a 4339*0Sstevel@tonic-gate * timeout for a retransmitted segment. 4340*0Sstevel@tonic-gate */ 4341*0Sstevel@tonic-gate uint32_t npkt; 4342*0Sstevel@tonic-gate 4343*0Sstevel@tonic-gate if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4344*0Sstevel@tonic-gate npkt = (MIN((tcp->tcp_timer_backoff ? 4345*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh : 4346*0Sstevel@tonic-gate tcp->tcp_cwnd), 4347*0Sstevel@tonic-gate tcp->tcp_swnd) >> 1) / 4348*0Sstevel@tonic-gate tcp->tcp_mss; 4349*0Sstevel@tonic-gate if (npkt < 2) 4350*0Sstevel@tonic-gate npkt = 2; 4351*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = npkt * 4352*0Sstevel@tonic-gate tcp->tcp_mss; 4353*0Sstevel@tonic-gate } 4354*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 4355*0Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 4356*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 4357*0Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 4358*0Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4359*0Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 4360*0Sstevel@tonic-gate } 4361*0Sstevel@tonic-gate } 4362*0Sstevel@tonic-gate break; 4363*0Sstevel@tonic-gate } 4364*0Sstevel@tonic-gate /* 4365*0Sstevel@tonic-gate * We have something to send yet we cannot send. The 4366*0Sstevel@tonic-gate * reason can be: 4367*0Sstevel@tonic-gate * 4368*0Sstevel@tonic-gate * 1. Zero send window: we need to do zero window probe. 4369*0Sstevel@tonic-gate * 2. Zero cwnd: because of ECN, we need to "clock out 4370*0Sstevel@tonic-gate * segments. 4371*0Sstevel@tonic-gate * 3. SWS avoidance: receiver may have shrunk window, 4372*0Sstevel@tonic-gate * reset our knowledge. 4373*0Sstevel@tonic-gate * 4374*0Sstevel@tonic-gate * Note that condition 2 can happen with either 1 or 4375*0Sstevel@tonic-gate * 3. But 1 and 3 are exclusive. 4376*0Sstevel@tonic-gate */ 4377*0Sstevel@tonic-gate if (tcp->tcp_unsent != 0) { 4378*0Sstevel@tonic-gate if (tcp->tcp_cwnd == 0) { 4379*0Sstevel@tonic-gate /* 4380*0Sstevel@tonic-gate * Set tcp_cwnd to 1 MSS so that a 4381*0Sstevel@tonic-gate * new segment can be sent out. We 4382*0Sstevel@tonic-gate * are "clocking out" new data when 4383*0Sstevel@tonic-gate * the network is really congested. 4384*0Sstevel@tonic-gate */ 4385*0Sstevel@tonic-gate assert(tcp->tcp_ecn_ok); 4386*0Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 4387*0Sstevel@tonic-gate } 4388*0Sstevel@tonic-gate if (tcp->tcp_swnd == 0) { 4389*0Sstevel@tonic-gate /* Extend window for zero window probe */ 4390*0Sstevel@tonic-gate tcp->tcp_swnd++; 4391*0Sstevel@tonic-gate tcp->tcp_zero_win_probe = B_TRUE; 4392*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutWinProbe); 4393*0Sstevel@tonic-gate } else { 4394*0Sstevel@tonic-gate /* 4395*0Sstevel@tonic-gate * Handle timeout from sender SWS avoidance. 4396*0Sstevel@tonic-gate * Reset our knowledge of the max send window 4397*0Sstevel@tonic-gate * since the receiver might have reduced its 4398*0Sstevel@tonic-gate * receive buffer. Avoid setting tcp_max_swnd 4399*0Sstevel@tonic-gate * to one since that will essentially disable 4400*0Sstevel@tonic-gate * the SWS checks. 4401*0Sstevel@tonic-gate * 4402*0Sstevel@tonic-gate * Note that since we don't have a SWS 4403*0Sstevel@tonic-gate * state variable, if the timeout is set 4404*0Sstevel@tonic-gate * for ECN but not for SWS, this 4405*0Sstevel@tonic-gate * code will also be executed. This is 4406*0Sstevel@tonic-gate * fine as tcp_max_swnd is updated 4407*0Sstevel@tonic-gate * constantly and it will not affect 4408*0Sstevel@tonic-gate * anything. 4409*0Sstevel@tonic-gate */ 4410*0Sstevel@tonic-gate tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4411*0Sstevel@tonic-gate } 4412*0Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4413*0Sstevel@tonic-gate return; 4414*0Sstevel@tonic-gate } 4415*0Sstevel@tonic-gate /* Is there a FIN that needs to be to re retransmitted? */ 4416*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4417*0Sstevel@tonic-gate !tcp->tcp_fin_acked) 4418*0Sstevel@tonic-gate break; 4419*0Sstevel@tonic-gate /* Nothing to do, return without restarting timer. */ 4420*0Sstevel@tonic-gate return; 4421*0Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 4422*0Sstevel@tonic-gate /* 4423*0Sstevel@tonic-gate * User closed the TCP endpoint and peer ACK'ed our FIN. 4424*0Sstevel@tonic-gate * We waited some time for for peer's FIN, but it hasn't 4425*0Sstevel@tonic-gate * arrived. We flush the connection now to avoid 4426*0Sstevel@tonic-gate * case where the peer has rebooted. 4427*0Sstevel@tonic-gate */ 4428*0Sstevel@tonic-gate /* FALLTHRU */ 4429*0Sstevel@tonic-gate case TCPS_TIME_WAIT: 4430*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 4431*0Sstevel@tonic-gate return; 4432*0Sstevel@tonic-gate default: 4433*0Sstevel@tonic-gate DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4434*0Sstevel@tonic-gate tcp->tcp_state, tcp_display(tcp, NULL, 4435*0Sstevel@tonic-gate DISP_PORT_ONLY)); 4436*0Sstevel@tonic-gate return; 4437*0Sstevel@tonic-gate } 4438*0Sstevel@tonic-gate if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4439*0Sstevel@tonic-gate /* 4440*0Sstevel@tonic-gate * For zero window probe, we need to send indefinitely, 4441*0Sstevel@tonic-gate * unless we have not heard from the other side for some 4442*0Sstevel@tonic-gate * time... 4443*0Sstevel@tonic-gate */ 4444*0Sstevel@tonic-gate if ((tcp->tcp_zero_win_probe == 0) || 4445*0Sstevel@tonic-gate ((prom_gettime() - tcp->tcp_last_recv_time) > 4446*0Sstevel@tonic-gate second_threshold)) { 4447*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4448*0Sstevel@tonic-gate /* 4449*0Sstevel@tonic-gate * If TCP is in SYN_RCVD state, send back a 4450*0Sstevel@tonic-gate * RST|ACK as BSD does. Note that tcp_zero_win_probe 4451*0Sstevel@tonic-gate * should be zero in TCPS_SYN_RCVD state. 4452*0Sstevel@tonic-gate */ 4453*0Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_RCVD) { 4454*0Sstevel@tonic-gate tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4455*0Sstevel@tonic-gate "in SYN_RCVD", 4456*0Sstevel@tonic-gate tcp, NULL, tcp->tcp_snxt, 4457*0Sstevel@tonic-gate tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4458*0Sstevel@tonic-gate } 4459*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 4460*0Sstevel@tonic-gate tcp->tcp_client_errno ? 4461*0Sstevel@tonic-gate tcp->tcp_client_errno : ETIMEDOUT); 4462*0Sstevel@tonic-gate return; 4463*0Sstevel@tonic-gate } else { 4464*0Sstevel@tonic-gate /* 4465*0Sstevel@tonic-gate * Set tcp_ms_we_have_waited to second_threshold 4466*0Sstevel@tonic-gate * so that in next timeout, we will do the above 4467*0Sstevel@tonic-gate * check (lbolt - tcp_last_recv_time). This is 4468*0Sstevel@tonic-gate * also to avoid overflow. 4469*0Sstevel@tonic-gate * 4470*0Sstevel@tonic-gate * We don't need to decrement tcp_timer_backoff 4471*0Sstevel@tonic-gate * to avoid overflow because it will be decremented 4472*0Sstevel@tonic-gate * later if new timeout value is greater than 4473*0Sstevel@tonic-gate * tcp_rexmit_interval_max. In the case when 4474*0Sstevel@tonic-gate * tcp_rexmit_interval_max is greater than 4475*0Sstevel@tonic-gate * second_threshold, it means that we will wait 4476*0Sstevel@tonic-gate * longer than second_threshold to send the next 4477*0Sstevel@tonic-gate * window probe. 4478*0Sstevel@tonic-gate */ 4479*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = second_threshold; 4480*0Sstevel@tonic-gate } 4481*0Sstevel@tonic-gate } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4482*0Sstevel@tonic-gate /* 4483*0Sstevel@tonic-gate * We have been retransmitting for too long... The RTT 4484*0Sstevel@tonic-gate * we calculated is probably incorrect. Reinitialize it. 4485*0Sstevel@tonic-gate * Need to compensate for 0 tcp_rtt_sa. Reset 4486*0Sstevel@tonic-gate * tcp_rtt_update so that we won't accidentally cache a 4487*0Sstevel@tonic-gate * bad value. But only do this if this is not a zero 4488*0Sstevel@tonic-gate * window probe. 4489*0Sstevel@tonic-gate */ 4490*0Sstevel@tonic-gate if (tcp->tcp_zero_win_probe == 0) { 4491*0Sstevel@tonic-gate tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4492*0Sstevel@tonic-gate (tcp->tcp_rtt_sa >> 5); 4493*0Sstevel@tonic-gate tcp->tcp_rtt_sa = 0; 4494*0Sstevel@tonic-gate tcp->tcp_rtt_update = 0; 4495*0Sstevel@tonic-gate } 4496*0Sstevel@tonic-gate } 4497*0Sstevel@tonic-gate tcp->tcp_timer_backoff++; 4498*0Sstevel@tonic-gate if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4499*0Sstevel@tonic-gate tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4500*0Sstevel@tonic-gate tcp_rexmit_interval_min) { 4501*0Sstevel@tonic-gate /* 4502*0Sstevel@tonic-gate * This means the original RTO is tcp_rexmit_interval_min. 4503*0Sstevel@tonic-gate * So we will use tcp_rexmit_interval_min as the RTO value 4504*0Sstevel@tonic-gate * and do the backoff. 4505*0Sstevel@tonic-gate */ 4506*0Sstevel@tonic-gate ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4507*0Sstevel@tonic-gate } else { 4508*0Sstevel@tonic-gate ms <<= tcp->tcp_timer_backoff; 4509*0Sstevel@tonic-gate } 4510*0Sstevel@tonic-gate if (ms > tcp_rexmit_interval_max) { 4511*0Sstevel@tonic-gate ms = tcp_rexmit_interval_max; 4512*0Sstevel@tonic-gate /* 4513*0Sstevel@tonic-gate * ms is at max, decrement tcp_timer_backoff to avoid 4514*0Sstevel@tonic-gate * overflow. 4515*0Sstevel@tonic-gate */ 4516*0Sstevel@tonic-gate tcp->tcp_timer_backoff--; 4517*0Sstevel@tonic-gate } 4518*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited += ms; 4519*0Sstevel@tonic-gate if (tcp->tcp_zero_win_probe == 0) { 4520*0Sstevel@tonic-gate tcp->tcp_rto = ms; 4521*0Sstevel@tonic-gate } 4522*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, ms); 4523*0Sstevel@tonic-gate /* 4524*0Sstevel@tonic-gate * This is after a timeout and tcp_rto is backed off. Set 4525*0Sstevel@tonic-gate * tcp_set_timer to 1 so that next time RTO is updated, we will 4526*0Sstevel@tonic-gate * restart the timer with a correct value. 4527*0Sstevel@tonic-gate */ 4528*0Sstevel@tonic-gate tcp->tcp_set_timer = 1; 4529*0Sstevel@tonic-gate mss = tcp->tcp_snxt - tcp->tcp_suna; 4530*0Sstevel@tonic-gate if (mss > tcp->tcp_mss) 4531*0Sstevel@tonic-gate mss = tcp->tcp_mss; 4532*0Sstevel@tonic-gate if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4533*0Sstevel@tonic-gate mss = tcp->tcp_swnd; 4534*0Sstevel@tonic-gate 4535*0Sstevel@tonic-gate if ((mp = tcp->tcp_xmit_head) != NULL) 4536*0Sstevel@tonic-gate mp->b_prev = (mblk_t *)prom_gettime(); 4537*0Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4538*0Sstevel@tonic-gate B_TRUE); 4539*0Sstevel@tonic-gate if (mp == NULL) 4540*0Sstevel@tonic-gate return; 4541*0Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 4542*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4543*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4544*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 4545*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_timer", mp); 4546*0Sstevel@tonic-gate 4547*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 4548*0Sstevel@tonic-gate freeb(mp); 4549*0Sstevel@tonic-gate 4550*0Sstevel@tonic-gate /* 4551*0Sstevel@tonic-gate * When slow start after retransmission begins, start with 4552*0Sstevel@tonic-gate * this seq no. tcp_rexmit_max marks the end of special slow 4553*0Sstevel@tonic-gate * start phase. tcp_snd_burst controls how many segments 4554*0Sstevel@tonic-gate * can be sent because of an ack. 4555*0Sstevel@tonic-gate */ 4556*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4557*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_SS; 4558*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4559*0Sstevel@tonic-gate (tcp->tcp_unsent == 0)) { 4560*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_fss; 4561*0Sstevel@tonic-gate } else { 4562*0Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 4563*0Sstevel@tonic-gate } 4564*0Sstevel@tonic-gate tcp->tcp_rexmit = B_TRUE; 4565*0Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 4566*0Sstevel@tonic-gate 4567*0Sstevel@tonic-gate /* 4568*0Sstevel@tonic-gate * Remove all rexmit SACK blk to start from fresh. 4569*0Sstevel@tonic-gate */ 4570*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4571*0Sstevel@tonic-gate TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4572*0Sstevel@tonic-gate tcp->tcp_num_notsack_blk = 0; 4573*0Sstevel@tonic-gate tcp->tcp_cnt_notsack_list = 0; 4574*0Sstevel@tonic-gate } 4575*0Sstevel@tonic-gate } 4576*0Sstevel@tonic-gate 4577*0Sstevel@tonic-gate /* 4578*0Sstevel@tonic-gate * The TCP normal data output path. 4579*0Sstevel@tonic-gate * NOTE: the logic of the fast path is duplicated from this function. 4580*0Sstevel@tonic-gate */ 4581*0Sstevel@tonic-gate static void 4582*0Sstevel@tonic-gate tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4583*0Sstevel@tonic-gate { 4584*0Sstevel@tonic-gate int len; 4585*0Sstevel@tonic-gate mblk_t *local_time; 4586*0Sstevel@tonic-gate mblk_t *mp1; 4587*0Sstevel@tonic-gate uchar_t *rptr; 4588*0Sstevel@tonic-gate uint32_t snxt; 4589*0Sstevel@tonic-gate int tail_unsent; 4590*0Sstevel@tonic-gate int tcpstate; 4591*0Sstevel@tonic-gate int usable = 0; 4592*0Sstevel@tonic-gate mblk_t *xmit_tail; 4593*0Sstevel@tonic-gate int32_t num_burst_seg; 4594*0Sstevel@tonic-gate int32_t mss; 4595*0Sstevel@tonic-gate int32_t num_sack_blk = 0; 4596*0Sstevel@tonic-gate int32_t tcp_hdr_len; 4597*0Sstevel@tonic-gate ipaddr_t *dst; 4598*0Sstevel@tonic-gate ipaddr_t *src; 4599*0Sstevel@tonic-gate 4600*0Sstevel@tonic-gate #ifdef DEBUG 4601*0Sstevel@tonic-gate printf("tcp_wput_data(%d) ##############################\n", sock_id); 4602*0Sstevel@tonic-gate #endif 4603*0Sstevel@tonic-gate tcpstate = tcp->tcp_state; 4604*0Sstevel@tonic-gate if (mp == NULL) { 4605*0Sstevel@tonic-gate /* Really tacky... but we need this for detached closes. */ 4606*0Sstevel@tonic-gate len = tcp->tcp_unsent; 4607*0Sstevel@tonic-gate goto data_null; 4608*0Sstevel@tonic-gate } 4609*0Sstevel@tonic-gate 4610*0Sstevel@tonic-gate /* 4611*0Sstevel@tonic-gate * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4612*0Sstevel@tonic-gate * or before a connection attempt has begun. 4613*0Sstevel@tonic-gate * 4614*0Sstevel@tonic-gate * The following should not happen in inetboot.... 4615*0Sstevel@tonic-gate */ 4616*0Sstevel@tonic-gate if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4617*0Sstevel@tonic-gate (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4618*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4619*0Sstevel@tonic-gate printf("tcp_wput_data: data after ordrel, %s\n", 4620*0Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4621*0Sstevel@tonic-gate } 4622*0Sstevel@tonic-gate freemsg(mp); 4623*0Sstevel@tonic-gate return; 4624*0Sstevel@tonic-gate } 4625*0Sstevel@tonic-gate 4626*0Sstevel@tonic-gate /* Strip empties */ 4627*0Sstevel@tonic-gate for (;;) { 4628*0Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4629*0Sstevel@tonic-gate (uintptr_t)INT_MAX); 4630*0Sstevel@tonic-gate len = (int)(mp->b_wptr - mp->b_rptr); 4631*0Sstevel@tonic-gate if (len > 0) 4632*0Sstevel@tonic-gate break; 4633*0Sstevel@tonic-gate mp1 = mp; 4634*0Sstevel@tonic-gate mp = mp->b_cont; 4635*0Sstevel@tonic-gate freeb(mp1); 4636*0Sstevel@tonic-gate if (mp == NULL) { 4637*0Sstevel@tonic-gate return; 4638*0Sstevel@tonic-gate } 4639*0Sstevel@tonic-gate } 4640*0Sstevel@tonic-gate 4641*0Sstevel@tonic-gate /* If we are the first on the list ... */ 4642*0Sstevel@tonic-gate if (tcp->tcp_xmit_head == NULL) { 4643*0Sstevel@tonic-gate tcp->tcp_xmit_head = mp; 4644*0Sstevel@tonic-gate tcp->tcp_xmit_tail = mp; 4645*0Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = len; 4646*0Sstevel@tonic-gate } else { 4647*0Sstevel@tonic-gate tcp->tcp_xmit_last->b_cont = mp; 4648*0Sstevel@tonic-gate len += tcp->tcp_unsent; 4649*0Sstevel@tonic-gate } 4650*0Sstevel@tonic-gate 4651*0Sstevel@tonic-gate /* Tack on however many more positive length mblks we have */ 4652*0Sstevel@tonic-gate if ((mp1 = mp->b_cont) != NULL) { 4653*0Sstevel@tonic-gate do { 4654*0Sstevel@tonic-gate int tlen; 4655*0Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 4656*0Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 4657*0Sstevel@tonic-gate tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4658*0Sstevel@tonic-gate if (tlen <= 0) { 4659*0Sstevel@tonic-gate mp->b_cont = mp1->b_cont; 4660*0Sstevel@tonic-gate freeb(mp1); 4661*0Sstevel@tonic-gate } else { 4662*0Sstevel@tonic-gate len += tlen; 4663*0Sstevel@tonic-gate mp = mp1; 4664*0Sstevel@tonic-gate } 4665*0Sstevel@tonic-gate } while ((mp1 = mp->b_cont) != NULL); 4666*0Sstevel@tonic-gate } 4667*0Sstevel@tonic-gate tcp->tcp_xmit_last = mp; 4668*0Sstevel@tonic-gate tcp->tcp_unsent = len; 4669*0Sstevel@tonic-gate 4670*0Sstevel@tonic-gate data_null: 4671*0Sstevel@tonic-gate snxt = tcp->tcp_snxt; 4672*0Sstevel@tonic-gate xmit_tail = tcp->tcp_xmit_tail; 4673*0Sstevel@tonic-gate tail_unsent = tcp->tcp_xmit_tail_unsent; 4674*0Sstevel@tonic-gate 4675*0Sstevel@tonic-gate /* 4676*0Sstevel@tonic-gate * Note that tcp_mss has been adjusted to take into account the 4677*0Sstevel@tonic-gate * timestamp option if applicable. Because SACK options do not 4678*0Sstevel@tonic-gate * appear in every TCP segments and they are of variable lengths, 4679*0Sstevel@tonic-gate * they cannot be included in tcp_mss. Thus we need to calculate 4680*0Sstevel@tonic-gate * the actual segment length when we need to send a segment which 4681*0Sstevel@tonic-gate * includes SACK options. 4682*0Sstevel@tonic-gate */ 4683*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4684*0Sstevel@tonic-gate int32_t opt_len; 4685*0Sstevel@tonic-gate 4686*0Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4687*0Sstevel@tonic-gate tcp->tcp_num_sack_blk); 4688*0Sstevel@tonic-gate opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4689*0Sstevel@tonic-gate 2 + TCPOPT_HEADER_LEN; 4690*0Sstevel@tonic-gate mss = tcp->tcp_mss - opt_len; 4691*0Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4692*0Sstevel@tonic-gate } else { 4693*0Sstevel@tonic-gate mss = tcp->tcp_mss; 4694*0Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 4695*0Sstevel@tonic-gate } 4696*0Sstevel@tonic-gate 4697*0Sstevel@tonic-gate if ((tcp->tcp_suna == snxt) && 4698*0Sstevel@tonic-gate (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4699*0Sstevel@tonic-gate tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4700*0Sstevel@tonic-gate MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4701*0Sstevel@tonic-gate } 4702*0Sstevel@tonic-gate if (tcpstate == TCPS_SYN_RCVD) { 4703*0Sstevel@tonic-gate /* 4704*0Sstevel@tonic-gate * The three-way connection establishment handshake is not 4705*0Sstevel@tonic-gate * complete yet. We want to queue the data for transmission 4706*0Sstevel@tonic-gate * after entering ESTABLISHED state (RFC793). Setting usable to 4707*0Sstevel@tonic-gate * zero cause a jump to "done" label effectively leaving data 4708*0Sstevel@tonic-gate * on the queue. 4709*0Sstevel@tonic-gate */ 4710*0Sstevel@tonic-gate 4711*0Sstevel@tonic-gate usable = 0; 4712*0Sstevel@tonic-gate } else { 4713*0Sstevel@tonic-gate int usable_r = tcp->tcp_swnd; 4714*0Sstevel@tonic-gate 4715*0Sstevel@tonic-gate /* 4716*0Sstevel@tonic-gate * In the special case when cwnd is zero, which can only 4717*0Sstevel@tonic-gate * happen if the connection is ECN capable, return now. 4718*0Sstevel@tonic-gate * New segments is sent using tcp_timer(). The timer 4719*0Sstevel@tonic-gate * is set in tcp_rput_data(). 4720*0Sstevel@tonic-gate */ 4721*0Sstevel@tonic-gate if (tcp->tcp_cwnd == 0) { 4722*0Sstevel@tonic-gate /* 4723*0Sstevel@tonic-gate * Note that tcp_cwnd is 0 before 3-way handshake is 4724*0Sstevel@tonic-gate * finished. 4725*0Sstevel@tonic-gate */ 4726*0Sstevel@tonic-gate assert(tcp->tcp_ecn_ok || 4727*0Sstevel@tonic-gate tcp->tcp_state < TCPS_ESTABLISHED); 4728*0Sstevel@tonic-gate return; 4729*0Sstevel@tonic-gate } 4730*0Sstevel@tonic-gate 4731*0Sstevel@tonic-gate /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4732*0Sstevel@tonic-gate if (usable_r > tcp->tcp_cwnd) 4733*0Sstevel@tonic-gate usable_r = tcp->tcp_cwnd; 4734*0Sstevel@tonic-gate 4735*0Sstevel@tonic-gate /* NOTE: trouble if xmitting while SYN not acked? */ 4736*0Sstevel@tonic-gate usable_r -= snxt; 4737*0Sstevel@tonic-gate usable_r += tcp->tcp_suna; 4738*0Sstevel@tonic-gate 4739*0Sstevel@tonic-gate /* usable = MIN(usable, unsent) */ 4740*0Sstevel@tonic-gate if (usable_r > len) 4741*0Sstevel@tonic-gate usable_r = len; 4742*0Sstevel@tonic-gate 4743*0Sstevel@tonic-gate /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4744*0Sstevel@tonic-gate if (usable_r != 0) 4745*0Sstevel@tonic-gate usable = usable_r; 4746*0Sstevel@tonic-gate } 4747*0Sstevel@tonic-gate 4748*0Sstevel@tonic-gate local_time = (mblk_t *)prom_gettime(); 4749*0Sstevel@tonic-gate 4750*0Sstevel@tonic-gate /* 4751*0Sstevel@tonic-gate * "Our" Nagle Algorithm. This is not the same as in the old 4752*0Sstevel@tonic-gate * BSD. This is more in line with the true intent of Nagle. 4753*0Sstevel@tonic-gate * 4754*0Sstevel@tonic-gate * The conditions are: 4755*0Sstevel@tonic-gate * 1. The amount of unsent data (or amount of data which can be 4756*0Sstevel@tonic-gate * sent, whichever is smaller) is less than Nagle limit. 4757*0Sstevel@tonic-gate * 2. The last sent size is also less than Nagle limit. 4758*0Sstevel@tonic-gate * 3. There is unack'ed data. 4759*0Sstevel@tonic-gate * 4. Urgent pointer is not set. Send urgent data ignoring the 4760*0Sstevel@tonic-gate * Nagle algorithm. This reduces the probability that urgent 4761*0Sstevel@tonic-gate * bytes get "merged" together. 4762*0Sstevel@tonic-gate * 5. The app has not closed the connection. This eliminates the 4763*0Sstevel@tonic-gate * wait time of the receiving side waiting for the last piece of 4764*0Sstevel@tonic-gate * (small) data. 4765*0Sstevel@tonic-gate * 4766*0Sstevel@tonic-gate * If all are satisified, exit without sending anything. Note 4767*0Sstevel@tonic-gate * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4768*0Sstevel@tonic-gate * the smaller of 1 MSS and global tcp_naglim_def (default to be 4769*0Sstevel@tonic-gate * 4095). 4770*0Sstevel@tonic-gate */ 4771*0Sstevel@tonic-gate if (usable < (int)tcp->tcp_naglim && 4772*0Sstevel@tonic-gate tcp->tcp_naglim > tcp->tcp_last_sent_len && 4773*0Sstevel@tonic-gate snxt != tcp->tcp_suna && 4774*0Sstevel@tonic-gate !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4775*0Sstevel@tonic-gate goto done; 4776*0Sstevel@tonic-gate 4777*0Sstevel@tonic-gate num_burst_seg = tcp->tcp_snd_burst; 4778*0Sstevel@tonic-gate for (;;) { 4779*0Sstevel@tonic-gate tcph_t *tcph; 4780*0Sstevel@tonic-gate mblk_t *new_mp; 4781*0Sstevel@tonic-gate 4782*0Sstevel@tonic-gate if (num_burst_seg-- == 0) 4783*0Sstevel@tonic-gate goto done; 4784*0Sstevel@tonic-gate 4785*0Sstevel@tonic-gate len = mss; 4786*0Sstevel@tonic-gate if (len > usable) { 4787*0Sstevel@tonic-gate len = usable; 4788*0Sstevel@tonic-gate if (len <= 0) { 4789*0Sstevel@tonic-gate /* Terminate the loop */ 4790*0Sstevel@tonic-gate goto done; 4791*0Sstevel@tonic-gate } 4792*0Sstevel@tonic-gate /* 4793*0Sstevel@tonic-gate * Sender silly-window avoidance. 4794*0Sstevel@tonic-gate * Ignore this if we are going to send a 4795*0Sstevel@tonic-gate * zero window probe out. 4796*0Sstevel@tonic-gate * 4797*0Sstevel@tonic-gate * TODO: force data into microscopic window ?? 4798*0Sstevel@tonic-gate * ==> (!pushed || (unsent > usable)) 4799*0Sstevel@tonic-gate */ 4800*0Sstevel@tonic-gate if (len < (tcp->tcp_max_swnd >> 1) && 4801*0Sstevel@tonic-gate (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4802*0Sstevel@tonic-gate !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4803*0Sstevel@tonic-gate len == 1) && (! tcp->tcp_zero_win_probe)) { 4804*0Sstevel@tonic-gate /* 4805*0Sstevel@tonic-gate * If the retransmit timer is not running 4806*0Sstevel@tonic-gate * we start it so that we will retransmit 4807*0Sstevel@tonic-gate * in the case when the the receiver has 4808*0Sstevel@tonic-gate * decremented the window. 4809*0Sstevel@tonic-gate */ 4810*0Sstevel@tonic-gate if (snxt == tcp->tcp_snxt && 4811*0Sstevel@tonic-gate snxt == tcp->tcp_suna) { 4812*0Sstevel@tonic-gate /* 4813*0Sstevel@tonic-gate * We are not supposed to send 4814*0Sstevel@tonic-gate * anything. So let's wait a little 4815*0Sstevel@tonic-gate * bit longer before breaking SWS 4816*0Sstevel@tonic-gate * avoidance. 4817*0Sstevel@tonic-gate * 4818*0Sstevel@tonic-gate * What should the value be? 4819*0Sstevel@tonic-gate * Suggestion: MAX(init rexmit time, 4820*0Sstevel@tonic-gate * tcp->tcp_rto) 4821*0Sstevel@tonic-gate */ 4822*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4823*0Sstevel@tonic-gate } 4824*0Sstevel@tonic-gate goto done; 4825*0Sstevel@tonic-gate } 4826*0Sstevel@tonic-gate } 4827*0Sstevel@tonic-gate 4828*0Sstevel@tonic-gate tcph = tcp->tcp_tcph; 4829*0Sstevel@tonic-gate 4830*0Sstevel@tonic-gate usable -= len; /* Approximate - can be adjusted later */ 4831*0Sstevel@tonic-gate if (usable > 0) 4832*0Sstevel@tonic-gate tcph->th_flags[0] = TH_ACK; 4833*0Sstevel@tonic-gate else 4834*0Sstevel@tonic-gate tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4835*0Sstevel@tonic-gate 4836*0Sstevel@tonic-gate U32_TO_ABE32(snxt, tcph->th_seq); 4837*0Sstevel@tonic-gate 4838*0Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 4839*0Sstevel@tonic-gate uchar_t *prev_rptr = xmit_tail->b_rptr; 4840*0Sstevel@tonic-gate uint32_t prev_snxt = tcp->tcp_snxt; 4841*0Sstevel@tonic-gate 4842*0Sstevel@tonic-gate if (tail_unsent == 0) { 4843*0Sstevel@tonic-gate assert(xmit_tail->b_cont != NULL); 4844*0Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4845*0Sstevel@tonic-gate prev_rptr = xmit_tail->b_rptr; 4846*0Sstevel@tonic-gate tail_unsent = (int)(xmit_tail->b_wptr - 4847*0Sstevel@tonic-gate xmit_tail->b_rptr); 4848*0Sstevel@tonic-gate } else { 4849*0Sstevel@tonic-gate xmit_tail->b_rptr = xmit_tail->b_wptr - 4850*0Sstevel@tonic-gate tail_unsent; 4851*0Sstevel@tonic-gate } 4852*0Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4853*0Sstevel@tonic-gate snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4854*0Sstevel@tonic-gate /* Restore tcp_snxt so we get amount sent right. */ 4855*0Sstevel@tonic-gate tcp->tcp_snxt = prev_snxt; 4856*0Sstevel@tonic-gate if (prev_rptr == xmit_tail->b_rptr) 4857*0Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4858*0Sstevel@tonic-gate else 4859*0Sstevel@tonic-gate xmit_tail->b_rptr = prev_rptr; 4860*0Sstevel@tonic-gate 4861*0Sstevel@tonic-gate if (mp == NULL) 4862*0Sstevel@tonic-gate break; 4863*0Sstevel@tonic-gate 4864*0Sstevel@tonic-gate mp1 = mp->b_cont; 4865*0Sstevel@tonic-gate 4866*0Sstevel@tonic-gate snxt += len; 4867*0Sstevel@tonic-gate tcp->tcp_last_sent_len = (ushort_t)len; 4868*0Sstevel@tonic-gate while (mp1->b_cont) { 4869*0Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4870*0Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4871*0Sstevel@tonic-gate mp1 = mp1->b_cont; 4872*0Sstevel@tonic-gate } 4873*0Sstevel@tonic-gate tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4874*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutDataSegs); 4875*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4876*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 4877*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4878*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 4879*0Sstevel@tonic-gate freeb(mp); 4880*0Sstevel@tonic-gate continue; 4881*0Sstevel@tonic-gate } 4882*0Sstevel@tonic-gate 4883*0Sstevel@tonic-gate snxt += len; /* Adjust later if we don't send all of len */ 4884*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutDataSegs); 4885*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4886*0Sstevel@tonic-gate 4887*0Sstevel@tonic-gate if (tail_unsent) { 4888*0Sstevel@tonic-gate /* Are the bytes above us in flight? */ 4889*0Sstevel@tonic-gate rptr = xmit_tail->b_wptr - tail_unsent; 4890*0Sstevel@tonic-gate if (rptr != xmit_tail->b_rptr) { 4891*0Sstevel@tonic-gate tail_unsent -= len; 4892*0Sstevel@tonic-gate len += tcp_hdr_len; 4893*0Sstevel@tonic-gate tcp->tcp_ipha->ip_len = htons(len); 4894*0Sstevel@tonic-gate mp = dupb(xmit_tail); 4895*0Sstevel@tonic-gate if (!mp) 4896*0Sstevel@tonic-gate break; 4897*0Sstevel@tonic-gate mp->b_rptr = rptr; 4898*0Sstevel@tonic-gate goto must_alloc; 4899*0Sstevel@tonic-gate } 4900*0Sstevel@tonic-gate } else { 4901*0Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4902*0Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 4903*0Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4904*0Sstevel@tonic-gate tail_unsent = (int)(xmit_tail->b_wptr - 4905*0Sstevel@tonic-gate xmit_tail->b_rptr); 4906*0Sstevel@tonic-gate } 4907*0Sstevel@tonic-gate 4908*0Sstevel@tonic-gate tail_unsent -= len; 4909*0Sstevel@tonic-gate tcp->tcp_last_sent_len = (ushort_t)len; 4910*0Sstevel@tonic-gate 4911*0Sstevel@tonic-gate len += tcp_hdr_len; 4912*0Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 4913*0Sstevel@tonic-gate tcp->tcp_ipha->ip_len = htons(len); 4914*0Sstevel@tonic-gate 4915*0Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4916*0Sstevel@tonic-gate 4917*0Sstevel@tonic-gate mp = dupb(xmit_tail); 4918*0Sstevel@tonic-gate if (mp == NULL) 4919*0Sstevel@tonic-gate goto out_of_mem; 4920*0Sstevel@tonic-gate 4921*0Sstevel@tonic-gate len = tcp_hdr_len; 4922*0Sstevel@tonic-gate /* 4923*0Sstevel@tonic-gate * There are four reasons to allocate a new hdr mblk: 4924*0Sstevel@tonic-gate * 1) The bytes above us are in use by another packet 4925*0Sstevel@tonic-gate * 2) We don't have good alignment 4926*0Sstevel@tonic-gate * 3) The mblk is being shared 4927*0Sstevel@tonic-gate * 4) We don't have enough room for a header 4928*0Sstevel@tonic-gate */ 4929*0Sstevel@tonic-gate rptr = mp->b_rptr - len; 4930*0Sstevel@tonic-gate if (!OK_32PTR(rptr) || 4931*0Sstevel@tonic-gate rptr < mp->b_datap) { 4932*0Sstevel@tonic-gate /* NOTE: we assume allocb returns an OK_32PTR */ 4933*0Sstevel@tonic-gate 4934*0Sstevel@tonic-gate must_alloc:; 4935*0Sstevel@tonic-gate mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4936*0Sstevel@tonic-gate tcp_wroff_xtra, 0); 4937*0Sstevel@tonic-gate if (mp1 == NULL) { 4938*0Sstevel@tonic-gate freemsg(mp); 4939*0Sstevel@tonic-gate goto out_of_mem; 4940*0Sstevel@tonic-gate } 4941*0Sstevel@tonic-gate mp1->b_cont = mp; 4942*0Sstevel@tonic-gate mp = mp1; 4943*0Sstevel@tonic-gate /* Leave room for Link Level header */ 4944*0Sstevel@tonic-gate len = tcp_hdr_len; 4945*0Sstevel@tonic-gate rptr = &mp->b_rptr[tcp_wroff_xtra]; 4946*0Sstevel@tonic-gate mp->b_wptr = &rptr[len]; 4947*0Sstevel@tonic-gate } 4948*0Sstevel@tonic-gate 4949*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 4950*0Sstevel@tonic-gate U32_TO_BE32((uint32_t)local_time, 4951*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4952*0Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 4953*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4954*0Sstevel@tonic-gate } else { 4955*0Sstevel@tonic-gate assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4956*0Sstevel@tonic-gate } 4957*0Sstevel@tonic-gate 4958*0Sstevel@tonic-gate mp->b_rptr = rptr; 4959*0Sstevel@tonic-gate 4960*0Sstevel@tonic-gate /* Copy the template header. */ 4961*0Sstevel@tonic-gate dst = (ipaddr_t *)rptr; 4962*0Sstevel@tonic-gate src = (ipaddr_t *)tcp->tcp_iphc; 4963*0Sstevel@tonic-gate dst[0] = src[0]; 4964*0Sstevel@tonic-gate dst[1] = src[1]; 4965*0Sstevel@tonic-gate dst[2] = src[2]; 4966*0Sstevel@tonic-gate dst[3] = src[3]; 4967*0Sstevel@tonic-gate dst[4] = src[4]; 4968*0Sstevel@tonic-gate dst[5] = src[5]; 4969*0Sstevel@tonic-gate dst[6] = src[6]; 4970*0Sstevel@tonic-gate dst[7] = src[7]; 4971*0Sstevel@tonic-gate dst[8] = src[8]; 4972*0Sstevel@tonic-gate dst[9] = src[9]; 4973*0Sstevel@tonic-gate len = tcp->tcp_hdr_len; 4974*0Sstevel@tonic-gate if (len -= 40) { 4975*0Sstevel@tonic-gate len >>= 2; 4976*0Sstevel@tonic-gate dst += 10; 4977*0Sstevel@tonic-gate src += 10; 4978*0Sstevel@tonic-gate do { 4979*0Sstevel@tonic-gate *dst++ = *src++; 4980*0Sstevel@tonic-gate } while (--len); 4981*0Sstevel@tonic-gate } 4982*0Sstevel@tonic-gate 4983*0Sstevel@tonic-gate /* 4984*0Sstevel@tonic-gate * Set tcph to point to the header of the outgoing packet, 4985*0Sstevel@tonic-gate * not to the template header. 4986*0Sstevel@tonic-gate */ 4987*0Sstevel@tonic-gate tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 4988*0Sstevel@tonic-gate 4989*0Sstevel@tonic-gate /* 4990*0Sstevel@tonic-gate * Set the ECN info in the TCP header if it is not a zero 4991*0Sstevel@tonic-gate * window probe. Zero window probe is only sent in 4992*0Sstevel@tonic-gate * tcp_wput_data() and tcp_timer(). 4993*0Sstevel@tonic-gate */ 4994*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 4995*0Sstevel@tonic-gate SET_ECT(tcp, rptr); 4996*0Sstevel@tonic-gate 4997*0Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 4998*0Sstevel@tonic-gate tcph->th_flags[0] |= TH_ECE; 4999*0Sstevel@tonic-gate if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5000*0Sstevel@tonic-gate tcph->th_flags[0] |= TH_CWR; 5001*0Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_TRUE; 5002*0Sstevel@tonic-gate } 5003*0Sstevel@tonic-gate } 5004*0Sstevel@tonic-gate 5005*0Sstevel@tonic-gate /* Fill in SACK options */ 5006*0Sstevel@tonic-gate if (num_sack_blk > 0) { 5007*0Sstevel@tonic-gate uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5008*0Sstevel@tonic-gate sack_blk_t *tmp; 5009*0Sstevel@tonic-gate int32_t i; 5010*0Sstevel@tonic-gate 5011*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5012*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5013*0Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5014*0Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5015*0Sstevel@tonic-gate sizeof (sack_blk_t); 5016*0Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5017*0Sstevel@tonic-gate 5018*0Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5019*0Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5020*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5021*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5022*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 5023*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5024*0Sstevel@tonic-gate } 5025*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5026*0Sstevel@tonic-gate << 4); 5027*0Sstevel@tonic-gate } 5028*0Sstevel@tonic-gate 5029*0Sstevel@tonic-gate if (tail_unsent) { 5030*0Sstevel@tonic-gate mp1 = mp->b_cont; 5031*0Sstevel@tonic-gate if (mp1 == NULL) 5032*0Sstevel@tonic-gate mp1 = mp; 5033*0Sstevel@tonic-gate /* 5034*0Sstevel@tonic-gate * If we're a little short, tack on more mblks 5035*0Sstevel@tonic-gate * as long as we don't need to split an mblk. 5036*0Sstevel@tonic-gate */ 5037*0Sstevel@tonic-gate while (tail_unsent < 0 && 5038*0Sstevel@tonic-gate tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5039*0Sstevel@tonic-gate xmit_tail->b_cont->b_rptr) <= 0) { 5040*0Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 5041*0Sstevel@tonic-gate /* Stash for rtt use later */ 5042*0Sstevel@tonic-gate xmit_tail->b_prev = local_time; 5043*0Sstevel@tonic-gate mp1->b_cont = dupb(xmit_tail); 5044*0Sstevel@tonic-gate mp1 = mp1->b_cont; 5045*0Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 5046*0Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5047*0Sstevel@tonic-gate tail_unsent += (int)(xmit_tail->b_wptr - 5048*0Sstevel@tonic-gate xmit_tail->b_rptr); 5049*0Sstevel@tonic-gate if (mp1 == NULL) { 5050*0Sstevel@tonic-gate freemsg(mp); 5051*0Sstevel@tonic-gate goto out_of_mem; 5052*0Sstevel@tonic-gate } 5053*0Sstevel@tonic-gate } 5054*0Sstevel@tonic-gate /* Trim back any surplus on the last mblk */ 5055*0Sstevel@tonic-gate if (tail_unsent > 0) 5056*0Sstevel@tonic-gate mp1->b_wptr -= tail_unsent; 5057*0Sstevel@tonic-gate if (tail_unsent < 0) { 5058*0Sstevel@tonic-gate uint32_t ip_len; 5059*0Sstevel@tonic-gate 5060*0Sstevel@tonic-gate /* 5061*0Sstevel@tonic-gate * We did not send everything we could in 5062*0Sstevel@tonic-gate * order to preserve mblk boundaries. 5063*0Sstevel@tonic-gate */ 5064*0Sstevel@tonic-gate usable -= tail_unsent; 5065*0Sstevel@tonic-gate snxt += tail_unsent; 5066*0Sstevel@tonic-gate tcp->tcp_last_sent_len += tail_unsent; 5067*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5068*0Sstevel@tonic-gate tail_unsent); 5069*0Sstevel@tonic-gate /* Adjust the IP length field. */ 5070*0Sstevel@tonic-gate ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5071*0Sstevel@tonic-gate tail_unsent; 5072*0Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(ip_len); 5073*0Sstevel@tonic-gate tail_unsent = 0; 5074*0Sstevel@tonic-gate } 5075*0Sstevel@tonic-gate } 5076*0Sstevel@tonic-gate 5077*0Sstevel@tonic-gate if (mp == NULL) 5078*0Sstevel@tonic-gate goto out_of_mem; 5079*0Sstevel@tonic-gate 5080*0Sstevel@tonic-gate /* 5081*0Sstevel@tonic-gate * Performance hit! We need to pullup the whole message 5082*0Sstevel@tonic-gate * in order to do checksum and for the MAC output routine. 5083*0Sstevel@tonic-gate */ 5084*0Sstevel@tonic-gate if (mp->b_cont != NULL) { 5085*0Sstevel@tonic-gate int mp_size; 5086*0Sstevel@tonic-gate #ifdef DEBUG 5087*0Sstevel@tonic-gate printf("Multiple mblk %d\n", msgdsize(mp)); 5088*0Sstevel@tonic-gate #endif 5089*0Sstevel@tonic-gate new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5090*0Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 5091*0Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr; 5092*0Sstevel@tonic-gate while (mp != NULL) { 5093*0Sstevel@tonic-gate mp_size = mp->b_wptr - mp->b_rptr; 5094*0Sstevel@tonic-gate bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5095*0Sstevel@tonic-gate new_mp->b_wptr += mp_size; 5096*0Sstevel@tonic-gate mp = mp->b_cont; 5097*0Sstevel@tonic-gate } 5098*0Sstevel@tonic-gate freemsg(mp); 5099*0Sstevel@tonic-gate mp = new_mp; 5100*0Sstevel@tonic-gate } 5101*0Sstevel@tonic-gate tcp_set_cksum(mp); 5102*0Sstevel@tonic-gate ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5103*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_wput_data", mp); 5104*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 5105*0Sstevel@tonic-gate freemsg(mp); 5106*0Sstevel@tonic-gate } 5107*0Sstevel@tonic-gate out_of_mem:; 5108*0Sstevel@tonic-gate /* Pretend that all we were trying to send really got sent */ 5109*0Sstevel@tonic-gate if (tail_unsent < 0) { 5110*0Sstevel@tonic-gate do { 5111*0Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 5112*0Sstevel@tonic-gate xmit_tail->b_prev = local_time; 5113*0Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 5114*0Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5115*0Sstevel@tonic-gate tail_unsent += (int)(xmit_tail->b_wptr - 5116*0Sstevel@tonic-gate xmit_tail->b_rptr); 5117*0Sstevel@tonic-gate } while (tail_unsent < 0); 5118*0Sstevel@tonic-gate } 5119*0Sstevel@tonic-gate done:; 5120*0Sstevel@tonic-gate tcp->tcp_xmit_tail = xmit_tail; 5121*0Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = tail_unsent; 5122*0Sstevel@tonic-gate len = tcp->tcp_snxt - snxt; 5123*0Sstevel@tonic-gate if (len) { 5124*0Sstevel@tonic-gate /* 5125*0Sstevel@tonic-gate * If new data was sent, need to update the notsack 5126*0Sstevel@tonic-gate * list, which is, afterall, data blocks that have 5127*0Sstevel@tonic-gate * not been sack'ed by the receiver. New data is 5128*0Sstevel@tonic-gate * not sack'ed. 5129*0Sstevel@tonic-gate */ 5130*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5131*0Sstevel@tonic-gate /* len is a negative value. */ 5132*0Sstevel@tonic-gate tcp->tcp_pipe -= len; 5133*0Sstevel@tonic-gate tcp_notsack_update(&(tcp->tcp_notsack_list), 5134*0Sstevel@tonic-gate tcp->tcp_snxt, snxt, 5135*0Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 5136*0Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 5137*0Sstevel@tonic-gate } 5138*0Sstevel@tonic-gate tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5139*0Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5140*0Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5141*0Sstevel@tonic-gate if ((snxt + len) == tcp->tcp_suna) { 5142*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5143*0Sstevel@tonic-gate } 5144*0Sstevel@tonic-gate /* 5145*0Sstevel@tonic-gate * Note that len is the amount we just sent but with a negative 5146*0Sstevel@tonic-gate * sign. We update tcp_unsent here since we may come back to 5147*0Sstevel@tonic-gate * tcp_wput_data from tcp_state_wait. 5148*0Sstevel@tonic-gate */ 5149*0Sstevel@tonic-gate len += tcp->tcp_unsent; 5150*0Sstevel@tonic-gate tcp->tcp_unsent = len; 5151*0Sstevel@tonic-gate 5152*0Sstevel@tonic-gate /* 5153*0Sstevel@tonic-gate * Let's wait till all the segments have been acked, since we 5154*0Sstevel@tonic-gate * don't have a timer. 5155*0Sstevel@tonic-gate */ 5156*0Sstevel@tonic-gate (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5157*0Sstevel@tonic-gate return; 5158*0Sstevel@tonic-gate } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5159*0Sstevel@tonic-gate /* 5160*0Sstevel@tonic-gate * Didn't send anything. Make sure the timer is running 5161*0Sstevel@tonic-gate * so that we will probe a zero window. 5162*0Sstevel@tonic-gate */ 5163*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5164*0Sstevel@tonic-gate } 5165*0Sstevel@tonic-gate 5166*0Sstevel@tonic-gate /* Note that len is the amount we just sent but with a negative sign */ 5167*0Sstevel@tonic-gate len += tcp->tcp_unsent; 5168*0Sstevel@tonic-gate tcp->tcp_unsent = len; 5169*0Sstevel@tonic-gate 5170*0Sstevel@tonic-gate } 5171*0Sstevel@tonic-gate 5172*0Sstevel@tonic-gate static void 5173*0Sstevel@tonic-gate tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5174*0Sstevel@tonic-gate uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5175*0Sstevel@tonic-gate int sock_id) 5176*0Sstevel@tonic-gate { 5177*0Sstevel@tonic-gate int32_t bytes_acked; 5178*0Sstevel@tonic-gate int32_t gap; 5179*0Sstevel@tonic-gate int32_t rgap; 5180*0Sstevel@tonic-gate tcp_opt_t tcpopt; 5181*0Sstevel@tonic-gate uint_t flags; 5182*0Sstevel@tonic-gate uint32_t new_swnd = 0; 5183*0Sstevel@tonic-gate 5184*0Sstevel@tonic-gate #ifdef DEBUG 5185*0Sstevel@tonic-gate printf("Time wait processing called ###############3\n"); 5186*0Sstevel@tonic-gate #endif 5187*0Sstevel@tonic-gate 5188*0Sstevel@tonic-gate /* Just make sure we send the right sock_id to tcp_clean_death */ 5189*0Sstevel@tonic-gate if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5190*0Sstevel@tonic-gate sock_id = -1; 5191*0Sstevel@tonic-gate 5192*0Sstevel@tonic-gate flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5193*0Sstevel@tonic-gate new_swnd = BE16_TO_U16(tcph->th_win) << 5194*0Sstevel@tonic-gate ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5195*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5196*0Sstevel@tonic-gate if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5197*0Sstevel@tonic-gate freemsg(mp); 5198*0Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5199*0Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5200*0Sstevel@tonic-gate return; 5201*0Sstevel@tonic-gate } 5202*0Sstevel@tonic-gate } 5203*0Sstevel@tonic-gate gap = seg_seq - tcp->tcp_rnxt; 5204*0Sstevel@tonic-gate rgap = tcp->tcp_rwnd - (gap + seg_len); 5205*0Sstevel@tonic-gate if (gap < 0) { 5206*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5207*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5208*0Sstevel@tonic-gate (seg_len > -gap ? -gap : seg_len)); 5209*0Sstevel@tonic-gate seg_len += gap; 5210*0Sstevel@tonic-gate if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5211*0Sstevel@tonic-gate if (flags & TH_RST) { 5212*0Sstevel@tonic-gate freemsg(mp); 5213*0Sstevel@tonic-gate return; 5214*0Sstevel@tonic-gate } 5215*0Sstevel@tonic-gate if ((flags & TH_FIN) && seg_len == -1) { 5216*0Sstevel@tonic-gate /* 5217*0Sstevel@tonic-gate * When TCP receives a duplicate FIN in 5218*0Sstevel@tonic-gate * TIME_WAIT state, restart the 2 MSL timer. 5219*0Sstevel@tonic-gate * See page 73 in RFC 793. Make sure this TCP 5220*0Sstevel@tonic-gate * is already on the TIME_WAIT list. If not, 5221*0Sstevel@tonic-gate * just restart the timer. 5222*0Sstevel@tonic-gate */ 5223*0Sstevel@tonic-gate tcp_time_wait_remove(tcp); 5224*0Sstevel@tonic-gate tcp_time_wait_append(tcp); 5225*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5226*0Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5227*0Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5228*0Sstevel@tonic-gate freemsg(mp); 5229*0Sstevel@tonic-gate return; 5230*0Sstevel@tonic-gate } 5231*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5232*0Sstevel@tonic-gate seg_len = 0; 5233*0Sstevel@tonic-gate goto process_ack; 5234*0Sstevel@tonic-gate } 5235*0Sstevel@tonic-gate 5236*0Sstevel@tonic-gate /* Fix seg_seq, and chew the gap off the front. */ 5237*0Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 5238*0Sstevel@tonic-gate } 5239*0Sstevel@tonic-gate 5240*0Sstevel@tonic-gate if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5241*0Sstevel@tonic-gate /* 5242*0Sstevel@tonic-gate * Make sure that when we accept the connection, pick 5243*0Sstevel@tonic-gate * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5244*0Sstevel@tonic-gate * old connection. 5245*0Sstevel@tonic-gate * 5246*0Sstevel@tonic-gate * The next ISS generated is equal to tcp_iss_incr_extra 5247*0Sstevel@tonic-gate * + ISS_INCR/2 + other components depending on the 5248*0Sstevel@tonic-gate * value of tcp_strong_iss. We pre-calculate the new 5249*0Sstevel@tonic-gate * ISS here and compare with tcp_snxt to determine if 5250*0Sstevel@tonic-gate * we need to make adjustment to tcp_iss_incr_extra. 5251*0Sstevel@tonic-gate * 5252*0Sstevel@tonic-gate * Note that since we are now in the global queue 5253*0Sstevel@tonic-gate * perimeter and need to do a lateral_put() to the 5254*0Sstevel@tonic-gate * listener queue, there can be other connection requests/ 5255*0Sstevel@tonic-gate * attempts while the lateral_put() is going on. That 5256*0Sstevel@tonic-gate * means what we calculate here may not be correct. This 5257*0Sstevel@tonic-gate * is extremely difficult to solve unless TCP and IP 5258*0Sstevel@tonic-gate * modules are merged and there is no perimeter, but just 5259*0Sstevel@tonic-gate * locks. The above calculation is ugly and is a 5260*0Sstevel@tonic-gate * waste of CPU cycles... 5261*0Sstevel@tonic-gate */ 5262*0Sstevel@tonic-gate uint32_t new_iss = tcp_iss_incr_extra; 5263*0Sstevel@tonic-gate int32_t adj; 5264*0Sstevel@tonic-gate 5265*0Sstevel@tonic-gate /* Add time component and min random (i.e. 1). */ 5266*0Sstevel@tonic-gate new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5267*0Sstevel@tonic-gate if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5268*0Sstevel@tonic-gate /* 5269*0Sstevel@tonic-gate * New ISS not guaranteed to be ISS_INCR/2 5270*0Sstevel@tonic-gate * ahead of the current tcp_snxt, so add the 5271*0Sstevel@tonic-gate * difference to tcp_iss_incr_extra. 5272*0Sstevel@tonic-gate */ 5273*0Sstevel@tonic-gate tcp_iss_incr_extra += adj; 5274*0Sstevel@tonic-gate } 5275*0Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 5276*0Sstevel@tonic-gate 5277*0Sstevel@tonic-gate /* 5278*0Sstevel@tonic-gate * This is a passive open. Right now we do not 5279*0Sstevel@tonic-gate * do anything... 5280*0Sstevel@tonic-gate */ 5281*0Sstevel@tonic-gate freemsg(mp); 5282*0Sstevel@tonic-gate return; 5283*0Sstevel@tonic-gate } 5284*0Sstevel@tonic-gate 5285*0Sstevel@tonic-gate /* 5286*0Sstevel@tonic-gate * rgap is the amount of stuff received out of window. A negative 5287*0Sstevel@tonic-gate * value is the amount out of window. 5288*0Sstevel@tonic-gate */ 5289*0Sstevel@tonic-gate if (rgap < 0) { 5290*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5291*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5292*0Sstevel@tonic-gate /* Fix seg_len and make sure there is something left. */ 5293*0Sstevel@tonic-gate seg_len += rgap; 5294*0Sstevel@tonic-gate if (seg_len <= 0) { 5295*0Sstevel@tonic-gate if (flags & TH_RST) { 5296*0Sstevel@tonic-gate freemsg(mp); 5297*0Sstevel@tonic-gate return; 5298*0Sstevel@tonic-gate } 5299*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5300*0Sstevel@tonic-gate seg_len = 0; 5301*0Sstevel@tonic-gate goto process_ack; 5302*0Sstevel@tonic-gate } 5303*0Sstevel@tonic-gate } 5304*0Sstevel@tonic-gate /* 5305*0Sstevel@tonic-gate * Check whether we can update tcp_ts_recent. This test is 5306*0Sstevel@tonic-gate * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5307*0Sstevel@tonic-gate * Extensions for High Performance: An Update", Internet Draft. 5308*0Sstevel@tonic-gate */ 5309*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && 5310*0Sstevel@tonic-gate TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5311*0Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5312*0Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5313*0Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 5314*0Sstevel@tonic-gate } 5315*0Sstevel@tonic-gate 5316*0Sstevel@tonic-gate if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5317*0Sstevel@tonic-gate /* Always ack out of order packets */ 5318*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5319*0Sstevel@tonic-gate seg_len = 0; 5320*0Sstevel@tonic-gate } else if (seg_len > 0) { 5321*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5322*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5323*0Sstevel@tonic-gate } 5324*0Sstevel@tonic-gate if (flags & TH_RST) { 5325*0Sstevel@tonic-gate freemsg(mp); 5326*0Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 5327*0Sstevel@tonic-gate return; 5328*0Sstevel@tonic-gate } 5329*0Sstevel@tonic-gate if (flags & TH_SYN) { 5330*0Sstevel@tonic-gate freemsg(mp); 5331*0Sstevel@tonic-gate tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5332*0Sstevel@tonic-gate TH_RST|TH_ACK, 0, -1); 5333*0Sstevel@tonic-gate /* 5334*0Sstevel@tonic-gate * Do not delete the TCP structure if it is in 5335*0Sstevel@tonic-gate * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5336*0Sstevel@tonic-gate */ 5337*0Sstevel@tonic-gate return; 5338*0Sstevel@tonic-gate } 5339*0Sstevel@tonic-gate process_ack: 5340*0Sstevel@tonic-gate if (flags & TH_ACK) { 5341*0Sstevel@tonic-gate bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5342*0Sstevel@tonic-gate if (bytes_acked <= 0) { 5343*0Sstevel@tonic-gate if (bytes_acked == 0 && seg_len == 0 && 5344*0Sstevel@tonic-gate new_swnd == tcp->tcp_swnd) 5345*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDupAck); 5346*0Sstevel@tonic-gate } else { 5347*0Sstevel@tonic-gate /* Acks something not sent */ 5348*0Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5349*0Sstevel@tonic-gate } 5350*0Sstevel@tonic-gate } 5351*0Sstevel@tonic-gate freemsg(mp); 5352*0Sstevel@tonic-gate if (flags & TH_ACK_NEEDED) { 5353*0Sstevel@tonic-gate /* 5354*0Sstevel@tonic-gate * Time to send an ack for some reason. 5355*0Sstevel@tonic-gate */ 5356*0Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5357*0Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5358*0Sstevel@tonic-gate } 5359*0Sstevel@tonic-gate } 5360*0Sstevel@tonic-gate 5361*0Sstevel@tonic-gate static int 5362*0Sstevel@tonic-gate tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5363*0Sstevel@tonic-gate { 5364*0Sstevel@tonic-gate int err; 5365*0Sstevel@tonic-gate 5366*0Sstevel@tonic-gate tcp->tcp_family = AF_INET; 5367*0Sstevel@tonic-gate tcp->tcp_ipversion = IPV4_VERSION; 5368*0Sstevel@tonic-gate 5369*0Sstevel@tonic-gate /* 5370*0Sstevel@tonic-gate * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5371*0Sstevel@tonic-gate * will be close to tcp_rexmit_interval_initial. By doing this, we 5372*0Sstevel@tonic-gate * allow the algorithm to adjust slowly to large fluctuations of RTT 5373*0Sstevel@tonic-gate * during first few transmissions of a connection as seen in slow 5374*0Sstevel@tonic-gate * links. 5375*0Sstevel@tonic-gate */ 5376*0Sstevel@tonic-gate tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5377*0Sstevel@tonic-gate tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5378*0Sstevel@tonic-gate tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5379*0Sstevel@tonic-gate tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5380*0Sstevel@tonic-gate tcp_conn_grace_period; 5381*0Sstevel@tonic-gate if (tcp->tcp_rto < tcp_rexmit_interval_min) 5382*0Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_min; 5383*0Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 5384*0Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 5385*0Sstevel@tonic-gate tcp->tcp_last_recv_time = prom_gettime(); 5386*0Sstevel@tonic-gate tcp->tcp_cwnd_max = tcp_cwnd_max_; 5387*0Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5388*0Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5389*0Sstevel@tonic-gate /* For Ethernet, the mtu returned is actually 1550... */ 5390*0Sstevel@tonic-gate if (mac_get_type() == IFT_ETHER) { 5391*0Sstevel@tonic-gate tcp->tcp_if_mtu = mac_get_mtu() - 50; 5392*0Sstevel@tonic-gate } else { 5393*0Sstevel@tonic-gate tcp->tcp_if_mtu = mac_get_mtu(); 5394*0Sstevel@tonic-gate } 5395*0Sstevel@tonic-gate tcp->tcp_mss = tcp->tcp_if_mtu; 5396*0Sstevel@tonic-gate 5397*0Sstevel@tonic-gate tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5398*0Sstevel@tonic-gate tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5399*0Sstevel@tonic-gate tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5400*0Sstevel@tonic-gate /* 5401*0Sstevel@tonic-gate * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5402*0Sstevel@tonic-gate * passive open. 5403*0Sstevel@tonic-gate */ 5404*0Sstevel@tonic-gate tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5405*0Sstevel@tonic-gate 5406*0Sstevel@tonic-gate tcp->tcp_naglim = tcp_naglim_def; 5407*0Sstevel@tonic-gate 5408*0Sstevel@tonic-gate /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5409*0Sstevel@tonic-gate 5410*0Sstevel@tonic-gate /* Initialize the header template */ 5411*0Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) { 5412*0Sstevel@tonic-gate err = tcp_header_init_ipv4(tcp); 5413*0Sstevel@tonic-gate } 5414*0Sstevel@tonic-gate if (err) 5415*0Sstevel@tonic-gate return (err); 5416*0Sstevel@tonic-gate 5417*0Sstevel@tonic-gate /* 5418*0Sstevel@tonic-gate * Init the window scale to the max so tcp_rwnd_set() won't pare 5419*0Sstevel@tonic-gate * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5420*0Sstevel@tonic-gate */ 5421*0Sstevel@tonic-gate tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5422*0Sstevel@tonic-gate tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5423*0Sstevel@tonic-gate if (isp != NULL) { 5424*0Sstevel@tonic-gate tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5425*0Sstevel@tonic-gate tcp->tcp_rwnd = isp->so_rcvbuf; 5426*0Sstevel@tonic-gate tcp->tcp_rwnd_max = isp->so_rcvbuf; 5427*0Sstevel@tonic-gate } 5428*0Sstevel@tonic-gate tcp->tcp_state = TCPS_IDLE; 5429*0Sstevel@tonic-gate return (0); 5430*0Sstevel@tonic-gate } 5431*0Sstevel@tonic-gate 5432*0Sstevel@tonic-gate /* 5433*0Sstevel@tonic-gate * Initialize the IPv4 header. Loses any record of any IP options. 5434*0Sstevel@tonic-gate */ 5435*0Sstevel@tonic-gate static int 5436*0Sstevel@tonic-gate tcp_header_init_ipv4(tcp_t *tcp) 5437*0Sstevel@tonic-gate { 5438*0Sstevel@tonic-gate tcph_t *tcph; 5439*0Sstevel@tonic-gate 5440*0Sstevel@tonic-gate /* 5441*0Sstevel@tonic-gate * This is a simple initialization. If there's 5442*0Sstevel@tonic-gate * already a template, it should never be too small, 5443*0Sstevel@tonic-gate * so reuse it. Otherwise, allocate space for the new one. 5444*0Sstevel@tonic-gate */ 5445*0Sstevel@tonic-gate if (tcp->tcp_iphc != NULL) { 5446*0Sstevel@tonic-gate assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5447*0Sstevel@tonic-gate bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5448*0Sstevel@tonic-gate } else { 5449*0Sstevel@tonic-gate tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5450*0Sstevel@tonic-gate tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5451*0Sstevel@tonic-gate if (tcp->tcp_iphc == NULL) { 5452*0Sstevel@tonic-gate tcp->tcp_iphc_len = 0; 5453*0Sstevel@tonic-gate return (ENOMEM); 5454*0Sstevel@tonic-gate } 5455*0Sstevel@tonic-gate } 5456*0Sstevel@tonic-gate tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5457*0Sstevel@tonic-gate tcp->tcp_ipversion = IPV4_VERSION; 5458*0Sstevel@tonic-gate 5459*0Sstevel@tonic-gate /* 5460*0Sstevel@tonic-gate * Note that it does not include TCP options yet. It will 5461*0Sstevel@tonic-gate * after the connection is established. 5462*0Sstevel@tonic-gate */ 5463*0Sstevel@tonic-gate tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5464*0Sstevel@tonic-gate tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5465*0Sstevel@tonic-gate tcp->tcp_ip_hdr_len = sizeof (struct ip); 5466*0Sstevel@tonic-gate tcp->tcp_ipha->ip_v = IP_VERSION; 5467*0Sstevel@tonic-gate /* We don't support IP options... */ 5468*0Sstevel@tonic-gate tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5469*0Sstevel@tonic-gate tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5470*0Sstevel@tonic-gate /* We are not supposed to do PMTU discovery... */ 5471*0Sstevel@tonic-gate tcp->tcp_ipha->ip_sum = 0; 5472*0Sstevel@tonic-gate 5473*0Sstevel@tonic-gate tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5474*0Sstevel@tonic-gate tcp->tcp_tcph = tcph; 5475*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] = (5 << 4); 5476*0Sstevel@tonic-gate return (0); 5477*0Sstevel@tonic-gate } 5478*0Sstevel@tonic-gate 5479*0Sstevel@tonic-gate /* 5480*0Sstevel@tonic-gate * Send out a control packet on the tcp connection specified. This routine 5481*0Sstevel@tonic-gate * is typically called where we need a simple ACK or RST generated. 5482*0Sstevel@tonic-gate * 5483*0Sstevel@tonic-gate * This function is called with or without a mp. 5484*0Sstevel@tonic-gate */ 5485*0Sstevel@tonic-gate static void 5486*0Sstevel@tonic-gate tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5487*0Sstevel@tonic-gate uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5488*0Sstevel@tonic-gate { 5489*0Sstevel@tonic-gate uchar_t *rptr; 5490*0Sstevel@tonic-gate tcph_t *tcph; 5491*0Sstevel@tonic-gate struct ip *iph = NULL; 5492*0Sstevel@tonic-gate int tcp_hdr_len; 5493*0Sstevel@tonic-gate int tcp_ip_hdr_len; 5494*0Sstevel@tonic-gate 5495*0Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 5496*0Sstevel@tonic-gate tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5497*0Sstevel@tonic-gate 5498*0Sstevel@tonic-gate if (mp) { 5499*0Sstevel@tonic-gate assert(ip_hdr_len != 0); 5500*0Sstevel@tonic-gate rptr = mp->b_rptr; 5501*0Sstevel@tonic-gate tcph = (tcph_t *)(rptr + ip_hdr_len); 5502*0Sstevel@tonic-gate /* Don't reply to a RST segment. */ 5503*0Sstevel@tonic-gate if (tcph->th_flags[0] & TH_RST) { 5504*0Sstevel@tonic-gate freeb(mp); 5505*0Sstevel@tonic-gate return; 5506*0Sstevel@tonic-gate } 5507*0Sstevel@tonic-gate freemsg(mp); 5508*0Sstevel@tonic-gate rptr = NULL; 5509*0Sstevel@tonic-gate } else { 5510*0Sstevel@tonic-gate assert(ip_hdr_len == 0); 5511*0Sstevel@tonic-gate } 5512*0Sstevel@tonic-gate /* If a text string is passed in with the request, print it out. */ 5513*0Sstevel@tonic-gate if (str != NULL) { 5514*0Sstevel@tonic-gate dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5515*0Sstevel@tonic-gate "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5516*0Sstevel@tonic-gate } 5517*0Sstevel@tonic-gate mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5518*0Sstevel@tonic-gate if (mp == NULL) { 5519*0Sstevel@tonic-gate dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5520*0Sstevel@tonic-gate return; 5521*0Sstevel@tonic-gate } 5522*0Sstevel@tonic-gate rptr = &mp->b_rptr[tcp_wroff_xtra]; 5523*0Sstevel@tonic-gate mp->b_rptr = rptr; 5524*0Sstevel@tonic-gate mp->b_wptr = &rptr[tcp_hdr_len]; 5525*0Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5526*0Sstevel@tonic-gate 5527*0Sstevel@tonic-gate iph = (struct ip *)rptr; 5528*0Sstevel@tonic-gate iph->ip_len = htons(tcp_hdr_len); 5529*0Sstevel@tonic-gate 5530*0Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5531*0Sstevel@tonic-gate tcph->th_flags[0] = (uint8_t)ctl; 5532*0Sstevel@tonic-gate if (ctl & TH_RST) { 5533*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutRsts); 5534*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5535*0Sstevel@tonic-gate /* 5536*0Sstevel@tonic-gate * Don't send TSopt w/ TH_RST packets per RFC 1323. 5537*0Sstevel@tonic-gate */ 5538*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5539*0Sstevel@tonic-gate mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5540*0Sstevel@tonic-gate *(mp->b_wptr) = TCPOPT_EOL; 5541*0Sstevel@tonic-gate iph->ip_len = htons(tcp_hdr_len - 5542*0Sstevel@tonic-gate TCPOPT_REAL_TS_LEN); 5543*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5544*0Sstevel@tonic-gate } 5545*0Sstevel@tonic-gate } 5546*0Sstevel@tonic-gate if (ctl & TH_ACK) { 5547*0Sstevel@tonic-gate uint32_t now = prom_gettime(); 5548*0Sstevel@tonic-gate 5549*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5550*0Sstevel@tonic-gate U32_TO_BE32(now, 5551*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5552*0Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5553*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5554*0Sstevel@tonic-gate } 5555*0Sstevel@tonic-gate tcp->tcp_rack = ack; 5556*0Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5557*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 5558*0Sstevel@tonic-gate } 5559*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutSegs); 5560*0Sstevel@tonic-gate U32_TO_BE32(seq, tcph->th_seq); 5561*0Sstevel@tonic-gate U32_TO_BE32(ack, tcph->th_ack); 5562*0Sstevel@tonic-gate 5563*0Sstevel@tonic-gate tcp_set_cksum(mp); 5564*0Sstevel@tonic-gate iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5565*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5566*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 5567*0Sstevel@tonic-gate freeb(mp); 5568*0Sstevel@tonic-gate } 5569*0Sstevel@tonic-gate 5570*0Sstevel@tonic-gate /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5571*0Sstevel@tonic-gate static mblk_t * 5572*0Sstevel@tonic-gate tcp_ack_mp(tcp_t *tcp) 5573*0Sstevel@tonic-gate { 5574*0Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 5575*0Sstevel@tonic-gate /* 5576*0Sstevel@tonic-gate * For the complex case where we have to send some 5577*0Sstevel@tonic-gate * controls (FIN or SYN), let tcp_xmit_mp do it. 5578*0Sstevel@tonic-gate * When sending an ACK-only segment (no data) 5579*0Sstevel@tonic-gate * into a zero window, always set the seq number to 5580*0Sstevel@tonic-gate * suna, since snxt will be extended past the window. 5581*0Sstevel@tonic-gate * If we used snxt, the receiver might consider the ACK 5582*0Sstevel@tonic-gate * unacceptable. 5583*0Sstevel@tonic-gate */ 5584*0Sstevel@tonic-gate return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5585*0Sstevel@tonic-gate (tcp->tcp_zero_win_probe) ? 5586*0Sstevel@tonic-gate tcp->tcp_suna : 5587*0Sstevel@tonic-gate tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5588*0Sstevel@tonic-gate } else { 5589*0Sstevel@tonic-gate /* Generate a simple ACK */ 5590*0Sstevel@tonic-gate uchar_t *rptr; 5591*0Sstevel@tonic-gate tcph_t *tcph; 5592*0Sstevel@tonic-gate mblk_t *mp1; 5593*0Sstevel@tonic-gate int32_t tcp_hdr_len; 5594*0Sstevel@tonic-gate int32_t num_sack_blk = 0; 5595*0Sstevel@tonic-gate int32_t sack_opt_len; 5596*0Sstevel@tonic-gate 5597*0Sstevel@tonic-gate /* 5598*0Sstevel@tonic-gate * Allocate space for TCP + IP headers 5599*0Sstevel@tonic-gate * and link-level header 5600*0Sstevel@tonic-gate */ 5601*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5602*0Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5603*0Sstevel@tonic-gate tcp->tcp_num_sack_blk); 5604*0Sstevel@tonic-gate sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5605*0Sstevel@tonic-gate TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5606*0Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5607*0Sstevel@tonic-gate } else { 5608*0Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 5609*0Sstevel@tonic-gate } 5610*0Sstevel@tonic-gate mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5611*0Sstevel@tonic-gate if (mp1 == NULL) 5612*0Sstevel@tonic-gate return (NULL); 5613*0Sstevel@tonic-gate 5614*0Sstevel@tonic-gate /* copy in prototype TCP + IP header */ 5615*0Sstevel@tonic-gate rptr = mp1->b_rptr + tcp_wroff_xtra; 5616*0Sstevel@tonic-gate mp1->b_rptr = rptr; 5617*0Sstevel@tonic-gate mp1->b_wptr = rptr + tcp_hdr_len; 5618*0Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5619*0Sstevel@tonic-gate 5620*0Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5621*0Sstevel@tonic-gate 5622*0Sstevel@tonic-gate /* 5623*0Sstevel@tonic-gate * Set the TCP sequence number. 5624*0Sstevel@tonic-gate * When sending an ACK-only segment (no data) 5625*0Sstevel@tonic-gate * into a zero window, always set the seq number to 5626*0Sstevel@tonic-gate * suna, since snxt will be extended past the window. 5627*0Sstevel@tonic-gate * If we used snxt, the receiver might consider the ACK 5628*0Sstevel@tonic-gate * unacceptable. 5629*0Sstevel@tonic-gate */ 5630*0Sstevel@tonic-gate U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5631*0Sstevel@tonic-gate tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5632*0Sstevel@tonic-gate 5633*0Sstevel@tonic-gate /* Set up the TCP flag field. */ 5634*0Sstevel@tonic-gate tcph->th_flags[0] = (uchar_t)TH_ACK; 5635*0Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 5636*0Sstevel@tonic-gate tcph->th_flags[0] |= TH_ECE; 5637*0Sstevel@tonic-gate 5638*0Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5639*0Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5640*0Sstevel@tonic-gate 5641*0Sstevel@tonic-gate /* fill in timestamp option if in use */ 5642*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5643*0Sstevel@tonic-gate uint32_t llbolt = (uint32_t)prom_gettime(); 5644*0Sstevel@tonic-gate 5645*0Sstevel@tonic-gate U32_TO_BE32(llbolt, 5646*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5647*0Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5648*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5649*0Sstevel@tonic-gate } 5650*0Sstevel@tonic-gate 5651*0Sstevel@tonic-gate /* Fill in SACK options */ 5652*0Sstevel@tonic-gate if (num_sack_blk > 0) { 5653*0Sstevel@tonic-gate uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5654*0Sstevel@tonic-gate sack_blk_t *tmp; 5655*0Sstevel@tonic-gate int32_t i; 5656*0Sstevel@tonic-gate 5657*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5658*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5659*0Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5660*0Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5661*0Sstevel@tonic-gate sizeof (sack_blk_t); 5662*0Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5663*0Sstevel@tonic-gate 5664*0Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5665*0Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5666*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5667*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5668*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 5669*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5670*0Sstevel@tonic-gate } 5671*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5672*0Sstevel@tonic-gate << 4); 5673*0Sstevel@tonic-gate } 5674*0Sstevel@tonic-gate 5675*0Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5676*0Sstevel@tonic-gate tcp_set_cksum(mp1); 5677*0Sstevel@tonic-gate ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5678*0Sstevel@tonic-gate return (mp1); 5679*0Sstevel@tonic-gate } 5680*0Sstevel@tonic-gate } 5681*0Sstevel@tonic-gate 5682*0Sstevel@tonic-gate /* 5683*0Sstevel@tonic-gate * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5684*0Sstevel@tonic-gate * ip and tcp header ready to pass down to IP. If the mp passed in is 5685*0Sstevel@tonic-gate * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5686*0Sstevel@tonic-gate * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5687*0Sstevel@tonic-gate * otherwise it will dup partial mblks.) 5688*0Sstevel@tonic-gate * Otherwise, an appropriate ACK packet will be generated. This 5689*0Sstevel@tonic-gate * routine is not usually called to send new data for the first time. It 5690*0Sstevel@tonic-gate * is mostly called out of the timer for retransmits, and to generate ACKs. 5691*0Sstevel@tonic-gate * 5692*0Sstevel@tonic-gate * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5693*0Sstevel@tonic-gate * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5694*0Sstevel@tonic-gate * of the original mblk chain will be returned in *offset and *end_mp. 5695*0Sstevel@tonic-gate */ 5696*0Sstevel@tonic-gate static mblk_t * 5697*0Sstevel@tonic-gate tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5698*0Sstevel@tonic-gate mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5699*0Sstevel@tonic-gate boolean_t rexmit) 5700*0Sstevel@tonic-gate { 5701*0Sstevel@tonic-gate int data_length; 5702*0Sstevel@tonic-gate int32_t off = 0; 5703*0Sstevel@tonic-gate uint_t flags; 5704*0Sstevel@tonic-gate mblk_t *mp1; 5705*0Sstevel@tonic-gate mblk_t *mp2; 5706*0Sstevel@tonic-gate mblk_t *new_mp; 5707*0Sstevel@tonic-gate uchar_t *rptr; 5708*0Sstevel@tonic-gate tcph_t *tcph; 5709*0Sstevel@tonic-gate int32_t num_sack_blk = 0; 5710*0Sstevel@tonic-gate int32_t sack_opt_len = 0; 5711*0Sstevel@tonic-gate 5712*0Sstevel@tonic-gate /* Allocate for our maximum TCP header + link-level */ 5713*0Sstevel@tonic-gate mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5714*0Sstevel@tonic-gate tcp_wroff_xtra, 0); 5715*0Sstevel@tonic-gate if (mp1 == NULL) 5716*0Sstevel@tonic-gate return (NULL); 5717*0Sstevel@tonic-gate data_length = 0; 5718*0Sstevel@tonic-gate 5719*0Sstevel@tonic-gate /* 5720*0Sstevel@tonic-gate * Note that tcp_mss has been adjusted to take into account the 5721*0Sstevel@tonic-gate * timestamp option if applicable. Because SACK options do not 5722*0Sstevel@tonic-gate * appear in every TCP segments and they are of variable lengths, 5723*0Sstevel@tonic-gate * they cannot be included in tcp_mss. Thus we need to calculate 5724*0Sstevel@tonic-gate * the actual segment length when we need to send a segment which 5725*0Sstevel@tonic-gate * includes SACK options. 5726*0Sstevel@tonic-gate */ 5727*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5728*0Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5729*0Sstevel@tonic-gate tcp->tcp_num_sack_blk); 5730*0Sstevel@tonic-gate sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5731*0Sstevel@tonic-gate TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5732*0Sstevel@tonic-gate if (max_to_send + sack_opt_len > tcp->tcp_mss) 5733*0Sstevel@tonic-gate max_to_send -= sack_opt_len; 5734*0Sstevel@tonic-gate } 5735*0Sstevel@tonic-gate 5736*0Sstevel@tonic-gate if (offset != NULL) { 5737*0Sstevel@tonic-gate off = *offset; 5738*0Sstevel@tonic-gate /* We use offset as an indicator that end_mp is not NULL. */ 5739*0Sstevel@tonic-gate *end_mp = NULL; 5740*0Sstevel@tonic-gate } 5741*0Sstevel@tonic-gate for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5742*0Sstevel@tonic-gate /* This could be faster with cooperation from downstream */ 5743*0Sstevel@tonic-gate if (mp2 != mp1 && !sendall && 5744*0Sstevel@tonic-gate data_length + (int)(mp->b_wptr - mp->b_rptr) > 5745*0Sstevel@tonic-gate max_to_send) 5746*0Sstevel@tonic-gate /* 5747*0Sstevel@tonic-gate * Don't send the next mblk since the whole mblk 5748*0Sstevel@tonic-gate * does not fit. 5749*0Sstevel@tonic-gate */ 5750*0Sstevel@tonic-gate break; 5751*0Sstevel@tonic-gate mp2->b_cont = dupb(mp); 5752*0Sstevel@tonic-gate mp2 = mp2->b_cont; 5753*0Sstevel@tonic-gate if (mp2 == NULL) { 5754*0Sstevel@tonic-gate freemsg(mp1); 5755*0Sstevel@tonic-gate return (NULL); 5756*0Sstevel@tonic-gate } 5757*0Sstevel@tonic-gate mp2->b_rptr += off; 5758*0Sstevel@tonic-gate assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5759*0Sstevel@tonic-gate (uintptr_t)INT_MAX); 5760*0Sstevel@tonic-gate 5761*0Sstevel@tonic-gate data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5762*0Sstevel@tonic-gate if (data_length > max_to_send) { 5763*0Sstevel@tonic-gate mp2->b_wptr -= data_length - max_to_send; 5764*0Sstevel@tonic-gate data_length = max_to_send; 5765*0Sstevel@tonic-gate off = mp2->b_wptr - mp->b_rptr; 5766*0Sstevel@tonic-gate break; 5767*0Sstevel@tonic-gate } else { 5768*0Sstevel@tonic-gate off = 0; 5769*0Sstevel@tonic-gate } 5770*0Sstevel@tonic-gate } 5771*0Sstevel@tonic-gate if (offset != NULL) { 5772*0Sstevel@tonic-gate *offset = off; 5773*0Sstevel@tonic-gate *end_mp = mp; 5774*0Sstevel@tonic-gate } 5775*0Sstevel@tonic-gate if (seg_len != NULL) { 5776*0Sstevel@tonic-gate *seg_len = data_length; 5777*0Sstevel@tonic-gate } 5778*0Sstevel@tonic-gate 5779*0Sstevel@tonic-gate rptr = mp1->b_rptr + tcp_wroff_xtra; 5780*0Sstevel@tonic-gate mp1->b_rptr = rptr; 5781*0Sstevel@tonic-gate mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5782*0Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5783*0Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5784*0Sstevel@tonic-gate U32_TO_ABE32(seq, tcph->th_seq); 5785*0Sstevel@tonic-gate 5786*0Sstevel@tonic-gate /* 5787*0Sstevel@tonic-gate * Use tcp_unsent to determine if the PUSH bit should be used assumes 5788*0Sstevel@tonic-gate * that this function was called from tcp_wput_data. Thus, when called 5789*0Sstevel@tonic-gate * to retransmit data the setting of the PUSH bit may appear some 5790*0Sstevel@tonic-gate * what random in that it might get set when it should not. This 5791*0Sstevel@tonic-gate * should not pose any performance issues. 5792*0Sstevel@tonic-gate */ 5793*0Sstevel@tonic-gate if (data_length != 0 && (tcp->tcp_unsent == 0 || 5794*0Sstevel@tonic-gate tcp->tcp_unsent == data_length)) { 5795*0Sstevel@tonic-gate flags = TH_ACK | TH_PUSH; 5796*0Sstevel@tonic-gate } else { 5797*0Sstevel@tonic-gate flags = TH_ACK; 5798*0Sstevel@tonic-gate } 5799*0Sstevel@tonic-gate 5800*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5801*0Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 5802*0Sstevel@tonic-gate flags |= TH_ECE; 5803*0Sstevel@tonic-gate 5804*0Sstevel@tonic-gate /* 5805*0Sstevel@tonic-gate * Only set ECT bit and ECN_CWR if a segment contains new data. 5806*0Sstevel@tonic-gate * There is no TCP flow control for non-data segments, and 5807*0Sstevel@tonic-gate * only data segment is transmitted reliably. 5808*0Sstevel@tonic-gate */ 5809*0Sstevel@tonic-gate if (data_length > 0 && !rexmit) { 5810*0Sstevel@tonic-gate SET_ECT(tcp, rptr); 5811*0Sstevel@tonic-gate if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5812*0Sstevel@tonic-gate flags |= TH_CWR; 5813*0Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_TRUE; 5814*0Sstevel@tonic-gate } 5815*0Sstevel@tonic-gate } 5816*0Sstevel@tonic-gate } 5817*0Sstevel@tonic-gate 5818*0Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 5819*0Sstevel@tonic-gate uint32_t u1; 5820*0Sstevel@tonic-gate 5821*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5822*0Sstevel@tonic-gate seq == tcp->tcp_iss) { 5823*0Sstevel@tonic-gate uchar_t *wptr; 5824*0Sstevel@tonic-gate 5825*0Sstevel@tonic-gate /* 5826*0Sstevel@tonic-gate * Tack on the MSS option. It is always needed 5827*0Sstevel@tonic-gate * for both active and passive open. 5828*0Sstevel@tonic-gate */ 5829*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5830*0Sstevel@tonic-gate wptr[0] = TCPOPT_MAXSEG; 5831*0Sstevel@tonic-gate wptr[1] = TCPOPT_MAXSEG_LEN; 5832*0Sstevel@tonic-gate wptr += 2; 5833*0Sstevel@tonic-gate /* 5834*0Sstevel@tonic-gate * MSS option value should be interface MTU - MIN 5835*0Sstevel@tonic-gate * TCP/IP header. 5836*0Sstevel@tonic-gate */ 5837*0Sstevel@tonic-gate u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5838*0Sstevel@tonic-gate TCP_MIN_HEADER_LENGTH; 5839*0Sstevel@tonic-gate U16_TO_BE16(u1, wptr); 5840*0Sstevel@tonic-gate mp1->b_wptr = wptr + 2; 5841*0Sstevel@tonic-gate /* Update the offset to cover the additional word */ 5842*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += (1 << 4); 5843*0Sstevel@tonic-gate 5844*0Sstevel@tonic-gate /* 5845*0Sstevel@tonic-gate * Note that the following way of filling in 5846*0Sstevel@tonic-gate * TCP options are not optimal. Some NOPs can 5847*0Sstevel@tonic-gate * be saved. But there is no need at this time 5848*0Sstevel@tonic-gate * to optimize it. When it is needed, we will 5849*0Sstevel@tonic-gate * do it. 5850*0Sstevel@tonic-gate */ 5851*0Sstevel@tonic-gate switch (tcp->tcp_state) { 5852*0Sstevel@tonic-gate case TCPS_SYN_SENT: 5853*0Sstevel@tonic-gate flags = TH_SYN; 5854*0Sstevel@tonic-gate 5855*0Sstevel@tonic-gate if (tcp->tcp_snd_ws_ok) { 5856*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5857*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5858*0Sstevel@tonic-gate wptr[1] = TCPOPT_WSCALE; 5859*0Sstevel@tonic-gate wptr[2] = TCPOPT_WS_LEN; 5860*0Sstevel@tonic-gate wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5861*0Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5862*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5863*0Sstevel@tonic-gate (1 << 4); 5864*0Sstevel@tonic-gate } 5865*0Sstevel@tonic-gate 5866*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5867*0Sstevel@tonic-gate uint32_t llbolt; 5868*0Sstevel@tonic-gate 5869*0Sstevel@tonic-gate llbolt = prom_gettime(); 5870*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5871*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5872*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5873*0Sstevel@tonic-gate wptr[2] = TCPOPT_TSTAMP; 5874*0Sstevel@tonic-gate wptr[3] = TCPOPT_TSTAMP_LEN; 5875*0Sstevel@tonic-gate wptr += 4; 5876*0Sstevel@tonic-gate U32_TO_BE32(llbolt, wptr); 5877*0Sstevel@tonic-gate wptr += 4; 5878*0Sstevel@tonic-gate assert(tcp->tcp_ts_recent == 0); 5879*0Sstevel@tonic-gate U32_TO_BE32(0L, wptr); 5880*0Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5881*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5882*0Sstevel@tonic-gate (3 << 4); 5883*0Sstevel@tonic-gate } 5884*0Sstevel@tonic-gate 5885*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 5886*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5887*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5888*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5889*0Sstevel@tonic-gate wptr[2] = TCPOPT_SACK_PERMITTED; 5890*0Sstevel@tonic-gate wptr[3] = TCPOPT_SACK_OK_LEN; 5891*0Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5892*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5893*0Sstevel@tonic-gate (1 << 4); 5894*0Sstevel@tonic-gate } 5895*0Sstevel@tonic-gate 5896*0Sstevel@tonic-gate /* 5897*0Sstevel@tonic-gate * Set up all the bits to tell other side 5898*0Sstevel@tonic-gate * we are ECN capable. 5899*0Sstevel@tonic-gate */ 5900*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5901*0Sstevel@tonic-gate flags |= (TH_ECE | TH_CWR); 5902*0Sstevel@tonic-gate } 5903*0Sstevel@tonic-gate break; 5904*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 5905*0Sstevel@tonic-gate flags |= TH_SYN; 5906*0Sstevel@tonic-gate 5907*0Sstevel@tonic-gate if (tcp->tcp_snd_ws_ok) { 5908*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5909*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5910*0Sstevel@tonic-gate wptr[1] = TCPOPT_WSCALE; 5911*0Sstevel@tonic-gate wptr[2] = TCPOPT_WS_LEN; 5912*0Sstevel@tonic-gate wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5913*0Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5914*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += (1 << 4); 5915*0Sstevel@tonic-gate } 5916*0Sstevel@tonic-gate 5917*0Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 5918*0Sstevel@tonic-gate wptr = mp1->b_wptr; 5919*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5920*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5921*0Sstevel@tonic-gate wptr[2] = TCPOPT_SACK_PERMITTED; 5922*0Sstevel@tonic-gate wptr[3] = TCPOPT_SACK_OK_LEN; 5923*0Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5924*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5925*0Sstevel@tonic-gate (1 << 4); 5926*0Sstevel@tonic-gate } 5927*0Sstevel@tonic-gate 5928*0Sstevel@tonic-gate /* 5929*0Sstevel@tonic-gate * If the other side is ECN capable, reply 5930*0Sstevel@tonic-gate * that we are also ECN capable. 5931*0Sstevel@tonic-gate */ 5932*0Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5933*0Sstevel@tonic-gate flags |= TH_ECE; 5934*0Sstevel@tonic-gate } 5935*0Sstevel@tonic-gate break; 5936*0Sstevel@tonic-gate default: 5937*0Sstevel@tonic-gate break; 5938*0Sstevel@tonic-gate } 5939*0Sstevel@tonic-gate /* allocb() of adequate mblk assures space */ 5940*0Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 5941*0Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 5942*0Sstevel@tonic-gate if (flags & TH_SYN) 5943*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5944*0Sstevel@tonic-gate } 5945*0Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5946*0Sstevel@tonic-gate (seq + data_length) == tcp->tcp_fss) { 5947*0Sstevel@tonic-gate if (!tcp->tcp_fin_acked) { 5948*0Sstevel@tonic-gate flags |= TH_FIN; 5949*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5950*0Sstevel@tonic-gate } 5951*0Sstevel@tonic-gate if (!tcp->tcp_fin_sent) { 5952*0Sstevel@tonic-gate tcp->tcp_fin_sent = B_TRUE; 5953*0Sstevel@tonic-gate switch (tcp->tcp_state) { 5954*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 5955*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 5956*0Sstevel@tonic-gate tcp->tcp_state = TCPS_FIN_WAIT_1; 5957*0Sstevel@tonic-gate break; 5958*0Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 5959*0Sstevel@tonic-gate tcp->tcp_state = TCPS_LAST_ACK; 5960*0Sstevel@tonic-gate break; 5961*0Sstevel@tonic-gate } 5962*0Sstevel@tonic-gate if (tcp->tcp_suna == tcp->tcp_snxt) 5963*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5964*0Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_fss + 1; 5965*0Sstevel@tonic-gate } 5966*0Sstevel@tonic-gate } 5967*0Sstevel@tonic-gate } 5968*0Sstevel@tonic-gate tcph->th_flags[0] = (uchar_t)flags; 5969*0Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5970*0Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5971*0Sstevel@tonic-gate 5972*0Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5973*0Sstevel@tonic-gate if (tcp->tcp_state != TCPS_SYN_SENT) { 5974*0Sstevel@tonic-gate uint32_t llbolt = prom_gettime(); 5975*0Sstevel@tonic-gate 5976*0Sstevel@tonic-gate U32_TO_BE32(llbolt, 5977*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5978*0Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5979*0Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5980*0Sstevel@tonic-gate } 5981*0Sstevel@tonic-gate } 5982*0Sstevel@tonic-gate 5983*0Sstevel@tonic-gate if (num_sack_blk > 0) { 5984*0Sstevel@tonic-gate uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5985*0Sstevel@tonic-gate sack_blk_t *tmp; 5986*0Sstevel@tonic-gate int32_t i; 5987*0Sstevel@tonic-gate 5988*0Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5989*0Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5990*0Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5991*0Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5992*0Sstevel@tonic-gate sizeof (sack_blk_t); 5993*0Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5994*0Sstevel@tonic-gate 5995*0Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5996*0Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5997*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5998*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5999*0Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 6000*0Sstevel@tonic-gate wptr += sizeof (tcp_seq); 6001*0Sstevel@tonic-gate } 6002*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6003*0Sstevel@tonic-gate } 6004*0Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6005*0Sstevel@tonic-gate data_length += (int)(mp1->b_wptr - rptr); 6006*0Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 6007*0Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(data_length); 6008*0Sstevel@tonic-gate 6009*0Sstevel@tonic-gate /* 6010*0Sstevel@tonic-gate * Performance hit! We need to pullup the whole message 6011*0Sstevel@tonic-gate * in order to do checksum and for the MAC output routine. 6012*0Sstevel@tonic-gate */ 6013*0Sstevel@tonic-gate if (mp1->b_cont != NULL) { 6014*0Sstevel@tonic-gate int mp_size; 6015*0Sstevel@tonic-gate #ifdef DEBUG 6016*0Sstevel@tonic-gate printf("Multiple mblk %d\n", msgdsize(mp1)); 6017*0Sstevel@tonic-gate #endif 6018*0Sstevel@tonic-gate new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6019*0Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 6020*0Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr; 6021*0Sstevel@tonic-gate while (mp1 != NULL) { 6022*0Sstevel@tonic-gate mp_size = mp1->b_wptr - mp1->b_rptr; 6023*0Sstevel@tonic-gate bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6024*0Sstevel@tonic-gate new_mp->b_wptr += mp_size; 6025*0Sstevel@tonic-gate mp1 = mp1->b_cont; 6026*0Sstevel@tonic-gate } 6027*0Sstevel@tonic-gate freemsg(mp1); 6028*0Sstevel@tonic-gate mp1 = new_mp; 6029*0Sstevel@tonic-gate } 6030*0Sstevel@tonic-gate tcp_set_cksum(mp1); 6031*0Sstevel@tonic-gate /* Fill in the TTL field as it is 0 in the header template. */ 6032*0Sstevel@tonic-gate ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6033*0Sstevel@tonic-gate 6034*0Sstevel@tonic-gate return (mp1); 6035*0Sstevel@tonic-gate } 6036*0Sstevel@tonic-gate 6037*0Sstevel@tonic-gate /* 6038*0Sstevel@tonic-gate * Generate a "no listener here" reset in response to the 6039*0Sstevel@tonic-gate * connection request contained within 'mp' 6040*0Sstevel@tonic-gate */ 6041*0Sstevel@tonic-gate static void 6042*0Sstevel@tonic-gate tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6043*0Sstevel@tonic-gate { 6044*0Sstevel@tonic-gate uchar_t *rptr; 6045*0Sstevel@tonic-gate uint32_t seg_len; 6046*0Sstevel@tonic-gate tcph_t *tcph; 6047*0Sstevel@tonic-gate uint32_t seg_seq; 6048*0Sstevel@tonic-gate uint32_t seg_ack; 6049*0Sstevel@tonic-gate uint_t flags; 6050*0Sstevel@tonic-gate 6051*0Sstevel@tonic-gate rptr = mp->b_rptr; 6052*0Sstevel@tonic-gate 6053*0Sstevel@tonic-gate tcph = (tcph_t *)&rptr[ip_hdr_len]; 6054*0Sstevel@tonic-gate seg_seq = BE32_TO_U32(tcph->th_seq); 6055*0Sstevel@tonic-gate seg_ack = BE32_TO_U32(tcph->th_ack); 6056*0Sstevel@tonic-gate flags = tcph->th_flags[0]; 6057*0Sstevel@tonic-gate 6058*0Sstevel@tonic-gate seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6059*0Sstevel@tonic-gate if (flags & TH_RST) { 6060*0Sstevel@tonic-gate freeb(mp); 6061*0Sstevel@tonic-gate } else if (flags & TH_ACK) { 6062*0Sstevel@tonic-gate tcp_xmit_early_reset("no tcp, reset", 6063*0Sstevel@tonic-gate sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6064*0Sstevel@tonic-gate } else { 6065*0Sstevel@tonic-gate if (flags & TH_SYN) 6066*0Sstevel@tonic-gate seg_len++; 6067*0Sstevel@tonic-gate tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6068*0Sstevel@tonic-gate mp, 0, seg_seq + seg_len, 6069*0Sstevel@tonic-gate TH_RST | TH_ACK, ip_hdr_len); 6070*0Sstevel@tonic-gate } 6071*0Sstevel@tonic-gate } 6072*0Sstevel@tonic-gate 6073*0Sstevel@tonic-gate /* Non overlapping byte exchanger */ 6074*0Sstevel@tonic-gate static void 6075*0Sstevel@tonic-gate tcp_xchg(uchar_t *a, uchar_t *b, int len) 6076*0Sstevel@tonic-gate { 6077*0Sstevel@tonic-gate uchar_t uch; 6078*0Sstevel@tonic-gate 6079*0Sstevel@tonic-gate while (len-- > 0) { 6080*0Sstevel@tonic-gate uch = a[len]; 6081*0Sstevel@tonic-gate a[len] = b[len]; 6082*0Sstevel@tonic-gate b[len] = uch; 6083*0Sstevel@tonic-gate } 6084*0Sstevel@tonic-gate } 6085*0Sstevel@tonic-gate 6086*0Sstevel@tonic-gate /* 6087*0Sstevel@tonic-gate * Generate a reset based on an inbound packet for which there is no active 6088*0Sstevel@tonic-gate * tcp state that we can find. 6089*0Sstevel@tonic-gate */ 6090*0Sstevel@tonic-gate static void 6091*0Sstevel@tonic-gate tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6092*0Sstevel@tonic-gate uint32_t ack, int ctl, uint_t ip_hdr_len) 6093*0Sstevel@tonic-gate { 6094*0Sstevel@tonic-gate struct ip *iph = NULL; 6095*0Sstevel@tonic-gate ushort_t len; 6096*0Sstevel@tonic-gate tcph_t *tcph; 6097*0Sstevel@tonic-gate int i; 6098*0Sstevel@tonic-gate ipaddr_t addr; 6099*0Sstevel@tonic-gate mblk_t *new_mp; 6100*0Sstevel@tonic-gate 6101*0Sstevel@tonic-gate if (str != NULL) { 6102*0Sstevel@tonic-gate dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6103*0Sstevel@tonic-gate "flags 0x%x\n", str, seq, ack, ctl); 6104*0Sstevel@tonic-gate } 6105*0Sstevel@tonic-gate 6106*0Sstevel@tonic-gate /* 6107*0Sstevel@tonic-gate * We skip reversing source route here. 6108*0Sstevel@tonic-gate * (for now we replace all IP options with EOL) 6109*0Sstevel@tonic-gate */ 6110*0Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6111*0Sstevel@tonic-gate for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6112*0Sstevel@tonic-gate mp->b_rptr[i] = IPOPT_EOL; 6113*0Sstevel@tonic-gate /* 6114*0Sstevel@tonic-gate * Make sure that src address is not a limited broadcast 6115*0Sstevel@tonic-gate * address. Not all broadcast address checking for the 6116*0Sstevel@tonic-gate * src address is possible, since we don't know the 6117*0Sstevel@tonic-gate * netmask of the src addr. 6118*0Sstevel@tonic-gate * No check for destination address is done, since 6119*0Sstevel@tonic-gate * IP will not pass up a packet with a broadcast dest address 6120*0Sstevel@tonic-gate * to TCP. 6121*0Sstevel@tonic-gate */ 6122*0Sstevel@tonic-gate if (iph->ip_src.s_addr == INADDR_ANY || 6123*0Sstevel@tonic-gate iph->ip_src.s_addr == INADDR_BROADCAST) { 6124*0Sstevel@tonic-gate freemsg(mp); 6125*0Sstevel@tonic-gate return; 6126*0Sstevel@tonic-gate } 6127*0Sstevel@tonic-gate 6128*0Sstevel@tonic-gate tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6129*0Sstevel@tonic-gate if (tcph->th_flags[0] & TH_RST) { 6130*0Sstevel@tonic-gate freemsg(mp); 6131*0Sstevel@tonic-gate return; 6132*0Sstevel@tonic-gate } 6133*0Sstevel@tonic-gate /* 6134*0Sstevel@tonic-gate * Now copy the original header to a new buffer. The reason 6135*0Sstevel@tonic-gate * for doing this is that we need to put extra room before 6136*0Sstevel@tonic-gate * the header for the MAC layer address. The original mblk 6137*0Sstevel@tonic-gate * does not have this extra head room. 6138*0Sstevel@tonic-gate */ 6139*0Sstevel@tonic-gate len = ip_hdr_len + sizeof (tcph_t); 6140*0Sstevel@tonic-gate if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6141*0Sstevel@tonic-gate freemsg(mp); 6142*0Sstevel@tonic-gate return; 6143*0Sstevel@tonic-gate } 6144*0Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 6145*0Sstevel@tonic-gate bcopy(mp->b_rptr, new_mp->b_rptr, len); 6146*0Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr + len; 6147*0Sstevel@tonic-gate freemsg(mp); 6148*0Sstevel@tonic-gate mp = new_mp; 6149*0Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6150*0Sstevel@tonic-gate tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6151*0Sstevel@tonic-gate 6152*0Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] = (5 << 4); 6153*0Sstevel@tonic-gate tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6154*0Sstevel@tonic-gate U32_TO_BE32(ack, tcph->th_ack); 6155*0Sstevel@tonic-gate U32_TO_BE32(seq, tcph->th_seq); 6156*0Sstevel@tonic-gate U16_TO_BE16(0, tcph->th_win); 6157*0Sstevel@tonic-gate bzero(tcph->th_sum, sizeof (int16_t)); 6158*0Sstevel@tonic-gate tcph->th_flags[0] = (uint8_t)ctl; 6159*0Sstevel@tonic-gate if (ctl & TH_RST) { 6160*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutRsts); 6161*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 6162*0Sstevel@tonic-gate } 6163*0Sstevel@tonic-gate 6164*0Sstevel@tonic-gate iph->ip_len = htons(len); 6165*0Sstevel@tonic-gate /* Swap addresses */ 6166*0Sstevel@tonic-gate addr = iph->ip_src.s_addr; 6167*0Sstevel@tonic-gate iph->ip_src = iph->ip_dst; 6168*0Sstevel@tonic-gate iph->ip_dst.s_addr = addr; 6169*0Sstevel@tonic-gate iph->ip_id = 0; 6170*0Sstevel@tonic-gate iph->ip_ttl = 0; 6171*0Sstevel@tonic-gate tcp_set_cksum(mp); 6172*0Sstevel@tonic-gate iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6173*0Sstevel@tonic-gate 6174*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 6175*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6176*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 6177*0Sstevel@tonic-gate freemsg(mp); 6178*0Sstevel@tonic-gate } 6179*0Sstevel@tonic-gate 6180*0Sstevel@tonic-gate static void 6181*0Sstevel@tonic-gate tcp_set_cksum(mblk_t *mp) 6182*0Sstevel@tonic-gate { 6183*0Sstevel@tonic-gate struct ip *iph; 6184*0Sstevel@tonic-gate tcpha_t *tcph; 6185*0Sstevel@tonic-gate int len; 6186*0Sstevel@tonic-gate 6187*0Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6188*0Sstevel@tonic-gate tcph = (tcpha_t *)(iph + 1); 6189*0Sstevel@tonic-gate len = ntohs(iph->ip_len); 6190*0Sstevel@tonic-gate /* 6191*0Sstevel@tonic-gate * Calculate the TCP checksum. Need to include the psuedo header, 6192*0Sstevel@tonic-gate * which is similar to the real IP header starting at the TTL field. 6193*0Sstevel@tonic-gate */ 6194*0Sstevel@tonic-gate iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6195*0Sstevel@tonic-gate tcph->tha_sum = 0; 6196*0Sstevel@tonic-gate tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6197*0Sstevel@tonic-gate len - IP_SIMPLE_HDR_LENGTH + 12); 6198*0Sstevel@tonic-gate iph->ip_sum = 0; 6199*0Sstevel@tonic-gate } 6200*0Sstevel@tonic-gate 6201*0Sstevel@tonic-gate static uint16_t 6202*0Sstevel@tonic-gate tcp_cksum(uint16_t *buf, uint32_t len) 6203*0Sstevel@tonic-gate { 6204*0Sstevel@tonic-gate /* 6205*0Sstevel@tonic-gate * Compute Internet Checksum for "count" bytes 6206*0Sstevel@tonic-gate * beginning at location "addr". 6207*0Sstevel@tonic-gate */ 6208*0Sstevel@tonic-gate int32_t sum = 0; 6209*0Sstevel@tonic-gate 6210*0Sstevel@tonic-gate while (len > 1) { 6211*0Sstevel@tonic-gate /* This is the inner loop */ 6212*0Sstevel@tonic-gate sum += *buf++; 6213*0Sstevel@tonic-gate len -= 2; 6214*0Sstevel@tonic-gate } 6215*0Sstevel@tonic-gate 6216*0Sstevel@tonic-gate /* Add left-over byte, if any */ 6217*0Sstevel@tonic-gate if (len > 0) 6218*0Sstevel@tonic-gate sum += *(unsigned char *)buf * 256; 6219*0Sstevel@tonic-gate 6220*0Sstevel@tonic-gate /* Fold 32-bit sum to 16 bits */ 6221*0Sstevel@tonic-gate while (sum >> 16) 6222*0Sstevel@tonic-gate sum = (sum & 0xffff) + (sum >> 16); 6223*0Sstevel@tonic-gate 6224*0Sstevel@tonic-gate return ((uint16_t)~sum); 6225*0Sstevel@tonic-gate } 6226*0Sstevel@tonic-gate 6227*0Sstevel@tonic-gate /* 6228*0Sstevel@tonic-gate * Type three generator adapted from the random() function in 4.4 BSD: 6229*0Sstevel@tonic-gate */ 6230*0Sstevel@tonic-gate 6231*0Sstevel@tonic-gate /* 6232*0Sstevel@tonic-gate * Copyright (c) 1983, 1993 6233*0Sstevel@tonic-gate * The Regents of the University of California. All rights reserved. 6234*0Sstevel@tonic-gate * 6235*0Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without 6236*0Sstevel@tonic-gate * modification, are permitted provided that the following conditions 6237*0Sstevel@tonic-gate * are met: 6238*0Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright 6239*0Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer. 6240*0Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright 6241*0Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the 6242*0Sstevel@tonic-gate * documentation and/or other materials provided with the distribution. 6243*0Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software 6244*0Sstevel@tonic-gate * must display the following acknowledgement: 6245*0Sstevel@tonic-gate * This product includes software developed by the University of 6246*0Sstevel@tonic-gate * California, Berkeley and its contributors. 6247*0Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors 6248*0Sstevel@tonic-gate * may be used to endorse or promote products derived from this software 6249*0Sstevel@tonic-gate * without specific prior written permission. 6250*0Sstevel@tonic-gate * 6251*0Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6252*0Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6253*0Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6254*0Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6255*0Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6256*0Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6257*0Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6258*0Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6259*0Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6260*0Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6261*0Sstevel@tonic-gate * SUCH DAMAGE. 6262*0Sstevel@tonic-gate */ 6263*0Sstevel@tonic-gate 6264*0Sstevel@tonic-gate /* Type 3 -- x**31 + x**3 + 1 */ 6265*0Sstevel@tonic-gate #define DEG_3 31 6266*0Sstevel@tonic-gate #define SEP_3 3 6267*0Sstevel@tonic-gate 6268*0Sstevel@tonic-gate 6269*0Sstevel@tonic-gate /* Protected by tcp_random_lock */ 6270*0Sstevel@tonic-gate static int tcp_randtbl[DEG_3 + 1]; 6271*0Sstevel@tonic-gate 6272*0Sstevel@tonic-gate static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6273*0Sstevel@tonic-gate static int *tcp_random_rptr = &tcp_randtbl[1]; 6274*0Sstevel@tonic-gate 6275*0Sstevel@tonic-gate static int *tcp_random_state = &tcp_randtbl[1]; 6276*0Sstevel@tonic-gate static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6277*0Sstevel@tonic-gate 6278*0Sstevel@tonic-gate static void 6279*0Sstevel@tonic-gate tcp_random_init(void) 6280*0Sstevel@tonic-gate { 6281*0Sstevel@tonic-gate int i; 6282*0Sstevel@tonic-gate uint32_t hrt; 6283*0Sstevel@tonic-gate uint32_t wallclock; 6284*0Sstevel@tonic-gate uint32_t result; 6285*0Sstevel@tonic-gate 6286*0Sstevel@tonic-gate /* 6287*0Sstevel@tonic-gate * 6288*0Sstevel@tonic-gate * XXX We don't have high resolution time in standalone... The 6289*0Sstevel@tonic-gate * following is just some approximation on the comment below. 6290*0Sstevel@tonic-gate * 6291*0Sstevel@tonic-gate * Use high-res timer and current time for seed. Gethrtime() returns 6292*0Sstevel@tonic-gate * a longlong, which may contain resolution down to nanoseconds. 6293*0Sstevel@tonic-gate * The current time will either be a 32-bit or a 64-bit quantity. 6294*0Sstevel@tonic-gate * XOR the two together in a 64-bit result variable. 6295*0Sstevel@tonic-gate * Convert the result to a 32-bit value by multiplying the high-order 6296*0Sstevel@tonic-gate * 32-bits by the low-order 32-bits. 6297*0Sstevel@tonic-gate * 6298*0Sstevel@tonic-gate * XXX We don't have gethrtime() in prom and the wallclock.... 6299*0Sstevel@tonic-gate */ 6300*0Sstevel@tonic-gate 6301*0Sstevel@tonic-gate hrt = prom_gettime(); 6302*0Sstevel@tonic-gate wallclock = (uint32_t)time(NULL); 6303*0Sstevel@tonic-gate result = wallclock ^ hrt; 6304*0Sstevel@tonic-gate tcp_random_state[0] = result; 6305*0Sstevel@tonic-gate 6306*0Sstevel@tonic-gate for (i = 1; i < DEG_3; i++) 6307*0Sstevel@tonic-gate tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6308*0Sstevel@tonic-gate + 12345; 6309*0Sstevel@tonic-gate tcp_random_fptr = &tcp_random_state[SEP_3]; 6310*0Sstevel@tonic-gate tcp_random_rptr = &tcp_random_state[0]; 6311*0Sstevel@tonic-gate for (i = 0; i < 10 * DEG_3; i++) 6312*0Sstevel@tonic-gate (void) tcp_random(); 6313*0Sstevel@tonic-gate } 6314*0Sstevel@tonic-gate 6315*0Sstevel@tonic-gate /* 6316*0Sstevel@tonic-gate * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6317*0Sstevel@tonic-gate * This range is selected to be approximately centered on TCP_ISS / 2, 6318*0Sstevel@tonic-gate * and easy to compute. We get this value by generating a 32-bit random 6319*0Sstevel@tonic-gate * number, selecting out the high-order 17 bits, and then adding one so 6320*0Sstevel@tonic-gate * that we never return zero. 6321*0Sstevel@tonic-gate */ 6322*0Sstevel@tonic-gate static int 6323*0Sstevel@tonic-gate tcp_random(void) 6324*0Sstevel@tonic-gate { 6325*0Sstevel@tonic-gate int i; 6326*0Sstevel@tonic-gate 6327*0Sstevel@tonic-gate *tcp_random_fptr += *tcp_random_rptr; 6328*0Sstevel@tonic-gate 6329*0Sstevel@tonic-gate /* 6330*0Sstevel@tonic-gate * The high-order bits are more random than the low-order bits, 6331*0Sstevel@tonic-gate * so we select out the high-order 17 bits and add one so that 6332*0Sstevel@tonic-gate * we never return zero. 6333*0Sstevel@tonic-gate */ 6334*0Sstevel@tonic-gate i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6335*0Sstevel@tonic-gate if (++tcp_random_fptr >= tcp_random_end_ptr) { 6336*0Sstevel@tonic-gate tcp_random_fptr = tcp_random_state; 6337*0Sstevel@tonic-gate ++tcp_random_rptr; 6338*0Sstevel@tonic-gate } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6339*0Sstevel@tonic-gate tcp_random_rptr = tcp_random_state; 6340*0Sstevel@tonic-gate 6341*0Sstevel@tonic-gate return (i); 6342*0Sstevel@tonic-gate } 6343*0Sstevel@tonic-gate 6344*0Sstevel@tonic-gate /* 6345*0Sstevel@tonic-gate * Generate ISS, taking into account NDD changes may happen halfway through. 6346*0Sstevel@tonic-gate * (If the iss is not zero, set it.) 6347*0Sstevel@tonic-gate */ 6348*0Sstevel@tonic-gate static void 6349*0Sstevel@tonic-gate tcp_iss_init(tcp_t *tcp) 6350*0Sstevel@tonic-gate { 6351*0Sstevel@tonic-gate tcp_iss_incr_extra += (ISS_INCR >> 1); 6352*0Sstevel@tonic-gate tcp->tcp_iss = tcp_iss_incr_extra; 6353*0Sstevel@tonic-gate tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6354*0Sstevel@tonic-gate tcp->tcp_valid_bits = TCP_ISS_VALID; 6355*0Sstevel@tonic-gate tcp->tcp_fss = tcp->tcp_iss - 1; 6356*0Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss; 6357*0Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_iss + 1; 6358*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6359*0Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 6360*0Sstevel@tonic-gate } 6361*0Sstevel@tonic-gate 6362*0Sstevel@tonic-gate /* 6363*0Sstevel@tonic-gate * Diagnostic routine used to return a string associated with the tcp state. 6364*0Sstevel@tonic-gate * Note that if the caller does not supply a buffer, it will use an internal 6365*0Sstevel@tonic-gate * static string. This means that if multiple threads call this function at 6366*0Sstevel@tonic-gate * the same time, output can be corrupted... Note also that this function 6367*0Sstevel@tonic-gate * does not check the size of the supplied buffer. The caller has to make 6368*0Sstevel@tonic-gate * sure that it is big enough. 6369*0Sstevel@tonic-gate */ 6370*0Sstevel@tonic-gate static char * 6371*0Sstevel@tonic-gate tcp_display(tcp_t *tcp, char *sup_buf, char format) 6372*0Sstevel@tonic-gate { 6373*0Sstevel@tonic-gate char buf1[30]; 6374*0Sstevel@tonic-gate static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6375*0Sstevel@tonic-gate char *buf; 6376*0Sstevel@tonic-gate char *cp; 6377*0Sstevel@tonic-gate char local_addrbuf[INET_ADDRSTRLEN]; 6378*0Sstevel@tonic-gate char remote_addrbuf[INET_ADDRSTRLEN]; 6379*0Sstevel@tonic-gate struct in_addr addr; 6380*0Sstevel@tonic-gate 6381*0Sstevel@tonic-gate if (sup_buf != NULL) 6382*0Sstevel@tonic-gate buf = sup_buf; 6383*0Sstevel@tonic-gate else 6384*0Sstevel@tonic-gate buf = priv_buf; 6385*0Sstevel@tonic-gate 6386*0Sstevel@tonic-gate if (tcp == NULL) 6387*0Sstevel@tonic-gate return ("NULL_TCP"); 6388*0Sstevel@tonic-gate switch (tcp->tcp_state) { 6389*0Sstevel@tonic-gate case TCPS_CLOSED: 6390*0Sstevel@tonic-gate cp = "TCP_CLOSED"; 6391*0Sstevel@tonic-gate break; 6392*0Sstevel@tonic-gate case TCPS_IDLE: 6393*0Sstevel@tonic-gate cp = "TCP_IDLE"; 6394*0Sstevel@tonic-gate break; 6395*0Sstevel@tonic-gate case TCPS_BOUND: 6396*0Sstevel@tonic-gate cp = "TCP_BOUND"; 6397*0Sstevel@tonic-gate break; 6398*0Sstevel@tonic-gate case TCPS_LISTEN: 6399*0Sstevel@tonic-gate cp = "TCP_LISTEN"; 6400*0Sstevel@tonic-gate break; 6401*0Sstevel@tonic-gate case TCPS_SYN_SENT: 6402*0Sstevel@tonic-gate cp = "TCP_SYN_SENT"; 6403*0Sstevel@tonic-gate break; 6404*0Sstevel@tonic-gate case TCPS_SYN_RCVD: 6405*0Sstevel@tonic-gate cp = "TCP_SYN_RCVD"; 6406*0Sstevel@tonic-gate break; 6407*0Sstevel@tonic-gate case TCPS_ESTABLISHED: 6408*0Sstevel@tonic-gate cp = "TCP_ESTABLISHED"; 6409*0Sstevel@tonic-gate break; 6410*0Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 6411*0Sstevel@tonic-gate cp = "TCP_CLOSE_WAIT"; 6412*0Sstevel@tonic-gate break; 6413*0Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 6414*0Sstevel@tonic-gate cp = "TCP_FIN_WAIT_1"; 6415*0Sstevel@tonic-gate break; 6416*0Sstevel@tonic-gate case TCPS_CLOSING: 6417*0Sstevel@tonic-gate cp = "TCP_CLOSING"; 6418*0Sstevel@tonic-gate break; 6419*0Sstevel@tonic-gate case TCPS_LAST_ACK: 6420*0Sstevel@tonic-gate cp = "TCP_LAST_ACK"; 6421*0Sstevel@tonic-gate break; 6422*0Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 6423*0Sstevel@tonic-gate cp = "TCP_FIN_WAIT_2"; 6424*0Sstevel@tonic-gate break; 6425*0Sstevel@tonic-gate case TCPS_TIME_WAIT: 6426*0Sstevel@tonic-gate cp = "TCP_TIME_WAIT"; 6427*0Sstevel@tonic-gate break; 6428*0Sstevel@tonic-gate default: 6429*0Sstevel@tonic-gate (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6430*0Sstevel@tonic-gate cp = buf1; 6431*0Sstevel@tonic-gate break; 6432*0Sstevel@tonic-gate } 6433*0Sstevel@tonic-gate switch (format) { 6434*0Sstevel@tonic-gate case DISP_ADDR_AND_PORT: 6435*0Sstevel@tonic-gate /* 6436*0Sstevel@tonic-gate * Note that we use the remote address in the tcp_b 6437*0Sstevel@tonic-gate * structure. This means that it will print out 6438*0Sstevel@tonic-gate * the real destination address, not the next hop's 6439*0Sstevel@tonic-gate * address if source routing is used. 6440*0Sstevel@tonic-gate */ 6441*0Sstevel@tonic-gate addr.s_addr = tcp->tcp_bound_source; 6442*0Sstevel@tonic-gate bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6443*0Sstevel@tonic-gate addr.s_addr = tcp->tcp_remote; 6444*0Sstevel@tonic-gate bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6445*0Sstevel@tonic-gate (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6446*0Sstevel@tonic-gate local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6447*0Sstevel@tonic-gate ntohs(tcp->tcp_fport), cp); 6448*0Sstevel@tonic-gate break; 6449*0Sstevel@tonic-gate case DISP_PORT_ONLY: 6450*0Sstevel@tonic-gate default: 6451*0Sstevel@tonic-gate (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6452*0Sstevel@tonic-gate ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6453*0Sstevel@tonic-gate break; 6454*0Sstevel@tonic-gate } 6455*0Sstevel@tonic-gate 6456*0Sstevel@tonic-gate return (buf); 6457*0Sstevel@tonic-gate } 6458*0Sstevel@tonic-gate 6459*0Sstevel@tonic-gate /* 6460*0Sstevel@tonic-gate * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6461*0Sstevel@tonic-gate * is filled, return as much as we can. The message passed in may be 6462*0Sstevel@tonic-gate * multi-part, chained using b_cont. "start" is the starting sequence 6463*0Sstevel@tonic-gate * number for this piece. 6464*0Sstevel@tonic-gate */ 6465*0Sstevel@tonic-gate static mblk_t * 6466*0Sstevel@tonic-gate tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6467*0Sstevel@tonic-gate { 6468*0Sstevel@tonic-gate uint32_t end; 6469*0Sstevel@tonic-gate mblk_t *mp1; 6470*0Sstevel@tonic-gate mblk_t *mp2; 6471*0Sstevel@tonic-gate mblk_t *next_mp; 6472*0Sstevel@tonic-gate uint32_t u1; 6473*0Sstevel@tonic-gate 6474*0Sstevel@tonic-gate /* Walk through all the new pieces. */ 6475*0Sstevel@tonic-gate do { 6476*0Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6477*0Sstevel@tonic-gate (uintptr_t)INT_MAX); 6478*0Sstevel@tonic-gate end = start + (int)(mp->b_wptr - mp->b_rptr); 6479*0Sstevel@tonic-gate next_mp = mp->b_cont; 6480*0Sstevel@tonic-gate if (start == end) { 6481*0Sstevel@tonic-gate /* Empty. Blast it. */ 6482*0Sstevel@tonic-gate freeb(mp); 6483*0Sstevel@tonic-gate continue; 6484*0Sstevel@tonic-gate } 6485*0Sstevel@tonic-gate mp->b_cont = NULL; 6486*0Sstevel@tonic-gate TCP_REASS_SET_SEQ(mp, start); 6487*0Sstevel@tonic-gate TCP_REASS_SET_END(mp, end); 6488*0Sstevel@tonic-gate mp1 = tcp->tcp_reass_tail; 6489*0Sstevel@tonic-gate if (!mp1) { 6490*0Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6491*0Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6492*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6493*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6494*0Sstevel@tonic-gate continue; 6495*0Sstevel@tonic-gate } 6496*0Sstevel@tonic-gate /* New stuff completely beyond tail? */ 6497*0Sstevel@tonic-gate if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6498*0Sstevel@tonic-gate /* Link it on end. */ 6499*0Sstevel@tonic-gate mp1->b_cont = mp; 6500*0Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6501*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6502*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6503*0Sstevel@tonic-gate continue; 6504*0Sstevel@tonic-gate } 6505*0Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6506*0Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp1); 6507*0Sstevel@tonic-gate /* New stuff at the front? */ 6508*0Sstevel@tonic-gate if (SEQ_LT(start, u1)) { 6509*0Sstevel@tonic-gate /* Yes... Check for overlap. */ 6510*0Sstevel@tonic-gate mp->b_cont = mp1; 6511*0Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6512*0Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp); 6513*0Sstevel@tonic-gate continue; 6514*0Sstevel@tonic-gate } 6515*0Sstevel@tonic-gate /* 6516*0Sstevel@tonic-gate * The new piece fits somewhere between the head and tail. 6517*0Sstevel@tonic-gate * We find our slot, where mp1 precedes us and mp2 trails. 6518*0Sstevel@tonic-gate */ 6519*0Sstevel@tonic-gate for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6520*0Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp2); 6521*0Sstevel@tonic-gate if (SEQ_LEQ(start, u1)) 6522*0Sstevel@tonic-gate break; 6523*0Sstevel@tonic-gate } 6524*0Sstevel@tonic-gate /* Link ourselves in */ 6525*0Sstevel@tonic-gate mp->b_cont = mp2; 6526*0Sstevel@tonic-gate mp1->b_cont = mp; 6527*0Sstevel@tonic-gate 6528*0Sstevel@tonic-gate /* Trim overlap with following mblk(s) first */ 6529*0Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp); 6530*0Sstevel@tonic-gate 6531*0Sstevel@tonic-gate /* Trim overlap with preceding mblk */ 6532*0Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp1); 6533*0Sstevel@tonic-gate 6534*0Sstevel@tonic-gate } while (start = end, mp = next_mp); 6535*0Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6536*0Sstevel@tonic-gate /* Anything ready to go? */ 6537*0Sstevel@tonic-gate if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6538*0Sstevel@tonic-gate return (NULL); 6539*0Sstevel@tonic-gate /* Eat what we can off the queue */ 6540*0Sstevel@tonic-gate for (;;) { 6541*0Sstevel@tonic-gate mp = mp1->b_cont; 6542*0Sstevel@tonic-gate end = TCP_REASS_END(mp1); 6543*0Sstevel@tonic-gate TCP_REASS_SET_SEQ(mp1, 0); 6544*0Sstevel@tonic-gate TCP_REASS_SET_END(mp1, 0); 6545*0Sstevel@tonic-gate if (!mp) { 6546*0Sstevel@tonic-gate tcp->tcp_reass_tail = NULL; 6547*0Sstevel@tonic-gate break; 6548*0Sstevel@tonic-gate } 6549*0Sstevel@tonic-gate if (end != TCP_REASS_SEQ(mp)) { 6550*0Sstevel@tonic-gate mp1->b_cont = NULL; 6551*0Sstevel@tonic-gate break; 6552*0Sstevel@tonic-gate } 6553*0Sstevel@tonic-gate mp1 = mp; 6554*0Sstevel@tonic-gate } 6555*0Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6556*0Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6557*0Sstevel@tonic-gate return (mp1); 6558*0Sstevel@tonic-gate } 6559*0Sstevel@tonic-gate 6560*0Sstevel@tonic-gate /* Eliminate any overlap that mp may have over later mblks */ 6561*0Sstevel@tonic-gate static void 6562*0Sstevel@tonic-gate tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6563*0Sstevel@tonic-gate { 6564*0Sstevel@tonic-gate uint32_t end; 6565*0Sstevel@tonic-gate mblk_t *mp1; 6566*0Sstevel@tonic-gate uint32_t u1; 6567*0Sstevel@tonic-gate 6568*0Sstevel@tonic-gate end = TCP_REASS_END(mp); 6569*0Sstevel@tonic-gate while ((mp1 = mp->b_cont) != NULL) { 6570*0Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp1); 6571*0Sstevel@tonic-gate if (!SEQ_GT(end, u1)) 6572*0Sstevel@tonic-gate break; 6573*0Sstevel@tonic-gate if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6574*0Sstevel@tonic-gate mp->b_wptr -= end - u1; 6575*0Sstevel@tonic-gate TCP_REASS_SET_END(mp, u1); 6576*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6577*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6578*0Sstevel@tonic-gate break; 6579*0Sstevel@tonic-gate } 6580*0Sstevel@tonic-gate mp->b_cont = mp1->b_cont; 6581*0Sstevel@tonic-gate freeb(mp1); 6582*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6583*0Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6584*0Sstevel@tonic-gate } 6585*0Sstevel@tonic-gate if (!mp1) 6586*0Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6587*0Sstevel@tonic-gate } 6588*0Sstevel@tonic-gate 6589*0Sstevel@tonic-gate /* 6590*0Sstevel@tonic-gate * Remove a connection from the list of detached TIME_WAIT connections. 6591*0Sstevel@tonic-gate */ 6592*0Sstevel@tonic-gate static void 6593*0Sstevel@tonic-gate tcp_time_wait_remove(tcp_t *tcp) 6594*0Sstevel@tonic-gate { 6595*0Sstevel@tonic-gate if (tcp->tcp_time_wait_expire == 0) { 6596*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_next == NULL); 6597*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev == NULL); 6598*0Sstevel@tonic-gate return; 6599*0Sstevel@tonic-gate } 6600*0Sstevel@tonic-gate assert(tcp->tcp_state == TCPS_TIME_WAIT); 6601*0Sstevel@tonic-gate if (tcp == tcp_time_wait_head) { 6602*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev == NULL); 6603*0Sstevel@tonic-gate tcp_time_wait_head = tcp->tcp_time_wait_next; 6604*0Sstevel@tonic-gate if (tcp_time_wait_head != NULL) { 6605*0Sstevel@tonic-gate tcp_time_wait_head->tcp_time_wait_prev = NULL; 6606*0Sstevel@tonic-gate } else { 6607*0Sstevel@tonic-gate tcp_time_wait_tail = NULL; 6608*0Sstevel@tonic-gate } 6609*0Sstevel@tonic-gate } else if (tcp == tcp_time_wait_tail) { 6610*0Sstevel@tonic-gate assert(tcp != tcp_time_wait_head); 6611*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_next == NULL); 6612*0Sstevel@tonic-gate tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6613*0Sstevel@tonic-gate assert(tcp_time_wait_tail != NULL); 6614*0Sstevel@tonic-gate tcp_time_wait_tail->tcp_time_wait_next = NULL; 6615*0Sstevel@tonic-gate } else { 6616*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6617*0Sstevel@tonic-gate assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6618*0Sstevel@tonic-gate tcp->tcp_time_wait_prev->tcp_time_wait_next = 6619*0Sstevel@tonic-gate tcp->tcp_time_wait_next; 6620*0Sstevel@tonic-gate tcp->tcp_time_wait_next->tcp_time_wait_prev = 6621*0Sstevel@tonic-gate tcp->tcp_time_wait_prev; 6622*0Sstevel@tonic-gate } 6623*0Sstevel@tonic-gate tcp->tcp_time_wait_next = NULL; 6624*0Sstevel@tonic-gate tcp->tcp_time_wait_prev = NULL; 6625*0Sstevel@tonic-gate tcp->tcp_time_wait_expire = 0; 6626*0Sstevel@tonic-gate } 6627*0Sstevel@tonic-gate 6628*0Sstevel@tonic-gate /* 6629*0Sstevel@tonic-gate * Add a connection to the list of detached TIME_WAIT connections 6630*0Sstevel@tonic-gate * and set its time to expire ... 6631*0Sstevel@tonic-gate */ 6632*0Sstevel@tonic-gate static void 6633*0Sstevel@tonic-gate tcp_time_wait_append(tcp_t *tcp) 6634*0Sstevel@tonic-gate { 6635*0Sstevel@tonic-gate tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6636*0Sstevel@tonic-gate if (tcp->tcp_time_wait_expire == 0) 6637*0Sstevel@tonic-gate tcp->tcp_time_wait_expire = 1; 6638*0Sstevel@tonic-gate 6639*0Sstevel@tonic-gate if (tcp_time_wait_head == NULL) { 6640*0Sstevel@tonic-gate assert(tcp_time_wait_tail == NULL); 6641*0Sstevel@tonic-gate tcp_time_wait_head = tcp; 6642*0Sstevel@tonic-gate } else { 6643*0Sstevel@tonic-gate assert(tcp_time_wait_tail != NULL); 6644*0Sstevel@tonic-gate assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6645*0Sstevel@tonic-gate tcp_time_wait_tail->tcp_time_wait_next = tcp; 6646*0Sstevel@tonic-gate tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6647*0Sstevel@tonic-gate } 6648*0Sstevel@tonic-gate tcp_time_wait_tail = tcp; 6649*0Sstevel@tonic-gate 6650*0Sstevel@tonic-gate /* for ndd stats about compression */ 6651*0Sstevel@tonic-gate tcp_cum_timewait++; 6652*0Sstevel@tonic-gate } 6653*0Sstevel@tonic-gate 6654*0Sstevel@tonic-gate /* 6655*0Sstevel@tonic-gate * Periodic qtimeout routine run on the default queue. 6656*0Sstevel@tonic-gate * Performs 2 functions. 6657*0Sstevel@tonic-gate * 1. Does TIME_WAIT compression on all recently added tcps. List 6658*0Sstevel@tonic-gate * traversal is done backwards from the tail. 6659*0Sstevel@tonic-gate * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6660*0Sstevel@tonic-gate * is done forwards from the head. 6661*0Sstevel@tonic-gate */ 6662*0Sstevel@tonic-gate void 6663*0Sstevel@tonic-gate tcp_time_wait_collector(void) 6664*0Sstevel@tonic-gate { 6665*0Sstevel@tonic-gate tcp_t *tcp; 6666*0Sstevel@tonic-gate uint32_t now; 6667*0Sstevel@tonic-gate 6668*0Sstevel@tonic-gate /* 6669*0Sstevel@tonic-gate * In order to reap time waits reliably, we should use a 6670*0Sstevel@tonic-gate * source of time that is not adjustable by the user 6671*0Sstevel@tonic-gate */ 6672*0Sstevel@tonic-gate now = prom_gettime(); 6673*0Sstevel@tonic-gate while ((tcp = tcp_time_wait_head) != NULL) { 6674*0Sstevel@tonic-gate /* 6675*0Sstevel@tonic-gate * Compare times using modular arithmetic, since 6676*0Sstevel@tonic-gate * lbolt can wrapover. 6677*0Sstevel@tonic-gate */ 6678*0Sstevel@tonic-gate if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6679*0Sstevel@tonic-gate break; 6680*0Sstevel@tonic-gate } 6681*0Sstevel@tonic-gate /* 6682*0Sstevel@tonic-gate * Note that the err must be 0 as there is no socket 6683*0Sstevel@tonic-gate * associated with this TCP... 6684*0Sstevel@tonic-gate */ 6685*0Sstevel@tonic-gate (void) tcp_clean_death(-1, tcp, 0); 6686*0Sstevel@tonic-gate } 6687*0Sstevel@tonic-gate /* Schedule next run time. */ 6688*0Sstevel@tonic-gate tcp_time_wait_runtime = prom_gettime() + 10000; 6689*0Sstevel@tonic-gate } 6690*0Sstevel@tonic-gate 6691*0Sstevel@tonic-gate void 6692*0Sstevel@tonic-gate tcp_time_wait_report(void) 6693*0Sstevel@tonic-gate { 6694*0Sstevel@tonic-gate tcp_t *tcp; 6695*0Sstevel@tonic-gate 6696*0Sstevel@tonic-gate printf("Current time %u\n", prom_gettime()); 6697*0Sstevel@tonic-gate for (tcp = tcp_time_wait_head; tcp != NULL; 6698*0Sstevel@tonic-gate tcp = tcp->tcp_time_wait_next) { 6699*0Sstevel@tonic-gate printf("%s expires at %u\n", tcp_display(tcp, NULL, 6700*0Sstevel@tonic-gate DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6701*0Sstevel@tonic-gate } 6702*0Sstevel@tonic-gate } 6703*0Sstevel@tonic-gate 6704*0Sstevel@tonic-gate /* 6705*0Sstevel@tonic-gate * Send up all messages queued on tcp_rcv_list. 6706*0Sstevel@tonic-gate * Have to set tcp_co_norm since we use putnext. 6707*0Sstevel@tonic-gate */ 6708*0Sstevel@tonic-gate static void 6709*0Sstevel@tonic-gate tcp_rcv_drain(int sock_id, tcp_t *tcp) 6710*0Sstevel@tonic-gate { 6711*0Sstevel@tonic-gate mblk_t *mp; 6712*0Sstevel@tonic-gate struct inetgram *in_gram; 6713*0Sstevel@tonic-gate mblk_t *in_mp; 6714*0Sstevel@tonic-gate int len; 6715*0Sstevel@tonic-gate 6716*0Sstevel@tonic-gate /* Don't drain if the app has not finished reading all the data. */ 6717*0Sstevel@tonic-gate if (sockets[sock_id].so_rcvbuf <= 0) 6718*0Sstevel@tonic-gate return; 6719*0Sstevel@tonic-gate 6720*0Sstevel@tonic-gate /* We might have come here just to updated the rwnd */ 6721*0Sstevel@tonic-gate if (tcp->tcp_rcv_list == NULL) 6722*0Sstevel@tonic-gate goto win_update; 6723*0Sstevel@tonic-gate 6724*0Sstevel@tonic-gate if ((in_gram = (struct inetgram *)bkmem_zalloc( 6725*0Sstevel@tonic-gate sizeof (struct inetgram))) == NULL) { 6726*0Sstevel@tonic-gate return; 6727*0Sstevel@tonic-gate } 6728*0Sstevel@tonic-gate if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6729*0Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6730*0Sstevel@tonic-gate return; 6731*0Sstevel@tonic-gate } 6732*0Sstevel@tonic-gate in_gram->igm_level = APP_LVL; 6733*0Sstevel@tonic-gate in_gram->igm_mp = in_mp; 6734*0Sstevel@tonic-gate in_gram->igm_id = 0; 6735*0Sstevel@tonic-gate 6736*0Sstevel@tonic-gate while ((mp = tcp->tcp_rcv_list) != NULL) { 6737*0Sstevel@tonic-gate tcp->tcp_rcv_list = mp->b_cont; 6738*0Sstevel@tonic-gate len = mp->b_wptr - mp->b_rptr; 6739*0Sstevel@tonic-gate bcopy(mp->b_rptr, in_mp->b_wptr, len); 6740*0Sstevel@tonic-gate in_mp->b_wptr += len; 6741*0Sstevel@tonic-gate freeb(mp); 6742*0Sstevel@tonic-gate } 6743*0Sstevel@tonic-gate 6744*0Sstevel@tonic-gate tcp->tcp_rcv_last_tail = NULL; 6745*0Sstevel@tonic-gate tcp->tcp_rcv_cnt = 0; 6746*0Sstevel@tonic-gate add_grams(&sockets[sock_id].inq, in_gram); 6747*0Sstevel@tonic-gate 6748*0Sstevel@tonic-gate /* This means that so_rcvbuf can be less than 0. */ 6749*0Sstevel@tonic-gate sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6750*0Sstevel@tonic-gate win_update: 6751*0Sstevel@tonic-gate /* 6752*0Sstevel@tonic-gate * Increase the receive window to max. But we need to do receiver 6753*0Sstevel@tonic-gate * SWS avoidance. This means that we need to check the increase of 6754*0Sstevel@tonic-gate * of receive window is at least 1 MSS. 6755*0Sstevel@tonic-gate */ 6756*0Sstevel@tonic-gate if (sockets[sock_id].so_rcvbuf > 0 && 6757*0Sstevel@tonic-gate (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6758*0Sstevel@tonic-gate tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6759*0Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6760*0Sstevel@tonic-gate tcp->tcp_tcph->th_win); 6761*0Sstevel@tonic-gate } 6762*0Sstevel@tonic-gate } 6763*0Sstevel@tonic-gate 6764*0Sstevel@tonic-gate /* 6765*0Sstevel@tonic-gate * Wrapper for recvfrom to call 6766*0Sstevel@tonic-gate */ 6767*0Sstevel@tonic-gate void 6768*0Sstevel@tonic-gate tcp_rcv_drain_sock(int sock_id) 6769*0Sstevel@tonic-gate { 6770*0Sstevel@tonic-gate tcp_t *tcp; 6771*0Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) 6772*0Sstevel@tonic-gate return; 6773*0Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 6774*0Sstevel@tonic-gate } 6775*0Sstevel@tonic-gate 6776*0Sstevel@tonic-gate /* 6777*0Sstevel@tonic-gate * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6778*0Sstevel@tonic-gate * recvfrom could read. Place a magic message in the inq to let recvfrom 6779*0Sstevel@tonic-gate * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6780*0Sstevel@tonic-gate */ 6781*0Sstevel@tonic-gate static void 6782*0Sstevel@tonic-gate tcp_drain_needed(int sock_id, tcp_t *tcp) 6783*0Sstevel@tonic-gate { 6784*0Sstevel@tonic-gate struct inetgram *in_gram; 6785*0Sstevel@tonic-gate #ifdef DEBUG 6786*0Sstevel@tonic-gate printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6787*0Sstevel@tonic-gate sockets[sock_id].inq, tcp->tcp_rcv_list); 6788*0Sstevel@tonic-gate #endif 6789*0Sstevel@tonic-gate if ((sockets[sock_id].inq != NULL) || 6790*0Sstevel@tonic-gate (tcp->tcp_rcv_list == NULL)) 6791*0Sstevel@tonic-gate return; 6792*0Sstevel@tonic-gate 6793*0Sstevel@tonic-gate if ((in_gram = (struct inetgram *)bkmem_zalloc( 6794*0Sstevel@tonic-gate sizeof (struct inetgram))) == NULL) 6795*0Sstevel@tonic-gate return; 6796*0Sstevel@tonic-gate 6797*0Sstevel@tonic-gate in_gram->igm_level = APP_LVL; 6798*0Sstevel@tonic-gate in_gram->igm_mp = NULL; 6799*0Sstevel@tonic-gate in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6800*0Sstevel@tonic-gate 6801*0Sstevel@tonic-gate add_grams(&sockets[sock_id].inq, in_gram); 6802*0Sstevel@tonic-gate } 6803*0Sstevel@tonic-gate 6804*0Sstevel@tonic-gate /* 6805*0Sstevel@tonic-gate * Queue data on tcp_rcv_list which is a b_next chain. 6806*0Sstevel@tonic-gate * Each element of the chain is a b_cont chain. 6807*0Sstevel@tonic-gate * 6808*0Sstevel@tonic-gate * M_DATA messages are added to the current element. 6809*0Sstevel@tonic-gate * Other messages are added as new (b_next) elements. 6810*0Sstevel@tonic-gate */ 6811*0Sstevel@tonic-gate static void 6812*0Sstevel@tonic-gate tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6813*0Sstevel@tonic-gate { 6814*0Sstevel@tonic-gate assert(seg_len == msgdsize(mp)); 6815*0Sstevel@tonic-gate if (tcp->tcp_rcv_list == NULL) { 6816*0Sstevel@tonic-gate tcp->tcp_rcv_list = mp; 6817*0Sstevel@tonic-gate } else { 6818*0Sstevel@tonic-gate tcp->tcp_rcv_last_tail->b_cont = mp; 6819*0Sstevel@tonic-gate } 6820*0Sstevel@tonic-gate while (mp->b_cont) 6821*0Sstevel@tonic-gate mp = mp->b_cont; 6822*0Sstevel@tonic-gate tcp->tcp_rcv_last_tail = mp; 6823*0Sstevel@tonic-gate tcp->tcp_rcv_cnt += seg_len; 6824*0Sstevel@tonic-gate tcp->tcp_rwnd -= seg_len; 6825*0Sstevel@tonic-gate #ifdef DEBUG 6826*0Sstevel@tonic-gate printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6827*0Sstevel@tonic-gate #endif 6828*0Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6829*0Sstevel@tonic-gate } 6830*0Sstevel@tonic-gate 6831*0Sstevel@tonic-gate /* The minimum of smoothed mean deviation in RTO calculation. */ 6832*0Sstevel@tonic-gate #define TCP_SD_MIN 400 6833*0Sstevel@tonic-gate 6834*0Sstevel@tonic-gate /* 6835*0Sstevel@tonic-gate * Set RTO for this connection. The formula is from Jacobson and Karels' 6836*0Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6837*0Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper. 6838*0Sstevel@tonic-gate * 6839*0Sstevel@tonic-gate * m = new measurement 6840*0Sstevel@tonic-gate * sa = smoothed RTT average (8 * average estimates). 6841*0Sstevel@tonic-gate * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6842*0Sstevel@tonic-gate */ 6843*0Sstevel@tonic-gate static void 6844*0Sstevel@tonic-gate tcp_set_rto(tcp_t *tcp, int32_t rtt) 6845*0Sstevel@tonic-gate { 6846*0Sstevel@tonic-gate int32_t m = rtt; 6847*0Sstevel@tonic-gate uint32_t sa = tcp->tcp_rtt_sa; 6848*0Sstevel@tonic-gate uint32_t sv = tcp->tcp_rtt_sd; 6849*0Sstevel@tonic-gate uint32_t rto; 6850*0Sstevel@tonic-gate 6851*0Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRttUpdate); 6852*0Sstevel@tonic-gate tcp->tcp_rtt_update++; 6853*0Sstevel@tonic-gate 6854*0Sstevel@tonic-gate /* tcp_rtt_sa is not 0 means this is a new sample. */ 6855*0Sstevel@tonic-gate if (sa != 0) { 6856*0Sstevel@tonic-gate /* 6857*0Sstevel@tonic-gate * Update average estimator: 6858*0Sstevel@tonic-gate * new rtt = 7/8 old rtt + 1/8 Error 6859*0Sstevel@tonic-gate */ 6860*0Sstevel@tonic-gate 6861*0Sstevel@tonic-gate /* m is now Error in estimate. */ 6862*0Sstevel@tonic-gate m -= sa >> 3; 6863*0Sstevel@tonic-gate if ((int32_t)(sa += m) <= 0) { 6864*0Sstevel@tonic-gate /* 6865*0Sstevel@tonic-gate * Don't allow the smoothed average to be negative. 6866*0Sstevel@tonic-gate * We use 0 to denote reinitialization of the 6867*0Sstevel@tonic-gate * variables. 6868*0Sstevel@tonic-gate */ 6869*0Sstevel@tonic-gate sa = 1; 6870*0Sstevel@tonic-gate } 6871*0Sstevel@tonic-gate 6872*0Sstevel@tonic-gate /* 6873*0Sstevel@tonic-gate * Update deviation estimator: 6874*0Sstevel@tonic-gate * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6875*0Sstevel@tonic-gate */ 6876*0Sstevel@tonic-gate if (m < 0) 6877*0Sstevel@tonic-gate m = -m; 6878*0Sstevel@tonic-gate m -= sv >> 2; 6879*0Sstevel@tonic-gate sv += m; 6880*0Sstevel@tonic-gate } else { 6881*0Sstevel@tonic-gate /* 6882*0Sstevel@tonic-gate * This follows BSD's implementation. So the reinitialized 6883*0Sstevel@tonic-gate * RTO is 3 * m. We cannot go less than 2 because if the 6884*0Sstevel@tonic-gate * link is bandwidth dominated, doubling the window size 6885*0Sstevel@tonic-gate * during slow start means doubling the RTT. We want to be 6886*0Sstevel@tonic-gate * more conservative when we reinitialize our estimates. 3 6887*0Sstevel@tonic-gate * is just a convenient number. 6888*0Sstevel@tonic-gate */ 6889*0Sstevel@tonic-gate sa = m << 3; 6890*0Sstevel@tonic-gate sv = m << 1; 6891*0Sstevel@tonic-gate } 6892*0Sstevel@tonic-gate if (sv < TCP_SD_MIN) { 6893*0Sstevel@tonic-gate /* 6894*0Sstevel@tonic-gate * We do not know that if sa captures the delay ACK 6895*0Sstevel@tonic-gate * effect as in a long train of segments, a receiver 6896*0Sstevel@tonic-gate * does not delay its ACKs. So set the minimum of sv 6897*0Sstevel@tonic-gate * to be TCP_SD_MIN, which is default to 400 ms, twice 6898*0Sstevel@tonic-gate * of BSD DATO. That means the minimum of mean 6899*0Sstevel@tonic-gate * deviation is 100 ms. 6900*0Sstevel@tonic-gate * 6901*0Sstevel@tonic-gate */ 6902*0Sstevel@tonic-gate sv = TCP_SD_MIN; 6903*0Sstevel@tonic-gate } 6904*0Sstevel@tonic-gate tcp->tcp_rtt_sa = sa; 6905*0Sstevel@tonic-gate tcp->tcp_rtt_sd = sv; 6906*0Sstevel@tonic-gate /* 6907*0Sstevel@tonic-gate * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6908*0Sstevel@tonic-gate * 6909*0Sstevel@tonic-gate * Add tcp_rexmit_interval extra in case of extreme environment 6910*0Sstevel@tonic-gate * where the algorithm fails to work. The default value of 6911*0Sstevel@tonic-gate * tcp_rexmit_interval_extra should be 0. 6912*0Sstevel@tonic-gate * 6913*0Sstevel@tonic-gate * As we use a finer grained clock than BSD and update 6914*0Sstevel@tonic-gate * RTO for every ACKs, add in another .25 of RTT to the 6915*0Sstevel@tonic-gate * deviation of RTO to accomodate burstiness of 1/4 of 6916*0Sstevel@tonic-gate * window size. 6917*0Sstevel@tonic-gate */ 6918*0Sstevel@tonic-gate rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6919*0Sstevel@tonic-gate 6920*0Sstevel@tonic-gate if (rto > tcp_rexmit_interval_max) { 6921*0Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_max; 6922*0Sstevel@tonic-gate } else if (rto < tcp_rexmit_interval_min) { 6923*0Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_min; 6924*0Sstevel@tonic-gate } else { 6925*0Sstevel@tonic-gate tcp->tcp_rto = rto; 6926*0Sstevel@tonic-gate } 6927*0Sstevel@tonic-gate 6928*0Sstevel@tonic-gate /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6929*0Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 6930*0Sstevel@tonic-gate } 6931*0Sstevel@tonic-gate 6932*0Sstevel@tonic-gate /* 6933*0Sstevel@tonic-gate * Initiate closedown sequence on an active connection. 6934*0Sstevel@tonic-gate * Return value zero for OK return, non-zero for error return. 6935*0Sstevel@tonic-gate */ 6936*0Sstevel@tonic-gate static int 6937*0Sstevel@tonic-gate tcp_xmit_end(tcp_t *tcp, int sock_id) 6938*0Sstevel@tonic-gate { 6939*0Sstevel@tonic-gate mblk_t *mp; 6940*0Sstevel@tonic-gate 6941*0Sstevel@tonic-gate if (tcp->tcp_state < TCPS_SYN_RCVD || 6942*0Sstevel@tonic-gate tcp->tcp_state > TCPS_CLOSE_WAIT) { 6943*0Sstevel@tonic-gate /* 6944*0Sstevel@tonic-gate * Invalid state, only states TCPS_SYN_RCVD, 6945*0Sstevel@tonic-gate * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6946*0Sstevel@tonic-gate */ 6947*0Sstevel@tonic-gate return (-1); 6948*0Sstevel@tonic-gate } 6949*0Sstevel@tonic-gate 6950*0Sstevel@tonic-gate tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6951*0Sstevel@tonic-gate tcp->tcp_valid_bits |= TCP_FSS_VALID; 6952*0Sstevel@tonic-gate /* 6953*0Sstevel@tonic-gate * If there is nothing more unsent, send the FIN now. 6954*0Sstevel@tonic-gate * Otherwise, it will go out with the last segment. 6955*0Sstevel@tonic-gate */ 6956*0Sstevel@tonic-gate if (tcp->tcp_unsent == 0) { 6957*0Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6958*0Sstevel@tonic-gate tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6959*0Sstevel@tonic-gate 6960*0Sstevel@tonic-gate if (mp != NULL) { 6961*0Sstevel@tonic-gate /* Dump the packet when debugging. */ 6962*0Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_end", mp); 6963*0Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 6964*0Sstevel@tonic-gate freeb(mp); 6965*0Sstevel@tonic-gate } else { 6966*0Sstevel@tonic-gate /* 6967*0Sstevel@tonic-gate * Couldn't allocate msg. Pretend we got it out. 6968*0Sstevel@tonic-gate * Wait for rexmit timeout. 6969*0Sstevel@tonic-gate */ 6970*0Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_fss + 1; 6971*0Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 6972*0Sstevel@tonic-gate } 6973*0Sstevel@tonic-gate 6974*0Sstevel@tonic-gate /* 6975*0Sstevel@tonic-gate * If needed, update tcp_rexmit_snxt as tcp_snxt is 6976*0Sstevel@tonic-gate * changed. 6977*0Sstevel@tonic-gate */ 6978*0Sstevel@tonic-gate if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 6979*0Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6980*0Sstevel@tonic-gate } 6981*0Sstevel@tonic-gate } else { 6982*0Sstevel@tonic-gate tcp_wput_data(tcp, NULL, B_FALSE); 6983*0Sstevel@tonic-gate } 6984*0Sstevel@tonic-gate 6985*0Sstevel@tonic-gate return (0); 6986*0Sstevel@tonic-gate } 6987*0Sstevel@tonic-gate 6988*0Sstevel@tonic-gate int 6989*0Sstevel@tonic-gate tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 6990*0Sstevel@tonic-gate socklen_t optlen) 6991*0Sstevel@tonic-gate { 6992*0Sstevel@tonic-gate switch (level) { 6993*0Sstevel@tonic-gate case SOL_SOCKET: { 6994*0Sstevel@tonic-gate switch (option) { 6995*0Sstevel@tonic-gate case SO_RCVBUF: 6996*0Sstevel@tonic-gate if (optlen == sizeof (int)) { 6997*0Sstevel@tonic-gate int val = *(int *)optval; 6998*0Sstevel@tonic-gate 6999*0Sstevel@tonic-gate if (val > tcp_max_buf) { 7000*0Sstevel@tonic-gate errno = ENOBUFS; 7001*0Sstevel@tonic-gate break; 7002*0Sstevel@tonic-gate } 7003*0Sstevel@tonic-gate /* Silently ignore zero */ 7004*0Sstevel@tonic-gate if (val != 0) { 7005*0Sstevel@tonic-gate val = MSS_ROUNDUP(val, tcp->tcp_mss); 7006*0Sstevel@tonic-gate (void) tcp_rwnd_set(tcp, val); 7007*0Sstevel@tonic-gate } 7008*0Sstevel@tonic-gate } else { 7009*0Sstevel@tonic-gate errno = EINVAL; 7010*0Sstevel@tonic-gate } 7011*0Sstevel@tonic-gate break; 7012*0Sstevel@tonic-gate case SO_SNDBUF: 7013*0Sstevel@tonic-gate if (optlen == sizeof (int)) { 7014*0Sstevel@tonic-gate tcp->tcp_xmit_hiwater = *(int *)optval; 7015*0Sstevel@tonic-gate if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7016*0Sstevel@tonic-gate tcp->tcp_xmit_hiwater = tcp_max_buf; 7017*0Sstevel@tonic-gate } else { 7018*0Sstevel@tonic-gate errno = EINVAL; 7019*0Sstevel@tonic-gate } 7020*0Sstevel@tonic-gate break; 7021*0Sstevel@tonic-gate case SO_LINGER: 7022*0Sstevel@tonic-gate if (optlen == sizeof (struct linger)) { 7023*0Sstevel@tonic-gate struct linger *lgr = (struct linger *)optval; 7024*0Sstevel@tonic-gate 7025*0Sstevel@tonic-gate if (lgr->l_onoff) { 7026*0Sstevel@tonic-gate tcp->tcp_linger = 1; 7027*0Sstevel@tonic-gate tcp->tcp_lingertime = lgr->l_linger; 7028*0Sstevel@tonic-gate } else { 7029*0Sstevel@tonic-gate tcp->tcp_linger = 0; 7030*0Sstevel@tonic-gate tcp->tcp_lingertime = 0; 7031*0Sstevel@tonic-gate } 7032*0Sstevel@tonic-gate } else { 7033*0Sstevel@tonic-gate errno = EINVAL; 7034*0Sstevel@tonic-gate } 7035*0Sstevel@tonic-gate break; 7036*0Sstevel@tonic-gate default: 7037*0Sstevel@tonic-gate errno = ENOPROTOOPT; 7038*0Sstevel@tonic-gate break; 7039*0Sstevel@tonic-gate } 7040*0Sstevel@tonic-gate break; 7041*0Sstevel@tonic-gate } /* case SOL_SOCKET */ 7042*0Sstevel@tonic-gate case IPPROTO_TCP: { 7043*0Sstevel@tonic-gate switch (option) { 7044*0Sstevel@tonic-gate default: 7045*0Sstevel@tonic-gate errno = ENOPROTOOPT; 7046*0Sstevel@tonic-gate break; 7047*0Sstevel@tonic-gate } 7048*0Sstevel@tonic-gate break; 7049*0Sstevel@tonic-gate } /* case IPPROTO_TCP */ 7050*0Sstevel@tonic-gate case IPPROTO_IP: { 7051*0Sstevel@tonic-gate switch (option) { 7052*0Sstevel@tonic-gate default: 7053*0Sstevel@tonic-gate errno = ENOPROTOOPT; 7054*0Sstevel@tonic-gate break; 7055*0Sstevel@tonic-gate } 7056*0Sstevel@tonic-gate break; 7057*0Sstevel@tonic-gate } /* case IPPROTO_IP */ 7058*0Sstevel@tonic-gate default: 7059*0Sstevel@tonic-gate errno = ENOPROTOOPT; 7060*0Sstevel@tonic-gate break; 7061*0Sstevel@tonic-gate } /* switch (level) */ 7062*0Sstevel@tonic-gate 7063*0Sstevel@tonic-gate if (errno != 0) 7064*0Sstevel@tonic-gate return (-1); 7065*0Sstevel@tonic-gate else 7066*0Sstevel@tonic-gate return (0); 7067*0Sstevel@tonic-gate } 7068