xref: /onnv-gate/usr/src/uts/common/inet/tcp/tcp_input.c (revision 13062:36a559d3de13)
111754SKacheong.Poon@Sun.COM /*
211754SKacheong.Poon@Sun.COM  * CDDL HEADER START
311754SKacheong.Poon@Sun.COM  *
411754SKacheong.Poon@Sun.COM  * The contents of this file are subject to the terms of the
511754SKacheong.Poon@Sun.COM  * Common Development and Distribution License (the "License").
611754SKacheong.Poon@Sun.COM  * You may not use this file except in compliance with the License.
711754SKacheong.Poon@Sun.COM  *
811754SKacheong.Poon@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
911754SKacheong.Poon@Sun.COM  * or http://www.opensolaris.org/os/licensing.
1011754SKacheong.Poon@Sun.COM  * See the License for the specific language governing permissions
1111754SKacheong.Poon@Sun.COM  * and limitations under the License.
1211754SKacheong.Poon@Sun.COM  *
1311754SKacheong.Poon@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
1411754SKacheong.Poon@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1511754SKacheong.Poon@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
1611754SKacheong.Poon@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
1711754SKacheong.Poon@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
1811754SKacheong.Poon@Sun.COM  *
1911754SKacheong.Poon@Sun.COM  * CDDL HEADER END
2011754SKacheong.Poon@Sun.COM  */
2111754SKacheong.Poon@Sun.COM 
2211754SKacheong.Poon@Sun.COM /*
2312056SKacheong.Poon@Sun.COM  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
2411754SKacheong.Poon@Sun.COM  */
2511754SKacheong.Poon@Sun.COM 
2611754SKacheong.Poon@Sun.COM /* This file contains all TCP input processing functions. */
2711754SKacheong.Poon@Sun.COM 
2811754SKacheong.Poon@Sun.COM #include <sys/types.h>
2911754SKacheong.Poon@Sun.COM #include <sys/stream.h>
3011754SKacheong.Poon@Sun.COM #include <sys/strsun.h>
3111754SKacheong.Poon@Sun.COM #include <sys/strsubr.h>
3211754SKacheong.Poon@Sun.COM #include <sys/stropts.h>
3311754SKacheong.Poon@Sun.COM #include <sys/strlog.h>
3411754SKacheong.Poon@Sun.COM #define	_SUN_TPI_VERSION 2
3511754SKacheong.Poon@Sun.COM #include <sys/tihdr.h>
3611754SKacheong.Poon@Sun.COM #include <sys/suntpi.h>
3711754SKacheong.Poon@Sun.COM #include <sys/xti_inet.h>
3811754SKacheong.Poon@Sun.COM #include <sys/squeue_impl.h>
3911754SKacheong.Poon@Sun.COM #include <sys/squeue.h>
4011754SKacheong.Poon@Sun.COM #include <sys/tsol/tnet.h>
4111754SKacheong.Poon@Sun.COM 
4211754SKacheong.Poon@Sun.COM #include <inet/common.h>
4311754SKacheong.Poon@Sun.COM #include <inet/ip.h>
4411754SKacheong.Poon@Sun.COM #include <inet/tcp.h>
4511754SKacheong.Poon@Sun.COM #include <inet/tcp_impl.h>
4611754SKacheong.Poon@Sun.COM #include <inet/tcp_cluster.h>
4711754SKacheong.Poon@Sun.COM #include <inet/proto_set.h>
4811754SKacheong.Poon@Sun.COM #include <inet/ipsec_impl.h>
4911754SKacheong.Poon@Sun.COM 
5011754SKacheong.Poon@Sun.COM /*
5111754SKacheong.Poon@Sun.COM  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
5211754SKacheong.Poon@Sun.COM  */
5311754SKacheong.Poon@Sun.COM 
5411754SKacheong.Poon@Sun.COM #ifdef _BIG_ENDIAN
5511754SKacheong.Poon@Sun.COM #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
5611754SKacheong.Poon@Sun.COM 	(TCPOPT_TSTAMP << 8) | 10)
5711754SKacheong.Poon@Sun.COM #else
5811754SKacheong.Poon@Sun.COM #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
5911754SKacheong.Poon@Sun.COM 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
6011754SKacheong.Poon@Sun.COM #endif
6111754SKacheong.Poon@Sun.COM 
6211754SKacheong.Poon@Sun.COM /*
6311754SKacheong.Poon@Sun.COM  * Flags returned from tcp_parse_options.
6411754SKacheong.Poon@Sun.COM  */
6511754SKacheong.Poon@Sun.COM #define	TCP_OPT_MSS_PRESENT	1
6611754SKacheong.Poon@Sun.COM #define	TCP_OPT_WSCALE_PRESENT	2
6711754SKacheong.Poon@Sun.COM #define	TCP_OPT_TSTAMP_PRESENT	4
6811754SKacheong.Poon@Sun.COM #define	TCP_OPT_SACK_OK_PRESENT	8
6911754SKacheong.Poon@Sun.COM #define	TCP_OPT_SACK_PRESENT	16
7011754SKacheong.Poon@Sun.COM 
7111754SKacheong.Poon@Sun.COM /*
7211754SKacheong.Poon@Sun.COM  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
7311754SKacheong.Poon@Sun.COM  */
7411754SKacheong.Poon@Sun.COM #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
7511754SKacheong.Poon@Sun.COM 
7611754SKacheong.Poon@Sun.COM /*
7711754SKacheong.Poon@Sun.COM  * Since tcp_listener is not cleared atomically with tcp_detached
7811754SKacheong.Poon@Sun.COM  * being cleared we need this extra bit to tell a detached connection
7911754SKacheong.Poon@Sun.COM  * apart from one that is in the process of being accepted.
8011754SKacheong.Poon@Sun.COM  */
8111754SKacheong.Poon@Sun.COM #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
8211754SKacheong.Poon@Sun.COM 	(TCP_IS_DETACHED(tcp) &&	\
8311754SKacheong.Poon@Sun.COM 	    (!(tcp)->tcp_hard_binding))
8411754SKacheong.Poon@Sun.COM 
8511754SKacheong.Poon@Sun.COM /*
8611754SKacheong.Poon@Sun.COM  * Steps to do when a tcp_t moves to TIME-WAIT state.
8711754SKacheong.Poon@Sun.COM  *
8811754SKacheong.Poon@Sun.COM  * This connection is done, we don't need to account for it.  Decrement
8911754SKacheong.Poon@Sun.COM  * the listener connection counter if needed.
9011754SKacheong.Poon@Sun.COM  *
9111754SKacheong.Poon@Sun.COM  * Decrement the connection counter of the stack.  Note that this counter
9211754SKacheong.Poon@Sun.COM  * is per CPU.  So the total number of connections in a stack is the sum of all
9311754SKacheong.Poon@Sun.COM  * of them.  Since there is no lock for handling all of them exclusively, the
9411754SKacheong.Poon@Sun.COM  * resulting sum is only an approximation.
9511754SKacheong.Poon@Sun.COM  *
9611754SKacheong.Poon@Sun.COM  * Unconditionally clear the exclusive binding bit so this TIME-WAIT
9711754SKacheong.Poon@Sun.COM  * connection won't interfere with new ones.
9811754SKacheong.Poon@Sun.COM  *
9911754SKacheong.Poon@Sun.COM  * Start the TIME-WAIT timer.  If upper layer has not closed the connection,
10011754SKacheong.Poon@Sun.COM  * the timer is handled within the context of this tcp_t.  When the timer
10111754SKacheong.Poon@Sun.COM  * fires, tcp_clean_death() is called.  If upper layer closes the connection
10211754SKacheong.Poon@Sun.COM  * during this period, tcp_time_wait_append() will be called to add this
10311754SKacheong.Poon@Sun.COM  * tcp_t to the global TIME-WAIT list.  Note that this means that the
10411754SKacheong.Poon@Sun.COM  * actual wait time in TIME-WAIT state will be longer than the
10511754SKacheong.Poon@Sun.COM  * tcps_time_wait_interval since the period before upper layer closes the
10611754SKacheong.Poon@Sun.COM  * connection is not accounted for when tcp_time_wait_append() is called.
10711754SKacheong.Poon@Sun.COM  *
10811754SKacheong.Poon@Sun.COM  * If uppser layer has closed the connection, call tcp_time_wait_append()
10911754SKacheong.Poon@Sun.COM  * directly.
11011754SKacheong.Poon@Sun.COM  *
11111754SKacheong.Poon@Sun.COM  */
11211754SKacheong.Poon@Sun.COM #define	SET_TIME_WAIT(tcps, tcp, connp)				\
11311754SKacheong.Poon@Sun.COM {								\
11411754SKacheong.Poon@Sun.COM 	(tcp)->tcp_state = TCPS_TIME_WAIT;			\
11511754SKacheong.Poon@Sun.COM 	if ((tcp)->tcp_listen_cnt != NULL)			\
11611754SKacheong.Poon@Sun.COM 		TCP_DECR_LISTEN_CNT(tcp);			\
11711754SKacheong.Poon@Sun.COM 	atomic_dec_64(						\
11811754SKacheong.Poon@Sun.COM 	    (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \
11911754SKacheong.Poon@Sun.COM 	(connp)->conn_exclbind = 0;				\
12011754SKacheong.Poon@Sun.COM 	if (!TCP_IS_DETACHED(tcp)) {				\
12111754SKacheong.Poon@Sun.COM 		TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \
12211754SKacheong.Poon@Sun.COM 	} else {						\
12311754SKacheong.Poon@Sun.COM 		tcp_time_wait_append(tcp);			\
12411754SKacheong.Poon@Sun.COM 		TCP_DBGSTAT(tcps, tcp_rput_time_wait);		\
12511754SKacheong.Poon@Sun.COM 	}							\
12611754SKacheong.Poon@Sun.COM }
12711754SKacheong.Poon@Sun.COM 
12811754SKacheong.Poon@Sun.COM /*
12911754SKacheong.Poon@Sun.COM  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
13011754SKacheong.Poon@Sun.COM  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
13111754SKacheong.Poon@Sun.COM  * data, TCP will not respond with an ACK.  RFC 793 requires that
13211754SKacheong.Poon@Sun.COM  * TCP responds with an ACK for such a bogus ACK.  By not following
13311754SKacheong.Poon@Sun.COM  * the RFC, we prevent TCP from getting into an ACK storm if somehow
13411754SKacheong.Poon@Sun.COM  * an attacker successfully spoofs an acceptable segment to our
13511754SKacheong.Poon@Sun.COM  * peer; or when our peer is "confused."
13611754SKacheong.Poon@Sun.COM  */
13711754SKacheong.Poon@Sun.COM static uint32_t tcp_drop_ack_unsent_cnt = 10;
13811754SKacheong.Poon@Sun.COM 
13911754SKacheong.Poon@Sun.COM /*
14013008SKacheong.Poon@Sun.COM  * To protect TCP against attacker using a small window and requesting
14113008SKacheong.Poon@Sun.COM  * large amount of data (DoS attack by conuming memory), TCP checks the
14213008SKacheong.Poon@Sun.COM  * window advertised in the last ACK of the 3-way handshake.  TCP uses
14313008SKacheong.Poon@Sun.COM  * the tcp_mss (the size of one packet) value for comparion.  The window
14413008SKacheong.Poon@Sun.COM  * should be larger than tcp_mss.  But while a sane TCP should advertise
14513008SKacheong.Poon@Sun.COM  * a receive window larger than or equal to 4*MSS to avoid stop and go
14613008SKacheong.Poon@Sun.COM  * tarrfic, not all TCP stacks do that.  This is especially true when
14713008SKacheong.Poon@Sun.COM  * tcp_mss is a big value.
14813008SKacheong.Poon@Sun.COM  *
14913008SKacheong.Poon@Sun.COM  * To work around this issue, an additional fixed value for comparison
15013008SKacheong.Poon@Sun.COM  * is also used.  If the advertised window is smaller than both tcp_mss
15113008SKacheong.Poon@Sun.COM  * and tcp_init_wnd_chk, the ACK is considered as invalid.  So for large
15213008SKacheong.Poon@Sun.COM  * tcp_mss value (say, 8K), a window larger than tcp_init_wnd_chk but
15313008SKacheong.Poon@Sun.COM  * smaller than 8K is considered to be OK.
15411754SKacheong.Poon@Sun.COM  */
15513008SKacheong.Poon@Sun.COM static uint32_t tcp_init_wnd_chk = 4096;
15611754SKacheong.Poon@Sun.COM 
15711754SKacheong.Poon@Sun.COM /* Process ICMP source quench message or not. */
15811754SKacheong.Poon@Sun.COM static boolean_t tcp_icmp_source_quench = B_FALSE;
15911754SKacheong.Poon@Sun.COM 
16011754SKacheong.Poon@Sun.COM static boolean_t tcp_outbound_squeue_switch = B_FALSE;
16111754SKacheong.Poon@Sun.COM 
16211754SKacheong.Poon@Sun.COM static mblk_t	*tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
16311754SKacheong.Poon@Sun.COM 		    ip_recv_attr_t *);
16411754SKacheong.Poon@Sun.COM static mblk_t	*tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
16511754SKacheong.Poon@Sun.COM 		    ip_recv_attr_t *);
16611754SKacheong.Poon@Sun.COM static boolean_t	tcp_drop_q0(tcp_t *);
16711754SKacheong.Poon@Sun.COM static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
16811754SKacheong.Poon@Sun.COM static mblk_t	*tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
16911754SKacheong.Poon@Sun.COM 		    ip_recv_attr_t *);
17011754SKacheong.Poon@Sun.COM static void	tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
17111754SKacheong.Poon@Sun.COM static int	tcp_parse_options(tcpha_t *, tcp_opt_t *);
17211754SKacheong.Poon@Sun.COM static void	tcp_process_options(tcp_t *, tcpha_t *);
17311754SKacheong.Poon@Sun.COM static mblk_t	*tcp_reass(tcp_t *, mblk_t *, uint32_t);
17411754SKacheong.Poon@Sun.COM static void	tcp_reass_elim_overlap(tcp_t *, mblk_t *);
17511754SKacheong.Poon@Sun.COM static void	tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
17611754SKacheong.Poon@Sun.COM static void	tcp_set_rto(tcp_t *, time_t);
17711754SKacheong.Poon@Sun.COM static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
17811754SKacheong.Poon@Sun.COM 
17911754SKacheong.Poon@Sun.COM /*
18011754SKacheong.Poon@Sun.COM  * Set the MSS associated with a particular tcp based on its current value,
18111754SKacheong.Poon@Sun.COM  * and a new one passed in. Observe minimums and maximums, and reset other
18211754SKacheong.Poon@Sun.COM  * state variables that we want to view as multiples of MSS.
18311754SKacheong.Poon@Sun.COM  *
18411754SKacheong.Poon@Sun.COM  * The value of MSS could be either increased or descreased.
18511754SKacheong.Poon@Sun.COM  */
18611754SKacheong.Poon@Sun.COM void
tcp_mss_set(tcp_t * tcp,uint32_t mss)18711754SKacheong.Poon@Sun.COM tcp_mss_set(tcp_t *tcp, uint32_t mss)
18811754SKacheong.Poon@Sun.COM {
18911754SKacheong.Poon@Sun.COM 	uint32_t	mss_max;
19011754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
19111754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
19211754SKacheong.Poon@Sun.COM 
19311754SKacheong.Poon@Sun.COM 	if (connp->conn_ipversion == IPV4_VERSION)
19411754SKacheong.Poon@Sun.COM 		mss_max = tcps->tcps_mss_max_ipv4;
19511754SKacheong.Poon@Sun.COM 	else
19611754SKacheong.Poon@Sun.COM 		mss_max = tcps->tcps_mss_max_ipv6;
19711754SKacheong.Poon@Sun.COM 
19811754SKacheong.Poon@Sun.COM 	if (mss < tcps->tcps_mss_min)
19911754SKacheong.Poon@Sun.COM 		mss = tcps->tcps_mss_min;
20011754SKacheong.Poon@Sun.COM 	if (mss > mss_max)
20111754SKacheong.Poon@Sun.COM 		mss = mss_max;
20211754SKacheong.Poon@Sun.COM 	/*
20311754SKacheong.Poon@Sun.COM 	 * Unless naglim has been set by our client to
20411754SKacheong.Poon@Sun.COM 	 * a non-mss value, force naglim to track mss.
20511754SKacheong.Poon@Sun.COM 	 * This can help to aggregate small writes.
20611754SKacheong.Poon@Sun.COM 	 */
20711754SKacheong.Poon@Sun.COM 	if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
20811754SKacheong.Poon@Sun.COM 		tcp->tcp_naglim = mss;
20911754SKacheong.Poon@Sun.COM 	/*
21011754SKacheong.Poon@Sun.COM 	 * TCP should be able to buffer at least 4 MSS data for obvious
21111754SKacheong.Poon@Sun.COM 	 * performance reason.
21211754SKacheong.Poon@Sun.COM 	 */
21311754SKacheong.Poon@Sun.COM 	if ((mss << 2) > connp->conn_sndbuf)
21411754SKacheong.Poon@Sun.COM 		connp->conn_sndbuf = mss << 2;
21511754SKacheong.Poon@Sun.COM 
21611754SKacheong.Poon@Sun.COM 	/*
21711754SKacheong.Poon@Sun.COM 	 * Set the send lowater to at least twice of MSS.
21811754SKacheong.Poon@Sun.COM 	 */
21911754SKacheong.Poon@Sun.COM 	if ((mss << 1) > connp->conn_sndlowat)
22011754SKacheong.Poon@Sun.COM 		connp->conn_sndlowat = mss << 1;
22111754SKacheong.Poon@Sun.COM 
22211754SKacheong.Poon@Sun.COM 	/*
22311754SKacheong.Poon@Sun.COM 	 * Update tcp_cwnd according to the new value of MSS. Keep the
22411754SKacheong.Poon@Sun.COM 	 * previous ratio to preserve the transmit rate.
22511754SKacheong.Poon@Sun.COM 	 */
22611754SKacheong.Poon@Sun.COM 	tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
22711754SKacheong.Poon@Sun.COM 	tcp->tcp_cwnd_cnt = 0;
22811754SKacheong.Poon@Sun.COM 
22911754SKacheong.Poon@Sun.COM 	tcp->tcp_mss = mss;
23011754SKacheong.Poon@Sun.COM 	(void) tcp_maxpsz_set(tcp, B_TRUE);
23111754SKacheong.Poon@Sun.COM }
23211754SKacheong.Poon@Sun.COM 
23311754SKacheong.Poon@Sun.COM /*
23411754SKacheong.Poon@Sun.COM  * Extract option values from a tcp header.  We put any found values into the
23511754SKacheong.Poon@Sun.COM  * tcpopt struct and return a bitmask saying which options were found.
23611754SKacheong.Poon@Sun.COM  */
23711754SKacheong.Poon@Sun.COM static int
tcp_parse_options(tcpha_t * tcpha,tcp_opt_t * tcpopt)23811754SKacheong.Poon@Sun.COM tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
23911754SKacheong.Poon@Sun.COM {
24011754SKacheong.Poon@Sun.COM 	uchar_t		*endp;
24111754SKacheong.Poon@Sun.COM 	int		len;
24211754SKacheong.Poon@Sun.COM 	uint32_t	mss;
24311754SKacheong.Poon@Sun.COM 	uchar_t		*up = (uchar_t *)tcpha;
24411754SKacheong.Poon@Sun.COM 	int		found = 0;
24511754SKacheong.Poon@Sun.COM 	int32_t		sack_len;
24611754SKacheong.Poon@Sun.COM 	tcp_seq		sack_begin, sack_end;
24711754SKacheong.Poon@Sun.COM 	tcp_t		*tcp;
24811754SKacheong.Poon@Sun.COM 
24911754SKacheong.Poon@Sun.COM 	endp = up + TCP_HDR_LENGTH(tcpha);
25011754SKacheong.Poon@Sun.COM 	up += TCP_MIN_HEADER_LENGTH;
25111754SKacheong.Poon@Sun.COM 	while (up < endp) {
25211754SKacheong.Poon@Sun.COM 		len = endp - up;
25311754SKacheong.Poon@Sun.COM 		switch (*up) {
25411754SKacheong.Poon@Sun.COM 		case TCPOPT_EOL:
25511754SKacheong.Poon@Sun.COM 			break;
25611754SKacheong.Poon@Sun.COM 
25711754SKacheong.Poon@Sun.COM 		case TCPOPT_NOP:
25811754SKacheong.Poon@Sun.COM 			up++;
25911754SKacheong.Poon@Sun.COM 			continue;
26011754SKacheong.Poon@Sun.COM 
26111754SKacheong.Poon@Sun.COM 		case TCPOPT_MAXSEG:
26211754SKacheong.Poon@Sun.COM 			if (len < TCPOPT_MAXSEG_LEN ||
26311754SKacheong.Poon@Sun.COM 			    up[1] != TCPOPT_MAXSEG_LEN)
26411754SKacheong.Poon@Sun.COM 				break;
26511754SKacheong.Poon@Sun.COM 
26611754SKacheong.Poon@Sun.COM 			mss = BE16_TO_U16(up+2);
26711754SKacheong.Poon@Sun.COM 			/* Caller must handle tcp_mss_min and tcp_mss_max_* */
26811754SKacheong.Poon@Sun.COM 			tcpopt->tcp_opt_mss = mss;
26911754SKacheong.Poon@Sun.COM 			found |= TCP_OPT_MSS_PRESENT;
27011754SKacheong.Poon@Sun.COM 
27111754SKacheong.Poon@Sun.COM 			up += TCPOPT_MAXSEG_LEN;
27211754SKacheong.Poon@Sun.COM 			continue;
27311754SKacheong.Poon@Sun.COM 
27411754SKacheong.Poon@Sun.COM 		case TCPOPT_WSCALE:
27511754SKacheong.Poon@Sun.COM 			if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
27611754SKacheong.Poon@Sun.COM 				break;
27711754SKacheong.Poon@Sun.COM 
27811754SKacheong.Poon@Sun.COM 			if (up[2] > TCP_MAX_WINSHIFT)
27911754SKacheong.Poon@Sun.COM 				tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
28011754SKacheong.Poon@Sun.COM 			else
28111754SKacheong.Poon@Sun.COM 				tcpopt->tcp_opt_wscale = up[2];
28211754SKacheong.Poon@Sun.COM 			found |= TCP_OPT_WSCALE_PRESENT;
28311754SKacheong.Poon@Sun.COM 
28411754SKacheong.Poon@Sun.COM 			up += TCPOPT_WS_LEN;
28511754SKacheong.Poon@Sun.COM 			continue;
28611754SKacheong.Poon@Sun.COM 
28711754SKacheong.Poon@Sun.COM 		case TCPOPT_SACK_PERMITTED:
28811754SKacheong.Poon@Sun.COM 			if (len < TCPOPT_SACK_OK_LEN ||
28911754SKacheong.Poon@Sun.COM 			    up[1] != TCPOPT_SACK_OK_LEN)
29011754SKacheong.Poon@Sun.COM 				break;
29111754SKacheong.Poon@Sun.COM 			found |= TCP_OPT_SACK_OK_PRESENT;
29211754SKacheong.Poon@Sun.COM 			up += TCPOPT_SACK_OK_LEN;
29311754SKacheong.Poon@Sun.COM 			continue;
29411754SKacheong.Poon@Sun.COM 
29511754SKacheong.Poon@Sun.COM 		case TCPOPT_SACK:
29611754SKacheong.Poon@Sun.COM 			if (len <= 2 || up[1] <= 2 || len < up[1])
29711754SKacheong.Poon@Sun.COM 				break;
29811754SKacheong.Poon@Sun.COM 
29911754SKacheong.Poon@Sun.COM 			/* If TCP is not interested in SACK blks... */
30011754SKacheong.Poon@Sun.COM 			if ((tcp = tcpopt->tcp) == NULL) {
30111754SKacheong.Poon@Sun.COM 				up += up[1];
30211754SKacheong.Poon@Sun.COM 				continue;
30311754SKacheong.Poon@Sun.COM 			}
30411754SKacheong.Poon@Sun.COM 			sack_len = up[1] - TCPOPT_HEADER_LEN;
30511754SKacheong.Poon@Sun.COM 			up += TCPOPT_HEADER_LEN;
30611754SKacheong.Poon@Sun.COM 
30711754SKacheong.Poon@Sun.COM 			/*
30811754SKacheong.Poon@Sun.COM 			 * If the list is empty, allocate one and assume
30911754SKacheong.Poon@Sun.COM 			 * nothing is sack'ed.
31011754SKacheong.Poon@Sun.COM 			 */
31111754SKacheong.Poon@Sun.COM 			if (tcp->tcp_notsack_list == NULL) {
31211754SKacheong.Poon@Sun.COM 				tcp_notsack_update(&(tcp->tcp_notsack_list),
31311754SKacheong.Poon@Sun.COM 				    tcp->tcp_suna, tcp->tcp_snxt,
31411754SKacheong.Poon@Sun.COM 				    &(tcp->tcp_num_notsack_blk),
31511754SKacheong.Poon@Sun.COM 				    &(tcp->tcp_cnt_notsack_list));
31611754SKacheong.Poon@Sun.COM 
31711754SKacheong.Poon@Sun.COM 				/*
31811754SKacheong.Poon@Sun.COM 				 * Make sure tcp_notsack_list is not NULL.
31911754SKacheong.Poon@Sun.COM 				 * This happens when kmem_alloc(KM_NOSLEEP)
32011754SKacheong.Poon@Sun.COM 				 * returns NULL.
32111754SKacheong.Poon@Sun.COM 				 */
32211754SKacheong.Poon@Sun.COM 				if (tcp->tcp_notsack_list == NULL) {
32311754SKacheong.Poon@Sun.COM 					up += sack_len;
32411754SKacheong.Poon@Sun.COM 					continue;
32511754SKacheong.Poon@Sun.COM 				}
32611754SKacheong.Poon@Sun.COM 				tcp->tcp_fack = tcp->tcp_suna;
32711754SKacheong.Poon@Sun.COM 			}
32811754SKacheong.Poon@Sun.COM 
32911754SKacheong.Poon@Sun.COM 			while (sack_len > 0) {
33011754SKacheong.Poon@Sun.COM 				if (up + 8 > endp) {
33111754SKacheong.Poon@Sun.COM 					up = endp;
33211754SKacheong.Poon@Sun.COM 					break;
33311754SKacheong.Poon@Sun.COM 				}
33411754SKacheong.Poon@Sun.COM 				sack_begin = BE32_TO_U32(up);
33511754SKacheong.Poon@Sun.COM 				up += 4;
33611754SKacheong.Poon@Sun.COM 				sack_end = BE32_TO_U32(up);
33711754SKacheong.Poon@Sun.COM 				up += 4;
33811754SKacheong.Poon@Sun.COM 				sack_len -= 8;
33911754SKacheong.Poon@Sun.COM 				/*
34011754SKacheong.Poon@Sun.COM 				 * Bounds checking.  Make sure the SACK
34111754SKacheong.Poon@Sun.COM 				 * info is within tcp_suna and tcp_snxt.
34211754SKacheong.Poon@Sun.COM 				 * If this SACK blk is out of bound, ignore
34311754SKacheong.Poon@Sun.COM 				 * it but continue to parse the following
34411754SKacheong.Poon@Sun.COM 				 * blks.
34511754SKacheong.Poon@Sun.COM 				 */
34611754SKacheong.Poon@Sun.COM 				if (SEQ_LEQ(sack_end, sack_begin) ||
34711754SKacheong.Poon@Sun.COM 				    SEQ_LT(sack_begin, tcp->tcp_suna) ||
34811754SKacheong.Poon@Sun.COM 				    SEQ_GT(sack_end, tcp->tcp_snxt)) {
34911754SKacheong.Poon@Sun.COM 					continue;
35011754SKacheong.Poon@Sun.COM 				}
35111754SKacheong.Poon@Sun.COM 				tcp_notsack_insert(&(tcp->tcp_notsack_list),
35211754SKacheong.Poon@Sun.COM 				    sack_begin, sack_end,
35311754SKacheong.Poon@Sun.COM 				    &(tcp->tcp_num_notsack_blk),
35411754SKacheong.Poon@Sun.COM 				    &(tcp->tcp_cnt_notsack_list));
35511754SKacheong.Poon@Sun.COM 				if (SEQ_GT(sack_end, tcp->tcp_fack)) {
35611754SKacheong.Poon@Sun.COM 					tcp->tcp_fack = sack_end;
35711754SKacheong.Poon@Sun.COM 				}
35811754SKacheong.Poon@Sun.COM 			}
35911754SKacheong.Poon@Sun.COM 			found |= TCP_OPT_SACK_PRESENT;
36011754SKacheong.Poon@Sun.COM 			continue;
36111754SKacheong.Poon@Sun.COM 
36211754SKacheong.Poon@Sun.COM 		case TCPOPT_TSTAMP:
36311754SKacheong.Poon@Sun.COM 			if (len < TCPOPT_TSTAMP_LEN ||
36411754SKacheong.Poon@Sun.COM 			    up[1] != TCPOPT_TSTAMP_LEN)
36511754SKacheong.Poon@Sun.COM 				break;
36611754SKacheong.Poon@Sun.COM 
36711754SKacheong.Poon@Sun.COM 			tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
36811754SKacheong.Poon@Sun.COM 			tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
36911754SKacheong.Poon@Sun.COM 
37011754SKacheong.Poon@Sun.COM 			found |= TCP_OPT_TSTAMP_PRESENT;
37111754SKacheong.Poon@Sun.COM 
37211754SKacheong.Poon@Sun.COM 			up += TCPOPT_TSTAMP_LEN;
37311754SKacheong.Poon@Sun.COM 			continue;
37411754SKacheong.Poon@Sun.COM 
37511754SKacheong.Poon@Sun.COM 		default:
37611754SKacheong.Poon@Sun.COM 			if (len <= 1 || len < (int)up[1] || up[1] == 0)
37711754SKacheong.Poon@Sun.COM 				break;
37811754SKacheong.Poon@Sun.COM 			up += up[1];
37911754SKacheong.Poon@Sun.COM 			continue;
38011754SKacheong.Poon@Sun.COM 		}
38111754SKacheong.Poon@Sun.COM 		break;
38211754SKacheong.Poon@Sun.COM 	}
38311754SKacheong.Poon@Sun.COM 	return (found);
38411754SKacheong.Poon@Sun.COM }
38511754SKacheong.Poon@Sun.COM 
38611754SKacheong.Poon@Sun.COM /*
38711754SKacheong.Poon@Sun.COM  * Process all TCP option in SYN segment.  Note that this function should
38811754SKacheong.Poon@Sun.COM  * be called after tcp_set_destination() is called so that the necessary info
38911754SKacheong.Poon@Sun.COM  * from IRE is already set in the tcp structure.
39011754SKacheong.Poon@Sun.COM  *
39111754SKacheong.Poon@Sun.COM  * This function sets up the correct tcp_mss value according to the
39211754SKacheong.Poon@Sun.COM  * MSS option value and our header size.  It also sets up the window scale
39311754SKacheong.Poon@Sun.COM  * and timestamp values, and initialize SACK info blocks.  But it does not
39411754SKacheong.Poon@Sun.COM  * change receive window size after setting the tcp_mss value.  The caller
39511754SKacheong.Poon@Sun.COM  * should do the appropriate change.
39611754SKacheong.Poon@Sun.COM  */
39711754SKacheong.Poon@Sun.COM static void
tcp_process_options(tcp_t * tcp,tcpha_t * tcpha)39811754SKacheong.Poon@Sun.COM tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
39911754SKacheong.Poon@Sun.COM {
40011754SKacheong.Poon@Sun.COM 	int options;
40111754SKacheong.Poon@Sun.COM 	tcp_opt_t tcpopt;
40211754SKacheong.Poon@Sun.COM 	uint32_t mss_max;
40311754SKacheong.Poon@Sun.COM 	char *tmp_tcph;
40411754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
40511754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
40611754SKacheong.Poon@Sun.COM 
40711754SKacheong.Poon@Sun.COM 	tcpopt.tcp = NULL;
40811754SKacheong.Poon@Sun.COM 	options = tcp_parse_options(tcpha, &tcpopt);
40911754SKacheong.Poon@Sun.COM 
41011754SKacheong.Poon@Sun.COM 	/*
41111754SKacheong.Poon@Sun.COM 	 * Process MSS option.  Note that MSS option value does not account
41211754SKacheong.Poon@Sun.COM 	 * for IP or TCP options.  This means that it is equal to MTU - minimum
41311754SKacheong.Poon@Sun.COM 	 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for
41411754SKacheong.Poon@Sun.COM 	 * IPv6.
41511754SKacheong.Poon@Sun.COM 	 */
41611754SKacheong.Poon@Sun.COM 	if (!(options & TCP_OPT_MSS_PRESENT)) {
41711754SKacheong.Poon@Sun.COM 		if (connp->conn_ipversion == IPV4_VERSION)
41811754SKacheong.Poon@Sun.COM 			tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4;
41911754SKacheong.Poon@Sun.COM 		else
42011754SKacheong.Poon@Sun.COM 			tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6;
42111754SKacheong.Poon@Sun.COM 	} else {
42211754SKacheong.Poon@Sun.COM 		if (connp->conn_ipversion == IPV4_VERSION)
42311754SKacheong.Poon@Sun.COM 			mss_max = tcps->tcps_mss_max_ipv4;
42411754SKacheong.Poon@Sun.COM 		else
42511754SKacheong.Poon@Sun.COM 			mss_max = tcps->tcps_mss_max_ipv6;
42611754SKacheong.Poon@Sun.COM 		if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min)
42711754SKacheong.Poon@Sun.COM 			tcpopt.tcp_opt_mss = tcps->tcps_mss_min;
42811754SKacheong.Poon@Sun.COM 		else if (tcpopt.tcp_opt_mss > mss_max)
42911754SKacheong.Poon@Sun.COM 			tcpopt.tcp_opt_mss = mss_max;
43011754SKacheong.Poon@Sun.COM 	}
43111754SKacheong.Poon@Sun.COM 
43211754SKacheong.Poon@Sun.COM 	/* Process Window Scale option. */
43311754SKacheong.Poon@Sun.COM 	if (options & TCP_OPT_WSCALE_PRESENT) {
43411754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale;
43511754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ws_ok = B_TRUE;
43611754SKacheong.Poon@Sun.COM 	} else {
43711754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ws = B_FALSE;
43811754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ws_ok = B_FALSE;
43911754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_ws = B_FALSE;
44011754SKacheong.Poon@Sun.COM 	}
44111754SKacheong.Poon@Sun.COM 
44211754SKacheong.Poon@Sun.COM 	/* Process Timestamp option. */
44311754SKacheong.Poon@Sun.COM 	if ((options & TCP_OPT_TSTAMP_PRESENT) &&
44411754SKacheong.Poon@Sun.COM 	    (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) {
44511754SKacheong.Poon@Sun.COM 		tmp_tcph = (char *)tcp->tcp_tcpha;
44611754SKacheong.Poon@Sun.COM 
44711754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ts_ok = B_TRUE;
44811754SKacheong.Poon@Sun.COM 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
44911754SKacheong.Poon@Sun.COM 		tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
45011754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(tmp_tcph));
45111754SKacheong.Poon@Sun.COM 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
45211754SKacheong.Poon@Sun.COM 
45311754SKacheong.Poon@Sun.COM 		/* Fill in our template header with basic timestamp option. */
45411754SKacheong.Poon@Sun.COM 		tmp_tcph += connp->conn_ht_ulp_len;
45511754SKacheong.Poon@Sun.COM 		tmp_tcph[0] = TCPOPT_NOP;
45611754SKacheong.Poon@Sun.COM 		tmp_tcph[1] = TCPOPT_NOP;
45711754SKacheong.Poon@Sun.COM 		tmp_tcph[2] = TCPOPT_TSTAMP;
45811754SKacheong.Poon@Sun.COM 		tmp_tcph[3] = TCPOPT_TSTAMP_LEN;
45911754SKacheong.Poon@Sun.COM 		connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN;
46011754SKacheong.Poon@Sun.COM 		connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN;
46111754SKacheong.Poon@Sun.COM 		tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4);
46211754SKacheong.Poon@Sun.COM 	} else {
46311754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ts_ok = B_FALSE;
46411754SKacheong.Poon@Sun.COM 	}
46511754SKacheong.Poon@Sun.COM 
46611754SKacheong.Poon@Sun.COM 	/*
46711754SKacheong.Poon@Sun.COM 	 * Process SACK options.  If SACK is enabled for this connection,
46811754SKacheong.Poon@Sun.COM 	 * then allocate the SACK info structure.  Note the following ways
46911754SKacheong.Poon@Sun.COM 	 * when tcp_snd_sack_ok is set to true.
47011754SKacheong.Poon@Sun.COM 	 *
47111754SKacheong.Poon@Sun.COM 	 * For active connection: in tcp_set_destination() called in
47211754SKacheong.Poon@Sun.COM 	 * tcp_connect().
47311754SKacheong.Poon@Sun.COM 	 *
47411754SKacheong.Poon@Sun.COM 	 * For passive connection: in tcp_set_destination() called in
47511754SKacheong.Poon@Sun.COM 	 * tcp_input_listener().
47611754SKacheong.Poon@Sun.COM 	 *
47711754SKacheong.Poon@Sun.COM 	 * That's the reason why the extra TCP_IS_DETACHED() check is there.
47811754SKacheong.Poon@Sun.COM 	 * That check makes sure that if we did not send a SACK OK option,
47911754SKacheong.Poon@Sun.COM 	 * we will not enable SACK for this connection even though the other
48011754SKacheong.Poon@Sun.COM 	 * side sends us SACK OK option.  For active connection, the SACK
48111754SKacheong.Poon@Sun.COM 	 * info structure has already been allocated.  So we need to free
48211754SKacheong.Poon@Sun.COM 	 * it if SACK is disabled.
48311754SKacheong.Poon@Sun.COM 	 */
48411754SKacheong.Poon@Sun.COM 	if ((options & TCP_OPT_SACK_OK_PRESENT) &&
48511754SKacheong.Poon@Sun.COM 	    (tcp->tcp_snd_sack_ok ||
48611754SKacheong.Poon@Sun.COM 	    (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) {
48712056SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_num_sack_blk == 0);
48812056SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_notsack_list == NULL);
48912056SKacheong.Poon@Sun.COM 
49012056SKacheong.Poon@Sun.COM 		tcp->tcp_snd_sack_ok = B_TRUE;
49112056SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_ts_ok) {
49212056SKacheong.Poon@Sun.COM 			tcp->tcp_max_sack_blk = 3;
49311754SKacheong.Poon@Sun.COM 		} else {
49412056SKacheong.Poon@Sun.COM 			tcp->tcp_max_sack_blk = 4;
49511754SKacheong.Poon@Sun.COM 		}
49612056SKacheong.Poon@Sun.COM 	} else if (tcp->tcp_snd_sack_ok) {
49711754SKacheong.Poon@Sun.COM 		/*
49811754SKacheong.Poon@Sun.COM 		 * Resetting tcp_snd_sack_ok to B_FALSE so that
49911754SKacheong.Poon@Sun.COM 		 * no SACK info will be used for this
50011754SKacheong.Poon@Sun.COM 		 * connection.  This assumes that SACK usage
50111754SKacheong.Poon@Sun.COM 		 * permission is negotiated.  This may need
50211754SKacheong.Poon@Sun.COM 		 * to be changed once this is clarified.
50311754SKacheong.Poon@Sun.COM 		 */
50412056SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_num_sack_blk == 0);
50512056SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_notsack_list == NULL);
50611754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_sack_ok = B_FALSE;
50711754SKacheong.Poon@Sun.COM 	}
50811754SKacheong.Poon@Sun.COM 
50911754SKacheong.Poon@Sun.COM 	/*
51011754SKacheong.Poon@Sun.COM 	 * Now we know the exact TCP/IP header length, subtract
51111754SKacheong.Poon@Sun.COM 	 * that from tcp_mss to get our side's MSS.
51211754SKacheong.Poon@Sun.COM 	 */
51311754SKacheong.Poon@Sun.COM 	tcp->tcp_mss -= connp->conn_ht_iphc_len;
51411754SKacheong.Poon@Sun.COM 
51511754SKacheong.Poon@Sun.COM 	/*
51611754SKacheong.Poon@Sun.COM 	 * Here we assume that the other side's header size will be equal to
51711754SKacheong.Poon@Sun.COM 	 * our header size.  We calculate the real MSS accordingly.  Need to
51811754SKacheong.Poon@Sun.COM 	 * take into additional stuffs IPsec puts in.
51911754SKacheong.Poon@Sun.COM 	 *
52011754SKacheong.Poon@Sun.COM 	 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header)
52111754SKacheong.Poon@Sun.COM 	 */
52211754SKacheong.Poon@Sun.COM 	tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len +
52311754SKacheong.Poon@Sun.COM 	    tcp->tcp_ipsec_overhead -
52411754SKacheong.Poon@Sun.COM 	    ((connp->conn_ipversion == IPV4_VERSION ?
52511754SKacheong.Poon@Sun.COM 	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
52611754SKacheong.Poon@Sun.COM 
52711754SKacheong.Poon@Sun.COM 	/*
52811754SKacheong.Poon@Sun.COM 	 * Set MSS to the smaller one of both ends of the connection.
52911754SKacheong.Poon@Sun.COM 	 * We should not have called tcp_mss_set() before, but our
53011754SKacheong.Poon@Sun.COM 	 * side of the MSS should have been set to a proper value
53111754SKacheong.Poon@Sun.COM 	 * by tcp_set_destination().  tcp_mss_set() will also set up the
53211754SKacheong.Poon@Sun.COM 	 * STREAM head parameters properly.
53311754SKacheong.Poon@Sun.COM 	 *
53411754SKacheong.Poon@Sun.COM 	 * If we have a larger-than-16-bit window but the other side
53511754SKacheong.Poon@Sun.COM 	 * didn't want to do window scale, tcp_rwnd_set() will take
53611754SKacheong.Poon@Sun.COM 	 * care of that.
53711754SKacheong.Poon@Sun.COM 	 */
53811754SKacheong.Poon@Sun.COM 	tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
53911754SKacheong.Poon@Sun.COM 
54011754SKacheong.Poon@Sun.COM 	/*
54111754SKacheong.Poon@Sun.COM 	 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
54211754SKacheong.Poon@Sun.COM 	 * updated properly.
54311754SKacheong.Poon@Sun.COM 	 */
54411754SKacheong.Poon@Sun.COM 	TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
54511754SKacheong.Poon@Sun.COM }
54611754SKacheong.Poon@Sun.COM 
54711754SKacheong.Poon@Sun.COM /*
54811754SKacheong.Poon@Sun.COM  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
54911754SKacheong.Poon@Sun.COM  * is filled, return as much as we can.  The message passed in may be
55011754SKacheong.Poon@Sun.COM  * multi-part, chained using b_cont.  "start" is the starting sequence
55111754SKacheong.Poon@Sun.COM  * number for this piece.
55211754SKacheong.Poon@Sun.COM  */
55311754SKacheong.Poon@Sun.COM static mblk_t *
tcp_reass(tcp_t * tcp,mblk_t * mp,uint32_t start)55411754SKacheong.Poon@Sun.COM tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
55511754SKacheong.Poon@Sun.COM {
55611754SKacheong.Poon@Sun.COM 	uint32_t	end;
55711754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
55811754SKacheong.Poon@Sun.COM 	mblk_t		*mp2;
55911754SKacheong.Poon@Sun.COM 	mblk_t		*next_mp;
56011754SKacheong.Poon@Sun.COM 	uint32_t	u1;
56111754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
56211754SKacheong.Poon@Sun.COM 
56311754SKacheong.Poon@Sun.COM 
56411754SKacheong.Poon@Sun.COM 	/* Walk through all the new pieces. */
56511754SKacheong.Poon@Sun.COM 	do {
56611754SKacheong.Poon@Sun.COM 		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
56711754SKacheong.Poon@Sun.COM 		    (uintptr_t)INT_MAX);
56811754SKacheong.Poon@Sun.COM 		end = start + (int)(mp->b_wptr - mp->b_rptr);
56911754SKacheong.Poon@Sun.COM 		next_mp = mp->b_cont;
57011754SKacheong.Poon@Sun.COM 		if (start == end) {
57111754SKacheong.Poon@Sun.COM 			/* Empty.  Blast it. */
57211754SKacheong.Poon@Sun.COM 			freeb(mp);
57311754SKacheong.Poon@Sun.COM 			continue;
57411754SKacheong.Poon@Sun.COM 		}
57511754SKacheong.Poon@Sun.COM 		mp->b_cont = NULL;
57611754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_SEQ(mp, start);
57711754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_END(mp, end);
57811754SKacheong.Poon@Sun.COM 		mp1 = tcp->tcp_reass_tail;
57911754SKacheong.Poon@Sun.COM 		if (!mp1) {
58011754SKacheong.Poon@Sun.COM 			tcp->tcp_reass_tail = mp;
58111754SKacheong.Poon@Sun.COM 			tcp->tcp_reass_head = mp;
58211754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
58311754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
58411754SKacheong.Poon@Sun.COM 			    end - start);
58511754SKacheong.Poon@Sun.COM 			continue;
58611754SKacheong.Poon@Sun.COM 		}
58711754SKacheong.Poon@Sun.COM 		/* New stuff completely beyond tail? */
58811754SKacheong.Poon@Sun.COM 		if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
58911754SKacheong.Poon@Sun.COM 			/* Link it on end. */
59011754SKacheong.Poon@Sun.COM 			mp1->b_cont = mp;
59111754SKacheong.Poon@Sun.COM 			tcp->tcp_reass_tail = mp;
59211754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
59311754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
59411754SKacheong.Poon@Sun.COM 			    end - start);
59511754SKacheong.Poon@Sun.COM 			continue;
59611754SKacheong.Poon@Sun.COM 		}
59711754SKacheong.Poon@Sun.COM 		mp1 = tcp->tcp_reass_head;
59811754SKacheong.Poon@Sun.COM 		u1 = TCP_REASS_SEQ(mp1);
59911754SKacheong.Poon@Sun.COM 		/* New stuff at the front? */
60011754SKacheong.Poon@Sun.COM 		if (SEQ_LT(start, u1)) {
60111754SKacheong.Poon@Sun.COM 			/* Yes... Check for overlap. */
60211754SKacheong.Poon@Sun.COM 			mp->b_cont = mp1;
60311754SKacheong.Poon@Sun.COM 			tcp->tcp_reass_head = mp;
60411754SKacheong.Poon@Sun.COM 			tcp_reass_elim_overlap(tcp, mp);
60511754SKacheong.Poon@Sun.COM 			continue;
60611754SKacheong.Poon@Sun.COM 		}
60711754SKacheong.Poon@Sun.COM 		/*
60811754SKacheong.Poon@Sun.COM 		 * The new piece fits somewhere between the head and tail.
60911754SKacheong.Poon@Sun.COM 		 * We find our slot, where mp1 precedes us and mp2 trails.
61011754SKacheong.Poon@Sun.COM 		 */
61111754SKacheong.Poon@Sun.COM 		for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
61211754SKacheong.Poon@Sun.COM 			u1 = TCP_REASS_SEQ(mp2);
61311754SKacheong.Poon@Sun.COM 			if (SEQ_LEQ(start, u1))
61411754SKacheong.Poon@Sun.COM 				break;
61511754SKacheong.Poon@Sun.COM 		}
61611754SKacheong.Poon@Sun.COM 		/* Link ourselves in */
61711754SKacheong.Poon@Sun.COM 		mp->b_cont = mp2;
61811754SKacheong.Poon@Sun.COM 		mp1->b_cont = mp;
61911754SKacheong.Poon@Sun.COM 
62011754SKacheong.Poon@Sun.COM 		/* Trim overlap with following mblk(s) first */
62111754SKacheong.Poon@Sun.COM 		tcp_reass_elim_overlap(tcp, mp);
62211754SKacheong.Poon@Sun.COM 
62311754SKacheong.Poon@Sun.COM 		/* Trim overlap with preceding mblk */
62411754SKacheong.Poon@Sun.COM 		tcp_reass_elim_overlap(tcp, mp1);
62511754SKacheong.Poon@Sun.COM 
62611754SKacheong.Poon@Sun.COM 	} while (start = end, mp = next_mp);
62711754SKacheong.Poon@Sun.COM 	mp1 = tcp->tcp_reass_head;
62811754SKacheong.Poon@Sun.COM 	/* Anything ready to go? */
62911754SKacheong.Poon@Sun.COM 	if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt)
63011754SKacheong.Poon@Sun.COM 		return (NULL);
63111754SKacheong.Poon@Sun.COM 	/* Eat what we can off the queue */
63211754SKacheong.Poon@Sun.COM 	for (;;) {
63311754SKacheong.Poon@Sun.COM 		mp = mp1->b_cont;
63411754SKacheong.Poon@Sun.COM 		end = TCP_REASS_END(mp1);
63511754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_SEQ(mp1, 0);
63611754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_END(mp1, 0);
63711754SKacheong.Poon@Sun.COM 		if (!mp) {
63811754SKacheong.Poon@Sun.COM 			tcp->tcp_reass_tail = NULL;
63911754SKacheong.Poon@Sun.COM 			break;
64011754SKacheong.Poon@Sun.COM 		}
64111754SKacheong.Poon@Sun.COM 		if (end != TCP_REASS_SEQ(mp)) {
64211754SKacheong.Poon@Sun.COM 			mp1->b_cont = NULL;
64311754SKacheong.Poon@Sun.COM 			break;
64411754SKacheong.Poon@Sun.COM 		}
64511754SKacheong.Poon@Sun.COM 		mp1 = mp;
64611754SKacheong.Poon@Sun.COM 	}
64711754SKacheong.Poon@Sun.COM 	mp1 = tcp->tcp_reass_head;
64811754SKacheong.Poon@Sun.COM 	tcp->tcp_reass_head = mp;
64911754SKacheong.Poon@Sun.COM 	return (mp1);
65011754SKacheong.Poon@Sun.COM }
65111754SKacheong.Poon@Sun.COM 
65211754SKacheong.Poon@Sun.COM /* Eliminate any overlap that mp may have over later mblks */
65311754SKacheong.Poon@Sun.COM static void
tcp_reass_elim_overlap(tcp_t * tcp,mblk_t * mp)65411754SKacheong.Poon@Sun.COM tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp)
65511754SKacheong.Poon@Sun.COM {
65611754SKacheong.Poon@Sun.COM 	uint32_t	end;
65711754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
65811754SKacheong.Poon@Sun.COM 	uint32_t	u1;
65911754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
66011754SKacheong.Poon@Sun.COM 
66111754SKacheong.Poon@Sun.COM 	end = TCP_REASS_END(mp);
66211754SKacheong.Poon@Sun.COM 	while ((mp1 = mp->b_cont) != NULL) {
66311754SKacheong.Poon@Sun.COM 		u1 = TCP_REASS_SEQ(mp1);
66411754SKacheong.Poon@Sun.COM 		if (!SEQ_GT(end, u1))
66511754SKacheong.Poon@Sun.COM 			break;
66611754SKacheong.Poon@Sun.COM 		if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) {
66711754SKacheong.Poon@Sun.COM 			mp->b_wptr -= end - u1;
66811754SKacheong.Poon@Sun.COM 			TCP_REASS_SET_END(mp, u1);
66911754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs);
67011754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes,
67111754SKacheong.Poon@Sun.COM 			    end - u1);
67211754SKacheong.Poon@Sun.COM 			break;
67311754SKacheong.Poon@Sun.COM 		}
67411754SKacheong.Poon@Sun.COM 		mp->b_cont = mp1->b_cont;
67511754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_SEQ(mp1, 0);
67611754SKacheong.Poon@Sun.COM 		TCP_REASS_SET_END(mp1, 0);
67711754SKacheong.Poon@Sun.COM 		freeb(mp1);
67811754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
67911754SKacheong.Poon@Sun.COM 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1);
68011754SKacheong.Poon@Sun.COM 	}
68111754SKacheong.Poon@Sun.COM 	if (!mp1)
68211754SKacheong.Poon@Sun.COM 		tcp->tcp_reass_tail = mp;
68311754SKacheong.Poon@Sun.COM }
68411754SKacheong.Poon@Sun.COM 
68511754SKacheong.Poon@Sun.COM /*
68611754SKacheong.Poon@Sun.COM  * This function does PAWS protection check. Returns B_TRUE if the
68711754SKacheong.Poon@Sun.COM  * segment passes the PAWS test, else returns B_FALSE.
68811754SKacheong.Poon@Sun.COM  */
68911754SKacheong.Poon@Sun.COM boolean_t
tcp_paws_check(tcp_t * tcp,tcpha_t * tcpha,tcp_opt_t * tcpoptp)69011754SKacheong.Poon@Sun.COM tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp)
69111754SKacheong.Poon@Sun.COM {
69211754SKacheong.Poon@Sun.COM 	uint8_t	flags;
69311754SKacheong.Poon@Sun.COM 	int	options;
69411754SKacheong.Poon@Sun.COM 	uint8_t *up;
69511754SKacheong.Poon@Sun.COM 	conn_t	*connp = tcp->tcp_connp;
69611754SKacheong.Poon@Sun.COM 
69711754SKacheong.Poon@Sun.COM 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
69811754SKacheong.Poon@Sun.COM 	/*
69911754SKacheong.Poon@Sun.COM 	 * If timestamp option is aligned nicely, get values inline,
70011754SKacheong.Poon@Sun.COM 	 * otherwise call general routine to parse.  Only do that
70111754SKacheong.Poon@Sun.COM 	 * if timestamp is the only option.
70211754SKacheong.Poon@Sun.COM 	 */
70311754SKacheong.Poon@Sun.COM 	if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
70411754SKacheong.Poon@Sun.COM 	    TCPOPT_REAL_TS_LEN &&
70511754SKacheong.Poon@Sun.COM 	    OK_32PTR((up = ((uint8_t *)tcpha) +
70611754SKacheong.Poon@Sun.COM 	    TCP_MIN_HEADER_LENGTH)) &&
70711754SKacheong.Poon@Sun.COM 	    *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
70811754SKacheong.Poon@Sun.COM 		tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4));
70911754SKacheong.Poon@Sun.COM 		tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8));
71011754SKacheong.Poon@Sun.COM 
71111754SKacheong.Poon@Sun.COM 		options = TCP_OPT_TSTAMP_PRESENT;
71211754SKacheong.Poon@Sun.COM 	} else {
71311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok) {
71411754SKacheong.Poon@Sun.COM 			tcpoptp->tcp = tcp;
71511754SKacheong.Poon@Sun.COM 		} else {
71611754SKacheong.Poon@Sun.COM 			tcpoptp->tcp = NULL;
71711754SKacheong.Poon@Sun.COM 		}
71811754SKacheong.Poon@Sun.COM 		options = tcp_parse_options(tcpha, tcpoptp);
71911754SKacheong.Poon@Sun.COM 	}
72011754SKacheong.Poon@Sun.COM 
72111754SKacheong.Poon@Sun.COM 	if (options & TCP_OPT_TSTAMP_PRESENT) {
72211754SKacheong.Poon@Sun.COM 		/*
72311754SKacheong.Poon@Sun.COM 		 * Do PAWS per RFC 1323 section 4.2.  Accept RST
72411754SKacheong.Poon@Sun.COM 		 * regardless of the timestamp, page 18 RFC 1323.bis.
72511754SKacheong.Poon@Sun.COM 		 */
72611754SKacheong.Poon@Sun.COM 		if ((flags & TH_RST) == 0 &&
72711754SKacheong.Poon@Sun.COM 		    TSTMP_LT(tcpoptp->tcp_opt_ts_val,
72811754SKacheong.Poon@Sun.COM 		    tcp->tcp_ts_recent)) {
72912806SGeorge.Shepherd@Sun.COM 			if (LBOLT_FASTPATH64 <
73012806SGeorge.Shepherd@Sun.COM 			    (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) {
73111754SKacheong.Poon@Sun.COM 				/* This segment is not acceptable. */
73211754SKacheong.Poon@Sun.COM 				return (B_FALSE);
73311754SKacheong.Poon@Sun.COM 			} else {
73411754SKacheong.Poon@Sun.COM 				/*
73511754SKacheong.Poon@Sun.COM 				 * Connection has been idle for
73611754SKacheong.Poon@Sun.COM 				 * too long.  Reset the timestamp
73711754SKacheong.Poon@Sun.COM 				 * and assume the segment is valid.
73811754SKacheong.Poon@Sun.COM 				 */
73911754SKacheong.Poon@Sun.COM 				tcp->tcp_ts_recent =
74011754SKacheong.Poon@Sun.COM 				    tcpoptp->tcp_opt_ts_val;
74111754SKacheong.Poon@Sun.COM 			}
74211754SKacheong.Poon@Sun.COM 		}
74311754SKacheong.Poon@Sun.COM 	} else {
74411754SKacheong.Poon@Sun.COM 		/*
74511754SKacheong.Poon@Sun.COM 		 * If we don't get a timestamp on every packet, we
74611754SKacheong.Poon@Sun.COM 		 * figure we can't really trust 'em, so we stop sending
74711754SKacheong.Poon@Sun.COM 		 * and parsing them.
74811754SKacheong.Poon@Sun.COM 		 */
74911754SKacheong.Poon@Sun.COM 		tcp->tcp_snd_ts_ok = B_FALSE;
75011754SKacheong.Poon@Sun.COM 
75111754SKacheong.Poon@Sun.COM 		connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN;
75211754SKacheong.Poon@Sun.COM 		connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN;
75311754SKacheong.Poon@Sun.COM 		tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4);
75411754SKacheong.Poon@Sun.COM 		/*
75511754SKacheong.Poon@Sun.COM 		 * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid
75611754SKacheong.Poon@Sun.COM 		 * doing a slow start here so as to not to lose on the
75711754SKacheong.Poon@Sun.COM 		 * transfer rate built up so far.
75811754SKacheong.Poon@Sun.COM 		 */
75911754SKacheong.Poon@Sun.COM 		tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN);
76012056SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok)
76111754SKacheong.Poon@Sun.COM 			tcp->tcp_max_sack_blk = 4;
76211754SKacheong.Poon@Sun.COM 	}
76311754SKacheong.Poon@Sun.COM 	return (B_TRUE);
76411754SKacheong.Poon@Sun.COM }
76511754SKacheong.Poon@Sun.COM 
76611754SKacheong.Poon@Sun.COM /*
76711754SKacheong.Poon@Sun.COM  * Defense for the SYN attack -
76811754SKacheong.Poon@Sun.COM  * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
76911754SKacheong.Poon@Sun.COM  *    one from the list of droppable eagers. This list is a subset of q0.
77011754SKacheong.Poon@Sun.COM  *    see comments before the definition of MAKE_DROPPABLE().
77111754SKacheong.Poon@Sun.COM  * 2. Don't drop a SYN request before its first timeout. This gives every
77211754SKacheong.Poon@Sun.COM  *    request at least til the first timeout to complete its 3-way handshake.
77311754SKacheong.Poon@Sun.COM  * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
77411754SKacheong.Poon@Sun.COM  *    requests currently on the queue that has timed out. This will be used
77511754SKacheong.Poon@Sun.COM  *    as an indicator of whether an attack is under way, so that appropriate
77611754SKacheong.Poon@Sun.COM  *    actions can be taken. (It's incremented in tcp_timer() and decremented
77711754SKacheong.Poon@Sun.COM  *    either when eager goes into ESTABLISHED, or gets freed up.)
77811754SKacheong.Poon@Sun.COM  * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
77911754SKacheong.Poon@Sun.COM  *    # of timeout drops back to <= q0len/32 => SYN alert off
78011754SKacheong.Poon@Sun.COM  */
78111754SKacheong.Poon@Sun.COM static boolean_t
tcp_drop_q0(tcp_t * tcp)78211754SKacheong.Poon@Sun.COM tcp_drop_q0(tcp_t *tcp)
78311754SKacheong.Poon@Sun.COM {
78411754SKacheong.Poon@Sun.COM 	tcp_t	*eager;
78511754SKacheong.Poon@Sun.COM 	mblk_t	*mp;
78611754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
78711754SKacheong.Poon@Sun.COM 
78811754SKacheong.Poon@Sun.COM 	ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
78911754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
79011754SKacheong.Poon@Sun.COM 
79111754SKacheong.Poon@Sun.COM 	/* Pick oldest eager from the list of droppable eagers */
79211754SKacheong.Poon@Sun.COM 	eager = tcp->tcp_eager_prev_drop_q0;
79311754SKacheong.Poon@Sun.COM 
79411754SKacheong.Poon@Sun.COM 	/* If list is empty. return B_FALSE */
79511754SKacheong.Poon@Sun.COM 	if (eager == tcp) {
79611754SKacheong.Poon@Sun.COM 		return (B_FALSE);
79711754SKacheong.Poon@Sun.COM 	}
79811754SKacheong.Poon@Sun.COM 
79911754SKacheong.Poon@Sun.COM 	/* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
80011754SKacheong.Poon@Sun.COM 	if ((mp = allocb(0, BPRI_HI)) == NULL)
80111754SKacheong.Poon@Sun.COM 		return (B_FALSE);
80211754SKacheong.Poon@Sun.COM 
80311754SKacheong.Poon@Sun.COM 	/*
80411754SKacheong.Poon@Sun.COM 	 * Take this eager out from the list of droppable eagers since we are
80511754SKacheong.Poon@Sun.COM 	 * going to drop it.
80611754SKacheong.Poon@Sun.COM 	 */
80711754SKacheong.Poon@Sun.COM 	MAKE_UNDROPPABLE(eager);
80811754SKacheong.Poon@Sun.COM 
80911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_connp->conn_debug) {
81011754SKacheong.Poon@Sun.COM 		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
81111754SKacheong.Poon@Sun.COM 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
81211754SKacheong.Poon@Sun.COM 		    " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
81311754SKacheong.Poon@Sun.COM 		    tcp->tcp_conn_req_cnt_q0,
81411754SKacheong.Poon@Sun.COM 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
81511754SKacheong.Poon@Sun.COM 	}
81611754SKacheong.Poon@Sun.COM 
81711754SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop);
81811754SKacheong.Poon@Sun.COM 
81911754SKacheong.Poon@Sun.COM 	/* Put a reference on the conn as we are enqueueing it in the sqeue */
82011754SKacheong.Poon@Sun.COM 	CONN_INC_REF(eager->tcp_connp);
82111754SKacheong.Poon@Sun.COM 
82211754SKacheong.Poon@Sun.COM 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
82311754SKacheong.Poon@Sun.COM 	    tcp_clean_death_wrapper, eager->tcp_connp, NULL,
82411754SKacheong.Poon@Sun.COM 	    SQ_FILL, SQTAG_TCP_DROP_Q0);
82511754SKacheong.Poon@Sun.COM 
82611754SKacheong.Poon@Sun.COM 	return (B_TRUE);
82711754SKacheong.Poon@Sun.COM }
82811754SKacheong.Poon@Sun.COM 
82911754SKacheong.Poon@Sun.COM /*
83011754SKacheong.Poon@Sun.COM  * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
83111754SKacheong.Poon@Sun.COM  */
83211754SKacheong.Poon@Sun.COM static mblk_t *
tcp_conn_create_v6(conn_t * lconnp,conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira)83311754SKacheong.Poon@Sun.COM tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
83411754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira)
83511754SKacheong.Poon@Sun.COM {
83611754SKacheong.Poon@Sun.COM 	tcp_t 		*ltcp = lconnp->conn_tcp;
83711754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
83811754SKacheong.Poon@Sun.COM 	mblk_t		*tpi_mp;
83911754SKacheong.Poon@Sun.COM 	ipha_t		*ipha;
84011754SKacheong.Poon@Sun.COM 	ip6_t		*ip6h;
84111754SKacheong.Poon@Sun.COM 	sin6_t 		sin6;
84211754SKacheong.Poon@Sun.COM 	uint_t		ifindex = ira->ira_ruifindex;
84311754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
84411754SKacheong.Poon@Sun.COM 
84511754SKacheong.Poon@Sun.COM 	if (ira->ira_flags & IRAF_IS_IPV4) {
84611754SKacheong.Poon@Sun.COM 		ipha = (ipha_t *)mp->b_rptr;
84711754SKacheong.Poon@Sun.COM 
84811754SKacheong.Poon@Sun.COM 		connp->conn_ipversion = IPV4_VERSION;
84911754SKacheong.Poon@Sun.COM 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
85011754SKacheong.Poon@Sun.COM 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
85111754SKacheong.Poon@Sun.COM 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
85211754SKacheong.Poon@Sun.COM 
85311754SKacheong.Poon@Sun.COM 		sin6 = sin6_null;
85411754SKacheong.Poon@Sun.COM 		sin6.sin6_addr = connp->conn_faddr_v6;
85511754SKacheong.Poon@Sun.COM 		sin6.sin6_port = connp->conn_fport;
85611754SKacheong.Poon@Sun.COM 		sin6.sin6_family = AF_INET6;
85711754SKacheong.Poon@Sun.COM 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
85811754SKacheong.Poon@Sun.COM 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
85911754SKacheong.Poon@Sun.COM 
86011754SKacheong.Poon@Sun.COM 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
86111754SKacheong.Poon@Sun.COM 			sin6_t	sin6d;
86211754SKacheong.Poon@Sun.COM 
86311754SKacheong.Poon@Sun.COM 			sin6d = sin6_null;
86411754SKacheong.Poon@Sun.COM 			sin6d.sin6_addr = connp->conn_laddr_v6;
86511754SKacheong.Poon@Sun.COM 			sin6d.sin6_port = connp->conn_lport;
86611754SKacheong.Poon@Sun.COM 			sin6d.sin6_family = AF_INET;
86711754SKacheong.Poon@Sun.COM 			tpi_mp = mi_tpi_extconn_ind(NULL,
86811754SKacheong.Poon@Sun.COM 			    (char *)&sin6d, sizeof (sin6_t),
86911754SKacheong.Poon@Sun.COM 			    (char *)&tcp,
87011754SKacheong.Poon@Sun.COM 			    (t_scalar_t)sizeof (intptr_t),
87111754SKacheong.Poon@Sun.COM 			    (char *)&sin6d, sizeof (sin6_t),
87211754SKacheong.Poon@Sun.COM 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
87311754SKacheong.Poon@Sun.COM 		} else {
87411754SKacheong.Poon@Sun.COM 			tpi_mp = mi_tpi_conn_ind(NULL,
87511754SKacheong.Poon@Sun.COM 			    (char *)&sin6, sizeof (sin6_t),
87611754SKacheong.Poon@Sun.COM 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
87711754SKacheong.Poon@Sun.COM 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
87811754SKacheong.Poon@Sun.COM 		}
87911754SKacheong.Poon@Sun.COM 	} else {
88011754SKacheong.Poon@Sun.COM 		ip6h = (ip6_t *)mp->b_rptr;
88111754SKacheong.Poon@Sun.COM 
88211754SKacheong.Poon@Sun.COM 		connp->conn_ipversion = IPV6_VERSION;
88311754SKacheong.Poon@Sun.COM 		connp->conn_laddr_v6 = ip6h->ip6_dst;
88411754SKacheong.Poon@Sun.COM 		connp->conn_faddr_v6 = ip6h->ip6_src;
88511754SKacheong.Poon@Sun.COM 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
88611754SKacheong.Poon@Sun.COM 
88711754SKacheong.Poon@Sun.COM 		sin6 = sin6_null;
88811754SKacheong.Poon@Sun.COM 		sin6.sin6_addr = connp->conn_faddr_v6;
88911754SKacheong.Poon@Sun.COM 		sin6.sin6_port = connp->conn_fport;
89011754SKacheong.Poon@Sun.COM 		sin6.sin6_family = AF_INET6;
89111754SKacheong.Poon@Sun.COM 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
89211754SKacheong.Poon@Sun.COM 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
89311754SKacheong.Poon@Sun.COM 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
89411754SKacheong.Poon@Sun.COM 
89511754SKacheong.Poon@Sun.COM 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
89611754SKacheong.Poon@Sun.COM 			/* Pass up the scope_id of remote addr */
89711754SKacheong.Poon@Sun.COM 			sin6.sin6_scope_id = ifindex;
89811754SKacheong.Poon@Sun.COM 		} else {
89911754SKacheong.Poon@Sun.COM 			sin6.sin6_scope_id = 0;
90011754SKacheong.Poon@Sun.COM 		}
90111754SKacheong.Poon@Sun.COM 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
90211754SKacheong.Poon@Sun.COM 			sin6_t	sin6d;
90311754SKacheong.Poon@Sun.COM 
90411754SKacheong.Poon@Sun.COM 			sin6d = sin6_null;
90511754SKacheong.Poon@Sun.COM 			sin6.sin6_addr = connp->conn_laddr_v6;
90611754SKacheong.Poon@Sun.COM 			sin6d.sin6_port = connp->conn_lport;
90711754SKacheong.Poon@Sun.COM 			sin6d.sin6_family = AF_INET6;
90811754SKacheong.Poon@Sun.COM 			if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
90911754SKacheong.Poon@Sun.COM 				sin6d.sin6_scope_id = ifindex;
91011754SKacheong.Poon@Sun.COM 
91111754SKacheong.Poon@Sun.COM 			tpi_mp = mi_tpi_extconn_ind(NULL,
91211754SKacheong.Poon@Sun.COM 			    (char *)&sin6d, sizeof (sin6_t),
91311754SKacheong.Poon@Sun.COM 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
91411754SKacheong.Poon@Sun.COM 			    (char *)&sin6d, sizeof (sin6_t),
91511754SKacheong.Poon@Sun.COM 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
91611754SKacheong.Poon@Sun.COM 		} else {
91711754SKacheong.Poon@Sun.COM 			tpi_mp = mi_tpi_conn_ind(NULL,
91811754SKacheong.Poon@Sun.COM 			    (char *)&sin6, sizeof (sin6_t),
91911754SKacheong.Poon@Sun.COM 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
92011754SKacheong.Poon@Sun.COM 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
92111754SKacheong.Poon@Sun.COM 		}
92211754SKacheong.Poon@Sun.COM 	}
92311754SKacheong.Poon@Sun.COM 
92411754SKacheong.Poon@Sun.COM 	tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
92511754SKacheong.Poon@Sun.COM 	return (tpi_mp);
92611754SKacheong.Poon@Sun.COM }
92711754SKacheong.Poon@Sun.COM 
92811754SKacheong.Poon@Sun.COM /* Handle a SYN on an AF_INET socket */
92911754SKacheong.Poon@Sun.COM static mblk_t *
tcp_conn_create_v4(conn_t * lconnp,conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira)93011754SKacheong.Poon@Sun.COM tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
93111754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira)
93211754SKacheong.Poon@Sun.COM {
93311754SKacheong.Poon@Sun.COM 	tcp_t 		*ltcp = lconnp->conn_tcp;
93411754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
93511754SKacheong.Poon@Sun.COM 	sin_t		sin;
93611754SKacheong.Poon@Sun.COM 	mblk_t		*tpi_mp = NULL;
93711754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
93811754SKacheong.Poon@Sun.COM 	ipha_t		*ipha;
93911754SKacheong.Poon@Sun.COM 
94011754SKacheong.Poon@Sun.COM 	ASSERT(ira->ira_flags & IRAF_IS_IPV4);
94111754SKacheong.Poon@Sun.COM 	ipha = (ipha_t *)mp->b_rptr;
94211754SKacheong.Poon@Sun.COM 
94311754SKacheong.Poon@Sun.COM 	connp->conn_ipversion = IPV4_VERSION;
94411754SKacheong.Poon@Sun.COM 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
94511754SKacheong.Poon@Sun.COM 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
94611754SKacheong.Poon@Sun.COM 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
94711754SKacheong.Poon@Sun.COM 
94811754SKacheong.Poon@Sun.COM 	sin = sin_null;
94911754SKacheong.Poon@Sun.COM 	sin.sin_addr.s_addr = connp->conn_faddr_v4;
95011754SKacheong.Poon@Sun.COM 	sin.sin_port = connp->conn_fport;
95111754SKacheong.Poon@Sun.COM 	sin.sin_family = AF_INET;
95211754SKacheong.Poon@Sun.COM 	if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
95311754SKacheong.Poon@Sun.COM 		sin_t	sind;
95411754SKacheong.Poon@Sun.COM 
95511754SKacheong.Poon@Sun.COM 		sind = sin_null;
95611754SKacheong.Poon@Sun.COM 		sind.sin_addr.s_addr = connp->conn_laddr_v4;
95711754SKacheong.Poon@Sun.COM 		sind.sin_port = connp->conn_lport;
95811754SKacheong.Poon@Sun.COM 		sind.sin_family = AF_INET;
95911754SKacheong.Poon@Sun.COM 		tpi_mp = mi_tpi_extconn_ind(NULL,
96011754SKacheong.Poon@Sun.COM 		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
96111754SKacheong.Poon@Sun.COM 		    (t_scalar_t)sizeof (intptr_t), (char *)&sind,
96211754SKacheong.Poon@Sun.COM 		    sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
96311754SKacheong.Poon@Sun.COM 	} else {
96411754SKacheong.Poon@Sun.COM 		tpi_mp = mi_tpi_conn_ind(NULL,
96511754SKacheong.Poon@Sun.COM 		    (char *)&sin, sizeof (sin_t),
96611754SKacheong.Poon@Sun.COM 		    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
96711754SKacheong.Poon@Sun.COM 		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
96811754SKacheong.Poon@Sun.COM 	}
96911754SKacheong.Poon@Sun.COM 
97011754SKacheong.Poon@Sun.COM 	tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
97111754SKacheong.Poon@Sun.COM 	return (tpi_mp);
97211754SKacheong.Poon@Sun.COM }
97311754SKacheong.Poon@Sun.COM 
97411754SKacheong.Poon@Sun.COM /*
97511754SKacheong.Poon@Sun.COM  * Called via squeue to get on to eager's perimeter. It sends a
97611754SKacheong.Poon@Sun.COM  * TH_RST if eager is in the fanout table. The listener wants the
97711754SKacheong.Poon@Sun.COM  * eager to disappear either by means of tcp_eager_blowoff() or
97811754SKacheong.Poon@Sun.COM  * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
97911754SKacheong.Poon@Sun.COM  * called (via squeue) if the eager cannot be inserted in the
98011754SKacheong.Poon@Sun.COM  * fanout table in tcp_input_listener().
98111754SKacheong.Poon@Sun.COM  */
98211754SKacheong.Poon@Sun.COM /* ARGSUSED */
98311754SKacheong.Poon@Sun.COM void
tcp_eager_kill(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)98411754SKacheong.Poon@Sun.COM tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
98511754SKacheong.Poon@Sun.COM {
98611754SKacheong.Poon@Sun.COM 	conn_t	*econnp = (conn_t *)arg;
98711754SKacheong.Poon@Sun.COM 	tcp_t	*eager = econnp->conn_tcp;
98811754SKacheong.Poon@Sun.COM 	tcp_t	*listener = eager->tcp_listener;
98911754SKacheong.Poon@Sun.COM 
99011754SKacheong.Poon@Sun.COM 	/*
99111754SKacheong.Poon@Sun.COM 	 * We could be called because listener is closing. Since
99211754SKacheong.Poon@Sun.COM 	 * the eager was using listener's queue's, we avoid
99311754SKacheong.Poon@Sun.COM 	 * using the listeners queues from now on.
99411754SKacheong.Poon@Sun.COM 	 */
99511754SKacheong.Poon@Sun.COM 	ASSERT(eager->tcp_detached);
99611754SKacheong.Poon@Sun.COM 	econnp->conn_rq = NULL;
99711754SKacheong.Poon@Sun.COM 	econnp->conn_wq = NULL;
99811754SKacheong.Poon@Sun.COM 
99911754SKacheong.Poon@Sun.COM 	/*
100011754SKacheong.Poon@Sun.COM 	 * An eager's conn_fanout will be NULL if it's a duplicate
100111754SKacheong.Poon@Sun.COM 	 * for an existing 4-tuples in the conn fanout table.
100211754SKacheong.Poon@Sun.COM 	 * We don't want to send an RST out in such case.
100311754SKacheong.Poon@Sun.COM 	 */
100411754SKacheong.Poon@Sun.COM 	if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
100511754SKacheong.Poon@Sun.COM 		tcp_xmit_ctl("tcp_eager_kill, can't wait",
100611754SKacheong.Poon@Sun.COM 		    eager, eager->tcp_snxt, 0, TH_RST);
100711754SKacheong.Poon@Sun.COM 	}
100811754SKacheong.Poon@Sun.COM 
100911754SKacheong.Poon@Sun.COM 	/* We are here because listener wants this eager gone */
101011754SKacheong.Poon@Sun.COM 	if (listener != NULL) {
101111754SKacheong.Poon@Sun.COM 		mutex_enter(&listener->tcp_eager_lock);
101211754SKacheong.Poon@Sun.COM 		tcp_eager_unlink(eager);
101311754SKacheong.Poon@Sun.COM 		if (eager->tcp_tconnind_started) {
101411754SKacheong.Poon@Sun.COM 			/*
101511754SKacheong.Poon@Sun.COM 			 * The eager has sent a conn_ind up to the
101611754SKacheong.Poon@Sun.COM 			 * listener but listener decides to close
101711754SKacheong.Poon@Sun.COM 			 * instead. We need to drop the extra ref
101811754SKacheong.Poon@Sun.COM 			 * placed on eager in tcp_input_data() before
101911754SKacheong.Poon@Sun.COM 			 * sending the conn_ind to listener.
102011754SKacheong.Poon@Sun.COM 			 */
102111754SKacheong.Poon@Sun.COM 			CONN_DEC_REF(econnp);
102211754SKacheong.Poon@Sun.COM 		}
102311754SKacheong.Poon@Sun.COM 		mutex_exit(&listener->tcp_eager_lock);
102411754SKacheong.Poon@Sun.COM 		CONN_DEC_REF(listener->tcp_connp);
102511754SKacheong.Poon@Sun.COM 	}
102611754SKacheong.Poon@Sun.COM 
102711754SKacheong.Poon@Sun.COM 	if (eager->tcp_state != TCPS_CLOSED)
102811754SKacheong.Poon@Sun.COM 		tcp_close_detached(eager);
102911754SKacheong.Poon@Sun.COM }
103011754SKacheong.Poon@Sun.COM 
103111754SKacheong.Poon@Sun.COM /*
103211754SKacheong.Poon@Sun.COM  * Reset any eager connection hanging off this listener marked
103311754SKacheong.Poon@Sun.COM  * with 'seqnum' and then reclaim it's resources.
103411754SKacheong.Poon@Sun.COM  */
103511754SKacheong.Poon@Sun.COM boolean_t
tcp_eager_blowoff(tcp_t * listener,t_scalar_t seqnum)103611754SKacheong.Poon@Sun.COM tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
103711754SKacheong.Poon@Sun.COM {
103811754SKacheong.Poon@Sun.COM 	tcp_t	*eager;
103911754SKacheong.Poon@Sun.COM 	mblk_t 	*mp;
104011754SKacheong.Poon@Sun.COM 
104111754SKacheong.Poon@Sun.COM 	eager = listener;
104211754SKacheong.Poon@Sun.COM 	mutex_enter(&listener->tcp_eager_lock);
104311754SKacheong.Poon@Sun.COM 	do {
104411754SKacheong.Poon@Sun.COM 		eager = eager->tcp_eager_next_q;
104511754SKacheong.Poon@Sun.COM 		if (eager == NULL) {
104611754SKacheong.Poon@Sun.COM 			mutex_exit(&listener->tcp_eager_lock);
104711754SKacheong.Poon@Sun.COM 			return (B_FALSE);
104811754SKacheong.Poon@Sun.COM 		}
104911754SKacheong.Poon@Sun.COM 	} while (eager->tcp_conn_req_seqnum != seqnum);
105011754SKacheong.Poon@Sun.COM 
105111754SKacheong.Poon@Sun.COM 	if (eager->tcp_closemp_used) {
105211754SKacheong.Poon@Sun.COM 		mutex_exit(&listener->tcp_eager_lock);
105311754SKacheong.Poon@Sun.COM 		return (B_TRUE);
105411754SKacheong.Poon@Sun.COM 	}
105511754SKacheong.Poon@Sun.COM 	eager->tcp_closemp_used = B_TRUE;
105611754SKacheong.Poon@Sun.COM 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
105711754SKacheong.Poon@Sun.COM 	CONN_INC_REF(eager->tcp_connp);
105811754SKacheong.Poon@Sun.COM 	mutex_exit(&listener->tcp_eager_lock);
105911754SKacheong.Poon@Sun.COM 	mp = &eager->tcp_closemp;
106011754SKacheong.Poon@Sun.COM 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
106111754SKacheong.Poon@Sun.COM 	    eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
106211754SKacheong.Poon@Sun.COM 	return (B_TRUE);
106311754SKacheong.Poon@Sun.COM }
106411754SKacheong.Poon@Sun.COM 
106511754SKacheong.Poon@Sun.COM /*
106611754SKacheong.Poon@Sun.COM  * Reset any eager connection hanging off this listener
106711754SKacheong.Poon@Sun.COM  * and then reclaim it's resources.
106811754SKacheong.Poon@Sun.COM  */
106911754SKacheong.Poon@Sun.COM void
tcp_eager_cleanup(tcp_t * listener,boolean_t q0_only)107011754SKacheong.Poon@Sun.COM tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
107111754SKacheong.Poon@Sun.COM {
107211754SKacheong.Poon@Sun.COM 	tcp_t	*eager;
107311754SKacheong.Poon@Sun.COM 	mblk_t	*mp;
107411754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = listener->tcp_tcps;
107511754SKacheong.Poon@Sun.COM 
107611754SKacheong.Poon@Sun.COM 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
107711754SKacheong.Poon@Sun.COM 
107811754SKacheong.Poon@Sun.COM 	if (!q0_only) {
107911754SKacheong.Poon@Sun.COM 		/* First cleanup q */
108011754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_eager_blowoff_q);
108111754SKacheong.Poon@Sun.COM 		eager = listener->tcp_eager_next_q;
108211754SKacheong.Poon@Sun.COM 		while (eager != NULL) {
108311754SKacheong.Poon@Sun.COM 			if (!eager->tcp_closemp_used) {
108411754SKacheong.Poon@Sun.COM 				eager->tcp_closemp_used = B_TRUE;
108511754SKacheong.Poon@Sun.COM 				TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
108611754SKacheong.Poon@Sun.COM 				CONN_INC_REF(eager->tcp_connp);
108711754SKacheong.Poon@Sun.COM 				mp = &eager->tcp_closemp;
108811754SKacheong.Poon@Sun.COM 				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
108911754SKacheong.Poon@Sun.COM 				    tcp_eager_kill, eager->tcp_connp, NULL,
109011754SKacheong.Poon@Sun.COM 				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
109111754SKacheong.Poon@Sun.COM 			}
109211754SKacheong.Poon@Sun.COM 			eager = eager->tcp_eager_next_q;
109311754SKacheong.Poon@Sun.COM 		}
109411754SKacheong.Poon@Sun.COM 	}
109511754SKacheong.Poon@Sun.COM 	/* Then cleanup q0 */
109611754SKacheong.Poon@Sun.COM 	TCP_STAT(tcps, tcp_eager_blowoff_q0);
109711754SKacheong.Poon@Sun.COM 	eager = listener->tcp_eager_next_q0;
109811754SKacheong.Poon@Sun.COM 	while (eager != listener) {
109911754SKacheong.Poon@Sun.COM 		if (!eager->tcp_closemp_used) {
110011754SKacheong.Poon@Sun.COM 			eager->tcp_closemp_used = B_TRUE;
110111754SKacheong.Poon@Sun.COM 			TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
110211754SKacheong.Poon@Sun.COM 			CONN_INC_REF(eager->tcp_connp);
110311754SKacheong.Poon@Sun.COM 			mp = &eager->tcp_closemp;
110411754SKacheong.Poon@Sun.COM 			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
110511754SKacheong.Poon@Sun.COM 			    tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
110611754SKacheong.Poon@Sun.COM 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
110711754SKacheong.Poon@Sun.COM 		}
110811754SKacheong.Poon@Sun.COM 		eager = eager->tcp_eager_next_q0;
110911754SKacheong.Poon@Sun.COM 	}
111011754SKacheong.Poon@Sun.COM }
111111754SKacheong.Poon@Sun.COM 
111211754SKacheong.Poon@Sun.COM /*
111311754SKacheong.Poon@Sun.COM  * If we are an eager connection hanging off a listener that hasn't
111411754SKacheong.Poon@Sun.COM  * formally accepted the connection yet, get off his list and blow off
111511754SKacheong.Poon@Sun.COM  * any data that we have accumulated.
111611754SKacheong.Poon@Sun.COM  */
111711754SKacheong.Poon@Sun.COM void
tcp_eager_unlink(tcp_t * tcp)111811754SKacheong.Poon@Sun.COM tcp_eager_unlink(tcp_t *tcp)
111911754SKacheong.Poon@Sun.COM {
112011754SKacheong.Poon@Sun.COM 	tcp_t	*listener = tcp->tcp_listener;
112111754SKacheong.Poon@Sun.COM 
112211754SKacheong.Poon@Sun.COM 	ASSERT(listener != NULL);
112311754SKacheong.Poon@Sun.COM 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
112411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_eager_next_q0 != NULL) {
112511754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_eager_prev_q0 != NULL);
112611754SKacheong.Poon@Sun.COM 
112711754SKacheong.Poon@Sun.COM 		/* Remove the eager tcp from q0 */
112811754SKacheong.Poon@Sun.COM 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
112911754SKacheong.Poon@Sun.COM 		    tcp->tcp_eager_prev_q0;
113011754SKacheong.Poon@Sun.COM 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
113111754SKacheong.Poon@Sun.COM 		    tcp->tcp_eager_next_q0;
113211754SKacheong.Poon@Sun.COM 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
113311754SKacheong.Poon@Sun.COM 		listener->tcp_conn_req_cnt_q0--;
113411754SKacheong.Poon@Sun.COM 
113511754SKacheong.Poon@Sun.COM 		tcp->tcp_eager_next_q0 = NULL;
113611754SKacheong.Poon@Sun.COM 		tcp->tcp_eager_prev_q0 = NULL;
113711754SKacheong.Poon@Sun.COM 
113811754SKacheong.Poon@Sun.COM 		/*
113911754SKacheong.Poon@Sun.COM 		 * Take the eager out, if it is in the list of droppable
114011754SKacheong.Poon@Sun.COM 		 * eagers.
114111754SKacheong.Poon@Sun.COM 		 */
114211754SKacheong.Poon@Sun.COM 		MAKE_UNDROPPABLE(tcp);
114311754SKacheong.Poon@Sun.COM 
114411754SKacheong.Poon@Sun.COM 		if (tcp->tcp_syn_rcvd_timeout != 0) {
114511754SKacheong.Poon@Sun.COM 			/* we have timed out before */
114611754SKacheong.Poon@Sun.COM 			ASSERT(listener->tcp_syn_rcvd_timeout > 0);
114711754SKacheong.Poon@Sun.COM 			listener->tcp_syn_rcvd_timeout--;
114811754SKacheong.Poon@Sun.COM 		}
114911754SKacheong.Poon@Sun.COM 	} else {
115011754SKacheong.Poon@Sun.COM 		tcp_t   **tcpp = &listener->tcp_eager_next_q;
115111754SKacheong.Poon@Sun.COM 		tcp_t	*prev = NULL;
115211754SKacheong.Poon@Sun.COM 
115311754SKacheong.Poon@Sun.COM 		for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
115411754SKacheong.Poon@Sun.COM 			if (tcpp[0] == tcp) {
115511754SKacheong.Poon@Sun.COM 				if (listener->tcp_eager_last_q == tcp) {
115611754SKacheong.Poon@Sun.COM 					/*
115711754SKacheong.Poon@Sun.COM 					 * If we are unlinking the last
115811754SKacheong.Poon@Sun.COM 					 * element on the list, adjust
115911754SKacheong.Poon@Sun.COM 					 * tail pointer. Set tail pointer
116011754SKacheong.Poon@Sun.COM 					 * to nil when list is empty.
116111754SKacheong.Poon@Sun.COM 					 */
116211754SKacheong.Poon@Sun.COM 					ASSERT(tcp->tcp_eager_next_q == NULL);
116311754SKacheong.Poon@Sun.COM 					if (listener->tcp_eager_last_q ==
116411754SKacheong.Poon@Sun.COM 					    listener->tcp_eager_next_q) {
116511754SKacheong.Poon@Sun.COM 						listener->tcp_eager_last_q =
116611754SKacheong.Poon@Sun.COM 						    NULL;
116711754SKacheong.Poon@Sun.COM 					} else {
116811754SKacheong.Poon@Sun.COM 						/*
116911754SKacheong.Poon@Sun.COM 						 * We won't get here if there
117011754SKacheong.Poon@Sun.COM 						 * is only one eager in the
117111754SKacheong.Poon@Sun.COM 						 * list.
117211754SKacheong.Poon@Sun.COM 						 */
117311754SKacheong.Poon@Sun.COM 						ASSERT(prev != NULL);
117411754SKacheong.Poon@Sun.COM 						listener->tcp_eager_last_q =
117511754SKacheong.Poon@Sun.COM 						    prev;
117611754SKacheong.Poon@Sun.COM 					}
117711754SKacheong.Poon@Sun.COM 				}
117811754SKacheong.Poon@Sun.COM 				tcpp[0] = tcp->tcp_eager_next_q;
117911754SKacheong.Poon@Sun.COM 				tcp->tcp_eager_next_q = NULL;
118011754SKacheong.Poon@Sun.COM 				tcp->tcp_eager_last_q = NULL;
118111754SKacheong.Poon@Sun.COM 				ASSERT(listener->tcp_conn_req_cnt_q > 0);
118211754SKacheong.Poon@Sun.COM 				listener->tcp_conn_req_cnt_q--;
118311754SKacheong.Poon@Sun.COM 				break;
118411754SKacheong.Poon@Sun.COM 			}
118511754SKacheong.Poon@Sun.COM 			prev = tcpp[0];
118611754SKacheong.Poon@Sun.COM 		}
118711754SKacheong.Poon@Sun.COM 	}
118811754SKacheong.Poon@Sun.COM 	tcp->tcp_listener = NULL;
118911754SKacheong.Poon@Sun.COM }
119011754SKacheong.Poon@Sun.COM 
119111754SKacheong.Poon@Sun.COM /* BEGIN CSTYLED */
119211754SKacheong.Poon@Sun.COM /*
119311754SKacheong.Poon@Sun.COM  *
119411754SKacheong.Poon@Sun.COM  * The sockfs ACCEPT path:
119511754SKacheong.Poon@Sun.COM  * =======================
119611754SKacheong.Poon@Sun.COM  *
119711754SKacheong.Poon@Sun.COM  * The eager is now established in its own perimeter as soon as SYN is
119811754SKacheong.Poon@Sun.COM  * received in tcp_input_listener(). When sockfs receives conn_ind, it
119911754SKacheong.Poon@Sun.COM  * completes the accept processing on the acceptor STREAM. The sending
120011754SKacheong.Poon@Sun.COM  * of conn_ind part is common for both sockfs listener and a TLI/XTI
120111754SKacheong.Poon@Sun.COM  * listener but a TLI/XTI listener completes the accept processing
120211754SKacheong.Poon@Sun.COM  * on the listener perimeter.
120311754SKacheong.Poon@Sun.COM  *
120411754SKacheong.Poon@Sun.COM  * Common control flow for 3 way handshake:
120511754SKacheong.Poon@Sun.COM  * ----------------------------------------
120611754SKacheong.Poon@Sun.COM  *
120711754SKacheong.Poon@Sun.COM  * incoming SYN (listener perimeter)	-> tcp_input_listener()
120811754SKacheong.Poon@Sun.COM  *
120911754SKacheong.Poon@Sun.COM  * incoming SYN-ACK-ACK (eager perim) 	-> tcp_input_data()
121011754SKacheong.Poon@Sun.COM  * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
121111754SKacheong.Poon@Sun.COM  *
121211754SKacheong.Poon@Sun.COM  * Sockfs ACCEPT Path:
121311754SKacheong.Poon@Sun.COM  * -------------------
121411754SKacheong.Poon@Sun.COM  *
121511754SKacheong.Poon@Sun.COM  * open acceptor stream (tcp_open allocates tcp_tli_accept()
121611754SKacheong.Poon@Sun.COM  * as STREAM entry point)
121711754SKacheong.Poon@Sun.COM  *
121811754SKacheong.Poon@Sun.COM  * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
121911754SKacheong.Poon@Sun.COM  *
122011754SKacheong.Poon@Sun.COM  * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
122111754SKacheong.Poon@Sun.COM  * association (we are not behind eager's squeue but sockfs is protecting us
122211754SKacheong.Poon@Sun.COM  * and no one knows about this stream yet. The STREAMS entry point q->q_info
122311754SKacheong.Poon@Sun.COM  * is changed to point at tcp_wput().
122411754SKacheong.Poon@Sun.COM  *
122511754SKacheong.Poon@Sun.COM  * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
122611754SKacheong.Poon@Sun.COM  * listener (done on listener's perimeter).
122711754SKacheong.Poon@Sun.COM  *
122811754SKacheong.Poon@Sun.COM  * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
122911754SKacheong.Poon@Sun.COM  * accept.
123011754SKacheong.Poon@Sun.COM  *
123111754SKacheong.Poon@Sun.COM  * TLI/XTI client ACCEPT path:
123211754SKacheong.Poon@Sun.COM  * ---------------------------
123311754SKacheong.Poon@Sun.COM  *
123411754SKacheong.Poon@Sun.COM  * soaccept() sends T_CONN_RES on the listener STREAM.
123511754SKacheong.Poon@Sun.COM  *
123611754SKacheong.Poon@Sun.COM  * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
123711754SKacheong.Poon@Sun.COM  * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
123811754SKacheong.Poon@Sun.COM  *
123911754SKacheong.Poon@Sun.COM  * Locks:
124011754SKacheong.Poon@Sun.COM  * ======
124111754SKacheong.Poon@Sun.COM  *
124211754SKacheong.Poon@Sun.COM  * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
124311754SKacheong.Poon@Sun.COM  * and listeners->tcp_eager_next_q.
124411754SKacheong.Poon@Sun.COM  *
124511754SKacheong.Poon@Sun.COM  * Referencing:
124611754SKacheong.Poon@Sun.COM  * ============
124711754SKacheong.Poon@Sun.COM  *
124811754SKacheong.Poon@Sun.COM  * 1) We start out in tcp_input_listener by eager placing a ref on
124911754SKacheong.Poon@Sun.COM  * listener and listener adding eager to listeners->tcp_eager_next_q0.
125011754SKacheong.Poon@Sun.COM  *
125111754SKacheong.Poon@Sun.COM  * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
125211754SKacheong.Poon@Sun.COM  * doing so we place a ref on the eager. This ref is finally dropped at the
125311754SKacheong.Poon@Sun.COM  * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
125411754SKacheong.Poon@Sun.COM  * reference is dropped by the squeue framework.
125511754SKacheong.Poon@Sun.COM  *
125611754SKacheong.Poon@Sun.COM  * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
125711754SKacheong.Poon@Sun.COM  *
125811754SKacheong.Poon@Sun.COM  * The reference must be released by the same entity that added the reference
125911754SKacheong.Poon@Sun.COM  * In the above scheme, the eager is the entity that adds and releases the
126011754SKacheong.Poon@Sun.COM  * references. Note that tcp_accept_finish executes in the squeue of the eager
126111754SKacheong.Poon@Sun.COM  * (albeit after it is attached to the acceptor stream). Though 1. executes
126211754SKacheong.Poon@Sun.COM  * in the listener's squeue, the eager is nascent at this point and the
126311754SKacheong.Poon@Sun.COM  * reference can be considered to have been added on behalf of the eager.
126411754SKacheong.Poon@Sun.COM  *
126511754SKacheong.Poon@Sun.COM  * Eager getting a Reset or listener closing:
126611754SKacheong.Poon@Sun.COM  * ==========================================
126711754SKacheong.Poon@Sun.COM  *
126811754SKacheong.Poon@Sun.COM  * Once the listener and eager are linked, the listener never does the unlink.
126911754SKacheong.Poon@Sun.COM  * If the listener needs to close, tcp_eager_cleanup() is called which queues
127011754SKacheong.Poon@Sun.COM  * a message on all eager perimeter. The eager then does the unlink, clears
127111754SKacheong.Poon@Sun.COM  * any pointers to the listener's queue and drops the reference to the
127211754SKacheong.Poon@Sun.COM  * listener. The listener waits in tcp_close outside the squeue until its
127311754SKacheong.Poon@Sun.COM  * refcount has dropped to 1. This ensures that the listener has waited for
127411754SKacheong.Poon@Sun.COM  * all eagers to clear their association with the listener.
127511754SKacheong.Poon@Sun.COM  *
127611754SKacheong.Poon@Sun.COM  * Similarly, if eager decides to go away, it can unlink itself and close.
127711754SKacheong.Poon@Sun.COM  * When the T_CONN_RES comes down, we check if eager has closed. Note that
127811754SKacheong.Poon@Sun.COM  * the reference to eager is still valid because of the extra ref we put
127911754SKacheong.Poon@Sun.COM  * in tcp_send_conn_ind.
128011754SKacheong.Poon@Sun.COM  *
128111754SKacheong.Poon@Sun.COM  * Listener can always locate the eager under the protection
128211754SKacheong.Poon@Sun.COM  * of the listener->tcp_eager_lock, and then do a refhold
128311754SKacheong.Poon@Sun.COM  * on the eager during the accept processing.
128411754SKacheong.Poon@Sun.COM  *
128511754SKacheong.Poon@Sun.COM  * The acceptor stream accesses the eager in the accept processing
128611754SKacheong.Poon@Sun.COM  * based on the ref placed on eager before sending T_conn_ind.
128711754SKacheong.Poon@Sun.COM  * The only entity that can negate this refhold is a listener close
128811754SKacheong.Poon@Sun.COM  * which is mutually exclusive with an active acceptor stream.
128911754SKacheong.Poon@Sun.COM  *
129011754SKacheong.Poon@Sun.COM  * Eager's reference on the listener
129111754SKacheong.Poon@Sun.COM  * ===================================
129211754SKacheong.Poon@Sun.COM  *
129311754SKacheong.Poon@Sun.COM  * If the accept happens (even on a closed eager) the eager drops its
129411754SKacheong.Poon@Sun.COM  * reference on the listener at the start of tcp_accept_finish. If the
129511754SKacheong.Poon@Sun.COM  * eager is killed due to an incoming RST before the T_conn_ind is sent up,
129611754SKacheong.Poon@Sun.COM  * the reference is dropped in tcp_closei_local. If the listener closes,
129711754SKacheong.Poon@Sun.COM  * the reference is dropped in tcp_eager_kill. In all cases the reference
129811754SKacheong.Poon@Sun.COM  * is dropped while executing in the eager's context (squeue).
129911754SKacheong.Poon@Sun.COM  */
130011754SKacheong.Poon@Sun.COM /* END CSTYLED */
130111754SKacheong.Poon@Sun.COM 
130211754SKacheong.Poon@Sun.COM /* Process the SYN packet, mp, directed at the listener 'tcp' */
130311754SKacheong.Poon@Sun.COM 
130411754SKacheong.Poon@Sun.COM /*
130511754SKacheong.Poon@Sun.COM  * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
130611754SKacheong.Poon@Sun.COM  * tcp_input_data will not see any packets for listeners since the listener
130711754SKacheong.Poon@Sun.COM  * has conn_recv set to tcp_input_listener.
130811754SKacheong.Poon@Sun.COM  */
130911754SKacheong.Poon@Sun.COM /* ARGSUSED */
131011754SKacheong.Poon@Sun.COM static void
tcp_input_listener(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)131111754SKacheong.Poon@Sun.COM tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
131211754SKacheong.Poon@Sun.COM {
131311754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
131411754SKacheong.Poon@Sun.COM 	uint32_t	seg_seq;
131511754SKacheong.Poon@Sun.COM 	tcp_t		*eager;
131611754SKacheong.Poon@Sun.COM 	int		err;
131711754SKacheong.Poon@Sun.COM 	conn_t		*econnp = NULL;
131811754SKacheong.Poon@Sun.COM 	squeue_t	*new_sqp;
131911754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
132011754SKacheong.Poon@Sun.COM 	uint_t 		ip_hdr_len;
132111754SKacheong.Poon@Sun.COM 	conn_t		*lconnp = (conn_t *)arg;
132211754SKacheong.Poon@Sun.COM 	tcp_t		*listener = lconnp->conn_tcp;
132311754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = listener->tcp_tcps;
132411754SKacheong.Poon@Sun.COM 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
132511754SKacheong.Poon@Sun.COM 	uint_t		flags;
132611754SKacheong.Poon@Sun.COM 	mblk_t		*tpi_mp;
132711754SKacheong.Poon@Sun.COM 	uint_t		ifindex = ira->ira_ruifindex;
132811754SKacheong.Poon@Sun.COM 	boolean_t	tlc_set = B_FALSE;
132911754SKacheong.Poon@Sun.COM 
133011754SKacheong.Poon@Sun.COM 	ip_hdr_len = ira->ira_ip_hdr_length;
133111754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
133211754SKacheong.Poon@Sun.COM 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
133311754SKacheong.Poon@Sun.COM 
133412507SAlan.Maguire@Sun.COM 	DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, lconnp->conn_ixa,
133512507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, listener,
133612507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_tcph_t *, tcpha);
133712507SAlan.Maguire@Sun.COM 
133811754SKacheong.Poon@Sun.COM 	if (!(flags & TH_SYN)) {
133911754SKacheong.Poon@Sun.COM 		if ((flags & TH_RST) || (flags & TH_URG)) {
134011754SKacheong.Poon@Sun.COM 			freemsg(mp);
134111754SKacheong.Poon@Sun.COM 			return;
134211754SKacheong.Poon@Sun.COM 		}
134311754SKacheong.Poon@Sun.COM 		if (flags & TH_ACK) {
134411754SKacheong.Poon@Sun.COM 			/* Note this executes in listener's squeue */
134511754SKacheong.Poon@Sun.COM 			tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
134611754SKacheong.Poon@Sun.COM 			return;
134711754SKacheong.Poon@Sun.COM 		}
134811754SKacheong.Poon@Sun.COM 
134911754SKacheong.Poon@Sun.COM 		freemsg(mp);
135011754SKacheong.Poon@Sun.COM 		return;
135111754SKacheong.Poon@Sun.COM 	}
135211754SKacheong.Poon@Sun.COM 
135311754SKacheong.Poon@Sun.COM 	if (listener->tcp_state != TCPS_LISTEN)
135411754SKacheong.Poon@Sun.COM 		goto error2;
135511754SKacheong.Poon@Sun.COM 
135611754SKacheong.Poon@Sun.COM 	ASSERT(IPCL_IS_BOUND(lconnp));
135711754SKacheong.Poon@Sun.COM 
135811754SKacheong.Poon@Sun.COM 	mutex_enter(&listener->tcp_eager_lock);
135911754SKacheong.Poon@Sun.COM 
136011754SKacheong.Poon@Sun.COM 	/*
136111754SKacheong.Poon@Sun.COM 	 * The system is under memory pressure, so we need to do our part
136211754SKacheong.Poon@Sun.COM 	 * to relieve the pressure.  So we only accept new request if there
136311754SKacheong.Poon@Sun.COM 	 * is nothing waiting to be accepted or waiting to complete the 3-way
136411754SKacheong.Poon@Sun.COM 	 * handshake.  This means that busy listener will not get too many
136511754SKacheong.Poon@Sun.COM 	 * new requests which they cannot handle in time while non-busy
136611754SKacheong.Poon@Sun.COM 	 * listener is still functioning properly.
136711754SKacheong.Poon@Sun.COM 	 */
136811754SKacheong.Poon@Sun.COM 	if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 ||
136911754SKacheong.Poon@Sun.COM 	    listener->tcp_conn_req_cnt_q0 > 0)) {
137011754SKacheong.Poon@Sun.COM 		mutex_exit(&listener->tcp_eager_lock);
137111754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_listen_mem_drop);
137211754SKacheong.Poon@Sun.COM 		goto error2;
137311754SKacheong.Poon@Sun.COM 	}
137411754SKacheong.Poon@Sun.COM 
137511754SKacheong.Poon@Sun.COM 	if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
137611754SKacheong.Poon@Sun.COM 		mutex_exit(&listener->tcp_eager_lock);
137711754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_listendrop);
137811754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpListenDrop);
137911754SKacheong.Poon@Sun.COM 		if (lconnp->conn_debug) {
138011754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
138111754SKacheong.Poon@Sun.COM 			    "tcp_input_listener: listen backlog (max=%d) "
138211754SKacheong.Poon@Sun.COM 			    "overflow (%d pending) on %s",
138311754SKacheong.Poon@Sun.COM 			    listener->tcp_conn_req_max,
138411754SKacheong.Poon@Sun.COM 			    listener->tcp_conn_req_cnt_q,
138511754SKacheong.Poon@Sun.COM 			    tcp_display(listener, NULL, DISP_PORT_ONLY));
138611754SKacheong.Poon@Sun.COM 		}
138711754SKacheong.Poon@Sun.COM 		goto error2;
138811754SKacheong.Poon@Sun.COM 	}
138911754SKacheong.Poon@Sun.COM 
139011754SKacheong.Poon@Sun.COM 	if (listener->tcp_conn_req_cnt_q0 >=
139111754SKacheong.Poon@Sun.COM 	    listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
139211754SKacheong.Poon@Sun.COM 		/*
139311754SKacheong.Poon@Sun.COM 		 * Q0 is full. Drop a pending half-open req from the queue
139411754SKacheong.Poon@Sun.COM 		 * to make room for the new SYN req. Also mark the time we
139511754SKacheong.Poon@Sun.COM 		 * drop a SYN.
139611754SKacheong.Poon@Sun.COM 		 *
139711754SKacheong.Poon@Sun.COM 		 * A more aggressive defense against SYN attack will
139811754SKacheong.Poon@Sun.COM 		 * be to set the "tcp_syn_defense" flag now.
139911754SKacheong.Poon@Sun.COM 		 */
140011754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_listendropq0);
140111754SKacheong.Poon@Sun.COM 		listener->tcp_last_rcv_lbolt = ddi_get_lbolt64();
140211754SKacheong.Poon@Sun.COM 		if (!tcp_drop_q0(listener)) {
140311754SKacheong.Poon@Sun.COM 			mutex_exit(&listener->tcp_eager_lock);
140411754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpListenDropQ0);
140511754SKacheong.Poon@Sun.COM 			if (lconnp->conn_debug) {
140611754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
140711754SKacheong.Poon@Sun.COM 				    "tcp_input_listener: listen half-open "
140811754SKacheong.Poon@Sun.COM 				    "queue (max=%d) full (%d pending) on %s",
140911754SKacheong.Poon@Sun.COM 				    tcps->tcps_conn_req_max_q0,
141011754SKacheong.Poon@Sun.COM 				    listener->tcp_conn_req_cnt_q0,
141111754SKacheong.Poon@Sun.COM 				    tcp_display(listener, NULL,
141211754SKacheong.Poon@Sun.COM 				    DISP_PORT_ONLY));
141311754SKacheong.Poon@Sun.COM 			}
141411754SKacheong.Poon@Sun.COM 			goto error2;
141511754SKacheong.Poon@Sun.COM 		}
141611754SKacheong.Poon@Sun.COM 	}
141711754SKacheong.Poon@Sun.COM 
141811754SKacheong.Poon@Sun.COM 	/*
141911754SKacheong.Poon@Sun.COM 	 * Enforce the limit set on the number of connections per listener.
142011754SKacheong.Poon@Sun.COM 	 * Note that tlc_cnt starts with 1.  So need to add 1 to tlc_max
142111754SKacheong.Poon@Sun.COM 	 * for comparison.
142211754SKacheong.Poon@Sun.COM 	 */
142311754SKacheong.Poon@Sun.COM 	if (listener->tcp_listen_cnt != NULL) {
142411754SKacheong.Poon@Sun.COM 		tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt;
142511754SKacheong.Poon@Sun.COM 		int64_t now;
142611754SKacheong.Poon@Sun.COM 
142711754SKacheong.Poon@Sun.COM 		if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) {
142811754SKacheong.Poon@Sun.COM 			mutex_exit(&listener->tcp_eager_lock);
142911754SKacheong.Poon@Sun.COM 			now = ddi_get_lbolt64();
143011754SKacheong.Poon@Sun.COM 			atomic_add_32(&tlc->tlc_cnt, -1);
143111754SKacheong.Poon@Sun.COM 			TCP_STAT(tcps, tcp_listen_cnt_drop);
143211754SKacheong.Poon@Sun.COM 			tlc->tlc_drop++;
143311754SKacheong.Poon@Sun.COM 			if (now - tlc->tlc_report_time >
143411754SKacheong.Poon@Sun.COM 			    MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) {
143511754SKacheong.Poon@Sun.COM 				zcmn_err(lconnp->conn_zoneid, CE_WARN,
143611754SKacheong.Poon@Sun.COM 				    "Listener (port %d) connection max (%u) "
143711754SKacheong.Poon@Sun.COM 				    "reached: %u attempts dropped total\n",
143811754SKacheong.Poon@Sun.COM 				    ntohs(listener->tcp_connp->conn_lport),
143911754SKacheong.Poon@Sun.COM 				    tlc->tlc_max, tlc->tlc_drop);
144011754SKacheong.Poon@Sun.COM 				tlc->tlc_report_time = now;
144111754SKacheong.Poon@Sun.COM 			}
144211754SKacheong.Poon@Sun.COM 			goto error2;
144311754SKacheong.Poon@Sun.COM 		}
144411754SKacheong.Poon@Sun.COM 		tlc_set = B_TRUE;
144511754SKacheong.Poon@Sun.COM 	}
144611754SKacheong.Poon@Sun.COM 
144711754SKacheong.Poon@Sun.COM 	mutex_exit(&listener->tcp_eager_lock);
144811754SKacheong.Poon@Sun.COM 
144911754SKacheong.Poon@Sun.COM 	/*
145011754SKacheong.Poon@Sun.COM 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
145111754SKacheong.Poon@Sun.COM 	 * or based on the ring (for packets from GLD). Otherwise it is
145211754SKacheong.Poon@Sun.COM 	 * set based on lbolt i.e., a somewhat random number.
145311754SKacheong.Poon@Sun.COM 	 */
145411754SKacheong.Poon@Sun.COM 	ASSERT(ira->ira_sqp != NULL);
145511754SKacheong.Poon@Sun.COM 	new_sqp = ira->ira_sqp;
145611754SKacheong.Poon@Sun.COM 
145711754SKacheong.Poon@Sun.COM 	econnp = (conn_t *)tcp_get_conn(arg2, tcps);
145811754SKacheong.Poon@Sun.COM 	if (econnp == NULL)
145911754SKacheong.Poon@Sun.COM 		goto error2;
146011754SKacheong.Poon@Sun.COM 
146111754SKacheong.Poon@Sun.COM 	ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
146211754SKacheong.Poon@Sun.COM 	econnp->conn_sqp = new_sqp;
146311754SKacheong.Poon@Sun.COM 	econnp->conn_initial_sqp = new_sqp;
146411754SKacheong.Poon@Sun.COM 	econnp->conn_ixa->ixa_sqp = new_sqp;
146511754SKacheong.Poon@Sun.COM 
146611754SKacheong.Poon@Sun.COM 	econnp->conn_fport = tcpha->tha_lport;
146711754SKacheong.Poon@Sun.COM 	econnp->conn_lport = tcpha->tha_fport;
146811754SKacheong.Poon@Sun.COM 
146911754SKacheong.Poon@Sun.COM 	err = conn_inherit_parent(lconnp, econnp);
147011754SKacheong.Poon@Sun.COM 	if (err != 0)
147111754SKacheong.Poon@Sun.COM 		goto error3;
147211754SKacheong.Poon@Sun.COM 
147311754SKacheong.Poon@Sun.COM 	/* We already know the laddr of the new connection is ours */
147411754SKacheong.Poon@Sun.COM 	econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
147511754SKacheong.Poon@Sun.COM 
147611754SKacheong.Poon@Sun.COM 	ASSERT(OK_32PTR(mp->b_rptr));
147711754SKacheong.Poon@Sun.COM 	ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
147811754SKacheong.Poon@Sun.COM 	    IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
147911754SKacheong.Poon@Sun.COM 
148011754SKacheong.Poon@Sun.COM 	if (lconnp->conn_family == AF_INET) {
148111754SKacheong.Poon@Sun.COM 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
148211754SKacheong.Poon@Sun.COM 		tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
148311754SKacheong.Poon@Sun.COM 	} else {
148411754SKacheong.Poon@Sun.COM 		tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
148511754SKacheong.Poon@Sun.COM 	}
148611754SKacheong.Poon@Sun.COM 
148711754SKacheong.Poon@Sun.COM 	if (tpi_mp == NULL)
148811754SKacheong.Poon@Sun.COM 		goto error3;
148911754SKacheong.Poon@Sun.COM 
149011754SKacheong.Poon@Sun.COM 	eager = econnp->conn_tcp;
149111754SKacheong.Poon@Sun.COM 	eager->tcp_detached = B_TRUE;
149211754SKacheong.Poon@Sun.COM 	SOCK_CONNID_INIT(eager->tcp_connid);
149311754SKacheong.Poon@Sun.COM 
149412544SKacheong.Poon@Sun.COM 	/*
149512544SKacheong.Poon@Sun.COM 	 * Initialize the eager's tcp_t and inherit some parameters from
149612544SKacheong.Poon@Sun.COM 	 * the listener.
149712544SKacheong.Poon@Sun.COM 	 */
149812544SKacheong.Poon@Sun.COM 	tcp_init_values(eager, listener);
149911754SKacheong.Poon@Sun.COM 
150011754SKacheong.Poon@Sun.COM 	ASSERT((econnp->conn_ixa->ixa_flags &
150111754SKacheong.Poon@Sun.COM 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
150211754SKacheong.Poon@Sun.COM 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
150311754SKacheong.Poon@Sun.COM 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
150411754SKacheong.Poon@Sun.COM 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
150511754SKacheong.Poon@Sun.COM 
150611754SKacheong.Poon@Sun.COM 	if (!tcps->tcps_dev_flow_ctl)
150711754SKacheong.Poon@Sun.COM 		econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
150811754SKacheong.Poon@Sun.COM 
150911754SKacheong.Poon@Sun.COM 	/* Prepare for diffing against previous packets */
151011754SKacheong.Poon@Sun.COM 	eager->tcp_recvifindex = 0;
151111754SKacheong.Poon@Sun.COM 	eager->tcp_recvhops = 0xffffffffU;
151211754SKacheong.Poon@Sun.COM 
151311754SKacheong.Poon@Sun.COM 	if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
151411754SKacheong.Poon@Sun.COM 		if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
151511754SKacheong.Poon@Sun.COM 		    IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
151611754SKacheong.Poon@Sun.COM 			econnp->conn_incoming_ifindex = ifindex;
151711754SKacheong.Poon@Sun.COM 			econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
151811754SKacheong.Poon@Sun.COM 			econnp->conn_ixa->ixa_scopeid = ifindex;
151911754SKacheong.Poon@Sun.COM 		}
152011754SKacheong.Poon@Sun.COM 	}
152111754SKacheong.Poon@Sun.COM 
152211754SKacheong.Poon@Sun.COM 	if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
152311754SKacheong.Poon@Sun.COM 	    (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
152411754SKacheong.Poon@Sun.COM 	    tcps->tcps_rev_src_routes) {
152511754SKacheong.Poon@Sun.COM 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
152611754SKacheong.Poon@Sun.COM 		ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
152711754SKacheong.Poon@Sun.COM 
152811754SKacheong.Poon@Sun.COM 		/* Source routing option copyover (reverse it) */
152911754SKacheong.Poon@Sun.COM 		err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
153011754SKacheong.Poon@Sun.COM 		if (err != 0) {
153111754SKacheong.Poon@Sun.COM 			freemsg(tpi_mp);
153211754SKacheong.Poon@Sun.COM 			goto error3;
153311754SKacheong.Poon@Sun.COM 		}
153411754SKacheong.Poon@Sun.COM 		ip_pkt_source_route_reverse_v4(ipp);
153511754SKacheong.Poon@Sun.COM 	}
153611754SKacheong.Poon@Sun.COM 
153711754SKacheong.Poon@Sun.COM 	ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
153811754SKacheong.Poon@Sun.COM 	ASSERT(!eager->tcp_tconnind_started);
153911754SKacheong.Poon@Sun.COM 	/*
154011754SKacheong.Poon@Sun.COM 	 * If the SYN came with a credential, it's a loopback packet or a
154111754SKacheong.Poon@Sun.COM 	 * labeled packet; attach the credential to the TPI message.
154211754SKacheong.Poon@Sun.COM 	 */
154311754SKacheong.Poon@Sun.COM 	if (ira->ira_cred != NULL)
154411754SKacheong.Poon@Sun.COM 		mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
154511754SKacheong.Poon@Sun.COM 
154611754SKacheong.Poon@Sun.COM 	eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
154712643SAnders.Persson@Sun.COM 	ASSERT(eager->tcp_ordrel_mp == NULL);
154812643SAnders.Persson@Sun.COM 
154911754SKacheong.Poon@Sun.COM 	/* Inherit the listener's non-STREAMS flag */
155011754SKacheong.Poon@Sun.COM 	if (IPCL_IS_NONSTR(lconnp)) {
155111754SKacheong.Poon@Sun.COM 		econnp->conn_flags |= IPCL_NONSTR;
155212643SAnders.Persson@Sun.COM 		/* All non-STREAMS tcp_ts are sockets */
155312643SAnders.Persson@Sun.COM 		eager->tcp_issocket = B_TRUE;
155412643SAnders.Persson@Sun.COM 	} else {
155511754SKacheong.Poon@Sun.COM 		/*
155611754SKacheong.Poon@Sun.COM 		 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
155711754SKacheong.Poon@Sun.COM 		 * at close time, we will always have that to send up.
155811754SKacheong.Poon@Sun.COM 		 * Otherwise, we need to do special handling in case the
155911754SKacheong.Poon@Sun.COM 		 * allocation fails at that time.
156011754SKacheong.Poon@Sun.COM 		 */
156111754SKacheong.Poon@Sun.COM 		if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
156211754SKacheong.Poon@Sun.COM 			goto error3;
156311754SKacheong.Poon@Sun.COM 	}
156411754SKacheong.Poon@Sun.COM 	/*
156511754SKacheong.Poon@Sun.COM 	 * Now that the IP addresses and ports are setup in econnp we
156611754SKacheong.Poon@Sun.COM 	 * can do the IPsec policy work.
156711754SKacheong.Poon@Sun.COM 	 */
156811754SKacheong.Poon@Sun.COM 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
156911754SKacheong.Poon@Sun.COM 		if (lconnp->conn_policy != NULL) {
157011754SKacheong.Poon@Sun.COM 			/*
157111754SKacheong.Poon@Sun.COM 			 * Inherit the policy from the listener; use
157211754SKacheong.Poon@Sun.COM 			 * actions from ira
157311754SKacheong.Poon@Sun.COM 			 */
157411754SKacheong.Poon@Sun.COM 			if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
157511754SKacheong.Poon@Sun.COM 				CONN_DEC_REF(econnp);
157611754SKacheong.Poon@Sun.COM 				freemsg(mp);
157711754SKacheong.Poon@Sun.COM 				goto error3;
157811754SKacheong.Poon@Sun.COM 			}
157911754SKacheong.Poon@Sun.COM 		}
158011754SKacheong.Poon@Sun.COM 	}
158111754SKacheong.Poon@Sun.COM 
158211754SKacheong.Poon@Sun.COM 	/*
158311754SKacheong.Poon@Sun.COM 	 * tcp_set_destination() may set tcp_rwnd according to the route
158411754SKacheong.Poon@Sun.COM 	 * metrics. If it does not, the eager's receive window will be set
158511754SKacheong.Poon@Sun.COM 	 * to the listener's receive window later in this function.
158611754SKacheong.Poon@Sun.COM 	 */
158711754SKacheong.Poon@Sun.COM 	eager->tcp_rwnd = 0;
158811754SKacheong.Poon@Sun.COM 
158911754SKacheong.Poon@Sun.COM 	if (is_system_labeled()) {
159011754SKacheong.Poon@Sun.COM 		ip_xmit_attr_t *ixa = econnp->conn_ixa;
159111754SKacheong.Poon@Sun.COM 
159211754SKacheong.Poon@Sun.COM 		ASSERT(ira->ira_tsl != NULL);
159311754SKacheong.Poon@Sun.COM 		/* Discard any old label */
159411754SKacheong.Poon@Sun.COM 		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
159511754SKacheong.Poon@Sun.COM 			ASSERT(ixa->ixa_tsl != NULL);
159611754SKacheong.Poon@Sun.COM 			label_rele(ixa->ixa_tsl);
159711754SKacheong.Poon@Sun.COM 			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
159811754SKacheong.Poon@Sun.COM 			ixa->ixa_tsl = NULL;
159911754SKacheong.Poon@Sun.COM 		}
160011754SKacheong.Poon@Sun.COM 		if ((lconnp->conn_mlp_type != mlptSingle ||
160111754SKacheong.Poon@Sun.COM 		    lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
160211754SKacheong.Poon@Sun.COM 		    ira->ira_tsl != NULL) {
160311754SKacheong.Poon@Sun.COM 			/*
160411754SKacheong.Poon@Sun.COM 			 * If this is an MLP connection or a MAC-Exempt
160511754SKacheong.Poon@Sun.COM 			 * connection with an unlabeled node, packets are to be
160611754SKacheong.Poon@Sun.COM 			 * exchanged using the security label of the received
160711754SKacheong.Poon@Sun.COM 			 * SYN packet instead of the server application's label.
160811754SKacheong.Poon@Sun.COM 			 * tsol_check_dest called from ip_set_destination
160911754SKacheong.Poon@Sun.COM 			 * might later update TSF_UNLABELED by replacing
161011754SKacheong.Poon@Sun.COM 			 * ixa_tsl with a new label.
161111754SKacheong.Poon@Sun.COM 			 */
161211754SKacheong.Poon@Sun.COM 			label_hold(ira->ira_tsl);
161311754SKacheong.Poon@Sun.COM 			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
161411754SKacheong.Poon@Sun.COM 			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
161511754SKacheong.Poon@Sun.COM 			    econnp, ts_label_t *, ixa->ixa_tsl)
161611754SKacheong.Poon@Sun.COM 		} else {
161711754SKacheong.Poon@Sun.COM 			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
161811754SKacheong.Poon@Sun.COM 			DTRACE_PROBE2(syn_accept, conn_t *,
161911754SKacheong.Poon@Sun.COM 			    econnp, ts_label_t *, ixa->ixa_tsl)
162011754SKacheong.Poon@Sun.COM 		}
162111754SKacheong.Poon@Sun.COM 		/*
162211754SKacheong.Poon@Sun.COM 		 * conn_connect() called from tcp_set_destination will verify
162311754SKacheong.Poon@Sun.COM 		 * the destination is allowed to receive packets at the
162411754SKacheong.Poon@Sun.COM 		 * security label of the SYN-ACK we are generating. As part of
162511754SKacheong.Poon@Sun.COM 		 * that, tsol_check_dest() may create a new effective label for
162611754SKacheong.Poon@Sun.COM 		 * this connection.
162711754SKacheong.Poon@Sun.COM 		 * Finally conn_connect() will call conn_update_label.
162811754SKacheong.Poon@Sun.COM 		 * All that remains for TCP to do is to call
162911754SKacheong.Poon@Sun.COM 		 * conn_build_hdr_template which is done as part of
163011754SKacheong.Poon@Sun.COM 		 * tcp_set_destination.
163111754SKacheong.Poon@Sun.COM 		 */
163211754SKacheong.Poon@Sun.COM 	}
163311754SKacheong.Poon@Sun.COM 
163411754SKacheong.Poon@Sun.COM 	/*
163511754SKacheong.Poon@Sun.COM 	 * Since we will clear tcp_listener before we clear tcp_detached
163611754SKacheong.Poon@Sun.COM 	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
163712643SAnders.Persson@Sun.COM 	 * so we can tell a TCP_IS_DETACHED_NONEAGER apart.
163811754SKacheong.Poon@Sun.COM 	 */
163911754SKacheong.Poon@Sun.COM 	eager->tcp_hard_binding = B_TRUE;
164011754SKacheong.Poon@Sun.COM 
164111754SKacheong.Poon@Sun.COM 	tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
164211754SKacheong.Poon@Sun.COM 	    TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
164311754SKacheong.Poon@Sun.COM 
164411754SKacheong.Poon@Sun.COM 	CL_INET_CONNECT(econnp, B_FALSE, err);
164511754SKacheong.Poon@Sun.COM 	if (err != 0) {
164611754SKacheong.Poon@Sun.COM 		tcp_bind_hash_remove(eager);
164711754SKacheong.Poon@Sun.COM 		goto error3;
164811754SKacheong.Poon@Sun.COM 	}
164911754SKacheong.Poon@Sun.COM 
165011754SKacheong.Poon@Sun.COM 	SOCK_CONNID_BUMP(eager->tcp_connid);
165111754SKacheong.Poon@Sun.COM 
165211754SKacheong.Poon@Sun.COM 	/*
165311754SKacheong.Poon@Sun.COM 	 * Adapt our mss, ttl, ... based on the remote address.
165411754SKacheong.Poon@Sun.COM 	 */
165511754SKacheong.Poon@Sun.COM 
165611754SKacheong.Poon@Sun.COM 	if (tcp_set_destination(eager) != 0) {
165711754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpAttemptFails);
165811754SKacheong.Poon@Sun.COM 		/* Undo the bind_hash_insert */
165911754SKacheong.Poon@Sun.COM 		tcp_bind_hash_remove(eager);
166011754SKacheong.Poon@Sun.COM 		goto error3;
166111754SKacheong.Poon@Sun.COM 	}
166211754SKacheong.Poon@Sun.COM 
166311754SKacheong.Poon@Sun.COM 	/* Process all TCP options. */
166411754SKacheong.Poon@Sun.COM 	tcp_process_options(eager, tcpha);
166511754SKacheong.Poon@Sun.COM 
166611754SKacheong.Poon@Sun.COM 	/* Is the other end ECN capable? */
166711754SKacheong.Poon@Sun.COM 	if (tcps->tcps_ecn_permitted >= 1 &&
166811754SKacheong.Poon@Sun.COM 	    (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
166911754SKacheong.Poon@Sun.COM 		eager->tcp_ecn_ok = B_TRUE;
167011754SKacheong.Poon@Sun.COM 	}
167111754SKacheong.Poon@Sun.COM 
167211754SKacheong.Poon@Sun.COM 	/*
167311754SKacheong.Poon@Sun.COM 	 * The listener's conn_rcvbuf should be the default window size or a
167411754SKacheong.Poon@Sun.COM 	 * window size changed via SO_RCVBUF option. First round up the
167511754SKacheong.Poon@Sun.COM 	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
167611754SKacheong.Poon@Sun.COM 	 * scale option value if needed. Call tcp_rwnd_set() to finish the
167711754SKacheong.Poon@Sun.COM 	 * setting.
167811754SKacheong.Poon@Sun.COM 	 *
167911754SKacheong.Poon@Sun.COM 	 * Note if there is a rpipe metric associated with the remote host,
168011754SKacheong.Poon@Sun.COM 	 * we should not inherit receive window size from listener.
168111754SKacheong.Poon@Sun.COM 	 */
168211754SKacheong.Poon@Sun.COM 	eager->tcp_rwnd = MSS_ROUNDUP(
168311754SKacheong.Poon@Sun.COM 	    (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
168411754SKacheong.Poon@Sun.COM 	    eager->tcp_rwnd), eager->tcp_mss);
168511754SKacheong.Poon@Sun.COM 	if (eager->tcp_snd_ws_ok)
168611754SKacheong.Poon@Sun.COM 		tcp_set_ws_value(eager);
168711754SKacheong.Poon@Sun.COM 	/*
168811754SKacheong.Poon@Sun.COM 	 * Note that this is the only place tcp_rwnd_set() is called for
168911754SKacheong.Poon@Sun.COM 	 * accepting a connection.  We need to call it here instead of
169011754SKacheong.Poon@Sun.COM 	 * after the 3-way handshake because we need to tell the other
169111754SKacheong.Poon@Sun.COM 	 * side our rwnd in the SYN-ACK segment.
169211754SKacheong.Poon@Sun.COM 	 */
169311754SKacheong.Poon@Sun.COM 	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
169411754SKacheong.Poon@Sun.COM 
169511754SKacheong.Poon@Sun.COM 	ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
169611754SKacheong.Poon@Sun.COM 	    eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
169711754SKacheong.Poon@Sun.COM 
169811754SKacheong.Poon@Sun.COM 	ASSERT(econnp->conn_rcvbuf != 0 &&
169911754SKacheong.Poon@Sun.COM 	    econnp->conn_rcvbuf == eager->tcp_rwnd);
170011754SKacheong.Poon@Sun.COM 
170111754SKacheong.Poon@Sun.COM 	/* Put a ref on the listener for the eager. */
170211754SKacheong.Poon@Sun.COM 	CONN_INC_REF(lconnp);
170311754SKacheong.Poon@Sun.COM 	mutex_enter(&listener->tcp_eager_lock);
170411754SKacheong.Poon@Sun.COM 	listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
170511754SKacheong.Poon@Sun.COM 	eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
170611754SKacheong.Poon@Sun.COM 	listener->tcp_eager_next_q0 = eager;
170711754SKacheong.Poon@Sun.COM 	eager->tcp_eager_prev_q0 = listener;
170811754SKacheong.Poon@Sun.COM 
170911754SKacheong.Poon@Sun.COM 	/* Set tcp_listener before adding it to tcp_conn_fanout */
171011754SKacheong.Poon@Sun.COM 	eager->tcp_listener = listener;
171111754SKacheong.Poon@Sun.COM 	eager->tcp_saved_listener = listener;
171211754SKacheong.Poon@Sun.COM 
171311754SKacheong.Poon@Sun.COM 	/*
171411754SKacheong.Poon@Sun.COM 	 * Set tcp_listen_cnt so that when the connection is done, the counter
171511754SKacheong.Poon@Sun.COM 	 * is decremented.
171611754SKacheong.Poon@Sun.COM 	 */
171711754SKacheong.Poon@Sun.COM 	eager->tcp_listen_cnt = listener->tcp_listen_cnt;
171811754SKacheong.Poon@Sun.COM 
171911754SKacheong.Poon@Sun.COM 	/*
172011754SKacheong.Poon@Sun.COM 	 * Tag this detached tcp vector for later retrieval
172111754SKacheong.Poon@Sun.COM 	 * by our listener client in tcp_accept().
172211754SKacheong.Poon@Sun.COM 	 */
172311754SKacheong.Poon@Sun.COM 	eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
172411754SKacheong.Poon@Sun.COM 	listener->tcp_conn_req_cnt_q0++;
172511754SKacheong.Poon@Sun.COM 	if (++listener->tcp_conn_req_seqnum == -1) {
172611754SKacheong.Poon@Sun.COM 		/*
172711754SKacheong.Poon@Sun.COM 		 * -1 is "special" and defined in TPI as something
172811754SKacheong.Poon@Sun.COM 		 * that should never be used in T_CONN_IND
172911754SKacheong.Poon@Sun.COM 		 */
173011754SKacheong.Poon@Sun.COM 		++listener->tcp_conn_req_seqnum;
173111754SKacheong.Poon@Sun.COM 	}
173211754SKacheong.Poon@Sun.COM 	mutex_exit(&listener->tcp_eager_lock);
173311754SKacheong.Poon@Sun.COM 
173411754SKacheong.Poon@Sun.COM 	if (listener->tcp_syn_defense) {
173511754SKacheong.Poon@Sun.COM 		/* Don't drop the SYN that comes from a good IP source */
173611754SKacheong.Poon@Sun.COM 		ipaddr_t *addr_cache;
173711754SKacheong.Poon@Sun.COM 
173811754SKacheong.Poon@Sun.COM 		addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
173911754SKacheong.Poon@Sun.COM 		if (addr_cache != NULL && econnp->conn_faddr_v4 ==
174011754SKacheong.Poon@Sun.COM 		    addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
174111754SKacheong.Poon@Sun.COM 			eager->tcp_dontdrop = B_TRUE;
174211754SKacheong.Poon@Sun.COM 		}
174311754SKacheong.Poon@Sun.COM 	}
174411754SKacheong.Poon@Sun.COM 
174511754SKacheong.Poon@Sun.COM 	/*
174611754SKacheong.Poon@Sun.COM 	 * We need to insert the eager in its own perimeter but as soon
174711754SKacheong.Poon@Sun.COM 	 * as we do that, we expose the eager to the classifier and
174811754SKacheong.Poon@Sun.COM 	 * should not touch any field outside the eager's perimeter.
174911754SKacheong.Poon@Sun.COM 	 * So do all the work necessary before inserting the eager
175011754SKacheong.Poon@Sun.COM 	 * in its own perimeter. Be optimistic that conn_connect()
175111754SKacheong.Poon@Sun.COM 	 * will succeed but undo everything if it fails.
175211754SKacheong.Poon@Sun.COM 	 */
175311754SKacheong.Poon@Sun.COM 	seg_seq = ntohl(tcpha->tha_seq);
175411754SKacheong.Poon@Sun.COM 	eager->tcp_irs = seg_seq;
175511754SKacheong.Poon@Sun.COM 	eager->tcp_rack = seg_seq;
175611754SKacheong.Poon@Sun.COM 	eager->tcp_rnxt = seg_seq + 1;
175711754SKacheong.Poon@Sun.COM 	eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
175811754SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpPassiveOpens);
175911754SKacheong.Poon@Sun.COM 	eager->tcp_state = TCPS_SYN_RCVD;
176012507SAlan.Maguire@Sun.COM 	DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
176112507SAlan.Maguire@Sun.COM 	    econnp->conn_ixa, void, NULL, tcp_t *, eager, void, NULL,
176212507SAlan.Maguire@Sun.COM 	    int32_t, TCPS_LISTEN);
176312507SAlan.Maguire@Sun.COM 
176411754SKacheong.Poon@Sun.COM 	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
176511754SKacheong.Poon@Sun.COM 	    NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
176611754SKacheong.Poon@Sun.COM 	if (mp1 == NULL) {
176711754SKacheong.Poon@Sun.COM 		/*
176811754SKacheong.Poon@Sun.COM 		 * Increment the ref count as we are going to
176911754SKacheong.Poon@Sun.COM 		 * enqueueing an mp in squeue
177011754SKacheong.Poon@Sun.COM 		 */
177111754SKacheong.Poon@Sun.COM 		CONN_INC_REF(econnp);
177211754SKacheong.Poon@Sun.COM 		goto error;
177311754SKacheong.Poon@Sun.COM 	}
177411754SKacheong.Poon@Sun.COM 
177511754SKacheong.Poon@Sun.COM 	/*
177611754SKacheong.Poon@Sun.COM 	 * We need to start the rto timer. In normal case, we start
177711754SKacheong.Poon@Sun.COM 	 * the timer after sending the packet on the wire (or at
177811754SKacheong.Poon@Sun.COM 	 * least believing that packet was sent by waiting for
177911754SKacheong.Poon@Sun.COM 	 * conn_ip_output() to return). Since this is the first packet
178011754SKacheong.Poon@Sun.COM 	 * being sent on the wire for the eager, our initial tcp_rto
178111754SKacheong.Poon@Sun.COM 	 * is at least tcp_rexmit_interval_min which is a fairly
178211754SKacheong.Poon@Sun.COM 	 * large value to allow the algorithm to adjust slowly to large
178311754SKacheong.Poon@Sun.COM 	 * fluctuations of RTT during first few transmissions.
178411754SKacheong.Poon@Sun.COM 	 *
178511754SKacheong.Poon@Sun.COM 	 * Starting the timer first and then sending the packet in this
178611754SKacheong.Poon@Sun.COM 	 * case shouldn't make much difference since tcp_rexmit_interval_min
178711754SKacheong.Poon@Sun.COM 	 * is of the order of several 100ms and starting the timer
178811754SKacheong.Poon@Sun.COM 	 * first and then sending the packet will result in difference
178911754SKacheong.Poon@Sun.COM 	 * of few micro seconds.
179011754SKacheong.Poon@Sun.COM 	 *
179111754SKacheong.Poon@Sun.COM 	 * Without this optimization, we are forced to hold the fanout
179211754SKacheong.Poon@Sun.COM 	 * lock across the ipcl_bind_insert() and sending the packet
179311754SKacheong.Poon@Sun.COM 	 * so that we don't race against an incoming packet (maybe RST)
179411754SKacheong.Poon@Sun.COM 	 * for this eager.
179511754SKacheong.Poon@Sun.COM 	 *
179611754SKacheong.Poon@Sun.COM 	 * It is necessary to acquire an extra reference on the eager
179711754SKacheong.Poon@Sun.COM 	 * at this point and hold it until after tcp_send_data() to
179811754SKacheong.Poon@Sun.COM 	 * ensure against an eager close race.
179911754SKacheong.Poon@Sun.COM 	 */
180011754SKacheong.Poon@Sun.COM 
180111754SKacheong.Poon@Sun.COM 	CONN_INC_REF(econnp);
180211754SKacheong.Poon@Sun.COM 
180311754SKacheong.Poon@Sun.COM 	TCP_TIMER_RESTART(eager, eager->tcp_rto);
180411754SKacheong.Poon@Sun.COM 
180511754SKacheong.Poon@Sun.COM 	/*
180611754SKacheong.Poon@Sun.COM 	 * Insert the eager in its own perimeter now. We are ready to deal
180711754SKacheong.Poon@Sun.COM 	 * with any packets on eager.
180811754SKacheong.Poon@Sun.COM 	 */
180911754SKacheong.Poon@Sun.COM 	if (ipcl_conn_insert(econnp) != 0)
181011754SKacheong.Poon@Sun.COM 		goto error;
181111754SKacheong.Poon@Sun.COM 
181211754SKacheong.Poon@Sun.COM 	ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
181311754SKacheong.Poon@Sun.COM 	freemsg(mp);
181411754SKacheong.Poon@Sun.COM 	/*
181511754SKacheong.Poon@Sun.COM 	 * Send the SYN-ACK. Use the right squeue so that conn_ixa is
181611754SKacheong.Poon@Sun.COM 	 * only used by one thread at a time.
181711754SKacheong.Poon@Sun.COM 	 */
181811754SKacheong.Poon@Sun.COM 	if (econnp->conn_sqp == lconnp->conn_sqp) {
181912507SAlan.Maguire@Sun.COM 		DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *,
182012507SAlan.Maguire@Sun.COM 		    econnp->conn_ixa, __dtrace_tcp_void_ip_t *, mp1->b_rptr,
182112507SAlan.Maguire@Sun.COM 		    tcp_t *, eager, __dtrace_tcp_tcph_t *,
182212507SAlan.Maguire@Sun.COM 		    &mp1->b_rptr[econnp->conn_ixa->ixa_ip_hdr_length]);
182311754SKacheong.Poon@Sun.COM 		(void) conn_ip_output(mp1, econnp->conn_ixa);
182411754SKacheong.Poon@Sun.COM 		CONN_DEC_REF(econnp);
182511754SKacheong.Poon@Sun.COM 	} else {
182611754SKacheong.Poon@Sun.COM 		SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack,
182711754SKacheong.Poon@Sun.COM 		    econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK);
182811754SKacheong.Poon@Sun.COM 	}
182911754SKacheong.Poon@Sun.COM 	return;
183011754SKacheong.Poon@Sun.COM error:
183111754SKacheong.Poon@Sun.COM 	freemsg(mp1);
183211754SKacheong.Poon@Sun.COM 	eager->tcp_closemp_used = B_TRUE;
183311754SKacheong.Poon@Sun.COM 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
183411754SKacheong.Poon@Sun.COM 	mp1 = &eager->tcp_closemp;
183511754SKacheong.Poon@Sun.COM 	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
183611754SKacheong.Poon@Sun.COM 	    econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
183711754SKacheong.Poon@Sun.COM 
183811754SKacheong.Poon@Sun.COM 	/*
183911754SKacheong.Poon@Sun.COM 	 * If a connection already exists, send the mp to that connections so
184011754SKacheong.Poon@Sun.COM 	 * that it can be appropriately dealt with.
184111754SKacheong.Poon@Sun.COM 	 */
184211754SKacheong.Poon@Sun.COM 	ipst = tcps->tcps_netstack->netstack_ip;
184311754SKacheong.Poon@Sun.COM 
184411754SKacheong.Poon@Sun.COM 	if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
184511754SKacheong.Poon@Sun.COM 		if (!IPCL_IS_CONNECTED(econnp)) {
184611754SKacheong.Poon@Sun.COM 			/*
184711754SKacheong.Poon@Sun.COM 			 * Something bad happened. ipcl_conn_insert()
184811754SKacheong.Poon@Sun.COM 			 * failed because a connection already existed
184911754SKacheong.Poon@Sun.COM 			 * in connected hash but we can't find it
185011754SKacheong.Poon@Sun.COM 			 * anymore (someone blew it away). Just
185111754SKacheong.Poon@Sun.COM 			 * free this message and hopefully remote
185211754SKacheong.Poon@Sun.COM 			 * will retransmit at which time the SYN can be
185311754SKacheong.Poon@Sun.COM 			 * treated as a new connection or dealth with
185411754SKacheong.Poon@Sun.COM 			 * a TH_RST if a connection already exists.
185511754SKacheong.Poon@Sun.COM 			 */
185611754SKacheong.Poon@Sun.COM 			CONN_DEC_REF(econnp);
185711754SKacheong.Poon@Sun.COM 			freemsg(mp);
185811754SKacheong.Poon@Sun.COM 		} else {
185911754SKacheong.Poon@Sun.COM 			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
186011754SKacheong.Poon@Sun.COM 			    econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
186111754SKacheong.Poon@Sun.COM 		}
186211754SKacheong.Poon@Sun.COM 	} else {
186311754SKacheong.Poon@Sun.COM 		/* Nobody wants this packet */
186411754SKacheong.Poon@Sun.COM 		freemsg(mp);
186511754SKacheong.Poon@Sun.COM 	}
186611754SKacheong.Poon@Sun.COM 	return;
186711754SKacheong.Poon@Sun.COM error3:
186811754SKacheong.Poon@Sun.COM 	CONN_DEC_REF(econnp);
186911754SKacheong.Poon@Sun.COM error2:
187011754SKacheong.Poon@Sun.COM 	freemsg(mp);
187111754SKacheong.Poon@Sun.COM 	if (tlc_set)
187211754SKacheong.Poon@Sun.COM 		atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1);
187311754SKacheong.Poon@Sun.COM }
187411754SKacheong.Poon@Sun.COM 
187511754SKacheong.Poon@Sun.COM /*
187611754SKacheong.Poon@Sun.COM  * In an ideal case of vertical partition in NUMA architecture, its
187711754SKacheong.Poon@Sun.COM  * beneficial to have the listener and all the incoming connections
187811754SKacheong.Poon@Sun.COM  * tied to the same squeue. The other constraint is that incoming
187911754SKacheong.Poon@Sun.COM  * connections should be tied to the squeue attached to interrupted
188011754SKacheong.Poon@Sun.COM  * CPU for obvious locality reason so this leaves the listener to
188111754SKacheong.Poon@Sun.COM  * be tied to the same squeue. Our only problem is that when listener
188211754SKacheong.Poon@Sun.COM  * is binding, the CPU that will get interrupted by the NIC whose
188311754SKacheong.Poon@Sun.COM  * IP address the listener is binding to is not even known. So
188411754SKacheong.Poon@Sun.COM  * the code below allows us to change that binding at the time the
188511754SKacheong.Poon@Sun.COM  * CPU is interrupted by virtue of incoming connection's squeue.
188611754SKacheong.Poon@Sun.COM  *
188711754SKacheong.Poon@Sun.COM  * This is usefull only in case of a listener bound to a specific IP
188811754SKacheong.Poon@Sun.COM  * address. For other kind of listeners, they get bound the
188911754SKacheong.Poon@Sun.COM  * very first time and there is no attempt to rebind them.
189011754SKacheong.Poon@Sun.COM  */
189111754SKacheong.Poon@Sun.COM void
tcp_input_listener_unbound(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)189211754SKacheong.Poon@Sun.COM tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
189311754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira)
189411754SKacheong.Poon@Sun.COM {
189511754SKacheong.Poon@Sun.COM 	conn_t		*connp = (conn_t *)arg;
189611754SKacheong.Poon@Sun.COM 	squeue_t	*sqp = (squeue_t *)arg2;
189711754SKacheong.Poon@Sun.COM 	squeue_t	*new_sqp;
189811754SKacheong.Poon@Sun.COM 	uint32_t	conn_flags;
189911754SKacheong.Poon@Sun.COM 
190011754SKacheong.Poon@Sun.COM 	/*
190111754SKacheong.Poon@Sun.COM 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
190211754SKacheong.Poon@Sun.COM 	 * or based on the ring (for packets from GLD). Otherwise it is
190311754SKacheong.Poon@Sun.COM 	 * set based on lbolt i.e., a somewhat random number.
190411754SKacheong.Poon@Sun.COM 	 */
190511754SKacheong.Poon@Sun.COM 	ASSERT(ira->ira_sqp != NULL);
190611754SKacheong.Poon@Sun.COM 	new_sqp = ira->ira_sqp;
190711754SKacheong.Poon@Sun.COM 
190811754SKacheong.Poon@Sun.COM 	if (connp->conn_fanout == NULL)
190911754SKacheong.Poon@Sun.COM 		goto done;
191011754SKacheong.Poon@Sun.COM 
191111754SKacheong.Poon@Sun.COM 	if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
191211754SKacheong.Poon@Sun.COM 		mutex_enter(&connp->conn_fanout->connf_lock);
191311754SKacheong.Poon@Sun.COM 		mutex_enter(&connp->conn_lock);
191411754SKacheong.Poon@Sun.COM 		/*
191511754SKacheong.Poon@Sun.COM 		 * No one from read or write side can access us now
191611754SKacheong.Poon@Sun.COM 		 * except for already queued packets on this squeue.
191711754SKacheong.Poon@Sun.COM 		 * But since we haven't changed the squeue yet, they
191811754SKacheong.Poon@Sun.COM 		 * can't execute. If they are processed after we have
191911754SKacheong.Poon@Sun.COM 		 * changed the squeue, they are sent back to the
192011754SKacheong.Poon@Sun.COM 		 * correct squeue down below.
192111754SKacheong.Poon@Sun.COM 		 * But a listner close can race with processing of
192211754SKacheong.Poon@Sun.COM 		 * incoming SYN. If incoming SYN processing changes
192311754SKacheong.Poon@Sun.COM 		 * the squeue then the listener close which is waiting
192411754SKacheong.Poon@Sun.COM 		 * to enter the squeue would operate on the wrong
192511754SKacheong.Poon@Sun.COM 		 * squeue. Hence we don't change the squeue here unless
192611754SKacheong.Poon@Sun.COM 		 * the refcount is exactly the minimum refcount. The
192711754SKacheong.Poon@Sun.COM 		 * minimum refcount of 4 is counted as - 1 each for
192811754SKacheong.Poon@Sun.COM 		 * TCP and IP, 1 for being in the classifier hash, and
192911754SKacheong.Poon@Sun.COM 		 * 1 for the mblk being processed.
193011754SKacheong.Poon@Sun.COM 		 */
193111754SKacheong.Poon@Sun.COM 
193211754SKacheong.Poon@Sun.COM 		if (connp->conn_ref != 4 ||
193311754SKacheong.Poon@Sun.COM 		    connp->conn_tcp->tcp_state != TCPS_LISTEN) {
193411754SKacheong.Poon@Sun.COM 			mutex_exit(&connp->conn_lock);
193511754SKacheong.Poon@Sun.COM 			mutex_exit(&connp->conn_fanout->connf_lock);
193611754SKacheong.Poon@Sun.COM 			goto done;
193711754SKacheong.Poon@Sun.COM 		}
193811754SKacheong.Poon@Sun.COM 		if (connp->conn_sqp != new_sqp) {
193911754SKacheong.Poon@Sun.COM 			while (connp->conn_sqp != new_sqp)
194011754SKacheong.Poon@Sun.COM 				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
194111754SKacheong.Poon@Sun.COM 			/* No special MT issues for outbound ixa_sqp hint */
194211754SKacheong.Poon@Sun.COM 			connp->conn_ixa->ixa_sqp = new_sqp;
194311754SKacheong.Poon@Sun.COM 		}
194411754SKacheong.Poon@Sun.COM 
194511754SKacheong.Poon@Sun.COM 		do {
194611754SKacheong.Poon@Sun.COM 			conn_flags = connp->conn_flags;
194711754SKacheong.Poon@Sun.COM 			conn_flags |= IPCL_FULLY_BOUND;
194811754SKacheong.Poon@Sun.COM 			(void) cas32(&connp->conn_flags, connp->conn_flags,
194911754SKacheong.Poon@Sun.COM 			    conn_flags);
195011754SKacheong.Poon@Sun.COM 		} while (!(connp->conn_flags & IPCL_FULLY_BOUND));
195111754SKacheong.Poon@Sun.COM 
195211754SKacheong.Poon@Sun.COM 		mutex_exit(&connp->conn_fanout->connf_lock);
195311754SKacheong.Poon@Sun.COM 		mutex_exit(&connp->conn_lock);
195411754SKacheong.Poon@Sun.COM 
195511754SKacheong.Poon@Sun.COM 		/*
195611754SKacheong.Poon@Sun.COM 		 * Assume we have picked a good squeue for the listener. Make
195711754SKacheong.Poon@Sun.COM 		 * subsequent SYNs not try to change the squeue.
195811754SKacheong.Poon@Sun.COM 		 */
195911754SKacheong.Poon@Sun.COM 		connp->conn_recv = tcp_input_listener;
196011754SKacheong.Poon@Sun.COM 	}
196111754SKacheong.Poon@Sun.COM 
196211754SKacheong.Poon@Sun.COM done:
196311754SKacheong.Poon@Sun.COM 	if (connp->conn_sqp != sqp) {
196411754SKacheong.Poon@Sun.COM 		CONN_INC_REF(connp);
196511754SKacheong.Poon@Sun.COM 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
196611754SKacheong.Poon@Sun.COM 		    ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
196711754SKacheong.Poon@Sun.COM 	} else {
196811754SKacheong.Poon@Sun.COM 		tcp_input_listener(connp, mp, sqp, ira);
196911754SKacheong.Poon@Sun.COM 	}
197011754SKacheong.Poon@Sun.COM }
197111754SKacheong.Poon@Sun.COM 
197211754SKacheong.Poon@Sun.COM /*
197311754SKacheong.Poon@Sun.COM  * Send up all messages queued on tcp_rcv_list.
197411754SKacheong.Poon@Sun.COM  */
197511754SKacheong.Poon@Sun.COM uint_t
tcp_rcv_drain(tcp_t * tcp)197611754SKacheong.Poon@Sun.COM tcp_rcv_drain(tcp_t *tcp)
197711754SKacheong.Poon@Sun.COM {
197811754SKacheong.Poon@Sun.COM 	mblk_t *mp;
197911754SKacheong.Poon@Sun.COM 	uint_t ret = 0;
198011754SKacheong.Poon@Sun.COM #ifdef DEBUG
198111754SKacheong.Poon@Sun.COM 	uint_t cnt = 0;
198211754SKacheong.Poon@Sun.COM #endif
198311754SKacheong.Poon@Sun.COM 	queue_t	*q = tcp->tcp_connp->conn_rq;
198411754SKacheong.Poon@Sun.COM 
198511754SKacheong.Poon@Sun.COM 	/* Can't drain on an eager connection */
198611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_listener != NULL)
198711754SKacheong.Poon@Sun.COM 		return (ret);
198811754SKacheong.Poon@Sun.COM 
198911754SKacheong.Poon@Sun.COM 	/* Can't be a non-STREAMS connection */
199011754SKacheong.Poon@Sun.COM 	ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
199111754SKacheong.Poon@Sun.COM 
199211754SKacheong.Poon@Sun.COM 	/* No need for the push timer now. */
199311754SKacheong.Poon@Sun.COM 	if (tcp->tcp_push_tid != 0) {
199411754SKacheong.Poon@Sun.COM 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
199511754SKacheong.Poon@Sun.COM 		tcp->tcp_push_tid = 0;
199611754SKacheong.Poon@Sun.COM 	}
199711754SKacheong.Poon@Sun.COM 
199811754SKacheong.Poon@Sun.COM 	/*
199911754SKacheong.Poon@Sun.COM 	 * Handle two cases here: we are currently fused or we were
200011754SKacheong.Poon@Sun.COM 	 * previously fused and have some urgent data to be delivered
200111754SKacheong.Poon@Sun.COM 	 * upstream.  The latter happens because we either ran out of
200211754SKacheong.Poon@Sun.COM 	 * memory or were detached and therefore sending the SIGURG was
200311754SKacheong.Poon@Sun.COM 	 * deferred until this point.  In either case we pass control
200411754SKacheong.Poon@Sun.COM 	 * over to tcp_fuse_rcv_drain() since it may need to complete
200511754SKacheong.Poon@Sun.COM 	 * some work.
200611754SKacheong.Poon@Sun.COM 	 */
200711754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
200811754SKacheong.Poon@Sun.COM 		if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
200911754SKacheong.Poon@Sun.COM 		    &tcp->tcp_fused_sigurg_mp))
201011754SKacheong.Poon@Sun.COM 			return (ret);
201111754SKacheong.Poon@Sun.COM 	}
201211754SKacheong.Poon@Sun.COM 
201311754SKacheong.Poon@Sun.COM 	while ((mp = tcp->tcp_rcv_list) != NULL) {
201411754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_list = mp->b_next;
201511754SKacheong.Poon@Sun.COM 		mp->b_next = NULL;
201611754SKacheong.Poon@Sun.COM #ifdef DEBUG
201711754SKacheong.Poon@Sun.COM 		cnt += msgdsize(mp);
201811754SKacheong.Poon@Sun.COM #endif
201911754SKacheong.Poon@Sun.COM 		putnext(q, mp);
202011754SKacheong.Poon@Sun.COM 	}
202111754SKacheong.Poon@Sun.COM #ifdef DEBUG
202211754SKacheong.Poon@Sun.COM 	ASSERT(cnt == tcp->tcp_rcv_cnt);
202311754SKacheong.Poon@Sun.COM #endif
202411754SKacheong.Poon@Sun.COM 	tcp->tcp_rcv_last_head = NULL;
202511754SKacheong.Poon@Sun.COM 	tcp->tcp_rcv_last_tail = NULL;
202611754SKacheong.Poon@Sun.COM 	tcp->tcp_rcv_cnt = 0;
202711754SKacheong.Poon@Sun.COM 
202811754SKacheong.Poon@Sun.COM 	if (canputnext(q))
202911754SKacheong.Poon@Sun.COM 		return (tcp_rwnd_reopen(tcp));
203011754SKacheong.Poon@Sun.COM 
203111754SKacheong.Poon@Sun.COM 	return (ret);
203211754SKacheong.Poon@Sun.COM }
203311754SKacheong.Poon@Sun.COM 
203411754SKacheong.Poon@Sun.COM /*
203511754SKacheong.Poon@Sun.COM  * Queue data on tcp_rcv_list which is a b_next chain.
203611754SKacheong.Poon@Sun.COM  * tcp_rcv_last_head/tail is the last element of this chain.
203711754SKacheong.Poon@Sun.COM  * Each element of the chain is a b_cont chain.
203811754SKacheong.Poon@Sun.COM  *
203911754SKacheong.Poon@Sun.COM  * M_DATA messages are added to the current element.
204011754SKacheong.Poon@Sun.COM  * Other messages are added as new (b_next) elements.
204111754SKacheong.Poon@Sun.COM  */
204211754SKacheong.Poon@Sun.COM void
tcp_rcv_enqueue(tcp_t * tcp,mblk_t * mp,uint_t seg_len,cred_t * cr)204311754SKacheong.Poon@Sun.COM tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr)
204411754SKacheong.Poon@Sun.COM {
204511754SKacheong.Poon@Sun.COM 	ASSERT(seg_len == msgdsize(mp));
204611754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
204711754SKacheong.Poon@Sun.COM 
204811754SKacheong.Poon@Sun.COM 	if (is_system_labeled()) {
204911754SKacheong.Poon@Sun.COM 		ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
205011754SKacheong.Poon@Sun.COM 		/*
205111754SKacheong.Poon@Sun.COM 		 * Provide for protocols above TCP such as RPC. NOPID leaves
205211754SKacheong.Poon@Sun.COM 		 * db_cpid unchanged.
205311754SKacheong.Poon@Sun.COM 		 * The cred could have already been set.
205411754SKacheong.Poon@Sun.COM 		 */
205511754SKacheong.Poon@Sun.COM 		if (cr != NULL)
205611754SKacheong.Poon@Sun.COM 			mblk_setcred(mp, cr, NOPID);
205711754SKacheong.Poon@Sun.COM 	}
205811754SKacheong.Poon@Sun.COM 
205911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_rcv_list == NULL) {
206011754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_rcv_last_head == NULL);
206111754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_list = mp;
206211754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_last_head = mp;
206311754SKacheong.Poon@Sun.COM 	} else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) {
206411754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_last_tail->b_cont = mp;
206511754SKacheong.Poon@Sun.COM 	} else {
206611754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_last_head->b_next = mp;
206711754SKacheong.Poon@Sun.COM 		tcp->tcp_rcv_last_head = mp;
206811754SKacheong.Poon@Sun.COM 	}
206911754SKacheong.Poon@Sun.COM 
207011754SKacheong.Poon@Sun.COM 	while (mp->b_cont)
207111754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
207211754SKacheong.Poon@Sun.COM 
207311754SKacheong.Poon@Sun.COM 	tcp->tcp_rcv_last_tail = mp;
207411754SKacheong.Poon@Sun.COM 	tcp->tcp_rcv_cnt += seg_len;
207511754SKacheong.Poon@Sun.COM 	tcp->tcp_rwnd -= seg_len;
207611754SKacheong.Poon@Sun.COM }
207711754SKacheong.Poon@Sun.COM 
207811754SKacheong.Poon@Sun.COM /* Generate an ACK-only (no data) segment for a TCP endpoint */
207911754SKacheong.Poon@Sun.COM mblk_t *
tcp_ack_mp(tcp_t * tcp)208011754SKacheong.Poon@Sun.COM tcp_ack_mp(tcp_t *tcp)
208111754SKacheong.Poon@Sun.COM {
208211754SKacheong.Poon@Sun.COM 	uint32_t	seq_no;
208311754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
208411754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
208511754SKacheong.Poon@Sun.COM 
208611754SKacheong.Poon@Sun.COM 	/*
208711754SKacheong.Poon@Sun.COM 	 * There are a few cases to be considered while setting the sequence no.
208811754SKacheong.Poon@Sun.COM 	 * Essentially, we can come here while processing an unacceptable pkt
208911754SKacheong.Poon@Sun.COM 	 * in the TCPS_SYN_RCVD state, in which case we set the sequence number
209011754SKacheong.Poon@Sun.COM 	 * to snxt (per RFC 793), note the swnd wouldn't have been set yet.
209111754SKacheong.Poon@Sun.COM 	 * If we are here for a zero window probe, stick with suna. In all
209211754SKacheong.Poon@Sun.COM 	 * other cases, we check if suna + swnd encompasses snxt and set
209311754SKacheong.Poon@Sun.COM 	 * the sequence number to snxt, if so. If snxt falls outside the
209411754SKacheong.Poon@Sun.COM 	 * window (the receiver probably shrunk its window), we will go with
209511754SKacheong.Poon@Sun.COM 	 * suna + swnd, otherwise the sequence no will be unacceptable to the
209611754SKacheong.Poon@Sun.COM 	 * receiver.
209711754SKacheong.Poon@Sun.COM 	 */
209811754SKacheong.Poon@Sun.COM 	if (tcp->tcp_zero_win_probe) {
209911754SKacheong.Poon@Sun.COM 		seq_no = tcp->tcp_suna;
210011754SKacheong.Poon@Sun.COM 	} else if (tcp->tcp_state == TCPS_SYN_RCVD) {
210111754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_swnd == 0);
210211754SKacheong.Poon@Sun.COM 		seq_no = tcp->tcp_snxt;
210311754SKacheong.Poon@Sun.COM 	} else {
210411754SKacheong.Poon@Sun.COM 		seq_no = SEQ_GT(tcp->tcp_snxt,
210511754SKacheong.Poon@Sun.COM 		    (tcp->tcp_suna + tcp->tcp_swnd)) ?
210611754SKacheong.Poon@Sun.COM 		    (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt;
210711754SKacheong.Poon@Sun.COM 	}
210811754SKacheong.Poon@Sun.COM 
210911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_valid_bits) {
211011754SKacheong.Poon@Sun.COM 		/*
211111754SKacheong.Poon@Sun.COM 		 * For the complex case where we have to send some
211211754SKacheong.Poon@Sun.COM 		 * controls (FIN or SYN), let tcp_xmit_mp do it.
211311754SKacheong.Poon@Sun.COM 		 */
211411754SKacheong.Poon@Sun.COM 		return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE,
211511754SKacheong.Poon@Sun.COM 		    NULL, B_FALSE));
211611754SKacheong.Poon@Sun.COM 	} else {
211711754SKacheong.Poon@Sun.COM 		/* Generate a simple ACK */
211811754SKacheong.Poon@Sun.COM 		int	data_length;
211911754SKacheong.Poon@Sun.COM 		uchar_t	*rptr;
212011754SKacheong.Poon@Sun.COM 		tcpha_t	*tcpha;
212111754SKacheong.Poon@Sun.COM 		mblk_t	*mp1;
212211754SKacheong.Poon@Sun.COM 		int32_t	total_hdr_len;
212311754SKacheong.Poon@Sun.COM 		int32_t	tcp_hdr_len;
212411754SKacheong.Poon@Sun.COM 		int32_t	num_sack_blk = 0;
212511754SKacheong.Poon@Sun.COM 		int32_t sack_opt_len;
212611754SKacheong.Poon@Sun.COM 		ip_xmit_attr_t *ixa = connp->conn_ixa;
212711754SKacheong.Poon@Sun.COM 
212811754SKacheong.Poon@Sun.COM 		/*
212911754SKacheong.Poon@Sun.COM 		 * Allocate space for TCP + IP headers
213011754SKacheong.Poon@Sun.COM 		 * and link-level header
213111754SKacheong.Poon@Sun.COM 		 */
213211754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
213311754SKacheong.Poon@Sun.COM 			num_sack_blk = MIN(tcp->tcp_max_sack_blk,
213411754SKacheong.Poon@Sun.COM 			    tcp->tcp_num_sack_blk);
213511754SKacheong.Poon@Sun.COM 			sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
213611754SKacheong.Poon@Sun.COM 			    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
213711754SKacheong.Poon@Sun.COM 			total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len;
213811754SKacheong.Poon@Sun.COM 			tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len;
213911754SKacheong.Poon@Sun.COM 		} else {
214011754SKacheong.Poon@Sun.COM 			total_hdr_len = connp->conn_ht_iphc_len;
214111754SKacheong.Poon@Sun.COM 			tcp_hdr_len = connp->conn_ht_ulp_len;
214211754SKacheong.Poon@Sun.COM 		}
214311754SKacheong.Poon@Sun.COM 		mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
214411754SKacheong.Poon@Sun.COM 		if (!mp1)
214511754SKacheong.Poon@Sun.COM 			return (NULL);
214611754SKacheong.Poon@Sun.COM 
214711754SKacheong.Poon@Sun.COM 		/* Update the latest receive window size in TCP header. */
214811754SKacheong.Poon@Sun.COM 		tcp->tcp_tcpha->tha_win =
214911754SKacheong.Poon@Sun.COM 		    htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
215011754SKacheong.Poon@Sun.COM 		/* copy in prototype TCP + IP header */
215111754SKacheong.Poon@Sun.COM 		rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
215211754SKacheong.Poon@Sun.COM 		mp1->b_rptr = rptr;
215311754SKacheong.Poon@Sun.COM 		mp1->b_wptr = rptr + total_hdr_len;
215411754SKacheong.Poon@Sun.COM 		bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
215511754SKacheong.Poon@Sun.COM 
215611754SKacheong.Poon@Sun.COM 		tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
215711754SKacheong.Poon@Sun.COM 
215811754SKacheong.Poon@Sun.COM 		/* Set the TCP sequence number. */
215911754SKacheong.Poon@Sun.COM 		tcpha->tha_seq = htonl(seq_no);
216011754SKacheong.Poon@Sun.COM 
216111754SKacheong.Poon@Sun.COM 		/* Set up the TCP flag field. */
216211754SKacheong.Poon@Sun.COM 		tcpha->tha_flags = (uchar_t)TH_ACK;
216311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_echo_on)
216411754SKacheong.Poon@Sun.COM 			tcpha->tha_flags |= TH_ECE;
216511754SKacheong.Poon@Sun.COM 
216611754SKacheong.Poon@Sun.COM 		tcp->tcp_rack = tcp->tcp_rnxt;
216711754SKacheong.Poon@Sun.COM 		tcp->tcp_rack_cnt = 0;
216811754SKacheong.Poon@Sun.COM 
216911754SKacheong.Poon@Sun.COM 		/* fill in timestamp option if in use */
217011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_ts_ok) {
217111754SKacheong.Poon@Sun.COM 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
217211754SKacheong.Poon@Sun.COM 
217311754SKacheong.Poon@Sun.COM 			U32_TO_BE32(llbolt,
217411754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
217511754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tcp->tcp_ts_recent,
217611754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
217711754SKacheong.Poon@Sun.COM 		}
217811754SKacheong.Poon@Sun.COM 
217911754SKacheong.Poon@Sun.COM 		/* Fill in SACK options */
218011754SKacheong.Poon@Sun.COM 		if (num_sack_blk > 0) {
218111754SKacheong.Poon@Sun.COM 			uchar_t *wptr = (uchar_t *)tcpha +
218211754SKacheong.Poon@Sun.COM 			    connp->conn_ht_ulp_len;
218311754SKacheong.Poon@Sun.COM 			sack_blk_t *tmp;
218411754SKacheong.Poon@Sun.COM 			int32_t	i;
218511754SKacheong.Poon@Sun.COM 
218611754SKacheong.Poon@Sun.COM 			wptr[0] = TCPOPT_NOP;
218711754SKacheong.Poon@Sun.COM 			wptr[1] = TCPOPT_NOP;
218811754SKacheong.Poon@Sun.COM 			wptr[2] = TCPOPT_SACK;
218911754SKacheong.Poon@Sun.COM 			wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
219011754SKacheong.Poon@Sun.COM 			    sizeof (sack_blk_t);
219111754SKacheong.Poon@Sun.COM 			wptr += TCPOPT_REAL_SACK_LEN;
219211754SKacheong.Poon@Sun.COM 
219311754SKacheong.Poon@Sun.COM 			tmp = tcp->tcp_sack_list;
219411754SKacheong.Poon@Sun.COM 			for (i = 0; i < num_sack_blk; i++) {
219511754SKacheong.Poon@Sun.COM 				U32_TO_BE32(tmp[i].begin, wptr);
219611754SKacheong.Poon@Sun.COM 				wptr += sizeof (tcp_seq);
219711754SKacheong.Poon@Sun.COM 				U32_TO_BE32(tmp[i].end, wptr);
219811754SKacheong.Poon@Sun.COM 				wptr += sizeof (tcp_seq);
219911754SKacheong.Poon@Sun.COM 			}
220011754SKacheong.Poon@Sun.COM 			tcpha->tha_offset_and_reserved +=
220111754SKacheong.Poon@Sun.COM 			    ((num_sack_blk * 2 + 1) << 4);
220211754SKacheong.Poon@Sun.COM 		}
220311754SKacheong.Poon@Sun.COM 
220411754SKacheong.Poon@Sun.COM 		ixa->ixa_pktlen = total_hdr_len;
220511754SKacheong.Poon@Sun.COM 
220611754SKacheong.Poon@Sun.COM 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
220711754SKacheong.Poon@Sun.COM 			((ipha_t *)rptr)->ipha_length = htons(total_hdr_len);
220811754SKacheong.Poon@Sun.COM 		} else {
220911754SKacheong.Poon@Sun.COM 			ip6_t *ip6 = (ip6_t *)rptr;
221011754SKacheong.Poon@Sun.COM 
221111754SKacheong.Poon@Sun.COM 			ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
221211754SKacheong.Poon@Sun.COM 		}
221311754SKacheong.Poon@Sun.COM 
221411754SKacheong.Poon@Sun.COM 		/*
221511754SKacheong.Poon@Sun.COM 		 * Prime pump for checksum calculation in IP.  Include the
221611754SKacheong.Poon@Sun.COM 		 * adjustment for a source route if any.
221711754SKacheong.Poon@Sun.COM 		 */
221811754SKacheong.Poon@Sun.COM 		data_length = tcp_hdr_len + connp->conn_sum;
221911754SKacheong.Poon@Sun.COM 		data_length = (data_length >> 16) + (data_length & 0xFFFF);
222011754SKacheong.Poon@Sun.COM 		tcpha->tha_sum = htons(data_length);
222111754SKacheong.Poon@Sun.COM 
222211754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ip_forward_progress) {
222311754SKacheong.Poon@Sun.COM 			tcp->tcp_ip_forward_progress = B_FALSE;
222411754SKacheong.Poon@Sun.COM 			connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
222511754SKacheong.Poon@Sun.COM 		} else {
222611754SKacheong.Poon@Sun.COM 			connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
222711754SKacheong.Poon@Sun.COM 		}
222811754SKacheong.Poon@Sun.COM 		return (mp1);
222911754SKacheong.Poon@Sun.COM 	}
223011754SKacheong.Poon@Sun.COM }
223111754SKacheong.Poon@Sun.COM 
223211754SKacheong.Poon@Sun.COM /*
223311754SKacheong.Poon@Sun.COM  * Handle M_DATA messages from IP. Its called directly from IP via
223411754SKacheong.Poon@Sun.COM  * squeue for received IP packets.
223511754SKacheong.Poon@Sun.COM  *
223611754SKacheong.Poon@Sun.COM  * The first argument is always the connp/tcp to which the mp belongs.
223711754SKacheong.Poon@Sun.COM  * There are no exceptions to this rule. The caller has already put
223811754SKacheong.Poon@Sun.COM  * a reference on this connp/tcp and once tcp_input_data() returns,
223911754SKacheong.Poon@Sun.COM  * the squeue will do the refrele.
224011754SKacheong.Poon@Sun.COM  *
224111754SKacheong.Poon@Sun.COM  * The TH_SYN for the listener directly go to tcp_input_listener via
224211754SKacheong.Poon@Sun.COM  * squeue. ICMP errors go directly to tcp_icmp_input().
224311754SKacheong.Poon@Sun.COM  *
224411754SKacheong.Poon@Sun.COM  * sqp: NULL = recursive, sqp != NULL means called from squeue
224511754SKacheong.Poon@Sun.COM  */
224611754SKacheong.Poon@Sun.COM void
tcp_input_data(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)224711754SKacheong.Poon@Sun.COM tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
224811754SKacheong.Poon@Sun.COM {
224911754SKacheong.Poon@Sun.COM 	int32_t		bytes_acked;
225011754SKacheong.Poon@Sun.COM 	int32_t		gap;
225111754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
225211754SKacheong.Poon@Sun.COM 	uint_t		flags;
225311754SKacheong.Poon@Sun.COM 	uint32_t	new_swnd = 0;
225411754SKacheong.Poon@Sun.COM 	uchar_t		*iphdr;
225511754SKacheong.Poon@Sun.COM 	uchar_t		*rptr;
225611754SKacheong.Poon@Sun.COM 	int32_t		rgap;
225711754SKacheong.Poon@Sun.COM 	uint32_t	seg_ack;
225811754SKacheong.Poon@Sun.COM 	int		seg_len;
225911754SKacheong.Poon@Sun.COM 	uint_t		ip_hdr_len;
226011754SKacheong.Poon@Sun.COM 	uint32_t	seg_seq;
226111754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
226211754SKacheong.Poon@Sun.COM 	int		urp;
226311754SKacheong.Poon@Sun.COM 	tcp_opt_t	tcpopt;
226411754SKacheong.Poon@Sun.COM 	ip_pkt_t	ipp;
226511754SKacheong.Poon@Sun.COM 	boolean_t	ofo_seg = B_FALSE; /* Out of order segment */
226611754SKacheong.Poon@Sun.COM 	uint32_t	cwnd;
226711754SKacheong.Poon@Sun.COM 	uint32_t	add;
226811754SKacheong.Poon@Sun.COM 	int		npkt;
226911754SKacheong.Poon@Sun.COM 	int		mss;
227011754SKacheong.Poon@Sun.COM 	conn_t		*connp = (conn_t *)arg;
227111754SKacheong.Poon@Sun.COM 	squeue_t	*sqp = (squeue_t *)arg2;
227211754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
227311754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
227411754SKacheong.Poon@Sun.COM 
227511754SKacheong.Poon@Sun.COM 	/*
227611754SKacheong.Poon@Sun.COM 	 * RST from fused tcp loopback peer should trigger an unfuse.
227711754SKacheong.Poon@Sun.COM 	 */
227811754SKacheong.Poon@Sun.COM 	if (tcp->tcp_fused) {
227911754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_fusion_aborted);
228011754SKacheong.Poon@Sun.COM 		tcp_unfuse(tcp);
228111754SKacheong.Poon@Sun.COM 	}
228211754SKacheong.Poon@Sun.COM 
228311754SKacheong.Poon@Sun.COM 	iphdr = mp->b_rptr;
228411754SKacheong.Poon@Sun.COM 	rptr = mp->b_rptr;
228511754SKacheong.Poon@Sun.COM 	ASSERT(OK_32PTR(rptr));
228611754SKacheong.Poon@Sun.COM 
228711754SKacheong.Poon@Sun.COM 	ip_hdr_len = ira->ira_ip_hdr_length;
228811754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_all != 0) {
228911754SKacheong.Poon@Sun.COM 		/*
229011754SKacheong.Poon@Sun.COM 		 * Record packet information in the ip_pkt_t
229111754SKacheong.Poon@Sun.COM 		 */
229211754SKacheong.Poon@Sun.COM 		ipp.ipp_fields = 0;
229311754SKacheong.Poon@Sun.COM 		if (ira->ira_flags & IRAF_IS_IPV4) {
229411754SKacheong.Poon@Sun.COM 			(void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
229511754SKacheong.Poon@Sun.COM 			    B_FALSE);
229611754SKacheong.Poon@Sun.COM 		} else {
229711754SKacheong.Poon@Sun.COM 			uint8_t nexthdrp;
229811754SKacheong.Poon@Sun.COM 
229911754SKacheong.Poon@Sun.COM 			/*
230011754SKacheong.Poon@Sun.COM 			 * IPv6 packets can only be received by applications
230111754SKacheong.Poon@Sun.COM 			 * that are prepared to receive IPv6 addresses.
230211754SKacheong.Poon@Sun.COM 			 * The IP fanout must ensure this.
230311754SKacheong.Poon@Sun.COM 			 */
230411754SKacheong.Poon@Sun.COM 			ASSERT(connp->conn_family == AF_INET6);
230511754SKacheong.Poon@Sun.COM 
230611754SKacheong.Poon@Sun.COM 			(void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
230711754SKacheong.Poon@Sun.COM 			    &nexthdrp);
230811754SKacheong.Poon@Sun.COM 			ASSERT(nexthdrp == IPPROTO_TCP);
230911754SKacheong.Poon@Sun.COM 
231011754SKacheong.Poon@Sun.COM 			/* Could have caused a pullup? */
231111754SKacheong.Poon@Sun.COM 			iphdr = mp->b_rptr;
231211754SKacheong.Poon@Sun.COM 			rptr = mp->b_rptr;
231311754SKacheong.Poon@Sun.COM 		}
231411754SKacheong.Poon@Sun.COM 	}
231511754SKacheong.Poon@Sun.COM 	ASSERT(DB_TYPE(mp) == M_DATA);
231611754SKacheong.Poon@Sun.COM 	ASSERT(mp->b_next == NULL);
231711754SKacheong.Poon@Sun.COM 
231811754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
231911754SKacheong.Poon@Sun.COM 	seg_seq = ntohl(tcpha->tha_seq);
232011754SKacheong.Poon@Sun.COM 	seg_ack = ntohl(tcpha->tha_ack);
232111754SKacheong.Poon@Sun.COM 	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
232211754SKacheong.Poon@Sun.COM 	seg_len = (int)(mp->b_wptr - rptr) -
232311754SKacheong.Poon@Sun.COM 	    (ip_hdr_len + TCP_HDR_LENGTH(tcpha));
232411754SKacheong.Poon@Sun.COM 	if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) {
232511754SKacheong.Poon@Sun.COM 		do {
232611754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
232711754SKacheong.Poon@Sun.COM 			    (uintptr_t)INT_MAX);
232811754SKacheong.Poon@Sun.COM 			seg_len += (int)(mp1->b_wptr - mp1->b_rptr);
232911754SKacheong.Poon@Sun.COM 		} while ((mp1 = mp1->b_cont) != NULL &&
233011754SKacheong.Poon@Sun.COM 		    mp1->b_datap->db_type == M_DATA);
233111754SKacheong.Poon@Sun.COM 	}
233211754SKacheong.Poon@Sun.COM 
233312507SAlan.Maguire@Sun.COM 	DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
233412507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_void_ip_t *, iphdr, tcp_t *, tcp,
233512507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_tcph_t *, tcpha);
233612507SAlan.Maguire@Sun.COM 
233711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_state == TCPS_TIME_WAIT) {
233811754SKacheong.Poon@Sun.COM 		tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
233911754SKacheong.Poon@Sun.COM 		    seg_len, tcpha, ira);
234011754SKacheong.Poon@Sun.COM 		return;
234111754SKacheong.Poon@Sun.COM 	}
234211754SKacheong.Poon@Sun.COM 
234311754SKacheong.Poon@Sun.COM 	if (sqp != NULL) {
234411754SKacheong.Poon@Sun.COM 		/*
234511754SKacheong.Poon@Sun.COM 		 * This is the correct place to update tcp_last_recv_time. Note
234611754SKacheong.Poon@Sun.COM 		 * that it is also updated for tcp structure that belongs to
234711754SKacheong.Poon@Sun.COM 		 * global and listener queues which do not really need updating.
234811754SKacheong.Poon@Sun.COM 		 * But that should not cause any harm.  And it is updated for
234911754SKacheong.Poon@Sun.COM 		 * all kinds of incoming segments, not only for data segments.
235011754SKacheong.Poon@Sun.COM 		 */
235111754SKacheong.Poon@Sun.COM 		tcp->tcp_last_recv_time = LBOLT_FASTPATH;
235211754SKacheong.Poon@Sun.COM 	}
235311754SKacheong.Poon@Sun.COM 
235411754SKacheong.Poon@Sun.COM 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
235511754SKacheong.Poon@Sun.COM 
235611754SKacheong.Poon@Sun.COM 	BUMP_LOCAL(tcp->tcp_ibsegs);
235711754SKacheong.Poon@Sun.COM 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
235811754SKacheong.Poon@Sun.COM 
235911754SKacheong.Poon@Sun.COM 	if ((flags & TH_URG) && sqp != NULL) {
236011754SKacheong.Poon@Sun.COM 		/*
236111754SKacheong.Poon@Sun.COM 		 * TCP can't handle urgent pointers that arrive before
236211754SKacheong.Poon@Sun.COM 		 * the connection has been accept()ed since it can't
236311754SKacheong.Poon@Sun.COM 		 * buffer OOB data.  Discard segment if this happens.
236411754SKacheong.Poon@Sun.COM 		 *
236511754SKacheong.Poon@Sun.COM 		 * We can't just rely on a non-null tcp_listener to indicate
236611754SKacheong.Poon@Sun.COM 		 * that the accept() has completed since unlinking of the
236711754SKacheong.Poon@Sun.COM 		 * eager and completion of the accept are not atomic.
236811754SKacheong.Poon@Sun.COM 		 * tcp_detached, when it is not set (B_FALSE) indicates
236911754SKacheong.Poon@Sun.COM 		 * that the accept() has completed.
237011754SKacheong.Poon@Sun.COM 		 *
237111754SKacheong.Poon@Sun.COM 		 * Nor can it reassemble urgent pointers, so discard
237211754SKacheong.Poon@Sun.COM 		 * if it's not the next segment expected.
237311754SKacheong.Poon@Sun.COM 		 *
237411754SKacheong.Poon@Sun.COM 		 * Otherwise, collapse chain into one mblk (discard if
237511754SKacheong.Poon@Sun.COM 		 * that fails).  This makes sure the headers, retransmitted
237611754SKacheong.Poon@Sun.COM 		 * data, and new data all are in the same mblk.
237711754SKacheong.Poon@Sun.COM 		 */
237811754SKacheong.Poon@Sun.COM 		ASSERT(mp != NULL);
237911754SKacheong.Poon@Sun.COM 		if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
238011754SKacheong.Poon@Sun.COM 			freemsg(mp);
238111754SKacheong.Poon@Sun.COM 			return;
238211754SKacheong.Poon@Sun.COM 		}
238311754SKacheong.Poon@Sun.COM 		/* Update pointers into message */
238411754SKacheong.Poon@Sun.COM 		iphdr = rptr = mp->b_rptr;
238511754SKacheong.Poon@Sun.COM 		tcpha = (tcpha_t *)&rptr[ip_hdr_len];
238611754SKacheong.Poon@Sun.COM 		if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
238711754SKacheong.Poon@Sun.COM 			/*
238811754SKacheong.Poon@Sun.COM 			 * Since we can't handle any data with this urgent
238911754SKacheong.Poon@Sun.COM 			 * pointer that is out of sequence, we expunge
239011754SKacheong.Poon@Sun.COM 			 * the data.  This allows us to still register
239111754SKacheong.Poon@Sun.COM 			 * the urgent mark and generate the M_PCSIG,
239211754SKacheong.Poon@Sun.COM 			 * which we can do.
239311754SKacheong.Poon@Sun.COM 			 */
239411754SKacheong.Poon@Sun.COM 			mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
239511754SKacheong.Poon@Sun.COM 			seg_len = 0;
239611754SKacheong.Poon@Sun.COM 		}
239711754SKacheong.Poon@Sun.COM 	}
239811754SKacheong.Poon@Sun.COM 
239911754SKacheong.Poon@Sun.COM 	switch (tcp->tcp_state) {
240011754SKacheong.Poon@Sun.COM 	case TCPS_SYN_SENT:
240111754SKacheong.Poon@Sun.COM 		if (connp->conn_final_sqp == NULL &&
240211754SKacheong.Poon@Sun.COM 		    tcp_outbound_squeue_switch && sqp != NULL) {
240311754SKacheong.Poon@Sun.COM 			ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
240411754SKacheong.Poon@Sun.COM 			connp->conn_final_sqp = sqp;
240511754SKacheong.Poon@Sun.COM 			if (connp->conn_final_sqp != connp->conn_sqp) {
240611754SKacheong.Poon@Sun.COM 				DTRACE_PROBE1(conn__final__sqp__switch,
240711754SKacheong.Poon@Sun.COM 				    conn_t *, connp);
240811754SKacheong.Poon@Sun.COM 				CONN_INC_REF(connp);
240911754SKacheong.Poon@Sun.COM 				SQUEUE_SWITCH(connp, connp->conn_final_sqp);
241011754SKacheong.Poon@Sun.COM 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
241111754SKacheong.Poon@Sun.COM 				    tcp_input_data, connp, ira, ip_squeue_flag,
241211754SKacheong.Poon@Sun.COM 				    SQTAG_CONNECT_FINISH);
241311754SKacheong.Poon@Sun.COM 				return;
241411754SKacheong.Poon@Sun.COM 			}
241511754SKacheong.Poon@Sun.COM 			DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
241611754SKacheong.Poon@Sun.COM 		}
241711754SKacheong.Poon@Sun.COM 		if (flags & TH_ACK) {
241811754SKacheong.Poon@Sun.COM 			/*
241911754SKacheong.Poon@Sun.COM 			 * Note that our stack cannot send data before a
242011754SKacheong.Poon@Sun.COM 			 * connection is established, therefore the
242111754SKacheong.Poon@Sun.COM 			 * following check is valid.  Otherwise, it has
242211754SKacheong.Poon@Sun.COM 			 * to be changed.
242311754SKacheong.Poon@Sun.COM 			 */
242411754SKacheong.Poon@Sun.COM 			if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
242511754SKacheong.Poon@Sun.COM 			    SEQ_GT(seg_ack, tcp->tcp_snxt)) {
242611754SKacheong.Poon@Sun.COM 				freemsg(mp);
242711754SKacheong.Poon@Sun.COM 				if (flags & TH_RST)
242811754SKacheong.Poon@Sun.COM 					return;
242911754SKacheong.Poon@Sun.COM 				tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
243011754SKacheong.Poon@Sun.COM 				    tcp, seg_ack, 0, TH_RST);
243111754SKacheong.Poon@Sun.COM 				return;
243211754SKacheong.Poon@Sun.COM 			}
243311754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_suna + 1 == seg_ack);
243411754SKacheong.Poon@Sun.COM 		}
243511754SKacheong.Poon@Sun.COM 		if (flags & TH_RST) {
243612507SAlan.Maguire@Sun.COM 			if (flags & TH_ACK) {
243712507SAlan.Maguire@Sun.COM 				DTRACE_TCP5(connect__refused, mblk_t *, NULL,
243812507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa,
243912507SAlan.Maguire@Sun.COM 				    void_ip_t *, iphdr, tcp_t *, tcp,
244012507SAlan.Maguire@Sun.COM 				    tcph_t *, tcpha);
244112507SAlan.Maguire@Sun.COM 				(void) tcp_clean_death(tcp, ECONNREFUSED);
244212507SAlan.Maguire@Sun.COM 			}
244311754SKacheong.Poon@Sun.COM 			freemsg(mp);
244411754SKacheong.Poon@Sun.COM 			return;
244511754SKacheong.Poon@Sun.COM 		}
244611754SKacheong.Poon@Sun.COM 		if (!(flags & TH_SYN)) {
244711754SKacheong.Poon@Sun.COM 			freemsg(mp);
244811754SKacheong.Poon@Sun.COM 			return;
244911754SKacheong.Poon@Sun.COM 		}
245011754SKacheong.Poon@Sun.COM 
245111754SKacheong.Poon@Sun.COM 		/* Process all TCP options. */
245211754SKacheong.Poon@Sun.COM 		tcp_process_options(tcp, tcpha);
245311754SKacheong.Poon@Sun.COM 		/*
245411754SKacheong.Poon@Sun.COM 		 * The following changes our rwnd to be a multiple of the
245511754SKacheong.Poon@Sun.COM 		 * MIN(peer MSS, our MSS) for performance reason.
245611754SKacheong.Poon@Sun.COM 		 */
245711754SKacheong.Poon@Sun.COM 		(void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf,
245811754SKacheong.Poon@Sun.COM 		    tcp->tcp_mss));
245911754SKacheong.Poon@Sun.COM 
246011754SKacheong.Poon@Sun.COM 		/* Is the other end ECN capable? */
246111754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_ok) {
246211754SKacheong.Poon@Sun.COM 			if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) {
246311754SKacheong.Poon@Sun.COM 				tcp->tcp_ecn_ok = B_FALSE;
246411754SKacheong.Poon@Sun.COM 			}
246511754SKacheong.Poon@Sun.COM 		}
246611754SKacheong.Poon@Sun.COM 		/*
246711754SKacheong.Poon@Sun.COM 		 * Clear ECN flags because it may interfere with later
246811754SKacheong.Poon@Sun.COM 		 * processing.
246911754SKacheong.Poon@Sun.COM 		 */
247011754SKacheong.Poon@Sun.COM 		flags &= ~(TH_ECE|TH_CWR);
247111754SKacheong.Poon@Sun.COM 
247211754SKacheong.Poon@Sun.COM 		tcp->tcp_irs = seg_seq;
247311754SKacheong.Poon@Sun.COM 		tcp->tcp_rack = seg_seq;
247411754SKacheong.Poon@Sun.COM 		tcp->tcp_rnxt = seg_seq + 1;
247511754SKacheong.Poon@Sun.COM 		tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
247611754SKacheong.Poon@Sun.COM 		if (!TCP_IS_DETACHED(tcp)) {
247711754SKacheong.Poon@Sun.COM 			/* Allocate room for SACK options if needed. */
247811754SKacheong.Poon@Sun.COM 			connp->conn_wroff = connp->conn_ht_iphc_len;
247911754SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_sack_ok)
248011754SKacheong.Poon@Sun.COM 				connp->conn_wroff += TCPOPT_MAX_SACK_LEN;
248111754SKacheong.Poon@Sun.COM 			if (!tcp->tcp_loopback)
248211754SKacheong.Poon@Sun.COM 				connp->conn_wroff += tcps->tcps_wroff_xtra;
248311754SKacheong.Poon@Sun.COM 
248411754SKacheong.Poon@Sun.COM 			(void) proto_set_tx_wroff(connp->conn_rq, connp,
248511754SKacheong.Poon@Sun.COM 			    connp->conn_wroff);
248611754SKacheong.Poon@Sun.COM 		}
248711754SKacheong.Poon@Sun.COM 		if (flags & TH_ACK) {
248811754SKacheong.Poon@Sun.COM 			/*
248911754SKacheong.Poon@Sun.COM 			 * If we can't get the confirmation upstream, pretend
249011754SKacheong.Poon@Sun.COM 			 * we didn't even see this one.
249111754SKacheong.Poon@Sun.COM 			 *
249211754SKacheong.Poon@Sun.COM 			 * XXX: how can we pretend we didn't see it if we
249311754SKacheong.Poon@Sun.COM 			 * have updated rnxt et. al.
249411754SKacheong.Poon@Sun.COM 			 *
249511754SKacheong.Poon@Sun.COM 			 * For loopback we defer sending up the T_CONN_CON
249611754SKacheong.Poon@Sun.COM 			 * until after some checks below.
249711754SKacheong.Poon@Sun.COM 			 */
249811754SKacheong.Poon@Sun.COM 			mp1 = NULL;
249911754SKacheong.Poon@Sun.COM 			/*
250011754SKacheong.Poon@Sun.COM 			 * tcp_sendmsg() checks tcp_state without entering
250111754SKacheong.Poon@Sun.COM 			 * the squeue so tcp_state should be updated before
250212507SAlan.Maguire@Sun.COM 			 * sending up connection confirmation.  Probe the
250312507SAlan.Maguire@Sun.COM 			 * state change below when we are sure the connection
250412507SAlan.Maguire@Sun.COM 			 * confirmation has been sent.
250511754SKacheong.Poon@Sun.COM 			 */
250611754SKacheong.Poon@Sun.COM 			tcp->tcp_state = TCPS_ESTABLISHED;
250711754SKacheong.Poon@Sun.COM 			if (!tcp_conn_con(tcp, iphdr, mp,
250811754SKacheong.Poon@Sun.COM 			    tcp->tcp_loopback ? &mp1 : NULL, ira)) {
250911754SKacheong.Poon@Sun.COM 				tcp->tcp_state = TCPS_SYN_SENT;
251011754SKacheong.Poon@Sun.COM 				freemsg(mp);
251111754SKacheong.Poon@Sun.COM 				return;
251211754SKacheong.Poon@Sun.COM 			}
251311754SKacheong.Poon@Sun.COM 			TCPS_CONN_INC(tcps);
251411754SKacheong.Poon@Sun.COM 			/* SYN was acked - making progress */
251511754SKacheong.Poon@Sun.COM 			tcp->tcp_ip_forward_progress = B_TRUE;
251611754SKacheong.Poon@Sun.COM 
251711754SKacheong.Poon@Sun.COM 			/* One for the SYN */
251811754SKacheong.Poon@Sun.COM 			tcp->tcp_suna = tcp->tcp_iss + 1;
251911754SKacheong.Poon@Sun.COM 			tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
252011754SKacheong.Poon@Sun.COM 
252111754SKacheong.Poon@Sun.COM 			/*
252211754SKacheong.Poon@Sun.COM 			 * If SYN was retransmitted, need to reset all
252311754SKacheong.Poon@Sun.COM 			 * retransmission info.  This is because this
252411754SKacheong.Poon@Sun.COM 			 * segment will be treated as a dup ACK.
252511754SKacheong.Poon@Sun.COM 			 */
252611754SKacheong.Poon@Sun.COM 			if (tcp->tcp_rexmit) {
252711754SKacheong.Poon@Sun.COM 				tcp->tcp_rexmit = B_FALSE;
252811754SKacheong.Poon@Sun.COM 				tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
252911754SKacheong.Poon@Sun.COM 				tcp->tcp_rexmit_max = tcp->tcp_snxt;
253011754SKacheong.Poon@Sun.COM 				tcp->tcp_snd_burst = tcp->tcp_localnet ?
253111754SKacheong.Poon@Sun.COM 				    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
253211754SKacheong.Poon@Sun.COM 				tcp->tcp_ms_we_have_waited = 0;
253311754SKacheong.Poon@Sun.COM 
253411754SKacheong.Poon@Sun.COM 				/*
253511754SKacheong.Poon@Sun.COM 				 * Set tcp_cwnd back to 1 MSS, per
253611754SKacheong.Poon@Sun.COM 				 * recommendation from
253711754SKacheong.Poon@Sun.COM 				 * draft-floyd-incr-init-win-01.txt,
253811754SKacheong.Poon@Sun.COM 				 * Increasing TCP's Initial Window.
253911754SKacheong.Poon@Sun.COM 				 */
254011754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd = tcp->tcp_mss;
254111754SKacheong.Poon@Sun.COM 			}
254211754SKacheong.Poon@Sun.COM 
254311754SKacheong.Poon@Sun.COM 			tcp->tcp_swl1 = seg_seq;
254411754SKacheong.Poon@Sun.COM 			tcp->tcp_swl2 = seg_ack;
254511754SKacheong.Poon@Sun.COM 
254611754SKacheong.Poon@Sun.COM 			new_swnd = ntohs(tcpha->tha_win);
254711754SKacheong.Poon@Sun.COM 			tcp->tcp_swnd = new_swnd;
254811754SKacheong.Poon@Sun.COM 			if (new_swnd > tcp->tcp_max_swnd)
254911754SKacheong.Poon@Sun.COM 				tcp->tcp_max_swnd = new_swnd;
255011754SKacheong.Poon@Sun.COM 
255111754SKacheong.Poon@Sun.COM 			/*
255211754SKacheong.Poon@Sun.COM 			 * Always send the three-way handshake ack immediately
255311754SKacheong.Poon@Sun.COM 			 * in order to make the connection complete as soon as
255411754SKacheong.Poon@Sun.COM 			 * possible on the accepting host.
255511754SKacheong.Poon@Sun.COM 			 */
255611754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
255711754SKacheong.Poon@Sun.COM 
255811754SKacheong.Poon@Sun.COM 			/*
255912507SAlan.Maguire@Sun.COM 			 * Trace connect-established here.
256012507SAlan.Maguire@Sun.COM 			 */
256112507SAlan.Maguire@Sun.COM 			DTRACE_TCP5(connect__established, mblk_t *, NULL,
256212507SAlan.Maguire@Sun.COM 			    ip_xmit_attr_t *, tcp->tcp_connp->conn_ixa,
256312507SAlan.Maguire@Sun.COM 			    void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcpha);
256412507SAlan.Maguire@Sun.COM 
256512507SAlan.Maguire@Sun.COM 			/* Trace change from SYN_SENT -> ESTABLISHED here */
256612507SAlan.Maguire@Sun.COM 			DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
256712507SAlan.Maguire@Sun.COM 			    connp->conn_ixa, void, NULL, tcp_t *, tcp,
256812507SAlan.Maguire@Sun.COM 			    void, NULL, int32_t, TCPS_SYN_SENT);
256912507SAlan.Maguire@Sun.COM 
257012507SAlan.Maguire@Sun.COM 			/*
257111754SKacheong.Poon@Sun.COM 			 * Special case for loopback.  At this point we have
257211754SKacheong.Poon@Sun.COM 			 * received SYN-ACK from the remote endpoint.  In
257311754SKacheong.Poon@Sun.COM 			 * order to ensure that both endpoints reach the
257411754SKacheong.Poon@Sun.COM 			 * fused state prior to any data exchange, the final
257511754SKacheong.Poon@Sun.COM 			 * ACK needs to be sent before we indicate T_CONN_CON
257611754SKacheong.Poon@Sun.COM 			 * to the module upstream.
257711754SKacheong.Poon@Sun.COM 			 */
257811754SKacheong.Poon@Sun.COM 			if (tcp->tcp_loopback) {
257911754SKacheong.Poon@Sun.COM 				mblk_t *ack_mp;
258011754SKacheong.Poon@Sun.COM 
258111754SKacheong.Poon@Sun.COM 				ASSERT(!tcp->tcp_unfusable);
258211754SKacheong.Poon@Sun.COM 				ASSERT(mp1 != NULL);
258311754SKacheong.Poon@Sun.COM 				/*
258411754SKacheong.Poon@Sun.COM 				 * For loopback, we always get a pure SYN-ACK
258511754SKacheong.Poon@Sun.COM 				 * and only need to send back the final ACK
258611754SKacheong.Poon@Sun.COM 				 * with no data (this is because the other
258711754SKacheong.Poon@Sun.COM 				 * tcp is ours and we don't do T/TCP).  This
258811754SKacheong.Poon@Sun.COM 				 * final ACK triggers the passive side to
258911754SKacheong.Poon@Sun.COM 				 * perform fusion in ESTABLISHED state.
259011754SKacheong.Poon@Sun.COM 				 */
259111754SKacheong.Poon@Sun.COM 				if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
259211754SKacheong.Poon@Sun.COM 					if (tcp->tcp_ack_tid != 0) {
259311754SKacheong.Poon@Sun.COM 						(void) TCP_TIMER_CANCEL(tcp,
259411754SKacheong.Poon@Sun.COM 						    tcp->tcp_ack_tid);
259511754SKacheong.Poon@Sun.COM 						tcp->tcp_ack_tid = 0;
259611754SKacheong.Poon@Sun.COM 					}
259711754SKacheong.Poon@Sun.COM 					tcp_send_data(tcp, ack_mp);
259811754SKacheong.Poon@Sun.COM 					BUMP_LOCAL(tcp->tcp_obsegs);
259911754SKacheong.Poon@Sun.COM 					TCPS_BUMP_MIB(tcps, tcpOutAck);
260011754SKacheong.Poon@Sun.COM 
260111754SKacheong.Poon@Sun.COM 					if (!IPCL_IS_NONSTR(connp)) {
260211754SKacheong.Poon@Sun.COM 						/* Send up T_CONN_CON */
260311754SKacheong.Poon@Sun.COM 						if (ira->ira_cred != NULL) {
260411754SKacheong.Poon@Sun.COM 							mblk_setcred(mp1,
260511754SKacheong.Poon@Sun.COM 							    ira->ira_cred,
260611754SKacheong.Poon@Sun.COM 							    ira->ira_cpid);
260711754SKacheong.Poon@Sun.COM 						}
260811754SKacheong.Poon@Sun.COM 						putnext(connp->conn_rq, mp1);
260911754SKacheong.Poon@Sun.COM 					} else {
261011754SKacheong.Poon@Sun.COM 						(*connp->conn_upcalls->
261111754SKacheong.Poon@Sun.COM 						    su_connected)
261211754SKacheong.Poon@Sun.COM 						    (connp->conn_upper_handle,
261311754SKacheong.Poon@Sun.COM 						    tcp->tcp_connid,
261411754SKacheong.Poon@Sun.COM 						    ira->ira_cred,
261511754SKacheong.Poon@Sun.COM 						    ira->ira_cpid);
261611754SKacheong.Poon@Sun.COM 						freemsg(mp1);
261711754SKacheong.Poon@Sun.COM 					}
261811754SKacheong.Poon@Sun.COM 
261911754SKacheong.Poon@Sun.COM 					freemsg(mp);
262011754SKacheong.Poon@Sun.COM 					return;
262111754SKacheong.Poon@Sun.COM 				}
262211754SKacheong.Poon@Sun.COM 				/*
262311754SKacheong.Poon@Sun.COM 				 * Forget fusion; we need to handle more
262411754SKacheong.Poon@Sun.COM 				 * complex cases below.  Send the deferred
262511754SKacheong.Poon@Sun.COM 				 * T_CONN_CON message upstream and proceed
262611754SKacheong.Poon@Sun.COM 				 * as usual.  Mark this tcp as not capable
262711754SKacheong.Poon@Sun.COM 				 * of fusion.
262811754SKacheong.Poon@Sun.COM 				 */
262911754SKacheong.Poon@Sun.COM 				TCP_STAT(tcps, tcp_fusion_unfusable);
263011754SKacheong.Poon@Sun.COM 				tcp->tcp_unfusable = B_TRUE;
263111754SKacheong.Poon@Sun.COM 				if (!IPCL_IS_NONSTR(connp)) {
263211754SKacheong.Poon@Sun.COM 					if (ira->ira_cred != NULL) {
263311754SKacheong.Poon@Sun.COM 						mblk_setcred(mp1, ira->ira_cred,
263411754SKacheong.Poon@Sun.COM 						    ira->ira_cpid);
263511754SKacheong.Poon@Sun.COM 					}
263611754SKacheong.Poon@Sun.COM 					putnext(connp->conn_rq, mp1);
263711754SKacheong.Poon@Sun.COM 				} else {
263811754SKacheong.Poon@Sun.COM 					(*connp->conn_upcalls->su_connected)
263911754SKacheong.Poon@Sun.COM 					    (connp->conn_upper_handle,
264011754SKacheong.Poon@Sun.COM 					    tcp->tcp_connid, ira->ira_cred,
264111754SKacheong.Poon@Sun.COM 					    ira->ira_cpid);
264211754SKacheong.Poon@Sun.COM 					freemsg(mp1);
264311754SKacheong.Poon@Sun.COM 				}
264411754SKacheong.Poon@Sun.COM 			}
264511754SKacheong.Poon@Sun.COM 
264611754SKacheong.Poon@Sun.COM 			/*
264711754SKacheong.Poon@Sun.COM 			 * Check to see if there is data to be sent.  If
264811754SKacheong.Poon@Sun.COM 			 * yes, set the transmit flag.  Then check to see
264911754SKacheong.Poon@Sun.COM 			 * if received data processing needs to be done.
265011754SKacheong.Poon@Sun.COM 			 * If not, go straight to xmit_check.  This short
265111754SKacheong.Poon@Sun.COM 			 * cut is OK as we don't support T/TCP.
265211754SKacheong.Poon@Sun.COM 			 */
265311754SKacheong.Poon@Sun.COM 			if (tcp->tcp_unsent)
265411754SKacheong.Poon@Sun.COM 				flags |= TH_XMIT_NEEDED;
265511754SKacheong.Poon@Sun.COM 
265611754SKacheong.Poon@Sun.COM 			if (seg_len == 0 && !(flags & TH_URG)) {
265711754SKacheong.Poon@Sun.COM 				freemsg(mp);
265811754SKacheong.Poon@Sun.COM 				goto xmit_check;
265911754SKacheong.Poon@Sun.COM 			}
266011754SKacheong.Poon@Sun.COM 
266111754SKacheong.Poon@Sun.COM 			flags &= ~TH_SYN;
266211754SKacheong.Poon@Sun.COM 			seg_seq++;
266311754SKacheong.Poon@Sun.COM 			break;
266411754SKacheong.Poon@Sun.COM 		}
266511754SKacheong.Poon@Sun.COM 		tcp->tcp_state = TCPS_SYN_RCVD;
266612507SAlan.Maguire@Sun.COM 		DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
266712507SAlan.Maguire@Sun.COM 		    connp->conn_ixa, void_ip_t *, NULL, tcp_t *, tcp,
266812507SAlan.Maguire@Sun.COM 		    tcph_t *, NULL, int32_t, TCPS_SYN_SENT);
266911754SKacheong.Poon@Sun.COM 		mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
267011754SKacheong.Poon@Sun.COM 		    NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
267111754SKacheong.Poon@Sun.COM 		if (mp1 != NULL) {
267211754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp1);
267311754SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
267411754SKacheong.Poon@Sun.COM 		}
267511754SKacheong.Poon@Sun.COM 		freemsg(mp);
267611754SKacheong.Poon@Sun.COM 		return;
267711754SKacheong.Poon@Sun.COM 	case TCPS_SYN_RCVD:
267811754SKacheong.Poon@Sun.COM 		if (flags & TH_ACK) {
267913008SKacheong.Poon@Sun.COM 			uint32_t pinit_wnd;
268013008SKacheong.Poon@Sun.COM 
268111754SKacheong.Poon@Sun.COM 			/*
268211754SKacheong.Poon@Sun.COM 			 * In this state, a SYN|ACK packet is either bogus
268311754SKacheong.Poon@Sun.COM 			 * because the other side must be ACKing our SYN which
268411754SKacheong.Poon@Sun.COM 			 * indicates it has seen the ACK for their SYN and
268511754SKacheong.Poon@Sun.COM 			 * shouldn't retransmit it or we're crossing SYNs
268611754SKacheong.Poon@Sun.COM 			 * on active open.
268711754SKacheong.Poon@Sun.COM 			 */
268811754SKacheong.Poon@Sun.COM 			if ((flags & TH_SYN) && !tcp->tcp_active_open) {
268911754SKacheong.Poon@Sun.COM 				freemsg(mp);
269011754SKacheong.Poon@Sun.COM 				tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn",
269111754SKacheong.Poon@Sun.COM 				    tcp, seg_ack, 0, TH_RST);
269211754SKacheong.Poon@Sun.COM 				return;
269311754SKacheong.Poon@Sun.COM 			}
269411754SKacheong.Poon@Sun.COM 			/*
269511754SKacheong.Poon@Sun.COM 			 * NOTE: RFC 793 pg. 72 says this should be
269611754SKacheong.Poon@Sun.COM 			 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt
269711754SKacheong.Poon@Sun.COM 			 * but that would mean we have an ack that ignored
269811754SKacheong.Poon@Sun.COM 			 * our SYN.
269911754SKacheong.Poon@Sun.COM 			 */
270011754SKacheong.Poon@Sun.COM 			if (SEQ_LEQ(seg_ack, tcp->tcp_suna) ||
270111754SKacheong.Poon@Sun.COM 			    SEQ_GT(seg_ack, tcp->tcp_snxt)) {
270211754SKacheong.Poon@Sun.COM 				freemsg(mp);
270311754SKacheong.Poon@Sun.COM 				tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack",
270411754SKacheong.Poon@Sun.COM 				    tcp, seg_ack, 0, TH_RST);
270511754SKacheong.Poon@Sun.COM 				return;
270611754SKacheong.Poon@Sun.COM 			}
270711754SKacheong.Poon@Sun.COM 			/*
270811754SKacheong.Poon@Sun.COM 			 * No sane TCP stack will send such a small window
270911754SKacheong.Poon@Sun.COM 			 * without receiving any data.  Just drop this invalid
271011754SKacheong.Poon@Sun.COM 			 * ACK.  We also shorten the abort timeout in case
271111754SKacheong.Poon@Sun.COM 			 * this is an attack.
271211754SKacheong.Poon@Sun.COM 			 */
271313008SKacheong.Poon@Sun.COM 			pinit_wnd = ntohs(tcpha->tha_win) << tcp->tcp_snd_ws;
271413008SKacheong.Poon@Sun.COM 			if (pinit_wnd < tcp->tcp_mss &&
271513008SKacheong.Poon@Sun.COM 			    pinit_wnd < tcp_init_wnd_chk) {
271611754SKacheong.Poon@Sun.COM 				freemsg(mp);
271711754SKacheong.Poon@Sun.COM 				TCP_STAT(tcps, tcp_zwin_ack_syn);
271811754SKacheong.Poon@Sun.COM 				tcp->tcp_second_ctimer_threshold =
271911754SKacheong.Poon@Sun.COM 				    tcp_early_abort * SECONDS;
272011754SKacheong.Poon@Sun.COM 				return;
272111754SKacheong.Poon@Sun.COM 			}
272211754SKacheong.Poon@Sun.COM 		}
272311754SKacheong.Poon@Sun.COM 		break;
272411754SKacheong.Poon@Sun.COM 	case TCPS_LISTEN:
272511754SKacheong.Poon@Sun.COM 		/*
272611754SKacheong.Poon@Sun.COM 		 * Only a TLI listener can come through this path when a
272711754SKacheong.Poon@Sun.COM 		 * acceptor is going back to be a listener and a packet
272811754SKacheong.Poon@Sun.COM 		 * for the acceptor hits the classifier. For a socket
272911754SKacheong.Poon@Sun.COM 		 * listener, this can never happen because a listener
273011754SKacheong.Poon@Sun.COM 		 * can never accept connection on itself and hence a
273111754SKacheong.Poon@Sun.COM 		 * socket acceptor can not go back to being a listener.
273211754SKacheong.Poon@Sun.COM 		 */
273311754SKacheong.Poon@Sun.COM 		ASSERT(!TCP_IS_SOCKET(tcp));
273411754SKacheong.Poon@Sun.COM 		/*FALLTHRU*/
273511754SKacheong.Poon@Sun.COM 	case TCPS_CLOSED:
273611754SKacheong.Poon@Sun.COM 	case TCPS_BOUND: {
273711754SKacheong.Poon@Sun.COM 		conn_t	*new_connp;
273811754SKacheong.Poon@Sun.COM 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
273911754SKacheong.Poon@Sun.COM 
274011754SKacheong.Poon@Sun.COM 		/*
274111754SKacheong.Poon@Sun.COM 		 * Don't accept any input on a closed tcp as this TCP logically
274211754SKacheong.Poon@Sun.COM 		 * does not exist on the system. Don't proceed further with
274311754SKacheong.Poon@Sun.COM 		 * this TCP. For instance, this packet could trigger another
274411754SKacheong.Poon@Sun.COM 		 * close of this tcp which would be disastrous for tcp_refcnt.
274511754SKacheong.Poon@Sun.COM 		 * tcp_close_detached / tcp_clean_death / tcp_closei_local must
274611754SKacheong.Poon@Sun.COM 		 * be called at most once on a TCP. In this case we need to
274711754SKacheong.Poon@Sun.COM 		 * refeed the packet into the classifier and figure out where
274811754SKacheong.Poon@Sun.COM 		 * the packet should go.
274911754SKacheong.Poon@Sun.COM 		 */
275011754SKacheong.Poon@Sun.COM 		new_connp = ipcl_classify(mp, ira, ipst);
275111754SKacheong.Poon@Sun.COM 		if (new_connp != NULL) {
275211754SKacheong.Poon@Sun.COM 			/* Drops ref on new_connp */
275311754SKacheong.Poon@Sun.COM 			tcp_reinput(new_connp, mp, ira, ipst);
275411754SKacheong.Poon@Sun.COM 			return;
275511754SKacheong.Poon@Sun.COM 		}
275611754SKacheong.Poon@Sun.COM 		/* We failed to classify. For now just drop the packet */
275711754SKacheong.Poon@Sun.COM 		freemsg(mp);
275811754SKacheong.Poon@Sun.COM 		return;
275911754SKacheong.Poon@Sun.COM 	}
276011754SKacheong.Poon@Sun.COM 	case TCPS_IDLE:
276111754SKacheong.Poon@Sun.COM 		/*
276211754SKacheong.Poon@Sun.COM 		 * Handle the case where the tcp_clean_death() has happened
276311754SKacheong.Poon@Sun.COM 		 * on a connection (application hasn't closed yet) but a packet
276411754SKacheong.Poon@Sun.COM 		 * was already queued on squeue before tcp_clean_death()
276511754SKacheong.Poon@Sun.COM 		 * was processed. Calling tcp_clean_death() twice on same
276611754SKacheong.Poon@Sun.COM 		 * connection can result in weird behaviour.
276711754SKacheong.Poon@Sun.COM 		 */
276811754SKacheong.Poon@Sun.COM 		freemsg(mp);
276911754SKacheong.Poon@Sun.COM 		return;
277011754SKacheong.Poon@Sun.COM 	default:
277111754SKacheong.Poon@Sun.COM 		break;
277211754SKacheong.Poon@Sun.COM 	}
277311754SKacheong.Poon@Sun.COM 
277411754SKacheong.Poon@Sun.COM 	/*
277511754SKacheong.Poon@Sun.COM 	 * Already on the correct queue/perimeter.
277611754SKacheong.Poon@Sun.COM 	 * If this is a detached connection and not an eager
277711754SKacheong.Poon@Sun.COM 	 * connection hanging off a listener then new data
277811754SKacheong.Poon@Sun.COM 	 * (past the FIN) will cause a reset.
277911754SKacheong.Poon@Sun.COM 	 * We do a special check here where it
278011754SKacheong.Poon@Sun.COM 	 * is out of the main line, rather than check
278111754SKacheong.Poon@Sun.COM 	 * if we are detached every time we see new
278211754SKacheong.Poon@Sun.COM 	 * data down below.
278311754SKacheong.Poon@Sun.COM 	 */
278411754SKacheong.Poon@Sun.COM 	if (TCP_IS_DETACHED_NONEAGER(tcp) &&
278511754SKacheong.Poon@Sun.COM 	    (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) {
278611754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpInClosed);
278711754SKacheong.Poon@Sun.COM 		DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
278811754SKacheong.Poon@Sun.COM 		freemsg(mp);
278911754SKacheong.Poon@Sun.COM 		tcp_xmit_ctl("new data when detached", tcp,
279011754SKacheong.Poon@Sun.COM 		    tcp->tcp_snxt, 0, TH_RST);
279111754SKacheong.Poon@Sun.COM 		(void) tcp_clean_death(tcp, EPROTO);
279211754SKacheong.Poon@Sun.COM 		return;
279311754SKacheong.Poon@Sun.COM 	}
279411754SKacheong.Poon@Sun.COM 
279511754SKacheong.Poon@Sun.COM 	mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
279611754SKacheong.Poon@Sun.COM 	urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION;
279711754SKacheong.Poon@Sun.COM 	new_swnd = ntohs(tcpha->tha_win) <<
279811754SKacheong.Poon@Sun.COM 	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
279911754SKacheong.Poon@Sun.COM 
280011754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok) {
280111754SKacheong.Poon@Sun.COM 		if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
280211754SKacheong.Poon@Sun.COM 			/*
280311754SKacheong.Poon@Sun.COM 			 * This segment is not acceptable.
280411754SKacheong.Poon@Sun.COM 			 * Drop it and send back an ACK.
280511754SKacheong.Poon@Sun.COM 			 */
280611754SKacheong.Poon@Sun.COM 			freemsg(mp);
280711754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
280811754SKacheong.Poon@Sun.COM 			goto ack_check;
280911754SKacheong.Poon@Sun.COM 		}
281011754SKacheong.Poon@Sun.COM 	} else if (tcp->tcp_snd_sack_ok) {
281111754SKacheong.Poon@Sun.COM 		tcpopt.tcp = tcp;
281211754SKacheong.Poon@Sun.COM 		/*
281311754SKacheong.Poon@Sun.COM 		 * SACK info in already updated in tcp_parse_options.  Ignore
281411754SKacheong.Poon@Sun.COM 		 * all other TCP options...
281511754SKacheong.Poon@Sun.COM 		 */
281611754SKacheong.Poon@Sun.COM 		(void) tcp_parse_options(tcpha, &tcpopt);
281711754SKacheong.Poon@Sun.COM 	}
281811754SKacheong.Poon@Sun.COM try_again:;
281911754SKacheong.Poon@Sun.COM 	mss = tcp->tcp_mss;
282011754SKacheong.Poon@Sun.COM 	gap = seg_seq - tcp->tcp_rnxt;
282111754SKacheong.Poon@Sun.COM 	rgap = tcp->tcp_rwnd - (gap + seg_len);
282211754SKacheong.Poon@Sun.COM 	/*
282311754SKacheong.Poon@Sun.COM 	 * gap is the amount of sequence space between what we expect to see
282411754SKacheong.Poon@Sun.COM 	 * and what we got for seg_seq.  A positive value for gap means
282511754SKacheong.Poon@Sun.COM 	 * something got lost.  A negative value means we got some old stuff.
282611754SKacheong.Poon@Sun.COM 	 */
282711754SKacheong.Poon@Sun.COM 	if (gap < 0) {
282811754SKacheong.Poon@Sun.COM 		/* Old stuff present.  Is the SYN in there? */
282911754SKacheong.Poon@Sun.COM 		if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) &&
283011754SKacheong.Poon@Sun.COM 		    (seg_len != 0)) {
283111754SKacheong.Poon@Sun.COM 			flags &= ~TH_SYN;
283211754SKacheong.Poon@Sun.COM 			seg_seq++;
283311754SKacheong.Poon@Sun.COM 			urp--;
283411754SKacheong.Poon@Sun.COM 			/* Recompute the gaps after noting the SYN. */
283511754SKacheong.Poon@Sun.COM 			goto try_again;
283611754SKacheong.Poon@Sun.COM 		}
283711754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
283811754SKacheong.Poon@Sun.COM 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
283911754SKacheong.Poon@Sun.COM 		    (seg_len > -gap ? -gap : seg_len));
284011754SKacheong.Poon@Sun.COM 		/* Remove the old stuff from seg_len. */
284111754SKacheong.Poon@Sun.COM 		seg_len += gap;
284211754SKacheong.Poon@Sun.COM 		/*
284311754SKacheong.Poon@Sun.COM 		 * Anything left?
284411754SKacheong.Poon@Sun.COM 		 * Make sure to check for unack'd FIN when rest of data
284511754SKacheong.Poon@Sun.COM 		 * has been previously ack'd.
284611754SKacheong.Poon@Sun.COM 		 */
284711754SKacheong.Poon@Sun.COM 		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
284811754SKacheong.Poon@Sun.COM 			/*
284911754SKacheong.Poon@Sun.COM 			 * Resets are only valid if they lie within our offered
285011754SKacheong.Poon@Sun.COM 			 * window.  If the RST bit is set, we just ignore this
285111754SKacheong.Poon@Sun.COM 			 * segment.
285211754SKacheong.Poon@Sun.COM 			 */
285311754SKacheong.Poon@Sun.COM 			if (flags & TH_RST) {
285411754SKacheong.Poon@Sun.COM 				freemsg(mp);
285511754SKacheong.Poon@Sun.COM 				return;
285611754SKacheong.Poon@Sun.COM 			}
285711754SKacheong.Poon@Sun.COM 
285811754SKacheong.Poon@Sun.COM 			/*
285911754SKacheong.Poon@Sun.COM 			 * The arriving of dup data packets indicate that we
286011754SKacheong.Poon@Sun.COM 			 * may have postponed an ack for too long, or the other
286111754SKacheong.Poon@Sun.COM 			 * side's RTT estimate is out of shape. Start acking
286211754SKacheong.Poon@Sun.COM 			 * more often.
286311754SKacheong.Poon@Sun.COM 			 */
286411754SKacheong.Poon@Sun.COM 			if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) &&
286511754SKacheong.Poon@Sun.COM 			    tcp->tcp_rack_cnt >= 1 &&
286611754SKacheong.Poon@Sun.COM 			    tcp->tcp_rack_abs_max > 2) {
286711754SKacheong.Poon@Sun.COM 				tcp->tcp_rack_abs_max--;
286811754SKacheong.Poon@Sun.COM 			}
286911754SKacheong.Poon@Sun.COM 			tcp->tcp_rack_cur_max = 1;
287011754SKacheong.Poon@Sun.COM 
287111754SKacheong.Poon@Sun.COM 			/*
287211754SKacheong.Poon@Sun.COM 			 * This segment is "unacceptable".  None of its
287311754SKacheong.Poon@Sun.COM 			 * sequence space lies within our advertized window.
287411754SKacheong.Poon@Sun.COM 			 *
287511754SKacheong.Poon@Sun.COM 			 * Adjust seg_len to the original value for tracing.
287611754SKacheong.Poon@Sun.COM 			 */
287711754SKacheong.Poon@Sun.COM 			seg_len -= gap;
287811754SKacheong.Poon@Sun.COM 			if (connp->conn_debug) {
287911754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
288011754SKacheong.Poon@Sun.COM 				    "tcp_rput: unacceptable, gap %d, rgap %d, "
288111754SKacheong.Poon@Sun.COM 				    "flags 0x%x, seg_seq %u, seg_ack %u, "
288211754SKacheong.Poon@Sun.COM 				    "seg_len %d, rnxt %u, snxt %u, %s",
288311754SKacheong.Poon@Sun.COM 				    gap, rgap, flags, seg_seq, seg_ack,
288411754SKacheong.Poon@Sun.COM 				    seg_len, tcp->tcp_rnxt, tcp->tcp_snxt,
288511754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
288611754SKacheong.Poon@Sun.COM 				    DISP_ADDR_AND_PORT));
288711754SKacheong.Poon@Sun.COM 			}
288811754SKacheong.Poon@Sun.COM 
288911754SKacheong.Poon@Sun.COM 			/*
289011754SKacheong.Poon@Sun.COM 			 * Arrange to send an ACK in response to the
289111754SKacheong.Poon@Sun.COM 			 * unacceptable segment per RFC 793 page 69. There
289211754SKacheong.Poon@Sun.COM 			 * is only one small difference between ours and the
289311754SKacheong.Poon@Sun.COM 			 * acceptability test in the RFC - we accept ACK-only
289411754SKacheong.Poon@Sun.COM 			 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK
289511754SKacheong.Poon@Sun.COM 			 * will be generated.
289611754SKacheong.Poon@Sun.COM 			 *
289711754SKacheong.Poon@Sun.COM 			 * Note that we have to ACK an ACK-only packet at least
289811754SKacheong.Poon@Sun.COM 			 * for stacks that send 0-length keep-alives with
289911754SKacheong.Poon@Sun.COM 			 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122,
290011754SKacheong.Poon@Sun.COM 			 * section 4.2.3.6. As long as we don't ever generate
290111754SKacheong.Poon@Sun.COM 			 * an unacceptable packet in response to an incoming
290211754SKacheong.Poon@Sun.COM 			 * packet that is unacceptable, it should not cause
290311754SKacheong.Poon@Sun.COM 			 * "ACK wars".
290411754SKacheong.Poon@Sun.COM 			 */
290511754SKacheong.Poon@Sun.COM 			flags |=  TH_ACK_NEEDED;
290611754SKacheong.Poon@Sun.COM 
290711754SKacheong.Poon@Sun.COM 			/*
290811754SKacheong.Poon@Sun.COM 			 * Continue processing this segment in order to use the
290911754SKacheong.Poon@Sun.COM 			 * ACK information it contains, but skip all other
291011754SKacheong.Poon@Sun.COM 			 * sequence-number processing.	Processing the ACK
291111754SKacheong.Poon@Sun.COM 			 * information is necessary in order to
291211754SKacheong.Poon@Sun.COM 			 * re-synchronize connections that may have lost
291311754SKacheong.Poon@Sun.COM 			 * synchronization.
291411754SKacheong.Poon@Sun.COM 			 *
291511754SKacheong.Poon@Sun.COM 			 * We clear seg_len and flag fields related to
291611754SKacheong.Poon@Sun.COM 			 * sequence number processing as they are not
291711754SKacheong.Poon@Sun.COM 			 * to be trusted for an unacceptable segment.
291811754SKacheong.Poon@Sun.COM 			 */
291911754SKacheong.Poon@Sun.COM 			seg_len = 0;
292011754SKacheong.Poon@Sun.COM 			flags &= ~(TH_SYN | TH_FIN | TH_URG);
292111754SKacheong.Poon@Sun.COM 			goto process_ack;
292211754SKacheong.Poon@Sun.COM 		}
292311754SKacheong.Poon@Sun.COM 
292411754SKacheong.Poon@Sun.COM 		/* Fix seg_seq, and chew the gap off the front. */
292511754SKacheong.Poon@Sun.COM 		seg_seq = tcp->tcp_rnxt;
292611754SKacheong.Poon@Sun.COM 		urp += gap;
292711754SKacheong.Poon@Sun.COM 		do {
292811754SKacheong.Poon@Sun.COM 			mblk_t	*mp2;
292911754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
293011754SKacheong.Poon@Sun.COM 			    (uintptr_t)UINT_MAX);
293111754SKacheong.Poon@Sun.COM 			gap += (uint_t)(mp->b_wptr - mp->b_rptr);
293211754SKacheong.Poon@Sun.COM 			if (gap > 0) {
293311754SKacheong.Poon@Sun.COM 				mp->b_rptr = mp->b_wptr - gap;
293411754SKacheong.Poon@Sun.COM 				break;
293511754SKacheong.Poon@Sun.COM 			}
293611754SKacheong.Poon@Sun.COM 			mp2 = mp;
293711754SKacheong.Poon@Sun.COM 			mp = mp->b_cont;
293811754SKacheong.Poon@Sun.COM 			freeb(mp2);
293911754SKacheong.Poon@Sun.COM 		} while (gap < 0);
294011754SKacheong.Poon@Sun.COM 		/*
294111754SKacheong.Poon@Sun.COM 		 * If the urgent data has already been acknowledged, we
294211754SKacheong.Poon@Sun.COM 		 * should ignore TH_URG below
294311754SKacheong.Poon@Sun.COM 		 */
294411754SKacheong.Poon@Sun.COM 		if (urp < 0)
294511754SKacheong.Poon@Sun.COM 			flags &= ~TH_URG;
294611754SKacheong.Poon@Sun.COM 	}
294711754SKacheong.Poon@Sun.COM 	/*
294811754SKacheong.Poon@Sun.COM 	 * rgap is the amount of stuff received out of window.  A negative
294911754SKacheong.Poon@Sun.COM 	 * value is the amount out of window.
295011754SKacheong.Poon@Sun.COM 	 */
295111754SKacheong.Poon@Sun.COM 	if (rgap < 0) {
295211754SKacheong.Poon@Sun.COM 		mblk_t	*mp2;
295311754SKacheong.Poon@Sun.COM 
295411754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rwnd == 0) {
295511754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInWinProbe);
295611754SKacheong.Poon@Sun.COM 		} else {
295711754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
295811754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
295911754SKacheong.Poon@Sun.COM 		}
296011754SKacheong.Poon@Sun.COM 
296111754SKacheong.Poon@Sun.COM 		/*
296211754SKacheong.Poon@Sun.COM 		 * seg_len does not include the FIN, so if more than
296311754SKacheong.Poon@Sun.COM 		 * just the FIN is out of window, we act like we don't
296411754SKacheong.Poon@Sun.COM 		 * see it.  (If just the FIN is out of window, rgap
296511754SKacheong.Poon@Sun.COM 		 * will be zero and we will go ahead and acknowledge
296611754SKacheong.Poon@Sun.COM 		 * the FIN.)
296711754SKacheong.Poon@Sun.COM 		 */
296811754SKacheong.Poon@Sun.COM 		flags &= ~TH_FIN;
296911754SKacheong.Poon@Sun.COM 
297011754SKacheong.Poon@Sun.COM 		/* Fix seg_len and make sure there is something left. */
297111754SKacheong.Poon@Sun.COM 		seg_len += rgap;
297211754SKacheong.Poon@Sun.COM 		if (seg_len <= 0) {
297311754SKacheong.Poon@Sun.COM 			/*
297411754SKacheong.Poon@Sun.COM 			 * Resets are only valid if they lie within our offered
297511754SKacheong.Poon@Sun.COM 			 * window.  If the RST bit is set, we just ignore this
297611754SKacheong.Poon@Sun.COM 			 * segment.
297711754SKacheong.Poon@Sun.COM 			 */
297811754SKacheong.Poon@Sun.COM 			if (flags & TH_RST) {
297911754SKacheong.Poon@Sun.COM 				freemsg(mp);
298011754SKacheong.Poon@Sun.COM 				return;
298111754SKacheong.Poon@Sun.COM 			}
298211754SKacheong.Poon@Sun.COM 
298311754SKacheong.Poon@Sun.COM 			/* Per RFC 793, we need to send back an ACK. */
298411754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
298511754SKacheong.Poon@Sun.COM 
298611754SKacheong.Poon@Sun.COM 			/*
298711754SKacheong.Poon@Sun.COM 			 * Send SIGURG as soon as possible i.e. even
298811754SKacheong.Poon@Sun.COM 			 * if the TH_URG was delivered in a window probe
298911754SKacheong.Poon@Sun.COM 			 * packet (which will be unacceptable).
299011754SKacheong.Poon@Sun.COM 			 *
299111754SKacheong.Poon@Sun.COM 			 * We generate a signal if none has been generated
299211754SKacheong.Poon@Sun.COM 			 * for this connection or if this is a new urgent
299311754SKacheong.Poon@Sun.COM 			 * byte. Also send a zero-length "unmarked" message
299411754SKacheong.Poon@Sun.COM 			 * to inform SIOCATMARK that this is not the mark.
299511754SKacheong.Poon@Sun.COM 			 *
299611754SKacheong.Poon@Sun.COM 			 * tcp_urp_last_valid is cleared when the T_exdata_ind
299711754SKacheong.Poon@Sun.COM 			 * is sent up. This plus the check for old data
299811754SKacheong.Poon@Sun.COM 			 * (gap >= 0) handles the wraparound of the sequence
299911754SKacheong.Poon@Sun.COM 			 * number space without having to always track the
300011754SKacheong.Poon@Sun.COM 			 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks
300111754SKacheong.Poon@Sun.COM 			 * this max in its rcv_up variable).
300211754SKacheong.Poon@Sun.COM 			 *
300311754SKacheong.Poon@Sun.COM 			 * This prevents duplicate SIGURGS due to a "late"
300411754SKacheong.Poon@Sun.COM 			 * zero-window probe when the T_EXDATA_IND has already
300511754SKacheong.Poon@Sun.COM 			 * been sent up.
300611754SKacheong.Poon@Sun.COM 			 */
300711754SKacheong.Poon@Sun.COM 			if ((flags & TH_URG) &&
300811754SKacheong.Poon@Sun.COM 			    (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
300911754SKacheong.Poon@Sun.COM 			    tcp->tcp_urp_last))) {
301011754SKacheong.Poon@Sun.COM 				if (IPCL_IS_NONSTR(connp)) {
301111754SKacheong.Poon@Sun.COM 					if (!TCP_IS_DETACHED(tcp)) {
301211754SKacheong.Poon@Sun.COM 						(*connp->conn_upcalls->
301311754SKacheong.Poon@Sun.COM 						    su_signal_oob)
301411754SKacheong.Poon@Sun.COM 						    (connp->conn_upper_handle,
301511754SKacheong.Poon@Sun.COM 						    urp);
301611754SKacheong.Poon@Sun.COM 					}
301711754SKacheong.Poon@Sun.COM 				} else {
301811754SKacheong.Poon@Sun.COM 					mp1 = allocb(0, BPRI_MED);
301911754SKacheong.Poon@Sun.COM 					if (mp1 == NULL) {
302011754SKacheong.Poon@Sun.COM 						freemsg(mp);
302111754SKacheong.Poon@Sun.COM 						return;
302211754SKacheong.Poon@Sun.COM 					}
302311754SKacheong.Poon@Sun.COM 					if (!TCP_IS_DETACHED(tcp) &&
302411754SKacheong.Poon@Sun.COM 					    !putnextctl1(connp->conn_rq,
302511754SKacheong.Poon@Sun.COM 					    M_PCSIG, SIGURG)) {
302611754SKacheong.Poon@Sun.COM 						/* Try again on the rexmit. */
302711754SKacheong.Poon@Sun.COM 						freemsg(mp1);
302811754SKacheong.Poon@Sun.COM 						freemsg(mp);
302911754SKacheong.Poon@Sun.COM 						return;
303011754SKacheong.Poon@Sun.COM 					}
303111754SKacheong.Poon@Sun.COM 					/*
303211754SKacheong.Poon@Sun.COM 					 * If the next byte would be the mark
303311754SKacheong.Poon@Sun.COM 					 * then mark with MARKNEXT else mark
303411754SKacheong.Poon@Sun.COM 					 * with NOTMARKNEXT.
303511754SKacheong.Poon@Sun.COM 					 */
303611754SKacheong.Poon@Sun.COM 					if (gap == 0 && urp == 0)
303711754SKacheong.Poon@Sun.COM 						mp1->b_flag |= MSGMARKNEXT;
303811754SKacheong.Poon@Sun.COM 					else
303911754SKacheong.Poon@Sun.COM 						mp1->b_flag |= MSGNOTMARKNEXT;
304011754SKacheong.Poon@Sun.COM 					freemsg(tcp->tcp_urp_mark_mp);
304111754SKacheong.Poon@Sun.COM 					tcp->tcp_urp_mark_mp = mp1;
304211754SKacheong.Poon@Sun.COM 					flags |= TH_SEND_URP_MARK;
304311754SKacheong.Poon@Sun.COM 				}
304411754SKacheong.Poon@Sun.COM 				tcp->tcp_urp_last_valid = B_TRUE;
304511754SKacheong.Poon@Sun.COM 				tcp->tcp_urp_last = urp + seg_seq;
304611754SKacheong.Poon@Sun.COM 			}
304711754SKacheong.Poon@Sun.COM 			/*
304811754SKacheong.Poon@Sun.COM 			 * If this is a zero window probe, continue to
304911754SKacheong.Poon@Sun.COM 			 * process the ACK part.  But we need to set seg_len
305011754SKacheong.Poon@Sun.COM 			 * to 0 to avoid data processing.  Otherwise just
305111754SKacheong.Poon@Sun.COM 			 * drop the segment and send back an ACK.
305211754SKacheong.Poon@Sun.COM 			 */
305311754SKacheong.Poon@Sun.COM 			if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) {
305411754SKacheong.Poon@Sun.COM 				flags &= ~(TH_SYN | TH_URG);
305511754SKacheong.Poon@Sun.COM 				seg_len = 0;
305611754SKacheong.Poon@Sun.COM 				goto process_ack;
305711754SKacheong.Poon@Sun.COM 			} else {
305811754SKacheong.Poon@Sun.COM 				freemsg(mp);
305911754SKacheong.Poon@Sun.COM 				goto ack_check;
306011754SKacheong.Poon@Sun.COM 			}
306111754SKacheong.Poon@Sun.COM 		}
306211754SKacheong.Poon@Sun.COM 		/* Pitch out of window stuff off the end. */
306311754SKacheong.Poon@Sun.COM 		rgap = seg_len;
306411754SKacheong.Poon@Sun.COM 		mp2 = mp;
306511754SKacheong.Poon@Sun.COM 		do {
306611754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
306711754SKacheong.Poon@Sun.COM 			    (uintptr_t)INT_MAX);
306811754SKacheong.Poon@Sun.COM 			rgap -= (int)(mp2->b_wptr - mp2->b_rptr);
306911754SKacheong.Poon@Sun.COM 			if (rgap < 0) {
307011754SKacheong.Poon@Sun.COM 				mp2->b_wptr += rgap;
307111754SKacheong.Poon@Sun.COM 				if ((mp1 = mp2->b_cont) != NULL) {
307211754SKacheong.Poon@Sun.COM 					mp2->b_cont = NULL;
307311754SKacheong.Poon@Sun.COM 					freemsg(mp1);
307411754SKacheong.Poon@Sun.COM 				}
307511754SKacheong.Poon@Sun.COM 				break;
307611754SKacheong.Poon@Sun.COM 			}
307711754SKacheong.Poon@Sun.COM 		} while ((mp2 = mp2->b_cont) != NULL);
307811754SKacheong.Poon@Sun.COM 	}
307911754SKacheong.Poon@Sun.COM ok:;
308011754SKacheong.Poon@Sun.COM 	/*
308111754SKacheong.Poon@Sun.COM 	 * TCP should check ECN info for segments inside the window only.
308211754SKacheong.Poon@Sun.COM 	 * Therefore the check should be done here.
308311754SKacheong.Poon@Sun.COM 	 */
308411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ecn_ok) {
308511754SKacheong.Poon@Sun.COM 		if (flags & TH_CWR) {
308611754SKacheong.Poon@Sun.COM 			tcp->tcp_ecn_echo_on = B_FALSE;
308711754SKacheong.Poon@Sun.COM 		}
308811754SKacheong.Poon@Sun.COM 		/*
308911754SKacheong.Poon@Sun.COM 		 * Note that both ECN_CE and CWR can be set in the
309011754SKacheong.Poon@Sun.COM 		 * same segment.  In this case, we once again turn
309111754SKacheong.Poon@Sun.COM 		 * on ECN_ECHO.
309211754SKacheong.Poon@Sun.COM 		 */
309311754SKacheong.Poon@Sun.COM 		if (connp->conn_ipversion == IPV4_VERSION) {
309411754SKacheong.Poon@Sun.COM 			uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service;
309511754SKacheong.Poon@Sun.COM 
309611754SKacheong.Poon@Sun.COM 			if ((tos & IPH_ECN_CE) == IPH_ECN_CE) {
309711754SKacheong.Poon@Sun.COM 				tcp->tcp_ecn_echo_on = B_TRUE;
309811754SKacheong.Poon@Sun.COM 			}
309911754SKacheong.Poon@Sun.COM 		} else {
310011754SKacheong.Poon@Sun.COM 			uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf;
310111754SKacheong.Poon@Sun.COM 
310211754SKacheong.Poon@Sun.COM 			if ((vcf & htonl(IPH_ECN_CE << 20)) ==
310311754SKacheong.Poon@Sun.COM 			    htonl(IPH_ECN_CE << 20)) {
310411754SKacheong.Poon@Sun.COM 				tcp->tcp_ecn_echo_on = B_TRUE;
310511754SKacheong.Poon@Sun.COM 			}
310611754SKacheong.Poon@Sun.COM 		}
310711754SKacheong.Poon@Sun.COM 	}
310811754SKacheong.Poon@Sun.COM 
310911754SKacheong.Poon@Sun.COM 	/*
311011754SKacheong.Poon@Sun.COM 	 * Check whether we can update tcp_ts_recent.  This test is
311111754SKacheong.Poon@Sun.COM 	 * NOT the one in RFC 1323 3.4.  It is from Braden, 1993, "TCP
311211754SKacheong.Poon@Sun.COM 	 * Extensions for High Performance: An Update", Internet Draft.
311311754SKacheong.Poon@Sun.COM 	 */
311411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok &&
311511754SKacheong.Poon@Sun.COM 	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
311611754SKacheong.Poon@Sun.COM 	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
311711754SKacheong.Poon@Sun.COM 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
311811754SKacheong.Poon@Sun.COM 		tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64;
311911754SKacheong.Poon@Sun.COM 	}
312011754SKacheong.Poon@Sun.COM 
312111754SKacheong.Poon@Sun.COM 	if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) {
312211754SKacheong.Poon@Sun.COM 		/*
312311754SKacheong.Poon@Sun.COM 		 * FIN in an out of order segment.  We record this in
312411754SKacheong.Poon@Sun.COM 		 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq.
312511754SKacheong.Poon@Sun.COM 		 * Clear the FIN so that any check on FIN flag will fail.
312611754SKacheong.Poon@Sun.COM 		 * Remember that FIN also counts in the sequence number
312711754SKacheong.Poon@Sun.COM 		 * space.  So we need to ack out of order FIN only segments.
312811754SKacheong.Poon@Sun.COM 		 */
312911754SKacheong.Poon@Sun.COM 		if (flags & TH_FIN) {
313011754SKacheong.Poon@Sun.COM 			tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID;
313111754SKacheong.Poon@Sun.COM 			tcp->tcp_ofo_fin_seq = seg_seq + seg_len;
313211754SKacheong.Poon@Sun.COM 			flags &= ~TH_FIN;
313311754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
313411754SKacheong.Poon@Sun.COM 		}
313511754SKacheong.Poon@Sun.COM 		if (seg_len > 0) {
313611754SKacheong.Poon@Sun.COM 			/* Fill in the SACK blk list. */
313711754SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_sack_ok) {
313811754SKacheong.Poon@Sun.COM 				tcp_sack_insert(tcp->tcp_sack_list,
313911754SKacheong.Poon@Sun.COM 				    seg_seq, seg_seq + seg_len,
314011754SKacheong.Poon@Sun.COM 				    &(tcp->tcp_num_sack_blk));
314111754SKacheong.Poon@Sun.COM 			}
314211754SKacheong.Poon@Sun.COM 
314311754SKacheong.Poon@Sun.COM 			/*
314411754SKacheong.Poon@Sun.COM 			 * Attempt reassembly and see if we have something
314511754SKacheong.Poon@Sun.COM 			 * ready to go.
314611754SKacheong.Poon@Sun.COM 			 */
314711754SKacheong.Poon@Sun.COM 			mp = tcp_reass(tcp, mp, seg_seq);
314811754SKacheong.Poon@Sun.COM 			/* Always ack out of order packets */
314911754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED | TH_PUSH;
315011754SKacheong.Poon@Sun.COM 			if (mp) {
315111754SKacheong.Poon@Sun.COM 				ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
315211754SKacheong.Poon@Sun.COM 				    (uintptr_t)INT_MAX);
315311754SKacheong.Poon@Sun.COM 				seg_len = mp->b_cont ? msgdsize(mp) :
315411754SKacheong.Poon@Sun.COM 				    (int)(mp->b_wptr - mp->b_rptr);
315511754SKacheong.Poon@Sun.COM 				seg_seq = tcp->tcp_rnxt;
315611754SKacheong.Poon@Sun.COM 				/*
315711754SKacheong.Poon@Sun.COM 				 * A gap is filled and the seq num and len
315811754SKacheong.Poon@Sun.COM 				 * of the gap match that of a previously
315911754SKacheong.Poon@Sun.COM 				 * received FIN, put the FIN flag back in.
316011754SKacheong.Poon@Sun.COM 				 */
316111754SKacheong.Poon@Sun.COM 				if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
316211754SKacheong.Poon@Sun.COM 				    seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
316311754SKacheong.Poon@Sun.COM 					flags |= TH_FIN;
316411754SKacheong.Poon@Sun.COM 					tcp->tcp_valid_bits &=
316511754SKacheong.Poon@Sun.COM 					    ~TCP_OFO_FIN_VALID;
316611754SKacheong.Poon@Sun.COM 				}
316711754SKacheong.Poon@Sun.COM 				if (tcp->tcp_reass_tid != 0) {
316811754SKacheong.Poon@Sun.COM 					(void) TCP_TIMER_CANCEL(tcp,
316911754SKacheong.Poon@Sun.COM 					    tcp->tcp_reass_tid);
317011754SKacheong.Poon@Sun.COM 					/*
317111754SKacheong.Poon@Sun.COM 					 * Restart the timer if there is still
317211754SKacheong.Poon@Sun.COM 					 * data in the reassembly queue.
317311754SKacheong.Poon@Sun.COM 					 */
317411754SKacheong.Poon@Sun.COM 					if (tcp->tcp_reass_head != NULL) {
317511754SKacheong.Poon@Sun.COM 						tcp->tcp_reass_tid = TCP_TIMER(
317611754SKacheong.Poon@Sun.COM 						    tcp, tcp_reass_timer,
317712056SKacheong.Poon@Sun.COM 						    tcps->tcps_reass_timeout);
317811754SKacheong.Poon@Sun.COM 					} else {
317911754SKacheong.Poon@Sun.COM 						tcp->tcp_reass_tid = 0;
318011754SKacheong.Poon@Sun.COM 					}
318111754SKacheong.Poon@Sun.COM 				}
318211754SKacheong.Poon@Sun.COM 			} else {
318311754SKacheong.Poon@Sun.COM 				/*
318411754SKacheong.Poon@Sun.COM 				 * Keep going even with NULL mp.
318511754SKacheong.Poon@Sun.COM 				 * There may be a useful ACK or something else
318611754SKacheong.Poon@Sun.COM 				 * we don't want to miss.
318711754SKacheong.Poon@Sun.COM 				 *
318811754SKacheong.Poon@Sun.COM 				 * But TCP should not perform fast retransmit
318911754SKacheong.Poon@Sun.COM 				 * because of the ack number.  TCP uses
319011754SKacheong.Poon@Sun.COM 				 * seg_len == 0 to determine if it is a pure
319111754SKacheong.Poon@Sun.COM 				 * ACK.  And this is not a pure ACK.
319211754SKacheong.Poon@Sun.COM 				 */
319311754SKacheong.Poon@Sun.COM 				seg_len = 0;
319411754SKacheong.Poon@Sun.COM 				ofo_seg = B_TRUE;
319511754SKacheong.Poon@Sun.COM 
319611754SKacheong.Poon@Sun.COM 				if (tcps->tcps_reass_timeout != 0 &&
319711754SKacheong.Poon@Sun.COM 				    tcp->tcp_reass_tid == 0) {
319811754SKacheong.Poon@Sun.COM 					tcp->tcp_reass_tid = TCP_TIMER(tcp,
319912056SKacheong.Poon@Sun.COM 					    tcp_reass_timer,
320012056SKacheong.Poon@Sun.COM 					    tcps->tcps_reass_timeout);
320111754SKacheong.Poon@Sun.COM 				}
320211754SKacheong.Poon@Sun.COM 			}
320311754SKacheong.Poon@Sun.COM 		}
320411754SKacheong.Poon@Sun.COM 	} else if (seg_len > 0) {
320511754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
320611754SKacheong.Poon@Sun.COM 		TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
320711754SKacheong.Poon@Sun.COM 		/*
320811754SKacheong.Poon@Sun.COM 		 * If an out of order FIN was received before, and the seq
320911754SKacheong.Poon@Sun.COM 		 * num and len of the new segment match that of the FIN,
321011754SKacheong.Poon@Sun.COM 		 * put the FIN flag back in.
321111754SKacheong.Poon@Sun.COM 		 */
321211754SKacheong.Poon@Sun.COM 		if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
321311754SKacheong.Poon@Sun.COM 		    seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
321411754SKacheong.Poon@Sun.COM 			flags |= TH_FIN;
321511754SKacheong.Poon@Sun.COM 			tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
321611754SKacheong.Poon@Sun.COM 		}
321711754SKacheong.Poon@Sun.COM 	}
321811754SKacheong.Poon@Sun.COM 	if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
321911754SKacheong.Poon@Sun.COM 	if (flags & TH_RST) {
322011754SKacheong.Poon@Sun.COM 		freemsg(mp);
322111754SKacheong.Poon@Sun.COM 		switch (tcp->tcp_state) {
322211754SKacheong.Poon@Sun.COM 		case TCPS_SYN_RCVD:
322311754SKacheong.Poon@Sun.COM 			(void) tcp_clean_death(tcp, ECONNREFUSED);
322411754SKacheong.Poon@Sun.COM 			break;
322511754SKacheong.Poon@Sun.COM 		case TCPS_ESTABLISHED:
322611754SKacheong.Poon@Sun.COM 		case TCPS_FIN_WAIT_1:
322711754SKacheong.Poon@Sun.COM 		case TCPS_FIN_WAIT_2:
322811754SKacheong.Poon@Sun.COM 		case TCPS_CLOSE_WAIT:
322911754SKacheong.Poon@Sun.COM 			(void) tcp_clean_death(tcp, ECONNRESET);
323011754SKacheong.Poon@Sun.COM 			break;
323111754SKacheong.Poon@Sun.COM 		case TCPS_CLOSING:
323211754SKacheong.Poon@Sun.COM 		case TCPS_LAST_ACK:
323311754SKacheong.Poon@Sun.COM 			(void) tcp_clean_death(tcp, 0);
323411754SKacheong.Poon@Sun.COM 			break;
323511754SKacheong.Poon@Sun.COM 		default:
323611754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
323711754SKacheong.Poon@Sun.COM 			(void) tcp_clean_death(tcp, ENXIO);
323811754SKacheong.Poon@Sun.COM 			break;
323911754SKacheong.Poon@Sun.COM 		}
324011754SKacheong.Poon@Sun.COM 		return;
324111754SKacheong.Poon@Sun.COM 	}
324211754SKacheong.Poon@Sun.COM 	if (flags & TH_SYN) {
324311754SKacheong.Poon@Sun.COM 		/*
324411754SKacheong.Poon@Sun.COM 		 * See RFC 793, Page 71
324511754SKacheong.Poon@Sun.COM 		 *
324611754SKacheong.Poon@Sun.COM 		 * The seq number must be in the window as it should
324711754SKacheong.Poon@Sun.COM 		 * be "fixed" above.  If it is outside window, it should
324811754SKacheong.Poon@Sun.COM 		 * be already rejected.  Note that we allow seg_seq to be
324911754SKacheong.Poon@Sun.COM 		 * rnxt + rwnd because we want to accept 0 window probe.
325011754SKacheong.Poon@Sun.COM 		 */
325111754SKacheong.Poon@Sun.COM 		ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) &&
325211754SKacheong.Poon@Sun.COM 		    SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
325311754SKacheong.Poon@Sun.COM 		freemsg(mp);
325411754SKacheong.Poon@Sun.COM 		/*
325511754SKacheong.Poon@Sun.COM 		 * If the ACK flag is not set, just use our snxt as the
325611754SKacheong.Poon@Sun.COM 		 * seq number of the RST segment.
325711754SKacheong.Poon@Sun.COM 		 */
325811754SKacheong.Poon@Sun.COM 		if (!(flags & TH_ACK)) {
325911754SKacheong.Poon@Sun.COM 			seg_ack = tcp->tcp_snxt;
326011754SKacheong.Poon@Sun.COM 		}
326111754SKacheong.Poon@Sun.COM 		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
326211754SKacheong.Poon@Sun.COM 		    TH_RST|TH_ACK);
326311754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
326411754SKacheong.Poon@Sun.COM 		(void) tcp_clean_death(tcp, ECONNRESET);
326511754SKacheong.Poon@Sun.COM 		return;
326611754SKacheong.Poon@Sun.COM 	}
326711754SKacheong.Poon@Sun.COM 	/*
326811754SKacheong.Poon@Sun.COM 	 * urp could be -1 when the urp field in the packet is 0
326911754SKacheong.Poon@Sun.COM 	 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
327011754SKacheong.Poon@Sun.COM 	 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
327111754SKacheong.Poon@Sun.COM 	 */
327211754SKacheong.Poon@Sun.COM 	if (flags & TH_URG && urp >= 0) {
327311754SKacheong.Poon@Sun.COM 		if (!tcp->tcp_urp_last_valid ||
327411754SKacheong.Poon@Sun.COM 		    SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
327511754SKacheong.Poon@Sun.COM 			/*
327611754SKacheong.Poon@Sun.COM 			 * Non-STREAMS sockets handle the urgent data a litte
327711754SKacheong.Poon@Sun.COM 			 * differently from STREAMS based sockets. There is no
327811754SKacheong.Poon@Sun.COM 			 * need to mark any mblks with the MSG{NOT,}MARKNEXT
327911754SKacheong.Poon@Sun.COM 			 * flags to keep SIOCATMARK happy. Instead a
328011754SKacheong.Poon@Sun.COM 			 * su_signal_oob upcall is made to update the mark.
328111754SKacheong.Poon@Sun.COM 			 * Neither is a T_EXDATA_IND mblk needed to be
328211754SKacheong.Poon@Sun.COM 			 * prepended to the urgent data. The urgent data is
328311754SKacheong.Poon@Sun.COM 			 * delivered using the su_recv upcall, where we set
328411754SKacheong.Poon@Sun.COM 			 * the MSG_OOB flag to indicate that it is urg data.
328511754SKacheong.Poon@Sun.COM 			 *
328611754SKacheong.Poon@Sun.COM 			 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
328711754SKacheong.Poon@Sun.COM 			 * are used by non-STREAMS sockets.
328811754SKacheong.Poon@Sun.COM 			 */
328911754SKacheong.Poon@Sun.COM 			if (IPCL_IS_NONSTR(connp)) {
329011754SKacheong.Poon@Sun.COM 				if (!TCP_IS_DETACHED(tcp)) {
329111754SKacheong.Poon@Sun.COM 					(*connp->conn_upcalls->su_signal_oob)
329211754SKacheong.Poon@Sun.COM 					    (connp->conn_upper_handle, urp);
329311754SKacheong.Poon@Sun.COM 				}
329411754SKacheong.Poon@Sun.COM 			} else {
329511754SKacheong.Poon@Sun.COM 				/*
329611754SKacheong.Poon@Sun.COM 				 * If we haven't generated the signal yet for
329711754SKacheong.Poon@Sun.COM 				 * this urgent pointer value, do it now.  Also,
329811754SKacheong.Poon@Sun.COM 				 * send up a zero-length M_DATA indicating
329911754SKacheong.Poon@Sun.COM 				 * whether or not this is the mark. The latter
330011754SKacheong.Poon@Sun.COM 				 * is not needed when a T_EXDATA_IND is sent up.
330111754SKacheong.Poon@Sun.COM 				 * However, if there are allocation failures
330211754SKacheong.Poon@Sun.COM 				 * this code relies on the sender retransmitting
330311754SKacheong.Poon@Sun.COM 				 * and the socket code for determining the mark
330411754SKacheong.Poon@Sun.COM 				 * should not block waiting for the peer to
330511754SKacheong.Poon@Sun.COM 				 * transmit. Thus, for simplicity we always
330611754SKacheong.Poon@Sun.COM 				 * send up the mark indication.
330711754SKacheong.Poon@Sun.COM 				 */
330811754SKacheong.Poon@Sun.COM 				mp1 = allocb(0, BPRI_MED);
330911754SKacheong.Poon@Sun.COM 				if (mp1 == NULL) {
331011754SKacheong.Poon@Sun.COM 					freemsg(mp);
331111754SKacheong.Poon@Sun.COM 					return;
331211754SKacheong.Poon@Sun.COM 				}
331311754SKacheong.Poon@Sun.COM 				if (!TCP_IS_DETACHED(tcp) &&
331411754SKacheong.Poon@Sun.COM 				    !putnextctl1(connp->conn_rq, M_PCSIG,
331511754SKacheong.Poon@Sun.COM 				    SIGURG)) {
331611754SKacheong.Poon@Sun.COM 					/* Try again on the rexmit. */
331711754SKacheong.Poon@Sun.COM 					freemsg(mp1);
331811754SKacheong.Poon@Sun.COM 					freemsg(mp);
331911754SKacheong.Poon@Sun.COM 					return;
332011754SKacheong.Poon@Sun.COM 				}
332111754SKacheong.Poon@Sun.COM 				/*
332211754SKacheong.Poon@Sun.COM 				 * Mark with NOTMARKNEXT for now.
332311754SKacheong.Poon@Sun.COM 				 * The code below will change this to MARKNEXT
332411754SKacheong.Poon@Sun.COM 				 * if we are at the mark.
332511754SKacheong.Poon@Sun.COM 				 *
332611754SKacheong.Poon@Sun.COM 				 * If there are allocation failures (e.g. in
332711754SKacheong.Poon@Sun.COM 				 * dupmsg below) the next time tcp_input_data
332811754SKacheong.Poon@Sun.COM 				 * sees the urgent segment it will send up the
332911754SKacheong.Poon@Sun.COM 				 * MSGMARKNEXT message.
333011754SKacheong.Poon@Sun.COM 				 */
333111754SKacheong.Poon@Sun.COM 				mp1->b_flag |= MSGNOTMARKNEXT;
333211754SKacheong.Poon@Sun.COM 				freemsg(tcp->tcp_urp_mark_mp);
333311754SKacheong.Poon@Sun.COM 				tcp->tcp_urp_mark_mp = mp1;
333411754SKacheong.Poon@Sun.COM 				flags |= TH_SEND_URP_MARK;
333511754SKacheong.Poon@Sun.COM #ifdef DEBUG
333611754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
333711754SKacheong.Poon@Sun.COM 				    "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
333811754SKacheong.Poon@Sun.COM 				    "last %x, %s",
333911754SKacheong.Poon@Sun.COM 				    seg_seq, urp, tcp->tcp_urp_last,
334011754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL, DISP_PORT_ONLY));
334111754SKacheong.Poon@Sun.COM #endif /* DEBUG */
334211754SKacheong.Poon@Sun.COM 			}
334311754SKacheong.Poon@Sun.COM 			tcp->tcp_urp_last_valid = B_TRUE;
334411754SKacheong.Poon@Sun.COM 			tcp->tcp_urp_last = urp + seg_seq;
334511754SKacheong.Poon@Sun.COM 		} else if (tcp->tcp_urp_mark_mp != NULL) {
334611754SKacheong.Poon@Sun.COM 			/*
334711754SKacheong.Poon@Sun.COM 			 * An allocation failure prevented the previous
334811754SKacheong.Poon@Sun.COM 			 * tcp_input_data from sending up the allocated
334911754SKacheong.Poon@Sun.COM 			 * MSG*MARKNEXT message - send it up this time
335011754SKacheong.Poon@Sun.COM 			 * around.
335111754SKacheong.Poon@Sun.COM 			 */
335211754SKacheong.Poon@Sun.COM 			flags |= TH_SEND_URP_MARK;
335311754SKacheong.Poon@Sun.COM 		}
335411754SKacheong.Poon@Sun.COM 
335511754SKacheong.Poon@Sun.COM 		/*
335611754SKacheong.Poon@Sun.COM 		 * If the urgent byte is in this segment, make sure that it is
335711754SKacheong.Poon@Sun.COM 		 * all by itself.  This makes it much easier to deal with the
335811754SKacheong.Poon@Sun.COM 		 * possibility of an allocation failure on the T_exdata_ind.
335911754SKacheong.Poon@Sun.COM 		 * Note that seg_len is the number of bytes in the segment, and
336011754SKacheong.Poon@Sun.COM 		 * urp is the offset into the segment of the urgent byte.
336111754SKacheong.Poon@Sun.COM 		 * urp < seg_len means that the urgent byte is in this segment.
336211754SKacheong.Poon@Sun.COM 		 */
336311754SKacheong.Poon@Sun.COM 		if (urp < seg_len) {
336411754SKacheong.Poon@Sun.COM 			if (seg_len != 1) {
336511754SKacheong.Poon@Sun.COM 				uint32_t  tmp_rnxt;
336611754SKacheong.Poon@Sun.COM 				/*
336711754SKacheong.Poon@Sun.COM 				 * Break it up and feed it back in.
336811754SKacheong.Poon@Sun.COM 				 * Re-attach the IP header.
336911754SKacheong.Poon@Sun.COM 				 */
337011754SKacheong.Poon@Sun.COM 				mp->b_rptr = iphdr;
337111754SKacheong.Poon@Sun.COM 				if (urp > 0) {
337211754SKacheong.Poon@Sun.COM 					/*
337311754SKacheong.Poon@Sun.COM 					 * There is stuff before the urgent
337411754SKacheong.Poon@Sun.COM 					 * byte.
337511754SKacheong.Poon@Sun.COM 					 */
337611754SKacheong.Poon@Sun.COM 					mp1 = dupmsg(mp);
337711754SKacheong.Poon@Sun.COM 					if (!mp1) {
337811754SKacheong.Poon@Sun.COM 						/*
337911754SKacheong.Poon@Sun.COM 						 * Trim from urgent byte on.
338011754SKacheong.Poon@Sun.COM 						 * The rest will come back.
338111754SKacheong.Poon@Sun.COM 						 */
338211754SKacheong.Poon@Sun.COM 						(void) adjmsg(mp,
338311754SKacheong.Poon@Sun.COM 						    urp - seg_len);
338411754SKacheong.Poon@Sun.COM 						tcp_input_data(connp,
338511754SKacheong.Poon@Sun.COM 						    mp, NULL, ira);
338611754SKacheong.Poon@Sun.COM 						return;
338711754SKacheong.Poon@Sun.COM 					}
338811754SKacheong.Poon@Sun.COM 					(void) adjmsg(mp1, urp - seg_len);
338911754SKacheong.Poon@Sun.COM 					/* Feed this piece back in. */
339011754SKacheong.Poon@Sun.COM 					tmp_rnxt = tcp->tcp_rnxt;
339111754SKacheong.Poon@Sun.COM 					tcp_input_data(connp, mp1, NULL, ira);
339211754SKacheong.Poon@Sun.COM 					/*
339311754SKacheong.Poon@Sun.COM 					 * If the data passed back in was not
339411754SKacheong.Poon@Sun.COM 					 * processed (ie: bad ACK) sending
339511754SKacheong.Poon@Sun.COM 					 * the remainder back in will cause a
339611754SKacheong.Poon@Sun.COM 					 * loop. In this case, drop the
339711754SKacheong.Poon@Sun.COM 					 * packet and let the sender try
339811754SKacheong.Poon@Sun.COM 					 * sending a good packet.
339911754SKacheong.Poon@Sun.COM 					 */
340011754SKacheong.Poon@Sun.COM 					if (tmp_rnxt == tcp->tcp_rnxt) {
340111754SKacheong.Poon@Sun.COM 						freemsg(mp);
340211754SKacheong.Poon@Sun.COM 						return;
340311754SKacheong.Poon@Sun.COM 					}
340411754SKacheong.Poon@Sun.COM 				}
340511754SKacheong.Poon@Sun.COM 				if (urp != seg_len - 1) {
340611754SKacheong.Poon@Sun.COM 					uint32_t  tmp_rnxt;
340711754SKacheong.Poon@Sun.COM 					/*
340811754SKacheong.Poon@Sun.COM 					 * There is stuff after the urgent
340911754SKacheong.Poon@Sun.COM 					 * byte.
341011754SKacheong.Poon@Sun.COM 					 */
341111754SKacheong.Poon@Sun.COM 					mp1 = dupmsg(mp);
341211754SKacheong.Poon@Sun.COM 					if (!mp1) {
341311754SKacheong.Poon@Sun.COM 						/*
341411754SKacheong.Poon@Sun.COM 						 * Trim everything beyond the
341511754SKacheong.Poon@Sun.COM 						 * urgent byte.  The rest will
341611754SKacheong.Poon@Sun.COM 						 * come back.
341711754SKacheong.Poon@Sun.COM 						 */
341811754SKacheong.Poon@Sun.COM 						(void) adjmsg(mp,
341911754SKacheong.Poon@Sun.COM 						    urp + 1 - seg_len);
342011754SKacheong.Poon@Sun.COM 						tcp_input_data(connp,
342111754SKacheong.Poon@Sun.COM 						    mp, NULL, ira);
342211754SKacheong.Poon@Sun.COM 						return;
342311754SKacheong.Poon@Sun.COM 					}
342411754SKacheong.Poon@Sun.COM 					(void) adjmsg(mp1, urp + 1 - seg_len);
342511754SKacheong.Poon@Sun.COM 					tmp_rnxt = tcp->tcp_rnxt;
342611754SKacheong.Poon@Sun.COM 					tcp_input_data(connp, mp1, NULL, ira);
342711754SKacheong.Poon@Sun.COM 					/*
342811754SKacheong.Poon@Sun.COM 					 * If the data passed back in was not
342911754SKacheong.Poon@Sun.COM 					 * processed (ie: bad ACK) sending
343011754SKacheong.Poon@Sun.COM 					 * the remainder back in will cause a
343111754SKacheong.Poon@Sun.COM 					 * loop. In this case, drop the
343211754SKacheong.Poon@Sun.COM 					 * packet and let the sender try
343311754SKacheong.Poon@Sun.COM 					 * sending a good packet.
343411754SKacheong.Poon@Sun.COM 					 */
343511754SKacheong.Poon@Sun.COM 					if (tmp_rnxt == tcp->tcp_rnxt) {
343611754SKacheong.Poon@Sun.COM 						freemsg(mp);
343711754SKacheong.Poon@Sun.COM 						return;
343811754SKacheong.Poon@Sun.COM 					}
343911754SKacheong.Poon@Sun.COM 				}
344011754SKacheong.Poon@Sun.COM 				tcp_input_data(connp, mp, NULL, ira);
344111754SKacheong.Poon@Sun.COM 				return;
344211754SKacheong.Poon@Sun.COM 			}
344311754SKacheong.Poon@Sun.COM 			/*
344411754SKacheong.Poon@Sun.COM 			 * This segment contains only the urgent byte.  We
344511754SKacheong.Poon@Sun.COM 			 * have to allocate the T_exdata_ind, if we can.
344611754SKacheong.Poon@Sun.COM 			 */
344711754SKacheong.Poon@Sun.COM 			if (IPCL_IS_NONSTR(connp)) {
344811754SKacheong.Poon@Sun.COM 				int error;
344911754SKacheong.Poon@Sun.COM 
345011754SKacheong.Poon@Sun.COM 				(*connp->conn_upcalls->su_recv)
345111754SKacheong.Poon@Sun.COM 				    (connp->conn_upper_handle, mp, seg_len,
345211754SKacheong.Poon@Sun.COM 				    MSG_OOB, &error, NULL);
345311754SKacheong.Poon@Sun.COM 				/*
345411754SKacheong.Poon@Sun.COM 				 * We should never be in middle of a
345511754SKacheong.Poon@Sun.COM 				 * fallback, the squeue guarantees that.
345611754SKacheong.Poon@Sun.COM 				 */
345711754SKacheong.Poon@Sun.COM 				ASSERT(error != EOPNOTSUPP);
345811754SKacheong.Poon@Sun.COM 				mp = NULL;
345911754SKacheong.Poon@Sun.COM 				goto update_ack;
346011754SKacheong.Poon@Sun.COM 			} else if (!tcp->tcp_urp_mp) {
346111754SKacheong.Poon@Sun.COM 				struct T_exdata_ind *tei;
346211754SKacheong.Poon@Sun.COM 				mp1 = allocb(sizeof (struct T_exdata_ind),
346311754SKacheong.Poon@Sun.COM 				    BPRI_MED);
346411754SKacheong.Poon@Sun.COM 				if (!mp1) {
346511754SKacheong.Poon@Sun.COM 					/*
346611754SKacheong.Poon@Sun.COM 					 * Sigh... It'll be back.
346711754SKacheong.Poon@Sun.COM 					 * Generate any MSG*MARK message now.
346811754SKacheong.Poon@Sun.COM 					 */
346911754SKacheong.Poon@Sun.COM 					freemsg(mp);
347011754SKacheong.Poon@Sun.COM 					seg_len = 0;
347111754SKacheong.Poon@Sun.COM 					if (flags & TH_SEND_URP_MARK) {
347211754SKacheong.Poon@Sun.COM 
347311754SKacheong.Poon@Sun.COM 
347411754SKacheong.Poon@Sun.COM 						ASSERT(tcp->tcp_urp_mark_mp);
347511754SKacheong.Poon@Sun.COM 						tcp->tcp_urp_mark_mp->b_flag &=
347611754SKacheong.Poon@Sun.COM 						    ~MSGNOTMARKNEXT;
347711754SKacheong.Poon@Sun.COM 						tcp->tcp_urp_mark_mp->b_flag |=
347811754SKacheong.Poon@Sun.COM 						    MSGMARKNEXT;
347911754SKacheong.Poon@Sun.COM 					}
348011754SKacheong.Poon@Sun.COM 					goto ack_check;
348111754SKacheong.Poon@Sun.COM 				}
348211754SKacheong.Poon@Sun.COM 				mp1->b_datap->db_type = M_PROTO;
348311754SKacheong.Poon@Sun.COM 				tei = (struct T_exdata_ind *)mp1->b_rptr;
348411754SKacheong.Poon@Sun.COM 				tei->PRIM_type = T_EXDATA_IND;
348511754SKacheong.Poon@Sun.COM 				tei->MORE_flag = 0;
348611754SKacheong.Poon@Sun.COM 				mp1->b_wptr = (uchar_t *)&tei[1];
348711754SKacheong.Poon@Sun.COM 				tcp->tcp_urp_mp = mp1;
348811754SKacheong.Poon@Sun.COM #ifdef DEBUG
348911754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
349011754SKacheong.Poon@Sun.COM 				    "tcp_rput: allocated exdata_ind %s",
349111754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
349211754SKacheong.Poon@Sun.COM 				    DISP_PORT_ONLY));
349311754SKacheong.Poon@Sun.COM #endif /* DEBUG */
349411754SKacheong.Poon@Sun.COM 				/*
349511754SKacheong.Poon@Sun.COM 				 * There is no need to send a separate MSG*MARK
349611754SKacheong.Poon@Sun.COM 				 * message since the T_EXDATA_IND will be sent
349711754SKacheong.Poon@Sun.COM 				 * now.
349811754SKacheong.Poon@Sun.COM 				 */
349911754SKacheong.Poon@Sun.COM 				flags &= ~TH_SEND_URP_MARK;
350011754SKacheong.Poon@Sun.COM 				freemsg(tcp->tcp_urp_mark_mp);
350111754SKacheong.Poon@Sun.COM 				tcp->tcp_urp_mark_mp = NULL;
350211754SKacheong.Poon@Sun.COM 			}
350311754SKacheong.Poon@Sun.COM 			/*
350411754SKacheong.Poon@Sun.COM 			 * Now we are all set.  On the next putnext upstream,
350511754SKacheong.Poon@Sun.COM 			 * tcp_urp_mp will be non-NULL and will get prepended
350611754SKacheong.Poon@Sun.COM 			 * to what has to be this piece containing the urgent
350711754SKacheong.Poon@Sun.COM 			 * byte.  If for any reason we abort this segment below,
350811754SKacheong.Poon@Sun.COM 			 * if it comes back, we will have this ready, or it
350911754SKacheong.Poon@Sun.COM 			 * will get blown off in close.
351011754SKacheong.Poon@Sun.COM 			 */
351111754SKacheong.Poon@Sun.COM 		} else if (urp == seg_len) {
351211754SKacheong.Poon@Sun.COM 			/*
351311754SKacheong.Poon@Sun.COM 			 * The urgent byte is the next byte after this sequence
351411754SKacheong.Poon@Sun.COM 			 * number. If this endpoint is non-STREAMS, then there
351511754SKacheong.Poon@Sun.COM 			 * is nothing to do here since the socket has already
351611754SKacheong.Poon@Sun.COM 			 * been notified about the urg pointer by the
351711754SKacheong.Poon@Sun.COM 			 * su_signal_oob call above.
351811754SKacheong.Poon@Sun.COM 			 *
351911754SKacheong.Poon@Sun.COM 			 * In case of STREAMS, some more work might be needed.
352011754SKacheong.Poon@Sun.COM 			 * If there is data it is marked with MSGMARKNEXT and
352111754SKacheong.Poon@Sun.COM 			 * and any tcp_urp_mark_mp is discarded since it is not
352211754SKacheong.Poon@Sun.COM 			 * needed. Otherwise, if the code above just allocated
352311754SKacheong.Poon@Sun.COM 			 * a zero-length tcp_urp_mark_mp message, that message
352411754SKacheong.Poon@Sun.COM 			 * is tagged with MSGMARKNEXT. Sending up these
352511754SKacheong.Poon@Sun.COM 			 * MSGMARKNEXT messages makes SIOCATMARK work correctly
352611754SKacheong.Poon@Sun.COM 			 * even though the T_EXDATA_IND will not be sent up
352711754SKacheong.Poon@Sun.COM 			 * until the urgent byte arrives.
352811754SKacheong.Poon@Sun.COM 			 */
352911754SKacheong.Poon@Sun.COM 			if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
353011754SKacheong.Poon@Sun.COM 				if (seg_len != 0) {
353111754SKacheong.Poon@Sun.COM 					flags |= TH_MARKNEXT_NEEDED;
353211754SKacheong.Poon@Sun.COM 					freemsg(tcp->tcp_urp_mark_mp);
353311754SKacheong.Poon@Sun.COM 					tcp->tcp_urp_mark_mp = NULL;
353411754SKacheong.Poon@Sun.COM 					flags &= ~TH_SEND_URP_MARK;
353511754SKacheong.Poon@Sun.COM 				} else if (tcp->tcp_urp_mark_mp != NULL) {
353611754SKacheong.Poon@Sun.COM 					flags |= TH_SEND_URP_MARK;
353711754SKacheong.Poon@Sun.COM 					tcp->tcp_urp_mark_mp->b_flag &=
353811754SKacheong.Poon@Sun.COM 					    ~MSGNOTMARKNEXT;
353911754SKacheong.Poon@Sun.COM 					tcp->tcp_urp_mark_mp->b_flag |=
354011754SKacheong.Poon@Sun.COM 					    MSGMARKNEXT;
354111754SKacheong.Poon@Sun.COM 				}
354211754SKacheong.Poon@Sun.COM 			}
354311754SKacheong.Poon@Sun.COM #ifdef DEBUG
354411754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
354511754SKacheong.Poon@Sun.COM 			    "tcp_rput: AT MARK, len %d, flags 0x%x, %s",
354611754SKacheong.Poon@Sun.COM 			    seg_len, flags,
354711754SKacheong.Poon@Sun.COM 			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
354811754SKacheong.Poon@Sun.COM #endif /* DEBUG */
354911754SKacheong.Poon@Sun.COM 		}
355011754SKacheong.Poon@Sun.COM #ifdef DEBUG
355111754SKacheong.Poon@Sun.COM 		else {
355211754SKacheong.Poon@Sun.COM 			/* Data left until we hit mark */
355311754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
355411754SKacheong.Poon@Sun.COM 			    "tcp_rput: URP %d bytes left, %s",
355511754SKacheong.Poon@Sun.COM 			    urp - seg_len, tcp_display(tcp, NULL,
355611754SKacheong.Poon@Sun.COM 			    DISP_PORT_ONLY));
355711754SKacheong.Poon@Sun.COM 		}
355811754SKacheong.Poon@Sun.COM #endif /* DEBUG */
355911754SKacheong.Poon@Sun.COM 	}
356011754SKacheong.Poon@Sun.COM 
356111754SKacheong.Poon@Sun.COM process_ack:
356211754SKacheong.Poon@Sun.COM 	if (!(flags & TH_ACK)) {
356311754SKacheong.Poon@Sun.COM 		freemsg(mp);
356411754SKacheong.Poon@Sun.COM 		goto xmit_check;
356511754SKacheong.Poon@Sun.COM 	}
356611754SKacheong.Poon@Sun.COM 	}
356711754SKacheong.Poon@Sun.COM 	bytes_acked = (int)(seg_ack - tcp->tcp_suna);
356811754SKacheong.Poon@Sun.COM 
356911754SKacheong.Poon@Sun.COM 	if (bytes_acked > 0)
357011754SKacheong.Poon@Sun.COM 		tcp->tcp_ip_forward_progress = B_TRUE;
357111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_state == TCPS_SYN_RCVD) {
357212643SAnders.Persson@Sun.COM 		/*
357312643SAnders.Persson@Sun.COM 		 * tcp_sendmsg() checks tcp_state without entering
357412643SAnders.Persson@Sun.COM 		 * the squeue so tcp_state should be updated before
357512643SAnders.Persson@Sun.COM 		 * sending up a connection confirmation or a new
357612643SAnders.Persson@Sun.COM 		 * connection indication.
357712643SAnders.Persson@Sun.COM 		 */
357812643SAnders.Persson@Sun.COM 		tcp->tcp_state = TCPS_ESTABLISHED;
357912643SAnders.Persson@Sun.COM 
358012643SAnders.Persson@Sun.COM 		/*
358112643SAnders.Persson@Sun.COM 		 * We are seeing the final ack in the three way
358212643SAnders.Persson@Sun.COM 		 * hand shake of a active open'ed connection
358312643SAnders.Persson@Sun.COM 		 * so we must send up a T_CONN_CON
358412643SAnders.Persson@Sun.COM 		 */
358512643SAnders.Persson@Sun.COM 		if (tcp->tcp_active_open) {
358612643SAnders.Persson@Sun.COM 			if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
358712643SAnders.Persson@Sun.COM 				freemsg(mp);
358812643SAnders.Persson@Sun.COM 				tcp->tcp_state = TCPS_SYN_RCVD;
358912643SAnders.Persson@Sun.COM 				return;
359012643SAnders.Persson@Sun.COM 			}
359112643SAnders.Persson@Sun.COM 			/*
359212643SAnders.Persson@Sun.COM 			 * Don't fuse the loopback endpoints for
359312643SAnders.Persson@Sun.COM 			 * simultaneous active opens.
359412643SAnders.Persson@Sun.COM 			 */
359512643SAnders.Persson@Sun.COM 			if (tcp->tcp_loopback) {
359612643SAnders.Persson@Sun.COM 				TCP_STAT(tcps, tcp_fusion_unfusable);
359712643SAnders.Persson@Sun.COM 				tcp->tcp_unfusable = B_TRUE;
359812643SAnders.Persson@Sun.COM 			}
359912643SAnders.Persson@Sun.COM 			/*
360012643SAnders.Persson@Sun.COM 			 * For simultaneous active open, trace receipt of final
360112643SAnders.Persson@Sun.COM 			 * ACK as tcp:::connect-established.
360212643SAnders.Persson@Sun.COM 			 */
360312643SAnders.Persson@Sun.COM 			DTRACE_TCP5(connect__established, mblk_t *, NULL,
360412643SAnders.Persson@Sun.COM 			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
360512643SAnders.Persson@Sun.COM 			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
360612643SAnders.Persson@Sun.COM 		} else if (IPCL_IS_NONSTR(connp)) {
360712643SAnders.Persson@Sun.COM 			/*
360812643SAnders.Persson@Sun.COM 			 * 3-way handshake has completed, so notify socket
360912643SAnders.Persson@Sun.COM 			 * of the new connection.
361012643SAnders.Persson@Sun.COM 			 *
361112643SAnders.Persson@Sun.COM 			 * We are here means eager is fine but it can
361212643SAnders.Persson@Sun.COM 			 * get a TH_RST at any point between now and till
361312643SAnders.Persson@Sun.COM 			 * accept completes and disappear. We need to
361412643SAnders.Persson@Sun.COM 			 * ensure that reference to eager is valid after
361512643SAnders.Persson@Sun.COM 			 * we get out of eager's perimeter. So we do
361612643SAnders.Persson@Sun.COM 			 * an extra refhold.
361712643SAnders.Persson@Sun.COM 			 */
361812643SAnders.Persson@Sun.COM 			CONN_INC_REF(connp);
361912643SAnders.Persson@Sun.COM 
362012643SAnders.Persson@Sun.COM 			if (!tcp_newconn_notify(tcp, ira)) {
3621*13062SAnders.Persson@Sun.COM 				/*
3622*13062SAnders.Persson@Sun.COM 				 * The state-change probe for SYN_RCVD ->
3623*13062SAnders.Persson@Sun.COM 				 * ESTABLISHED has not fired yet. We reset
3624*13062SAnders.Persson@Sun.COM 				 * the state to SYN_RCVD so that future
3625*13062SAnders.Persson@Sun.COM 				 * state-change probes report correct state
3626*13062SAnders.Persson@Sun.COM 				 * transistions.
3627*13062SAnders.Persson@Sun.COM 				 */
3628*13062SAnders.Persson@Sun.COM 				tcp->tcp_state = TCPS_SYN_RCVD;
362912643SAnders.Persson@Sun.COM 				freemsg(mp);
363012643SAnders.Persson@Sun.COM 				/* notification did not go up, so drop ref */
363112643SAnders.Persson@Sun.COM 				CONN_DEC_REF(connp);
3632*13062SAnders.Persson@Sun.COM 				/* ... and close the eager */
3633*13062SAnders.Persson@Sun.COM 				ASSERT(TCP_IS_DETACHED(tcp));
3634*13062SAnders.Persson@Sun.COM 				(void) tcp_close_detached(tcp);
363512643SAnders.Persson@Sun.COM 				return;
363612643SAnders.Persson@Sun.COM 			}
363712643SAnders.Persson@Sun.COM 			/*
363812643SAnders.Persson@Sun.COM 			 * For passive open, trace receipt of final ACK as
363912643SAnders.Persson@Sun.COM 			 * tcp:::accept-established.
364012643SAnders.Persson@Sun.COM 			 */
364112643SAnders.Persson@Sun.COM 			DTRACE_TCP5(accept__established, mlbk_t *, NULL,
364212643SAnders.Persson@Sun.COM 			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
364312643SAnders.Persson@Sun.COM 			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
364412644SAnders.Persson@Sun.COM 		} else {
364512643SAnders.Persson@Sun.COM 			/*
364612643SAnders.Persson@Sun.COM 			 * 3-way handshake complete - this is a STREAMS based
364712643SAnders.Persson@Sun.COM 			 * socket, so pass up the T_CONN_IND.
364812643SAnders.Persson@Sun.COM 			 */
364911754SKacheong.Poon@Sun.COM 			tcp_t	*listener = tcp->tcp_listener;
365011754SKacheong.Poon@Sun.COM 			mblk_t	*mp = tcp->tcp_conn.tcp_eager_conn_ind;
365111754SKacheong.Poon@Sun.COM 
365211754SKacheong.Poon@Sun.COM 			tcp->tcp_tconnind_started = B_TRUE;
365311754SKacheong.Poon@Sun.COM 			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
365412643SAnders.Persson@Sun.COM 			ASSERT(mp != NULL);
365511754SKacheong.Poon@Sun.COM 			/*
365611754SKacheong.Poon@Sun.COM 			 * We are here means eager is fine but it can
365711754SKacheong.Poon@Sun.COM 			 * get a TH_RST at any point between now and till
365811754SKacheong.Poon@Sun.COM 			 * accept completes and disappear. We need to
365911754SKacheong.Poon@Sun.COM 			 * ensure that reference to eager is valid after
366011754SKacheong.Poon@Sun.COM 			 * we get out of eager's perimeter. So we do
366111754SKacheong.Poon@Sun.COM 			 * an extra refhold.
366211754SKacheong.Poon@Sun.COM 			 */
366311754SKacheong.Poon@Sun.COM 			CONN_INC_REF(connp);
366411754SKacheong.Poon@Sun.COM 
366511754SKacheong.Poon@Sun.COM 			/*
366611754SKacheong.Poon@Sun.COM 			 * The listener also exists because of the refhold
366711754SKacheong.Poon@Sun.COM 			 * done in tcp_input_listener. Its possible that it
366811754SKacheong.Poon@Sun.COM 			 * might have closed. We will check that once we
366911754SKacheong.Poon@Sun.COM 			 * get inside listeners context.
367011754SKacheong.Poon@Sun.COM 			 */
367111754SKacheong.Poon@Sun.COM 			CONN_INC_REF(listener->tcp_connp);
367211754SKacheong.Poon@Sun.COM 			if (listener->tcp_connp->conn_sqp ==
367311754SKacheong.Poon@Sun.COM 			    connp->conn_sqp) {
367411754SKacheong.Poon@Sun.COM 				/*
367511754SKacheong.Poon@Sun.COM 				 * We optimize by not calling an SQUEUE_ENTER
367611754SKacheong.Poon@Sun.COM 				 * on the listener since we know that the
367711754SKacheong.Poon@Sun.COM 				 * listener and eager squeues are the same.
367811754SKacheong.Poon@Sun.COM 				 * We are able to make this check safely only
367911754SKacheong.Poon@Sun.COM 				 * because neither the eager nor the listener
368011754SKacheong.Poon@Sun.COM 				 * can change its squeue. Only an active connect
368111754SKacheong.Poon@Sun.COM 				 * can change its squeue
368211754SKacheong.Poon@Sun.COM 				 */
368311754SKacheong.Poon@Sun.COM 				tcp_send_conn_ind(listener->tcp_connp, mp,
368411754SKacheong.Poon@Sun.COM 				    listener->tcp_connp->conn_sqp);
368511754SKacheong.Poon@Sun.COM 				CONN_DEC_REF(listener->tcp_connp);
368611754SKacheong.Poon@Sun.COM 			} else if (!tcp->tcp_loopback) {
368711754SKacheong.Poon@Sun.COM 				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
368811754SKacheong.Poon@Sun.COM 				    mp, tcp_send_conn_ind,
368911754SKacheong.Poon@Sun.COM 				    listener->tcp_connp, NULL, SQ_FILL,
369011754SKacheong.Poon@Sun.COM 				    SQTAG_TCP_CONN_IND);
369111754SKacheong.Poon@Sun.COM 			} else {
369211754SKacheong.Poon@Sun.COM 				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
369311754SKacheong.Poon@Sun.COM 				    mp, tcp_send_conn_ind,
369412438SGeorge.Shepherd@Sun.COM 				    listener->tcp_connp, NULL, SQ_NODRAIN,
369511754SKacheong.Poon@Sun.COM 				    SQTAG_TCP_CONN_IND);
369611754SKacheong.Poon@Sun.COM 			}
369712507SAlan.Maguire@Sun.COM 			/*
369812507SAlan.Maguire@Sun.COM 			 * For passive open, trace receipt of final ACK as
369912507SAlan.Maguire@Sun.COM 			 * tcp:::accept-established.
370012507SAlan.Maguire@Sun.COM 			 */
370112507SAlan.Maguire@Sun.COM 			DTRACE_TCP5(accept__established, mlbk_t *, NULL,
370212507SAlan.Maguire@Sun.COM 			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
370312507SAlan.Maguire@Sun.COM 			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
370411754SKacheong.Poon@Sun.COM 		}
370511754SKacheong.Poon@Sun.COM 		TCPS_CONN_INC(tcps);
370611754SKacheong.Poon@Sun.COM 
370711754SKacheong.Poon@Sun.COM 		tcp->tcp_suna = tcp->tcp_iss + 1;	/* One for the SYN */
370811754SKacheong.Poon@Sun.COM 		bytes_acked--;
370911754SKacheong.Poon@Sun.COM 		/* SYN was acked - making progress */
371011754SKacheong.Poon@Sun.COM 		tcp->tcp_ip_forward_progress = B_TRUE;
371111754SKacheong.Poon@Sun.COM 
371211754SKacheong.Poon@Sun.COM 		/*
371311754SKacheong.Poon@Sun.COM 		 * If SYN was retransmitted, need to reset all
371411754SKacheong.Poon@Sun.COM 		 * retransmission info as this segment will be
371511754SKacheong.Poon@Sun.COM 		 * treated as a dup ACK.
371611754SKacheong.Poon@Sun.COM 		 */
371711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rexmit) {
371811754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit = B_FALSE;
371911754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
372011754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_max = tcp->tcp_snxt;
372111754SKacheong.Poon@Sun.COM 			tcp->tcp_snd_burst = tcp->tcp_localnet ?
372211754SKacheong.Poon@Sun.COM 			    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
372311754SKacheong.Poon@Sun.COM 			tcp->tcp_ms_we_have_waited = 0;
372411754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd = mss;
372511754SKacheong.Poon@Sun.COM 		}
372611754SKacheong.Poon@Sun.COM 
372711754SKacheong.Poon@Sun.COM 		/*
372811754SKacheong.Poon@Sun.COM 		 * We set the send window to zero here.
372911754SKacheong.Poon@Sun.COM 		 * This is needed if there is data to be
373011754SKacheong.Poon@Sun.COM 		 * processed already on the queue.
373111754SKacheong.Poon@Sun.COM 		 * Later (at swnd_update label), the
373211754SKacheong.Poon@Sun.COM 		 * "new_swnd > tcp_swnd" condition is satisfied
373311754SKacheong.Poon@Sun.COM 		 * the XMIT_NEEDED flag is set in the current
373411754SKacheong.Poon@Sun.COM 		 * (SYN_RCVD) state. This ensures tcp_wput_data() is
373511754SKacheong.Poon@Sun.COM 		 * called if there is already data on queue in
373611754SKacheong.Poon@Sun.COM 		 * this state.
373711754SKacheong.Poon@Sun.COM 		 */
373811754SKacheong.Poon@Sun.COM 		tcp->tcp_swnd = 0;
373911754SKacheong.Poon@Sun.COM 
374011754SKacheong.Poon@Sun.COM 		if (new_swnd > tcp->tcp_max_swnd)
374111754SKacheong.Poon@Sun.COM 			tcp->tcp_max_swnd = new_swnd;
374211754SKacheong.Poon@Sun.COM 		tcp->tcp_swl1 = seg_seq;
374311754SKacheong.Poon@Sun.COM 		tcp->tcp_swl2 = seg_ack;
374411754SKacheong.Poon@Sun.COM 		tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
374511754SKacheong.Poon@Sun.COM 
374612507SAlan.Maguire@Sun.COM 		/* Trace change from SYN_RCVD -> ESTABLISHED here */
374712507SAlan.Maguire@Sun.COM 		DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
374812507SAlan.Maguire@Sun.COM 		    connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
374912507SAlan.Maguire@Sun.COM 		    int32_t, TCPS_SYN_RCVD);
375012507SAlan.Maguire@Sun.COM 
375111754SKacheong.Poon@Sun.COM 		/* Fuse when both sides are in ESTABLISHED state */
375211754SKacheong.Poon@Sun.COM 		if (tcp->tcp_loopback && do_tcp_fusion)
375311754SKacheong.Poon@Sun.COM 			tcp_fuse(tcp, iphdr, tcpha);
375411754SKacheong.Poon@Sun.COM 
375511754SKacheong.Poon@Sun.COM 	}
375611754SKacheong.Poon@Sun.COM 	/* This code follows 4.4BSD-Lite2 mostly. */
375711754SKacheong.Poon@Sun.COM 	if (bytes_acked < 0)
375811754SKacheong.Poon@Sun.COM 		goto est;
375911754SKacheong.Poon@Sun.COM 
376011754SKacheong.Poon@Sun.COM 	/*
376111754SKacheong.Poon@Sun.COM 	 * If TCP is ECN capable and the congestion experience bit is
376211754SKacheong.Poon@Sun.COM 	 * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
376311754SKacheong.Poon@Sun.COM 	 * done once per window (or more loosely, per RTT).
376411754SKacheong.Poon@Sun.COM 	 */
376511754SKacheong.Poon@Sun.COM 	if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
376611754SKacheong.Poon@Sun.COM 		tcp->tcp_cwr = B_FALSE;
376711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
376811754SKacheong.Poon@Sun.COM 		if (!tcp->tcp_cwr) {
376911754SKacheong.Poon@Sun.COM 			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
377011754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
377111754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd = npkt * mss;
377211754SKacheong.Poon@Sun.COM 			/*
377311754SKacheong.Poon@Sun.COM 			 * If the cwnd is 0, use the timer to clock out
377411754SKacheong.Poon@Sun.COM 			 * new segments.  This is required by the ECN spec.
377511754SKacheong.Poon@Sun.COM 			 */
377611754SKacheong.Poon@Sun.COM 			if (npkt == 0) {
377711754SKacheong.Poon@Sun.COM 				TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
377811754SKacheong.Poon@Sun.COM 				/*
377911754SKacheong.Poon@Sun.COM 				 * This makes sure that when the ACK comes
378011754SKacheong.Poon@Sun.COM 				 * back, we will increase tcp_cwnd by 1 MSS.
378111754SKacheong.Poon@Sun.COM 				 */
378211754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd_cnt = 0;
378311754SKacheong.Poon@Sun.COM 			}
378411754SKacheong.Poon@Sun.COM 			tcp->tcp_cwr = B_TRUE;
378511754SKacheong.Poon@Sun.COM 			/*
378611754SKacheong.Poon@Sun.COM 			 * This marks the end of the current window of in
378711754SKacheong.Poon@Sun.COM 			 * flight data.  That is why we don't use
378811754SKacheong.Poon@Sun.COM 			 * tcp_suna + tcp_swnd.  Only data in flight can
378911754SKacheong.Poon@Sun.COM 			 * provide ECN info.
379011754SKacheong.Poon@Sun.COM 			 */
379111754SKacheong.Poon@Sun.COM 			tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
379211754SKacheong.Poon@Sun.COM 			tcp->tcp_ecn_cwr_sent = B_FALSE;
379311754SKacheong.Poon@Sun.COM 		}
379411754SKacheong.Poon@Sun.COM 	}
379511754SKacheong.Poon@Sun.COM 
379611754SKacheong.Poon@Sun.COM 	mp1 = tcp->tcp_xmit_head;
379711754SKacheong.Poon@Sun.COM 	if (bytes_acked == 0) {
379811754SKacheong.Poon@Sun.COM 		if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
379911754SKacheong.Poon@Sun.COM 			int dupack_cnt;
380011754SKacheong.Poon@Sun.COM 
380111754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInDupAck);
380211754SKacheong.Poon@Sun.COM 			/*
380311754SKacheong.Poon@Sun.COM 			 * Fast retransmit.  When we have seen exactly three
380411754SKacheong.Poon@Sun.COM 			 * identical ACKs while we have unacked data
380511754SKacheong.Poon@Sun.COM 			 * outstanding we take it as a hint that our peer
380611754SKacheong.Poon@Sun.COM 			 * dropped something.
380711754SKacheong.Poon@Sun.COM 			 *
380811754SKacheong.Poon@Sun.COM 			 * If TCP is retransmitting, don't do fast retransmit.
380911754SKacheong.Poon@Sun.COM 			 */
381011754SKacheong.Poon@Sun.COM 			if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
381111754SKacheong.Poon@Sun.COM 			    ! tcp->tcp_rexmit) {
381211754SKacheong.Poon@Sun.COM 				/* Do Limited Transmit */
381311754SKacheong.Poon@Sun.COM 				if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
381411754SKacheong.Poon@Sun.COM 				    tcps->tcps_dupack_fast_retransmit) {
381511754SKacheong.Poon@Sun.COM 					/*
381611754SKacheong.Poon@Sun.COM 					 * RFC 3042
381711754SKacheong.Poon@Sun.COM 					 *
381811754SKacheong.Poon@Sun.COM 					 * What we need to do is temporarily
381911754SKacheong.Poon@Sun.COM 					 * increase tcp_cwnd so that new
382011754SKacheong.Poon@Sun.COM 					 * data can be sent if it is allowed
382111754SKacheong.Poon@Sun.COM 					 * by the receive window (tcp_rwnd).
382211754SKacheong.Poon@Sun.COM 					 * tcp_wput_data() will take care of
382311754SKacheong.Poon@Sun.COM 					 * the rest.
382411754SKacheong.Poon@Sun.COM 					 *
382511754SKacheong.Poon@Sun.COM 					 * If the connection is SACK capable,
382611754SKacheong.Poon@Sun.COM 					 * only do limited xmit when there
382711754SKacheong.Poon@Sun.COM 					 * is SACK info.
382811754SKacheong.Poon@Sun.COM 					 *
382911754SKacheong.Poon@Sun.COM 					 * Note how tcp_cwnd is incremented.
383011754SKacheong.Poon@Sun.COM 					 * The first dup ACK will increase
383111754SKacheong.Poon@Sun.COM 					 * it by 1 MSS.  The second dup ACK
383211754SKacheong.Poon@Sun.COM 					 * will increase it by 2 MSS.  This
383311754SKacheong.Poon@Sun.COM 					 * means that only 1 new segment will
383411754SKacheong.Poon@Sun.COM 					 * be sent for each dup ACK.
383511754SKacheong.Poon@Sun.COM 					 */
383611754SKacheong.Poon@Sun.COM 					if (tcp->tcp_unsent > 0 &&
383711754SKacheong.Poon@Sun.COM 					    (!tcp->tcp_snd_sack_ok ||
383811754SKacheong.Poon@Sun.COM 					    (tcp->tcp_snd_sack_ok &&
383911754SKacheong.Poon@Sun.COM 					    tcp->tcp_notsack_list != NULL))) {
384011754SKacheong.Poon@Sun.COM 						tcp->tcp_cwnd += mss <<
384111754SKacheong.Poon@Sun.COM 						    (tcp->tcp_dupack_cnt - 1);
384211754SKacheong.Poon@Sun.COM 						flags |= TH_LIMIT_XMIT;
384311754SKacheong.Poon@Sun.COM 					}
384411754SKacheong.Poon@Sun.COM 				} else if (dupack_cnt ==
384511754SKacheong.Poon@Sun.COM 				    tcps->tcps_dupack_fast_retransmit) {
384611754SKacheong.Poon@Sun.COM 
384711754SKacheong.Poon@Sun.COM 				/*
384811754SKacheong.Poon@Sun.COM 				 * If we have reduced tcp_ssthresh
384911754SKacheong.Poon@Sun.COM 				 * because of ECN, do not reduce it again
385011754SKacheong.Poon@Sun.COM 				 * unless it is already one window of data
385111754SKacheong.Poon@Sun.COM 				 * away.  After one window of data, tcp_cwr
385211754SKacheong.Poon@Sun.COM 				 * should then be cleared.  Note that
385311754SKacheong.Poon@Sun.COM 				 * for non ECN capable connection, tcp_cwr
385411754SKacheong.Poon@Sun.COM 				 * should always be false.
385511754SKacheong.Poon@Sun.COM 				 *
385611754SKacheong.Poon@Sun.COM 				 * Adjust cwnd since the duplicate
385711754SKacheong.Poon@Sun.COM 				 * ack indicates that a packet was
385811754SKacheong.Poon@Sun.COM 				 * dropped (due to congestion.)
385911754SKacheong.Poon@Sun.COM 				 */
386011754SKacheong.Poon@Sun.COM 				if (!tcp->tcp_cwr) {
386111754SKacheong.Poon@Sun.COM 					npkt = ((tcp->tcp_snxt -
386211754SKacheong.Poon@Sun.COM 					    tcp->tcp_suna) >> 1) / mss;
386311754SKacheong.Poon@Sun.COM 					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
386411754SKacheong.Poon@Sun.COM 					    mss;
386511754SKacheong.Poon@Sun.COM 					tcp->tcp_cwnd = (npkt +
386611754SKacheong.Poon@Sun.COM 					    tcp->tcp_dupack_cnt) * mss;
386711754SKacheong.Poon@Sun.COM 				}
386811754SKacheong.Poon@Sun.COM 				if (tcp->tcp_ecn_ok) {
386911754SKacheong.Poon@Sun.COM 					tcp->tcp_cwr = B_TRUE;
387011754SKacheong.Poon@Sun.COM 					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
387111754SKacheong.Poon@Sun.COM 					tcp->tcp_ecn_cwr_sent = B_FALSE;
387211754SKacheong.Poon@Sun.COM 				}
387311754SKacheong.Poon@Sun.COM 
387411754SKacheong.Poon@Sun.COM 				/*
387511754SKacheong.Poon@Sun.COM 				 * We do Hoe's algorithm.  Refer to her
387611754SKacheong.Poon@Sun.COM 				 * paper "Improving the Start-up Behavior
387711754SKacheong.Poon@Sun.COM 				 * of a Congestion Control Scheme for TCP,"
387811754SKacheong.Poon@Sun.COM 				 * appeared in SIGCOMM'96.
387911754SKacheong.Poon@Sun.COM 				 *
388011754SKacheong.Poon@Sun.COM 				 * Save highest seq no we have sent so far.
388111754SKacheong.Poon@Sun.COM 				 * Be careful about the invisible FIN byte.
388211754SKacheong.Poon@Sun.COM 				 */
388311754SKacheong.Poon@Sun.COM 				if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
388411754SKacheong.Poon@Sun.COM 				    (tcp->tcp_unsent == 0)) {
388511754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit_max = tcp->tcp_fss;
388611754SKacheong.Poon@Sun.COM 				} else {
388711754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit_max = tcp->tcp_snxt;
388811754SKacheong.Poon@Sun.COM 				}
388911754SKacheong.Poon@Sun.COM 
389011754SKacheong.Poon@Sun.COM 				/*
389111754SKacheong.Poon@Sun.COM 				 * Do not allow bursty traffic during.
389211754SKacheong.Poon@Sun.COM 				 * fast recovery.  Refer to Fall and Floyd's
389311754SKacheong.Poon@Sun.COM 				 * paper "Simulation-based Comparisons of
389411754SKacheong.Poon@Sun.COM 				 * Tahoe, Reno and SACK TCP" (in CCR?)
389511754SKacheong.Poon@Sun.COM 				 * This is a best current practise.
389611754SKacheong.Poon@Sun.COM 				 */
389711754SKacheong.Poon@Sun.COM 				tcp->tcp_snd_burst = TCP_CWND_SS;
389811754SKacheong.Poon@Sun.COM 
389911754SKacheong.Poon@Sun.COM 				/*
390011754SKacheong.Poon@Sun.COM 				 * For SACK:
390111754SKacheong.Poon@Sun.COM 				 * Calculate tcp_pipe, which is the
390211754SKacheong.Poon@Sun.COM 				 * estimated number of bytes in
390311754SKacheong.Poon@Sun.COM 				 * network.
390411754SKacheong.Poon@Sun.COM 				 *
390511754SKacheong.Poon@Sun.COM 				 * tcp_fack is the highest sack'ed seq num
390611754SKacheong.Poon@Sun.COM 				 * TCP has received.
390711754SKacheong.Poon@Sun.COM 				 *
390811754SKacheong.Poon@Sun.COM 				 * tcp_pipe is explained in the above quoted
390911754SKacheong.Poon@Sun.COM 				 * Fall and Floyd's paper.  tcp_fack is
391011754SKacheong.Poon@Sun.COM 				 * explained in Mathis and Mahdavi's
391111754SKacheong.Poon@Sun.COM 				 * "Forward Acknowledgment: Refining TCP
391211754SKacheong.Poon@Sun.COM 				 * Congestion Control" in SIGCOMM '96.
391311754SKacheong.Poon@Sun.COM 				 */
391411754SKacheong.Poon@Sun.COM 				if (tcp->tcp_snd_sack_ok) {
391511754SKacheong.Poon@Sun.COM 					if (tcp->tcp_notsack_list != NULL) {
391611754SKacheong.Poon@Sun.COM 						tcp->tcp_pipe = tcp->tcp_snxt -
391711754SKacheong.Poon@Sun.COM 						    tcp->tcp_fack;
391811754SKacheong.Poon@Sun.COM 						tcp->tcp_sack_snxt = seg_ack;
391911754SKacheong.Poon@Sun.COM 						flags |= TH_NEED_SACK_REXMIT;
392011754SKacheong.Poon@Sun.COM 					} else {
392111754SKacheong.Poon@Sun.COM 						/*
392211754SKacheong.Poon@Sun.COM 						 * Always initialize tcp_pipe
392311754SKacheong.Poon@Sun.COM 						 * even though we don't have
392411754SKacheong.Poon@Sun.COM 						 * any SACK info.  If later
392511754SKacheong.Poon@Sun.COM 						 * we get SACK info and
392611754SKacheong.Poon@Sun.COM 						 * tcp_pipe is not initialized,
392711754SKacheong.Poon@Sun.COM 						 * funny things will happen.
392811754SKacheong.Poon@Sun.COM 						 */
392911754SKacheong.Poon@Sun.COM 						tcp->tcp_pipe =
393011754SKacheong.Poon@Sun.COM 						    tcp->tcp_cwnd_ssthresh;
393111754SKacheong.Poon@Sun.COM 					}
393211754SKacheong.Poon@Sun.COM 				} else {
393311754SKacheong.Poon@Sun.COM 					flags |= TH_REXMIT_NEEDED;
393411754SKacheong.Poon@Sun.COM 				} /* tcp_snd_sack_ok */
393511754SKacheong.Poon@Sun.COM 
393611754SKacheong.Poon@Sun.COM 				} else {
393711754SKacheong.Poon@Sun.COM 					/*
393811754SKacheong.Poon@Sun.COM 					 * Here we perform congestion
393911754SKacheong.Poon@Sun.COM 					 * avoidance, but NOT slow start.
394011754SKacheong.Poon@Sun.COM 					 * This is known as the Fast
394111754SKacheong.Poon@Sun.COM 					 * Recovery Algorithm.
394211754SKacheong.Poon@Sun.COM 					 */
394311754SKacheong.Poon@Sun.COM 					if (tcp->tcp_snd_sack_ok &&
394411754SKacheong.Poon@Sun.COM 					    tcp->tcp_notsack_list != NULL) {
394511754SKacheong.Poon@Sun.COM 						flags |= TH_NEED_SACK_REXMIT;
394611754SKacheong.Poon@Sun.COM 						tcp->tcp_pipe -= mss;
394711754SKacheong.Poon@Sun.COM 						if (tcp->tcp_pipe < 0)
394811754SKacheong.Poon@Sun.COM 							tcp->tcp_pipe = 0;
394911754SKacheong.Poon@Sun.COM 					} else {
395011754SKacheong.Poon@Sun.COM 					/*
395111754SKacheong.Poon@Sun.COM 					 * We know that one more packet has
395211754SKacheong.Poon@Sun.COM 					 * left the pipe thus we can update
395311754SKacheong.Poon@Sun.COM 					 * cwnd.
395411754SKacheong.Poon@Sun.COM 					 */
395511754SKacheong.Poon@Sun.COM 					cwnd = tcp->tcp_cwnd + mss;
395611754SKacheong.Poon@Sun.COM 					if (cwnd > tcp->tcp_cwnd_max)
395711754SKacheong.Poon@Sun.COM 						cwnd = tcp->tcp_cwnd_max;
395811754SKacheong.Poon@Sun.COM 					tcp->tcp_cwnd = cwnd;
395911754SKacheong.Poon@Sun.COM 					if (tcp->tcp_unsent > 0)
396011754SKacheong.Poon@Sun.COM 						flags |= TH_XMIT_NEEDED;
396111754SKacheong.Poon@Sun.COM 					}
396211754SKacheong.Poon@Sun.COM 				}
396311754SKacheong.Poon@Sun.COM 			}
396411754SKacheong.Poon@Sun.COM 		} else if (tcp->tcp_zero_win_probe) {
396511754SKacheong.Poon@Sun.COM 			/*
396611754SKacheong.Poon@Sun.COM 			 * If the window has opened, need to arrange
396711754SKacheong.Poon@Sun.COM 			 * to send additional data.
396811754SKacheong.Poon@Sun.COM 			 */
396911754SKacheong.Poon@Sun.COM 			if (new_swnd != 0) {
397011754SKacheong.Poon@Sun.COM 				/* tcp_suna != tcp_snxt */
397111754SKacheong.Poon@Sun.COM 				/* Packet contains a window update */
397211754SKacheong.Poon@Sun.COM 				TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
397311754SKacheong.Poon@Sun.COM 				tcp->tcp_zero_win_probe = 0;
397411754SKacheong.Poon@Sun.COM 				tcp->tcp_timer_backoff = 0;
397511754SKacheong.Poon@Sun.COM 				tcp->tcp_ms_we_have_waited = 0;
397611754SKacheong.Poon@Sun.COM 
397711754SKacheong.Poon@Sun.COM 				/*
397811754SKacheong.Poon@Sun.COM 				 * Transmit starting with tcp_suna since
397911754SKacheong.Poon@Sun.COM 				 * the one byte probe is not ack'ed.
398011754SKacheong.Poon@Sun.COM 				 * If TCP has sent more than one identical
398111754SKacheong.Poon@Sun.COM 				 * probe, tcp_rexmit will be set.  That means
398211754SKacheong.Poon@Sun.COM 				 * tcp_ss_rexmit() will send out the one
398311754SKacheong.Poon@Sun.COM 				 * byte along with new data.  Otherwise,
398411754SKacheong.Poon@Sun.COM 				 * fake the retransmission.
398511754SKacheong.Poon@Sun.COM 				 */
398611754SKacheong.Poon@Sun.COM 				flags |= TH_XMIT_NEEDED;
398711754SKacheong.Poon@Sun.COM 				if (!tcp->tcp_rexmit) {
398811754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit = B_TRUE;
398911754SKacheong.Poon@Sun.COM 					tcp->tcp_dupack_cnt = 0;
399011754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit_nxt = tcp->tcp_suna;
399111754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit_max = tcp->tcp_suna + 1;
399211754SKacheong.Poon@Sun.COM 				}
399311754SKacheong.Poon@Sun.COM 			}
399411754SKacheong.Poon@Sun.COM 		}
399511754SKacheong.Poon@Sun.COM 		goto swnd_update;
399611754SKacheong.Poon@Sun.COM 	}
399711754SKacheong.Poon@Sun.COM 
399811754SKacheong.Poon@Sun.COM 	/*
399911754SKacheong.Poon@Sun.COM 	 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73.
400011754SKacheong.Poon@Sun.COM 	 * If the ACK value acks something that we have not yet sent, it might
400111754SKacheong.Poon@Sun.COM 	 * be an old duplicate segment.  Send an ACK to re-synchronize the
400211754SKacheong.Poon@Sun.COM 	 * other side.
400311754SKacheong.Poon@Sun.COM 	 * Note: reset in response to unacceptable ACK in SYN_RECEIVE
400411754SKacheong.Poon@Sun.COM 	 * state is handled above, so we can always just drop the segment and
400511754SKacheong.Poon@Sun.COM 	 * send an ACK here.
400611754SKacheong.Poon@Sun.COM 	 *
400711754SKacheong.Poon@Sun.COM 	 * In the case where the peer shrinks the window, we see the new window
400811754SKacheong.Poon@Sun.COM 	 * update, but all the data sent previously is queued up by the peer.
400911754SKacheong.Poon@Sun.COM 	 * To account for this, in tcp_process_shrunk_swnd(), the sequence
401011754SKacheong.Poon@Sun.COM 	 * number, which was already sent, and within window, is recorded.
401111754SKacheong.Poon@Sun.COM 	 * tcp_snxt is then updated.
401211754SKacheong.Poon@Sun.COM 	 *
401311754SKacheong.Poon@Sun.COM 	 * If the window has previously shrunk, and an ACK for data not yet
401411754SKacheong.Poon@Sun.COM 	 * sent, according to tcp_snxt is recieved, it may still be valid. If
401511754SKacheong.Poon@Sun.COM 	 * the ACK is for data within the window at the time the window was
401611754SKacheong.Poon@Sun.COM 	 * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to
401711754SKacheong.Poon@Sun.COM 	 * the sequence number ACK'ed.
401811754SKacheong.Poon@Sun.COM 	 *
401911754SKacheong.Poon@Sun.COM 	 * If the ACK covers all the data sent at the time the window was
402011754SKacheong.Poon@Sun.COM 	 * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE.
402111754SKacheong.Poon@Sun.COM 	 *
402211754SKacheong.Poon@Sun.COM 	 * Should we send ACKs in response to ACK only segments?
402311754SKacheong.Poon@Sun.COM 	 */
402411754SKacheong.Poon@Sun.COM 
402511754SKacheong.Poon@Sun.COM 	if (SEQ_GT(seg_ack, tcp->tcp_snxt)) {
402611754SKacheong.Poon@Sun.COM 		if ((tcp->tcp_is_wnd_shrnk) &&
402711754SKacheong.Poon@Sun.COM 		    (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) {
402811754SKacheong.Poon@Sun.COM 			uint32_t data_acked_ahead_snxt;
402911754SKacheong.Poon@Sun.COM 
403011754SKacheong.Poon@Sun.COM 			data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt;
403111754SKacheong.Poon@Sun.COM 			tcp_update_xmit_tail(tcp, seg_ack);
403211754SKacheong.Poon@Sun.COM 			tcp->tcp_unsent -= data_acked_ahead_snxt;
403311754SKacheong.Poon@Sun.COM 		} else {
403411754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpInAckUnsent);
403511754SKacheong.Poon@Sun.COM 			/* drop the received segment */
403611754SKacheong.Poon@Sun.COM 			freemsg(mp);
403711754SKacheong.Poon@Sun.COM 
403811754SKacheong.Poon@Sun.COM 			/*
403911754SKacheong.Poon@Sun.COM 			 * Send back an ACK.  If tcp_drop_ack_unsent_cnt is
404011754SKacheong.Poon@Sun.COM 			 * greater than 0, check if the number of such
404111754SKacheong.Poon@Sun.COM 			 * bogus ACks is greater than that count.  If yes,
404211754SKacheong.Poon@Sun.COM 			 * don't send back any ACK.  This prevents TCP from
404311754SKacheong.Poon@Sun.COM 			 * getting into an ACK storm if somehow an attacker
404411754SKacheong.Poon@Sun.COM 			 * successfully spoofs an acceptable segment to our
404511754SKacheong.Poon@Sun.COM 			 * peer.  If this continues (count > 2 X threshold),
404611754SKacheong.Poon@Sun.COM 			 * we should abort this connection.
404711754SKacheong.Poon@Sun.COM 			 */
404811754SKacheong.Poon@Sun.COM 			if (tcp_drop_ack_unsent_cnt > 0 &&
404911754SKacheong.Poon@Sun.COM 			    ++tcp->tcp_in_ack_unsent >
405011754SKacheong.Poon@Sun.COM 			    tcp_drop_ack_unsent_cnt) {
405111754SKacheong.Poon@Sun.COM 				TCP_STAT(tcps, tcp_in_ack_unsent_drop);
405211754SKacheong.Poon@Sun.COM 				if (tcp->tcp_in_ack_unsent > 2 *
405311754SKacheong.Poon@Sun.COM 				    tcp_drop_ack_unsent_cnt) {
405411754SKacheong.Poon@Sun.COM 					(void) tcp_clean_death(tcp, EPROTO);
405511754SKacheong.Poon@Sun.COM 				}
405611754SKacheong.Poon@Sun.COM 				return;
405711754SKacheong.Poon@Sun.COM 			}
405811754SKacheong.Poon@Sun.COM 			mp = tcp_ack_mp(tcp);
405911754SKacheong.Poon@Sun.COM 			if (mp != NULL) {
406011754SKacheong.Poon@Sun.COM 				BUMP_LOCAL(tcp->tcp_obsegs);
406111754SKacheong.Poon@Sun.COM 				TCPS_BUMP_MIB(tcps, tcpOutAck);
406211754SKacheong.Poon@Sun.COM 				tcp_send_data(tcp, mp);
406311754SKacheong.Poon@Sun.COM 			}
406411754SKacheong.Poon@Sun.COM 			return;
406511754SKacheong.Poon@Sun.COM 		}
406611754SKacheong.Poon@Sun.COM 	} else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
406711754SKacheong.Poon@Sun.COM 	    tcp->tcp_snxt_shrunk)) {
406811754SKacheong.Poon@Sun.COM 			tcp->tcp_is_wnd_shrnk = B_FALSE;
406911754SKacheong.Poon@Sun.COM 	}
407011754SKacheong.Poon@Sun.COM 
407111754SKacheong.Poon@Sun.COM 	/*
407211754SKacheong.Poon@Sun.COM 	 * TCP gets a new ACK, update the notsack'ed list to delete those
407311754SKacheong.Poon@Sun.COM 	 * blocks that are covered by this ACK.
407411754SKacheong.Poon@Sun.COM 	 */
407511754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
407611754SKacheong.Poon@Sun.COM 		tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
407711754SKacheong.Poon@Sun.COM 		    &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
407811754SKacheong.Poon@Sun.COM 	}
407911754SKacheong.Poon@Sun.COM 
408011754SKacheong.Poon@Sun.COM 	/*
408111754SKacheong.Poon@Sun.COM 	 * If we got an ACK after fast retransmit, check to see
408211754SKacheong.Poon@Sun.COM 	 * if it is a partial ACK.  If it is not and the congestion
408311754SKacheong.Poon@Sun.COM 	 * window was inflated to account for the other side's
408411754SKacheong.Poon@Sun.COM 	 * cached packets, retract it.  If it is, do Hoe's algorithm.
408511754SKacheong.Poon@Sun.COM 	 */
408611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
408711754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_rexmit == B_FALSE);
408811754SKacheong.Poon@Sun.COM 		if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
408911754SKacheong.Poon@Sun.COM 			tcp->tcp_dupack_cnt = 0;
409011754SKacheong.Poon@Sun.COM 			/*
409111754SKacheong.Poon@Sun.COM 			 * Restore the orig tcp_cwnd_ssthresh after
409211754SKacheong.Poon@Sun.COM 			 * fast retransmit phase.
409311754SKacheong.Poon@Sun.COM 			 */
409411754SKacheong.Poon@Sun.COM 			if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
409511754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
409611754SKacheong.Poon@Sun.COM 			}
409711754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_max = seg_ack;
409811754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd_cnt = 0;
409911754SKacheong.Poon@Sun.COM 			tcp->tcp_snd_burst = tcp->tcp_localnet ?
410011754SKacheong.Poon@Sun.COM 			    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
410111754SKacheong.Poon@Sun.COM 
410211754SKacheong.Poon@Sun.COM 			/*
410311754SKacheong.Poon@Sun.COM 			 * Remove all notsack info to avoid confusion with
410411754SKacheong.Poon@Sun.COM 			 * the next fast retrasnmit/recovery phase.
410511754SKacheong.Poon@Sun.COM 			 */
410612056SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_sack_ok) {
410711754SKacheong.Poon@Sun.COM 				TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
410811754SKacheong.Poon@Sun.COM 				    tcp);
410911754SKacheong.Poon@Sun.COM 			}
411011754SKacheong.Poon@Sun.COM 		} else {
411111754SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_sack_ok &&
411211754SKacheong.Poon@Sun.COM 			    tcp->tcp_notsack_list != NULL) {
411311754SKacheong.Poon@Sun.COM 				flags |= TH_NEED_SACK_REXMIT;
411411754SKacheong.Poon@Sun.COM 				tcp->tcp_pipe -= mss;
411511754SKacheong.Poon@Sun.COM 				if (tcp->tcp_pipe < 0)
411611754SKacheong.Poon@Sun.COM 					tcp->tcp_pipe = 0;
411711754SKacheong.Poon@Sun.COM 			} else {
411811754SKacheong.Poon@Sun.COM 				/*
411911754SKacheong.Poon@Sun.COM 				 * Hoe's algorithm:
412011754SKacheong.Poon@Sun.COM 				 *
412111754SKacheong.Poon@Sun.COM 				 * Retransmit the unack'ed segment and
412211754SKacheong.Poon@Sun.COM 				 * restart fast recovery.  Note that we
412311754SKacheong.Poon@Sun.COM 				 * need to scale back tcp_cwnd to the
412411754SKacheong.Poon@Sun.COM 				 * original value when we started fast
412511754SKacheong.Poon@Sun.COM 				 * recovery.  This is to prevent overly
412611754SKacheong.Poon@Sun.COM 				 * aggressive behaviour in sending new
412711754SKacheong.Poon@Sun.COM 				 * segments.
412811754SKacheong.Poon@Sun.COM 				 */
412911754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
413011754SKacheong.Poon@Sun.COM 				    tcps->tcps_dupack_fast_retransmit * mss;
413111754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
413211754SKacheong.Poon@Sun.COM 				flags |= TH_REXMIT_NEEDED;
413311754SKacheong.Poon@Sun.COM 			}
413411754SKacheong.Poon@Sun.COM 		}
413511754SKacheong.Poon@Sun.COM 	} else {
413611754SKacheong.Poon@Sun.COM 		tcp->tcp_dupack_cnt = 0;
413711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rexmit) {
413811754SKacheong.Poon@Sun.COM 			/*
413911754SKacheong.Poon@Sun.COM 			 * TCP is retranmitting.  If the ACK ack's all
414011754SKacheong.Poon@Sun.COM 			 * outstanding data, update tcp_rexmit_max and
414111754SKacheong.Poon@Sun.COM 			 * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
414211754SKacheong.Poon@Sun.COM 			 * to the correct value.
414311754SKacheong.Poon@Sun.COM 			 *
414411754SKacheong.Poon@Sun.COM 			 * Note that SEQ_LEQ() is used.  This is to avoid
414511754SKacheong.Poon@Sun.COM 			 * unnecessary fast retransmit caused by dup ACKs
414611754SKacheong.Poon@Sun.COM 			 * received when TCP does slow start retransmission
414711754SKacheong.Poon@Sun.COM 			 * after a time out.  During this phase, TCP may
414811754SKacheong.Poon@Sun.COM 			 * send out segments which are already received.
414911754SKacheong.Poon@Sun.COM 			 * This causes dup ACKs to be sent back.
415011754SKacheong.Poon@Sun.COM 			 */
415111754SKacheong.Poon@Sun.COM 			if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) {
415211754SKacheong.Poon@Sun.COM 				if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) {
415311754SKacheong.Poon@Sun.COM 					tcp->tcp_rexmit_nxt = seg_ack;
415411754SKacheong.Poon@Sun.COM 				}
415511754SKacheong.Poon@Sun.COM 				if (seg_ack != tcp->tcp_rexmit_max) {
415611754SKacheong.Poon@Sun.COM 					flags |= TH_XMIT_NEEDED;
415711754SKacheong.Poon@Sun.COM 				}
415811754SKacheong.Poon@Sun.COM 			} else {
415911754SKacheong.Poon@Sun.COM 				tcp->tcp_rexmit = B_FALSE;
416011754SKacheong.Poon@Sun.COM 				tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
416111754SKacheong.Poon@Sun.COM 				tcp->tcp_snd_burst = tcp->tcp_localnet ?
416211754SKacheong.Poon@Sun.COM 				    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
416311754SKacheong.Poon@Sun.COM 			}
416411754SKacheong.Poon@Sun.COM 			tcp->tcp_ms_we_have_waited = 0;
416511754SKacheong.Poon@Sun.COM 		}
416611754SKacheong.Poon@Sun.COM 	}
416711754SKacheong.Poon@Sun.COM 
416811754SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpInAckSegs);
416911754SKacheong.Poon@Sun.COM 	TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked);
417011754SKacheong.Poon@Sun.COM 	tcp->tcp_suna = seg_ack;
417111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_zero_win_probe != 0) {
417211754SKacheong.Poon@Sun.COM 		tcp->tcp_zero_win_probe = 0;
417311754SKacheong.Poon@Sun.COM 		tcp->tcp_timer_backoff = 0;
417411754SKacheong.Poon@Sun.COM 	}
417511754SKacheong.Poon@Sun.COM 
417611754SKacheong.Poon@Sun.COM 	/*
417711754SKacheong.Poon@Sun.COM 	 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
417811754SKacheong.Poon@Sun.COM 	 * Note that it cannot be the SYN being ack'ed.  The code flow
417911754SKacheong.Poon@Sun.COM 	 * will not reach here.
418011754SKacheong.Poon@Sun.COM 	 */
418111754SKacheong.Poon@Sun.COM 	if (mp1 == NULL) {
418211754SKacheong.Poon@Sun.COM 		goto fin_acked;
418311754SKacheong.Poon@Sun.COM 	}
418411754SKacheong.Poon@Sun.COM 
418511754SKacheong.Poon@Sun.COM 	/*
418611754SKacheong.Poon@Sun.COM 	 * Update the congestion window.
418711754SKacheong.Poon@Sun.COM 	 *
418811754SKacheong.Poon@Sun.COM 	 * If TCP is not ECN capable or TCP is ECN capable but the
418911754SKacheong.Poon@Sun.COM 	 * congestion experience bit is not set, increase the tcp_cwnd as
419011754SKacheong.Poon@Sun.COM 	 * usual.
419111754SKacheong.Poon@Sun.COM 	 */
419211754SKacheong.Poon@Sun.COM 	if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
419311754SKacheong.Poon@Sun.COM 		cwnd = tcp->tcp_cwnd;
419411754SKacheong.Poon@Sun.COM 		add = mss;
419511754SKacheong.Poon@Sun.COM 
419611754SKacheong.Poon@Sun.COM 		if (cwnd >= tcp->tcp_cwnd_ssthresh) {
419711754SKacheong.Poon@Sun.COM 			/*
419811754SKacheong.Poon@Sun.COM 			 * This is to prevent an increase of less than 1 MSS of
419911754SKacheong.Poon@Sun.COM 			 * tcp_cwnd.  With partial increase, tcp_wput_data()
420011754SKacheong.Poon@Sun.COM 			 * may send out tinygrams in order to preserve mblk
420111754SKacheong.Poon@Sun.COM 			 * boundaries.
420211754SKacheong.Poon@Sun.COM 			 *
420311754SKacheong.Poon@Sun.COM 			 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
420411754SKacheong.Poon@Sun.COM 			 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
420511754SKacheong.Poon@Sun.COM 			 * increased by 1 MSS for every RTTs.
420611754SKacheong.Poon@Sun.COM 			 */
420711754SKacheong.Poon@Sun.COM 			if (tcp->tcp_cwnd_cnt <= 0) {
420811754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd_cnt = cwnd + add;
420911754SKacheong.Poon@Sun.COM 			} else {
421011754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd_cnt -= add;
421111754SKacheong.Poon@Sun.COM 				add = 0;
421211754SKacheong.Poon@Sun.COM 			}
421311754SKacheong.Poon@Sun.COM 		}
421411754SKacheong.Poon@Sun.COM 		tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
421511754SKacheong.Poon@Sun.COM 	}
421611754SKacheong.Poon@Sun.COM 
421711754SKacheong.Poon@Sun.COM 	/* See if the latest urgent data has been acknowledged */
421811754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
421911754SKacheong.Poon@Sun.COM 	    SEQ_GT(seg_ack, tcp->tcp_urg))
422011754SKacheong.Poon@Sun.COM 		tcp->tcp_valid_bits &= ~TCP_URG_VALID;
422111754SKacheong.Poon@Sun.COM 
422211754SKacheong.Poon@Sun.COM 	/* Can we update the RTT estimates? */
422311754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok) {
422411754SKacheong.Poon@Sun.COM 		/* Ignore zero timestamp echo-reply. */
422511754SKacheong.Poon@Sun.COM 		if (tcpopt.tcp_opt_ts_ecr != 0) {
422611754SKacheong.Poon@Sun.COM 			tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
422711754SKacheong.Poon@Sun.COM 			    (int32_t)tcpopt.tcp_opt_ts_ecr);
422811754SKacheong.Poon@Sun.COM 		}
422911754SKacheong.Poon@Sun.COM 
423011754SKacheong.Poon@Sun.COM 		/* If needed, restart the timer. */
423111754SKacheong.Poon@Sun.COM 		if (tcp->tcp_set_timer == 1) {
423211754SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
423311754SKacheong.Poon@Sun.COM 			tcp->tcp_set_timer = 0;
423411754SKacheong.Poon@Sun.COM 		}
423511754SKacheong.Poon@Sun.COM 		/*
423611754SKacheong.Poon@Sun.COM 		 * Update tcp_csuna in case the other side stops sending
423711754SKacheong.Poon@Sun.COM 		 * us timestamps.
423811754SKacheong.Poon@Sun.COM 		 */
423911754SKacheong.Poon@Sun.COM 		tcp->tcp_csuna = tcp->tcp_snxt;
424011754SKacheong.Poon@Sun.COM 	} else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
424111754SKacheong.Poon@Sun.COM 		/*
424211754SKacheong.Poon@Sun.COM 		 * An ACK sequence we haven't seen before, so get the RTT
424311754SKacheong.Poon@Sun.COM 		 * and update the RTO. But first check if the timestamp is
424411754SKacheong.Poon@Sun.COM 		 * valid to use.
424511754SKacheong.Poon@Sun.COM 		 */
424611754SKacheong.Poon@Sun.COM 		if ((mp1->b_next != NULL) &&
424711754SKacheong.Poon@Sun.COM 		    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
424811754SKacheong.Poon@Sun.COM 			tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
424911754SKacheong.Poon@Sun.COM 			    (int32_t)(intptr_t)mp1->b_prev);
425011754SKacheong.Poon@Sun.COM 		else
425111754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
425211754SKacheong.Poon@Sun.COM 
425311754SKacheong.Poon@Sun.COM 		/* Remeber the last sequence to be ACKed */
425411754SKacheong.Poon@Sun.COM 		tcp->tcp_csuna = seg_ack;
425511754SKacheong.Poon@Sun.COM 		if (tcp->tcp_set_timer == 1) {
425611754SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
425711754SKacheong.Poon@Sun.COM 			tcp->tcp_set_timer = 0;
425811754SKacheong.Poon@Sun.COM 		}
425911754SKacheong.Poon@Sun.COM 	} else {
426011754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
426111754SKacheong.Poon@Sun.COM 	}
426211754SKacheong.Poon@Sun.COM 
426311754SKacheong.Poon@Sun.COM 	/* Eat acknowledged bytes off the xmit queue. */
426411754SKacheong.Poon@Sun.COM 	for (;;) {
426511754SKacheong.Poon@Sun.COM 		mblk_t	*mp2;
426611754SKacheong.Poon@Sun.COM 		uchar_t	*wptr;
426711754SKacheong.Poon@Sun.COM 
426811754SKacheong.Poon@Sun.COM 		wptr = mp1->b_wptr;
426911754SKacheong.Poon@Sun.COM 		ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
427011754SKacheong.Poon@Sun.COM 		bytes_acked -= (int)(wptr - mp1->b_rptr);
427111754SKacheong.Poon@Sun.COM 		if (bytes_acked < 0) {
427211754SKacheong.Poon@Sun.COM 			mp1->b_rptr = wptr + bytes_acked;
427311754SKacheong.Poon@Sun.COM 			/*
427411754SKacheong.Poon@Sun.COM 			 * Set a new timestamp if all the bytes timed by the
427511754SKacheong.Poon@Sun.COM 			 * old timestamp have been ack'ed.
427611754SKacheong.Poon@Sun.COM 			 */
427711754SKacheong.Poon@Sun.COM 			if (SEQ_GT(seg_ack,
427811754SKacheong.Poon@Sun.COM 			    (uint32_t)(uintptr_t)(mp1->b_next))) {
427911754SKacheong.Poon@Sun.COM 				mp1->b_prev =
428011754SKacheong.Poon@Sun.COM 				    (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
428111754SKacheong.Poon@Sun.COM 				mp1->b_next = NULL;
428211754SKacheong.Poon@Sun.COM 			}
428311754SKacheong.Poon@Sun.COM 			break;
428411754SKacheong.Poon@Sun.COM 		}
428511754SKacheong.Poon@Sun.COM 		mp1->b_next = NULL;
428611754SKacheong.Poon@Sun.COM 		mp1->b_prev = NULL;
428711754SKacheong.Poon@Sun.COM 		mp2 = mp1;
428811754SKacheong.Poon@Sun.COM 		mp1 = mp1->b_cont;
428911754SKacheong.Poon@Sun.COM 
429011754SKacheong.Poon@Sun.COM 		/*
429111754SKacheong.Poon@Sun.COM 		 * This notification is required for some zero-copy
429211754SKacheong.Poon@Sun.COM 		 * clients to maintain a copy semantic. After the data
429311754SKacheong.Poon@Sun.COM 		 * is ack'ed, client is safe to modify or reuse the buffer.
429411754SKacheong.Poon@Sun.COM 		 */
429511754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_zcopy_aware &&
429611754SKacheong.Poon@Sun.COM 		    (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
429711754SKacheong.Poon@Sun.COM 			tcp_zcopy_notify(tcp);
429811754SKacheong.Poon@Sun.COM 		freeb(mp2);
429911754SKacheong.Poon@Sun.COM 		if (bytes_acked == 0) {
430011754SKacheong.Poon@Sun.COM 			if (mp1 == NULL) {
430111754SKacheong.Poon@Sun.COM 				/* Everything is ack'ed, clear the tail. */
430211754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_tail = NULL;
430311754SKacheong.Poon@Sun.COM 				/*
430411754SKacheong.Poon@Sun.COM 				 * Cancel the timer unless we are still
430511754SKacheong.Poon@Sun.COM 				 * waiting for an ACK for the FIN packet.
430611754SKacheong.Poon@Sun.COM 				 */
430711754SKacheong.Poon@Sun.COM 				if (tcp->tcp_timer_tid != 0 &&
430811754SKacheong.Poon@Sun.COM 				    tcp->tcp_snxt == tcp->tcp_suna) {
430911754SKacheong.Poon@Sun.COM 					(void) TCP_TIMER_CANCEL(tcp,
431011754SKacheong.Poon@Sun.COM 					    tcp->tcp_timer_tid);
431111754SKacheong.Poon@Sun.COM 					tcp->tcp_timer_tid = 0;
431211754SKacheong.Poon@Sun.COM 				}
431311754SKacheong.Poon@Sun.COM 				goto pre_swnd_update;
431411754SKacheong.Poon@Sun.COM 			}
431511754SKacheong.Poon@Sun.COM 			if (mp2 != tcp->tcp_xmit_tail)
431611754SKacheong.Poon@Sun.COM 				break;
431711754SKacheong.Poon@Sun.COM 			tcp->tcp_xmit_tail = mp1;
431811754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
431911754SKacheong.Poon@Sun.COM 			    (uintptr_t)INT_MAX);
432011754SKacheong.Poon@Sun.COM 			tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr -
432111754SKacheong.Poon@Sun.COM 			    mp1->b_rptr);
432211754SKacheong.Poon@Sun.COM 			break;
432311754SKacheong.Poon@Sun.COM 		}
432411754SKacheong.Poon@Sun.COM 		if (mp1 == NULL) {
432511754SKacheong.Poon@Sun.COM 			/*
432611754SKacheong.Poon@Sun.COM 			 * More was acked but there is nothing more
432711754SKacheong.Poon@Sun.COM 			 * outstanding.  This means that the FIN was
432811754SKacheong.Poon@Sun.COM 			 * just acked or that we're talking to a clown.
432911754SKacheong.Poon@Sun.COM 			 */
433011754SKacheong.Poon@Sun.COM fin_acked:
433111754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_fin_sent);
433211754SKacheong.Poon@Sun.COM 			tcp->tcp_xmit_tail = NULL;
433311754SKacheong.Poon@Sun.COM 			if (tcp->tcp_fin_sent) {
433411754SKacheong.Poon@Sun.COM 				/* FIN was acked - making progress */
433511754SKacheong.Poon@Sun.COM 				if (!tcp->tcp_fin_acked)
433611754SKacheong.Poon@Sun.COM 					tcp->tcp_ip_forward_progress = B_TRUE;
433711754SKacheong.Poon@Sun.COM 				tcp->tcp_fin_acked = B_TRUE;
433811754SKacheong.Poon@Sun.COM 				if (tcp->tcp_linger_tid != 0 &&
433911754SKacheong.Poon@Sun.COM 				    TCP_TIMER_CANCEL(tcp,
434011754SKacheong.Poon@Sun.COM 				    tcp->tcp_linger_tid) >= 0) {
434111754SKacheong.Poon@Sun.COM 					tcp_stop_lingering(tcp);
434211754SKacheong.Poon@Sun.COM 					freemsg(mp);
434311754SKacheong.Poon@Sun.COM 					mp = NULL;
434411754SKacheong.Poon@Sun.COM 				}
434511754SKacheong.Poon@Sun.COM 			} else {
434611754SKacheong.Poon@Sun.COM 				/*
434711754SKacheong.Poon@Sun.COM 				 * We should never get here because
434811754SKacheong.Poon@Sun.COM 				 * we have already checked that the
434911754SKacheong.Poon@Sun.COM 				 * number of bytes ack'ed should be
435011754SKacheong.Poon@Sun.COM 				 * smaller than or equal to what we
435111754SKacheong.Poon@Sun.COM 				 * have sent so far (it is the
435211754SKacheong.Poon@Sun.COM 				 * acceptability check of the ACK).
435311754SKacheong.Poon@Sun.COM 				 * We can only get here if the send
435411754SKacheong.Poon@Sun.COM 				 * queue is corrupted.
435511754SKacheong.Poon@Sun.COM 				 *
435611754SKacheong.Poon@Sun.COM 				 * Terminate the connection and
435711754SKacheong.Poon@Sun.COM 				 * panic the system.  It is better
435811754SKacheong.Poon@Sun.COM 				 * for us to panic instead of
435911754SKacheong.Poon@Sun.COM 				 * continuing to avoid other disaster.
436011754SKacheong.Poon@Sun.COM 				 */
436111754SKacheong.Poon@Sun.COM 				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
436211754SKacheong.Poon@Sun.COM 				    tcp->tcp_rnxt, TH_RST|TH_ACK);
436311754SKacheong.Poon@Sun.COM 				panic("Memory corruption "
436411754SKacheong.Poon@Sun.COM 				    "detected for connection %s.",
436511754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
436611754SKacheong.Poon@Sun.COM 				    DISP_ADDR_AND_PORT));
436711754SKacheong.Poon@Sun.COM 				/*NOTREACHED*/
436811754SKacheong.Poon@Sun.COM 			}
436911754SKacheong.Poon@Sun.COM 			goto pre_swnd_update;
437011754SKacheong.Poon@Sun.COM 		}
437111754SKacheong.Poon@Sun.COM 		ASSERT(mp2 != tcp->tcp_xmit_tail);
437211754SKacheong.Poon@Sun.COM 	}
437311754SKacheong.Poon@Sun.COM 	if (tcp->tcp_unsent) {
437411754SKacheong.Poon@Sun.COM 		flags |= TH_XMIT_NEEDED;
437511754SKacheong.Poon@Sun.COM 	}
437611754SKacheong.Poon@Sun.COM pre_swnd_update:
437711754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_head = mp1;
437811754SKacheong.Poon@Sun.COM swnd_update:
437911754SKacheong.Poon@Sun.COM 	/*
438011754SKacheong.Poon@Sun.COM 	 * The following check is different from most other implementations.
438111754SKacheong.Poon@Sun.COM 	 * For bi-directional transfer, when segments are dropped, the
438211754SKacheong.Poon@Sun.COM 	 * "normal" check will not accept a window update in those
438311754SKacheong.Poon@Sun.COM 	 * retransmitted segemnts.  Failing to do that, TCP may send out
438411754SKacheong.Poon@Sun.COM 	 * segments which are outside receiver's window.  As TCP accepts
438511754SKacheong.Poon@Sun.COM 	 * the ack in those retransmitted segments, if the window update in
438611754SKacheong.Poon@Sun.COM 	 * the same segment is not accepted, TCP will incorrectly calculates
438711754SKacheong.Poon@Sun.COM 	 * that it can send more segments.  This can create a deadlock
438811754SKacheong.Poon@Sun.COM 	 * with the receiver if its window becomes zero.
438911754SKacheong.Poon@Sun.COM 	 */
439011754SKacheong.Poon@Sun.COM 	if (SEQ_LT(tcp->tcp_swl2, seg_ack) ||
439111754SKacheong.Poon@Sun.COM 	    SEQ_LT(tcp->tcp_swl1, seg_seq) ||
439211754SKacheong.Poon@Sun.COM 	    (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) {
439311754SKacheong.Poon@Sun.COM 		/*
439411754SKacheong.Poon@Sun.COM 		 * The criteria for update is:
439511754SKacheong.Poon@Sun.COM 		 *
439611754SKacheong.Poon@Sun.COM 		 * 1. the segment acknowledges some data.  Or
439711754SKacheong.Poon@Sun.COM 		 * 2. the segment is new, i.e. it has a higher seq num. Or
439811754SKacheong.Poon@Sun.COM 		 * 3. the segment is not old and the advertised window is
439911754SKacheong.Poon@Sun.COM 		 * larger than the previous advertised window.
440011754SKacheong.Poon@Sun.COM 		 */
440111754SKacheong.Poon@Sun.COM 		if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
440211754SKacheong.Poon@Sun.COM 			flags |= TH_XMIT_NEEDED;
440311754SKacheong.Poon@Sun.COM 		tcp->tcp_swnd = new_swnd;
440411754SKacheong.Poon@Sun.COM 		if (new_swnd > tcp->tcp_max_swnd)
440511754SKacheong.Poon@Sun.COM 			tcp->tcp_max_swnd = new_swnd;
440611754SKacheong.Poon@Sun.COM 		tcp->tcp_swl1 = seg_seq;
440711754SKacheong.Poon@Sun.COM 		tcp->tcp_swl2 = seg_ack;
440811754SKacheong.Poon@Sun.COM 	}
440911754SKacheong.Poon@Sun.COM est:
441011754SKacheong.Poon@Sun.COM 	if (tcp->tcp_state > TCPS_ESTABLISHED) {
441111754SKacheong.Poon@Sun.COM 
441211754SKacheong.Poon@Sun.COM 		switch (tcp->tcp_state) {
441311754SKacheong.Poon@Sun.COM 		case TCPS_FIN_WAIT_1:
441411754SKacheong.Poon@Sun.COM 			if (tcp->tcp_fin_acked) {
441511754SKacheong.Poon@Sun.COM 				tcp->tcp_state = TCPS_FIN_WAIT_2;
441612507SAlan.Maguire@Sun.COM 				DTRACE_TCP6(state__change, void, NULL,
441712507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa,
441812507SAlan.Maguire@Sun.COM 				    void, NULL, tcp_t *, tcp, void, NULL,
441912507SAlan.Maguire@Sun.COM 				    int32_t, TCPS_FIN_WAIT_1);
442011754SKacheong.Poon@Sun.COM 				/*
442111754SKacheong.Poon@Sun.COM 				 * We implement the non-standard BSD/SunOS
442211754SKacheong.Poon@Sun.COM 				 * FIN_WAIT_2 flushing algorithm.
442311754SKacheong.Poon@Sun.COM 				 * If there is no user attached to this
442411754SKacheong.Poon@Sun.COM 				 * TCP endpoint, then this TCP struct
442511754SKacheong.Poon@Sun.COM 				 * could hang around forever in FIN_WAIT_2
442611754SKacheong.Poon@Sun.COM 				 * state if the peer forgets to send us
442711754SKacheong.Poon@Sun.COM 				 * a FIN.  To prevent this, we wait only
442811754SKacheong.Poon@Sun.COM 				 * 2*MSL (a convenient time value) for
442911754SKacheong.Poon@Sun.COM 				 * the FIN to arrive.  If it doesn't show up,
443011754SKacheong.Poon@Sun.COM 				 * we flush the TCP endpoint.  This algorithm,
443111754SKacheong.Poon@Sun.COM 				 * though a violation of RFC-793, has worked
443211754SKacheong.Poon@Sun.COM 				 * for over 10 years in BSD systems.
443311754SKacheong.Poon@Sun.COM 				 * Note: SunOS 4.x waits 675 seconds before
443411754SKacheong.Poon@Sun.COM 				 * flushing the FIN_WAIT_2 connection.
443511754SKacheong.Poon@Sun.COM 				 */
443611754SKacheong.Poon@Sun.COM 				TCP_TIMER_RESTART(tcp,
443712544SKacheong.Poon@Sun.COM 				    tcp->tcp_fin_wait_2_flush_interval);
443811754SKacheong.Poon@Sun.COM 			}
443911754SKacheong.Poon@Sun.COM 			break;
444011754SKacheong.Poon@Sun.COM 		case TCPS_FIN_WAIT_2:
444111754SKacheong.Poon@Sun.COM 			break;	/* Shutdown hook? */
444211754SKacheong.Poon@Sun.COM 		case TCPS_LAST_ACK:
444311754SKacheong.Poon@Sun.COM 			freemsg(mp);
444411754SKacheong.Poon@Sun.COM 			if (tcp->tcp_fin_acked) {
444511754SKacheong.Poon@Sun.COM 				(void) tcp_clean_death(tcp, 0);
444611754SKacheong.Poon@Sun.COM 				return;
444711754SKacheong.Poon@Sun.COM 			}
444811754SKacheong.Poon@Sun.COM 			goto xmit_check;
444911754SKacheong.Poon@Sun.COM 		case TCPS_CLOSING:
445012507SAlan.Maguire@Sun.COM 			if (tcp->tcp_fin_acked) {
445111754SKacheong.Poon@Sun.COM 				SET_TIME_WAIT(tcps, tcp, connp);
445212507SAlan.Maguire@Sun.COM 				DTRACE_TCP6(state__change, void, NULL,
445312507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa, void,
445412507SAlan.Maguire@Sun.COM 				    NULL, tcp_t *, tcp, void, NULL, int32_t,
445512507SAlan.Maguire@Sun.COM 				    TCPS_CLOSING);
445612507SAlan.Maguire@Sun.COM 			}
445711754SKacheong.Poon@Sun.COM 			/*FALLTHRU*/
445811754SKacheong.Poon@Sun.COM 		case TCPS_CLOSE_WAIT:
445911754SKacheong.Poon@Sun.COM 			freemsg(mp);
446011754SKacheong.Poon@Sun.COM 			goto xmit_check;
446111754SKacheong.Poon@Sun.COM 		default:
446211754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
446311754SKacheong.Poon@Sun.COM 			break;
446411754SKacheong.Poon@Sun.COM 		}
446511754SKacheong.Poon@Sun.COM 	}
446611754SKacheong.Poon@Sun.COM 	if (flags & TH_FIN) {
446711754SKacheong.Poon@Sun.COM 		/* Make sure we ack the fin */
446811754SKacheong.Poon@Sun.COM 		flags |= TH_ACK_NEEDED;
446911754SKacheong.Poon@Sun.COM 		if (!tcp->tcp_fin_rcvd) {
447011754SKacheong.Poon@Sun.COM 			tcp->tcp_fin_rcvd = B_TRUE;
447111754SKacheong.Poon@Sun.COM 			tcp->tcp_rnxt++;
447211754SKacheong.Poon@Sun.COM 			tcpha = tcp->tcp_tcpha;
447311754SKacheong.Poon@Sun.COM 			tcpha->tha_ack = htonl(tcp->tcp_rnxt);
447411754SKacheong.Poon@Sun.COM 
447511754SKacheong.Poon@Sun.COM 			/*
447612643SAnders.Persson@Sun.COM 			 * Generate the ordrel_ind at the end unless the
447712643SAnders.Persson@Sun.COM 			 * conn is detached or it is a STREAMS based eager.
447812643SAnders.Persson@Sun.COM 			 * In the eager case we defer the notification until
447912643SAnders.Persson@Sun.COM 			 * tcp_accept_finish has run.
448011754SKacheong.Poon@Sun.COM 			 */
448112643SAnders.Persson@Sun.COM 			if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) ||
448212643SAnders.Persson@Sun.COM 			    (tcp->tcp_listener == NULL &&
448312643SAnders.Persson@Sun.COM 			    !tcp->tcp_hard_binding)))
448411754SKacheong.Poon@Sun.COM 				flags |= TH_ORDREL_NEEDED;
448511754SKacheong.Poon@Sun.COM 			switch (tcp->tcp_state) {
448611754SKacheong.Poon@Sun.COM 			case TCPS_SYN_RCVD:
448712507SAlan.Maguire@Sun.COM 				tcp->tcp_state = TCPS_CLOSE_WAIT;
448812507SAlan.Maguire@Sun.COM 				DTRACE_TCP6(state__change, void, NULL,
448912507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa,
449012507SAlan.Maguire@Sun.COM 				    void, NULL, tcp_t *, tcp, void, NULL,
449112507SAlan.Maguire@Sun.COM 				    int32_t, TCPS_SYN_RCVD);
449212507SAlan.Maguire@Sun.COM 				/* Keepalive? */
449312507SAlan.Maguire@Sun.COM 				break;
449411754SKacheong.Poon@Sun.COM 			case TCPS_ESTABLISHED:
449511754SKacheong.Poon@Sun.COM 				tcp->tcp_state = TCPS_CLOSE_WAIT;
449612507SAlan.Maguire@Sun.COM 				DTRACE_TCP6(state__change, void, NULL,
449712507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa,
449812507SAlan.Maguire@Sun.COM 				    void, NULL, tcp_t *, tcp, void, NULL,
449912507SAlan.Maguire@Sun.COM 				    int32_t, TCPS_ESTABLISHED);
450011754SKacheong.Poon@Sun.COM 				/* Keepalive? */
450111754SKacheong.Poon@Sun.COM 				break;
450211754SKacheong.Poon@Sun.COM 			case TCPS_FIN_WAIT_1:
450311754SKacheong.Poon@Sun.COM 				if (!tcp->tcp_fin_acked) {
450411754SKacheong.Poon@Sun.COM 					tcp->tcp_state = TCPS_CLOSING;
450512507SAlan.Maguire@Sun.COM 					DTRACE_TCP6(state__change, void, NULL,
450612507SAlan.Maguire@Sun.COM 					    ip_xmit_attr_t *, connp->conn_ixa,
450712507SAlan.Maguire@Sun.COM 					    void, NULL, tcp_t *, tcp, void,
450812507SAlan.Maguire@Sun.COM 					    NULL, int32_t, TCPS_FIN_WAIT_1);
450911754SKacheong.Poon@Sun.COM 					break;
451011754SKacheong.Poon@Sun.COM 				}
451111754SKacheong.Poon@Sun.COM 				/* FALLTHRU */
451211754SKacheong.Poon@Sun.COM 			case TCPS_FIN_WAIT_2:
451311754SKacheong.Poon@Sun.COM 				SET_TIME_WAIT(tcps, tcp, connp);
451412507SAlan.Maguire@Sun.COM 				DTRACE_TCP6(state__change, void, NULL,
451512507SAlan.Maguire@Sun.COM 				    ip_xmit_attr_t *, connp->conn_ixa, void,
451612507SAlan.Maguire@Sun.COM 				    NULL, tcp_t *, tcp, void, NULL, int32_t,
451712507SAlan.Maguire@Sun.COM 				    TCPS_FIN_WAIT_2);
451811754SKacheong.Poon@Sun.COM 				if (seg_len) {
451911754SKacheong.Poon@Sun.COM 					/*
452011754SKacheong.Poon@Sun.COM 					 * implies data piggybacked on FIN.
452111754SKacheong.Poon@Sun.COM 					 * break to handle data.
452211754SKacheong.Poon@Sun.COM 					 */
452311754SKacheong.Poon@Sun.COM 					break;
452411754SKacheong.Poon@Sun.COM 				}
452511754SKacheong.Poon@Sun.COM 				freemsg(mp);
452611754SKacheong.Poon@Sun.COM 				goto ack_check;
452711754SKacheong.Poon@Sun.COM 			}
452811754SKacheong.Poon@Sun.COM 		}
452911754SKacheong.Poon@Sun.COM 	}
453011754SKacheong.Poon@Sun.COM 	if (mp == NULL)
453111754SKacheong.Poon@Sun.COM 		goto xmit_check;
453211754SKacheong.Poon@Sun.COM 	if (seg_len == 0) {
453311754SKacheong.Poon@Sun.COM 		freemsg(mp);
453411754SKacheong.Poon@Sun.COM 		goto xmit_check;
453511754SKacheong.Poon@Sun.COM 	}
453611754SKacheong.Poon@Sun.COM 	if (mp->b_rptr == mp->b_wptr) {
453711754SKacheong.Poon@Sun.COM 		/*
453811754SKacheong.Poon@Sun.COM 		 * The header has been consumed, so we remove the
453911754SKacheong.Poon@Sun.COM 		 * zero-length mblk here.
454011754SKacheong.Poon@Sun.COM 		 */
454111754SKacheong.Poon@Sun.COM 		mp1 = mp;
454211754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
454311754SKacheong.Poon@Sun.COM 		freeb(mp1);
454411754SKacheong.Poon@Sun.COM 	}
454511754SKacheong.Poon@Sun.COM update_ack:
454611754SKacheong.Poon@Sun.COM 	tcpha = tcp->tcp_tcpha;
454711754SKacheong.Poon@Sun.COM 	tcp->tcp_rack_cnt++;
454811754SKacheong.Poon@Sun.COM 	{
454911754SKacheong.Poon@Sun.COM 		uint32_t cur_max;
455011754SKacheong.Poon@Sun.COM 
455111754SKacheong.Poon@Sun.COM 		cur_max = tcp->tcp_rack_cur_max;
455211754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rack_cnt >= cur_max) {
455311754SKacheong.Poon@Sun.COM 			/*
455411754SKacheong.Poon@Sun.COM 			 * We have more unacked data than we should - send
455511754SKacheong.Poon@Sun.COM 			 * an ACK now.
455611754SKacheong.Poon@Sun.COM 			 */
455711754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
455811754SKacheong.Poon@Sun.COM 			cur_max++;
455911754SKacheong.Poon@Sun.COM 			if (cur_max > tcp->tcp_rack_abs_max)
456011754SKacheong.Poon@Sun.COM 				tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max;
456111754SKacheong.Poon@Sun.COM 			else
456211754SKacheong.Poon@Sun.COM 				tcp->tcp_rack_cur_max = cur_max;
456311754SKacheong.Poon@Sun.COM 		} else if (TCP_IS_DETACHED(tcp)) {
456411754SKacheong.Poon@Sun.COM 			/* We don't have an ACK timer for detached TCP. */
456511754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_NEEDED;
456611754SKacheong.Poon@Sun.COM 		} else if (seg_len < mss) {
456711754SKacheong.Poon@Sun.COM 			/*
456811754SKacheong.Poon@Sun.COM 			 * If we get a segment that is less than an mss, and we
456911754SKacheong.Poon@Sun.COM 			 * already have unacknowledged data, and the amount
457011754SKacheong.Poon@Sun.COM 			 * unacknowledged is not a multiple of mss, then we
457111754SKacheong.Poon@Sun.COM 			 * better generate an ACK now.  Otherwise, this may be
457211754SKacheong.Poon@Sun.COM 			 * the tail piece of a transaction, and we would rather
457311754SKacheong.Poon@Sun.COM 			 * wait for the response.
457411754SKacheong.Poon@Sun.COM 			 */
457511754SKacheong.Poon@Sun.COM 			uint32_t udif;
457611754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <=
457711754SKacheong.Poon@Sun.COM 			    (uintptr_t)INT_MAX);
457811754SKacheong.Poon@Sun.COM 			udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack);
457911754SKacheong.Poon@Sun.COM 			if (udif && (udif % mss))
458011754SKacheong.Poon@Sun.COM 				flags |= TH_ACK_NEEDED;
458111754SKacheong.Poon@Sun.COM 			else
458211754SKacheong.Poon@Sun.COM 				flags |= TH_ACK_TIMER_NEEDED;
458311754SKacheong.Poon@Sun.COM 		} else {
458411754SKacheong.Poon@Sun.COM 			/* Start delayed ack timer */
458511754SKacheong.Poon@Sun.COM 			flags |= TH_ACK_TIMER_NEEDED;
458611754SKacheong.Poon@Sun.COM 		}
458711754SKacheong.Poon@Sun.COM 	}
458811754SKacheong.Poon@Sun.COM 	tcp->tcp_rnxt += seg_len;
458911754SKacheong.Poon@Sun.COM 	tcpha->tha_ack = htonl(tcp->tcp_rnxt);
459011754SKacheong.Poon@Sun.COM 
459111754SKacheong.Poon@Sun.COM 	if (mp == NULL)
459211754SKacheong.Poon@Sun.COM 		goto xmit_check;
459311754SKacheong.Poon@Sun.COM 
459411754SKacheong.Poon@Sun.COM 	/* Update SACK list */
459511754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
459611754SKacheong.Poon@Sun.COM 		tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt,
459711754SKacheong.Poon@Sun.COM 		    &(tcp->tcp_num_sack_blk));
459811754SKacheong.Poon@Sun.COM 	}
459911754SKacheong.Poon@Sun.COM 
460011754SKacheong.Poon@Sun.COM 	if (tcp->tcp_urp_mp) {
460111754SKacheong.Poon@Sun.COM 		tcp->tcp_urp_mp->b_cont = mp;
460211754SKacheong.Poon@Sun.COM 		mp = tcp->tcp_urp_mp;
460311754SKacheong.Poon@Sun.COM 		tcp->tcp_urp_mp = NULL;
460411754SKacheong.Poon@Sun.COM 		/* Ready for a new signal. */
460511754SKacheong.Poon@Sun.COM 		tcp->tcp_urp_last_valid = B_FALSE;
460611754SKacheong.Poon@Sun.COM #ifdef DEBUG
460711754SKacheong.Poon@Sun.COM 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
460811754SKacheong.Poon@Sun.COM 		    "tcp_rput: sending exdata_ind %s",
460911754SKacheong.Poon@Sun.COM 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
461011754SKacheong.Poon@Sun.COM #endif /* DEBUG */
461111754SKacheong.Poon@Sun.COM 	}
461211754SKacheong.Poon@Sun.COM 
461311754SKacheong.Poon@Sun.COM 	/*
461411754SKacheong.Poon@Sun.COM 	 * Check for ancillary data changes compared to last segment.
461511754SKacheong.Poon@Sun.COM 	 */
461611754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_all != 0) {
461711754SKacheong.Poon@Sun.COM 		mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
461811754SKacheong.Poon@Sun.COM 		if (mp == NULL)
461911754SKacheong.Poon@Sun.COM 			return;
462011754SKacheong.Poon@Sun.COM 	}
462111754SKacheong.Poon@Sun.COM 
462212643SAnders.Persson@Sun.COM 	if (IPCL_IS_NONSTR(connp)) {
462311754SKacheong.Poon@Sun.COM 		/*
462411754SKacheong.Poon@Sun.COM 		 * Non-STREAMS socket
462511754SKacheong.Poon@Sun.COM 		 */
462611754SKacheong.Poon@Sun.COM 		boolean_t push = flags & (TH_PUSH|TH_FIN);
462711754SKacheong.Poon@Sun.COM 		int error;
462811754SKacheong.Poon@Sun.COM 
462911754SKacheong.Poon@Sun.COM 		if ((*connp->conn_upcalls->su_recv)(
463011754SKacheong.Poon@Sun.COM 		    connp->conn_upper_handle,
463111754SKacheong.Poon@Sun.COM 		    mp, seg_len, 0, &error, &push) <= 0) {
463211754SKacheong.Poon@Sun.COM 			/*
463311754SKacheong.Poon@Sun.COM 			 * We should never be in middle of a
463411754SKacheong.Poon@Sun.COM 			 * fallback, the squeue guarantees that.
463511754SKacheong.Poon@Sun.COM 			 */
463611754SKacheong.Poon@Sun.COM 			ASSERT(error != EOPNOTSUPP);
463711754SKacheong.Poon@Sun.COM 			if (error == ENOSPC)
463811754SKacheong.Poon@Sun.COM 				tcp->tcp_rwnd -= seg_len;
463911754SKacheong.Poon@Sun.COM 		} else if (push) {
464011754SKacheong.Poon@Sun.COM 			/* PUSH bit set and sockfs is not flow controlled */
464111754SKacheong.Poon@Sun.COM 			flags |= tcp_rwnd_reopen(tcp);
464211754SKacheong.Poon@Sun.COM 		}
464312643SAnders.Persson@Sun.COM 	} else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
464412643SAnders.Persson@Sun.COM 		/*
464512643SAnders.Persson@Sun.COM 		 * Side queue inbound data until the accept happens.
464612643SAnders.Persson@Sun.COM 		 * tcp_accept/tcp_rput drains this when the accept happens.
464712643SAnders.Persson@Sun.COM 		 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
464812643SAnders.Persson@Sun.COM 		 * T_EXDATA_IND) it is queued on b_next.
464912643SAnders.Persson@Sun.COM 		 * XXX Make urgent data use this. Requires:
465012643SAnders.Persson@Sun.COM 		 *	Removing tcp_listener check for TH_URG
465112643SAnders.Persson@Sun.COM 		 *	Making M_PCPROTO and MARK messages skip the eager case
465212643SAnders.Persson@Sun.COM 		 */
465312643SAnders.Persson@Sun.COM 
465412644SAnders.Persson@Sun.COM 		tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
465511754SKacheong.Poon@Sun.COM 	} else {
465612643SAnders.Persson@Sun.COM 		/* Active STREAMS socket */
465711754SKacheong.Poon@Sun.COM 		if (mp->b_datap->db_type != M_DATA ||
465811754SKacheong.Poon@Sun.COM 		    (flags & TH_MARKNEXT_NEEDED)) {
465911754SKacheong.Poon@Sun.COM 			if (tcp->tcp_rcv_list != NULL) {
466011754SKacheong.Poon@Sun.COM 				flags |= tcp_rcv_drain(tcp);
466111754SKacheong.Poon@Sun.COM 			}
466211754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_rcv_list == NULL ||
466311754SKacheong.Poon@Sun.COM 			    tcp->tcp_fused_sigurg);
466411754SKacheong.Poon@Sun.COM 
466511754SKacheong.Poon@Sun.COM 			if (flags & TH_MARKNEXT_NEEDED) {
466611754SKacheong.Poon@Sun.COM #ifdef DEBUG
466711754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
466811754SKacheong.Poon@Sun.COM 				    "tcp_rput: sending MSGMARKNEXT %s",
466911754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
467011754SKacheong.Poon@Sun.COM 				    DISP_PORT_ONLY));
467111754SKacheong.Poon@Sun.COM #endif /* DEBUG */
467211754SKacheong.Poon@Sun.COM 				mp->b_flag |= MSGMARKNEXT;
467311754SKacheong.Poon@Sun.COM 				flags &= ~TH_MARKNEXT_NEEDED;
467411754SKacheong.Poon@Sun.COM 			}
467511754SKacheong.Poon@Sun.COM 
467612644SAnders.Persson@Sun.COM 			if (is_system_labeled())
467712644SAnders.Persson@Sun.COM 				tcp_setcred_data(mp, ira);
467812644SAnders.Persson@Sun.COM 
467912644SAnders.Persson@Sun.COM 			putnext(connp->conn_rq, mp);
468012644SAnders.Persson@Sun.COM 			if (!canputnext(connp->conn_rq))
468112644SAnders.Persson@Sun.COM 				tcp->tcp_rwnd -= seg_len;
468211754SKacheong.Poon@Sun.COM 		} else if ((flags & (TH_PUSH|TH_FIN)) ||
468311754SKacheong.Poon@Sun.COM 		    tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) {
468411754SKacheong.Poon@Sun.COM 			if (tcp->tcp_rcv_list != NULL) {
468511754SKacheong.Poon@Sun.COM 				/*
468611754SKacheong.Poon@Sun.COM 				 * Enqueue the new segment first and then
468711754SKacheong.Poon@Sun.COM 				 * call tcp_rcv_drain() to send all data
468811754SKacheong.Poon@Sun.COM 				 * up.  The other way to do this is to
468911754SKacheong.Poon@Sun.COM 				 * send all queued data up and then call
469011754SKacheong.Poon@Sun.COM 				 * putnext() to send the new segment up.
469111754SKacheong.Poon@Sun.COM 				 * This way can remove the else part later
469211754SKacheong.Poon@Sun.COM 				 * on.
469311754SKacheong.Poon@Sun.COM 				 *
469411754SKacheong.Poon@Sun.COM 				 * We don't do this to avoid one more call to
469511754SKacheong.Poon@Sun.COM 				 * canputnext() as tcp_rcv_drain() needs to
469611754SKacheong.Poon@Sun.COM 				 * call canputnext().
469711754SKacheong.Poon@Sun.COM 				 */
469811754SKacheong.Poon@Sun.COM 				tcp_rcv_enqueue(tcp, mp, seg_len,
469911754SKacheong.Poon@Sun.COM 				    ira->ira_cred);
470011754SKacheong.Poon@Sun.COM 				flags |= tcp_rcv_drain(tcp);
470111754SKacheong.Poon@Sun.COM 			} else {
470211754SKacheong.Poon@Sun.COM 				if (is_system_labeled())
470311754SKacheong.Poon@Sun.COM 					tcp_setcred_data(mp, ira);
470411754SKacheong.Poon@Sun.COM 
470511754SKacheong.Poon@Sun.COM 				putnext(connp->conn_rq, mp);
470611754SKacheong.Poon@Sun.COM 				if (!canputnext(connp->conn_rq))
470711754SKacheong.Poon@Sun.COM 					tcp->tcp_rwnd -= seg_len;
470811754SKacheong.Poon@Sun.COM 			}
470911754SKacheong.Poon@Sun.COM 		} else {
471011754SKacheong.Poon@Sun.COM 			/*
471111754SKacheong.Poon@Sun.COM 			 * Enqueue all packets when processing an mblk
471211754SKacheong.Poon@Sun.COM 			 * from the co queue and also enqueue normal packets.
471311754SKacheong.Poon@Sun.COM 			 */
471411754SKacheong.Poon@Sun.COM 			tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
471511754SKacheong.Poon@Sun.COM 		}
471611754SKacheong.Poon@Sun.COM 		/*
471711754SKacheong.Poon@Sun.COM 		 * Make sure the timer is running if we have data waiting
471811754SKacheong.Poon@Sun.COM 		 * for a push bit. This provides resiliency against
471911754SKacheong.Poon@Sun.COM 		 * implementations that do not correctly generate push bits.
472011754SKacheong.Poon@Sun.COM 		 */
472111754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) {
472211754SKacheong.Poon@Sun.COM 			/*
472311754SKacheong.Poon@Sun.COM 			 * The connection may be closed at this point, so don't
472411754SKacheong.Poon@Sun.COM 			 * do anything for a detached tcp.
472511754SKacheong.Poon@Sun.COM 			 */
472611754SKacheong.Poon@Sun.COM 			if (!TCP_IS_DETACHED(tcp))
472711754SKacheong.Poon@Sun.COM 				tcp->tcp_push_tid = TCP_TIMER(tcp,
472811754SKacheong.Poon@Sun.COM 				    tcp_push_timer,
472912056SKacheong.Poon@Sun.COM 				    tcps->tcps_push_timer_interval);
473011754SKacheong.Poon@Sun.COM 		}
473111754SKacheong.Poon@Sun.COM 	}
473211754SKacheong.Poon@Sun.COM 
473311754SKacheong.Poon@Sun.COM xmit_check:
473411754SKacheong.Poon@Sun.COM 	/* Is there anything left to do? */
473511754SKacheong.Poon@Sun.COM 	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
473611754SKacheong.Poon@Sun.COM 	if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED|
473711754SKacheong.Poon@Sun.COM 	    TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
473811754SKacheong.Poon@Sun.COM 	    TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
473911754SKacheong.Poon@Sun.COM 		goto done;
474011754SKacheong.Poon@Sun.COM 
474111754SKacheong.Poon@Sun.COM 	/* Any transmit work to do and a non-zero window? */
474211754SKacheong.Poon@Sun.COM 	if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
474311754SKacheong.Poon@Sun.COM 	    TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
474411754SKacheong.Poon@Sun.COM 		if (flags & TH_REXMIT_NEEDED) {
474511754SKacheong.Poon@Sun.COM 			uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
474611754SKacheong.Poon@Sun.COM 
474711754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
474811754SKacheong.Poon@Sun.COM 			if (snd_size > mss)
474911754SKacheong.Poon@Sun.COM 				snd_size = mss;
475011754SKacheong.Poon@Sun.COM 			if (snd_size > tcp->tcp_swnd)
475111754SKacheong.Poon@Sun.COM 				snd_size = tcp->tcp_swnd;
475211754SKacheong.Poon@Sun.COM 			mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
475311754SKacheong.Poon@Sun.COM 			    NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
475411754SKacheong.Poon@Sun.COM 			    B_TRUE);
475511754SKacheong.Poon@Sun.COM 
475611754SKacheong.Poon@Sun.COM 			if (mp1 != NULL) {
475711754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_head->b_prev =
475811754SKacheong.Poon@Sun.COM 				    (mblk_t *)LBOLT_FASTPATH;
475911754SKacheong.Poon@Sun.COM 				tcp->tcp_csuna = tcp->tcp_snxt;
476011754SKacheong.Poon@Sun.COM 				TCPS_BUMP_MIB(tcps, tcpRetransSegs);
476111754SKacheong.Poon@Sun.COM 				TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
476211754SKacheong.Poon@Sun.COM 				    snd_size);
476311754SKacheong.Poon@Sun.COM 				tcp_send_data(tcp, mp1);
476411754SKacheong.Poon@Sun.COM 			}
476511754SKacheong.Poon@Sun.COM 		}
476611754SKacheong.Poon@Sun.COM 		if (flags & TH_NEED_SACK_REXMIT) {
476711754SKacheong.Poon@Sun.COM 			tcp_sack_rexmit(tcp, &flags);
476811754SKacheong.Poon@Sun.COM 		}
476911754SKacheong.Poon@Sun.COM 		/*
477011754SKacheong.Poon@Sun.COM 		 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
477111754SKacheong.Poon@Sun.COM 		 * out new segment.  Note that tcp_rexmit should not be
477211754SKacheong.Poon@Sun.COM 		 * set, otherwise TH_LIMIT_XMIT should not be set.
477311754SKacheong.Poon@Sun.COM 		 */
477411754SKacheong.Poon@Sun.COM 		if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
477511754SKacheong.Poon@Sun.COM 			if (!tcp->tcp_rexmit) {
477611754SKacheong.Poon@Sun.COM 				tcp_wput_data(tcp, NULL, B_FALSE);
477711754SKacheong.Poon@Sun.COM 			} else {
477811754SKacheong.Poon@Sun.COM 				tcp_ss_rexmit(tcp);
477911754SKacheong.Poon@Sun.COM 			}
478011754SKacheong.Poon@Sun.COM 		}
478111754SKacheong.Poon@Sun.COM 		/*
478211754SKacheong.Poon@Sun.COM 		 * Adjust tcp_cwnd back to normal value after sending
478311754SKacheong.Poon@Sun.COM 		 * new data segments.
478411754SKacheong.Poon@Sun.COM 		 */
478511754SKacheong.Poon@Sun.COM 		if (flags & TH_LIMIT_XMIT) {
478611754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
478711754SKacheong.Poon@Sun.COM 			/*
478811754SKacheong.Poon@Sun.COM 			 * This will restart the timer.  Restarting the
478911754SKacheong.Poon@Sun.COM 			 * timer is used to avoid a timeout before the
479011754SKacheong.Poon@Sun.COM 			 * limited transmitted segment's ACK gets back.
479111754SKacheong.Poon@Sun.COM 			 */
479211754SKacheong.Poon@Sun.COM 			if (tcp->tcp_xmit_head != NULL)
479311754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_head->b_prev =
479411754SKacheong.Poon@Sun.COM 				    (mblk_t *)LBOLT_FASTPATH;
479511754SKacheong.Poon@Sun.COM 		}
479611754SKacheong.Poon@Sun.COM 
479711754SKacheong.Poon@Sun.COM 		/* Anything more to do? */
479811754SKacheong.Poon@Sun.COM 		if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
479911754SKacheong.Poon@Sun.COM 		    TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
480011754SKacheong.Poon@Sun.COM 			goto done;
480111754SKacheong.Poon@Sun.COM 	}
480211754SKacheong.Poon@Sun.COM ack_check:
480311754SKacheong.Poon@Sun.COM 	if (flags & TH_SEND_URP_MARK) {
480411754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_urp_mark_mp);
480511754SKacheong.Poon@Sun.COM 		ASSERT(!IPCL_IS_NONSTR(connp));
480611754SKacheong.Poon@Sun.COM 		/*
480711754SKacheong.Poon@Sun.COM 		 * Send up any queued data and then send the mark message
480811754SKacheong.Poon@Sun.COM 		 */
480911754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rcv_list != NULL) {
481011754SKacheong.Poon@Sun.COM 			flags |= tcp_rcv_drain(tcp);
481111754SKacheong.Poon@Sun.COM 
481211754SKacheong.Poon@Sun.COM 		}
481311754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
481411754SKacheong.Poon@Sun.COM 		mp1 = tcp->tcp_urp_mark_mp;
481511754SKacheong.Poon@Sun.COM 		tcp->tcp_urp_mark_mp = NULL;
481611754SKacheong.Poon@Sun.COM 		if (is_system_labeled())
481711754SKacheong.Poon@Sun.COM 			tcp_setcred_data(mp1, ira);
481811754SKacheong.Poon@Sun.COM 
481911754SKacheong.Poon@Sun.COM 		putnext(connp->conn_rq, mp1);
482011754SKacheong.Poon@Sun.COM #ifdef DEBUG
482111754SKacheong.Poon@Sun.COM 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
482211754SKacheong.Poon@Sun.COM 		    "tcp_rput: sending zero-length %s %s",
482311754SKacheong.Poon@Sun.COM 		    ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
482411754SKacheong.Poon@Sun.COM 		    "MSGNOTMARKNEXT"),
482511754SKacheong.Poon@Sun.COM 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
482611754SKacheong.Poon@Sun.COM #endif /* DEBUG */
482711754SKacheong.Poon@Sun.COM 		flags &= ~TH_SEND_URP_MARK;
482811754SKacheong.Poon@Sun.COM 	}
482911754SKacheong.Poon@Sun.COM 	if (flags & TH_ACK_NEEDED) {
483011754SKacheong.Poon@Sun.COM 		/*
483111754SKacheong.Poon@Sun.COM 		 * Time to send an ack for some reason.
483211754SKacheong.Poon@Sun.COM 		 */
483311754SKacheong.Poon@Sun.COM 		mp1 = tcp_ack_mp(tcp);
483411754SKacheong.Poon@Sun.COM 
483511754SKacheong.Poon@Sun.COM 		if (mp1 != NULL) {
483611754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp1);
483711754SKacheong.Poon@Sun.COM 			BUMP_LOCAL(tcp->tcp_obsegs);
483811754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpOutAck);
483911754SKacheong.Poon@Sun.COM 		}
484011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ack_tid != 0) {
484111754SKacheong.Poon@Sun.COM 			(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
484211754SKacheong.Poon@Sun.COM 			tcp->tcp_ack_tid = 0;
484311754SKacheong.Poon@Sun.COM 		}
484411754SKacheong.Poon@Sun.COM 	}
484511754SKacheong.Poon@Sun.COM 	if (flags & TH_ACK_TIMER_NEEDED) {
484611754SKacheong.Poon@Sun.COM 		/*
484711754SKacheong.Poon@Sun.COM 		 * Arrange for deferred ACK or push wait timeout.
484811754SKacheong.Poon@Sun.COM 		 * Start timer if it is not already running.
484911754SKacheong.Poon@Sun.COM 		 */
485011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ack_tid == 0) {
485111754SKacheong.Poon@Sun.COM 			tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
485212056SKacheong.Poon@Sun.COM 			    tcp->tcp_localnet ?
485312056SKacheong.Poon@Sun.COM 			    tcps->tcps_local_dack_interval :
485412056SKacheong.Poon@Sun.COM 			    tcps->tcps_deferred_ack_interval);
485511754SKacheong.Poon@Sun.COM 		}
485611754SKacheong.Poon@Sun.COM 	}
485711754SKacheong.Poon@Sun.COM 	if (flags & TH_ORDREL_NEEDED) {
485811754SKacheong.Poon@Sun.COM 		/*
485912643SAnders.Persson@Sun.COM 		 * Notify upper layer about an orderly release. If this is
486012643SAnders.Persson@Sun.COM 		 * a non-STREAMS socket, then just make an upcall. For STREAMS
486112643SAnders.Persson@Sun.COM 		 * we send up an ordrel_ind, unless this is an eager, in which
486212643SAnders.Persson@Sun.COM 		 * case the ordrel will be sent when tcp_accept_finish runs.
486312643SAnders.Persson@Sun.COM 		 * Note that for non-STREAMS we make an upcall even if it is an
486412643SAnders.Persson@Sun.COM 		 * eager, because we have an upper handle to send it to.
486511754SKacheong.Poon@Sun.COM 		 */
486612643SAnders.Persson@Sun.COM 		ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
486711754SKacheong.Poon@Sun.COM 		ASSERT(!tcp->tcp_detached);
486811754SKacheong.Poon@Sun.COM 
486911754SKacheong.Poon@Sun.COM 		if (IPCL_IS_NONSTR(connp)) {
487011754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_ordrel_mp == NULL);
487111754SKacheong.Poon@Sun.COM 			tcp->tcp_ordrel_done = B_TRUE;
487211754SKacheong.Poon@Sun.COM 			(*connp->conn_upcalls->su_opctl)
487311754SKacheong.Poon@Sun.COM 			    (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0);
487411754SKacheong.Poon@Sun.COM 			goto done;
487511754SKacheong.Poon@Sun.COM 		}
487611754SKacheong.Poon@Sun.COM 
487711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rcv_list != NULL) {
487811754SKacheong.Poon@Sun.COM 			/*
487911754SKacheong.Poon@Sun.COM 			 * Push any mblk(s) enqueued from co processing.
488011754SKacheong.Poon@Sun.COM 			 */
488111754SKacheong.Poon@Sun.COM 			flags |= tcp_rcv_drain(tcp);
488211754SKacheong.Poon@Sun.COM 		}
488311754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
488411754SKacheong.Poon@Sun.COM 
488511754SKacheong.Poon@Sun.COM 		mp1 = tcp->tcp_ordrel_mp;
488611754SKacheong.Poon@Sun.COM 		tcp->tcp_ordrel_mp = NULL;
488711754SKacheong.Poon@Sun.COM 		tcp->tcp_ordrel_done = B_TRUE;
488811754SKacheong.Poon@Sun.COM 		putnext(connp->conn_rq, mp1);
488911754SKacheong.Poon@Sun.COM 	}
489011754SKacheong.Poon@Sun.COM done:
489111754SKacheong.Poon@Sun.COM 	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
489211754SKacheong.Poon@Sun.COM }
489311754SKacheong.Poon@Sun.COM 
489411754SKacheong.Poon@Sun.COM /*
489511754SKacheong.Poon@Sun.COM  * Attach ancillary data to a received TCP segments for the
489611754SKacheong.Poon@Sun.COM  * ancillary pieces requested by the application that are
489711754SKacheong.Poon@Sun.COM  * different than they were in the previous data segment.
489811754SKacheong.Poon@Sun.COM  *
489911754SKacheong.Poon@Sun.COM  * Save the "current" values once memory allocation is ok so that
490011754SKacheong.Poon@Sun.COM  * when memory allocation fails we can just wait for the next data segment.
490111754SKacheong.Poon@Sun.COM  */
490211754SKacheong.Poon@Sun.COM static mblk_t *
tcp_input_add_ancillary(tcp_t * tcp,mblk_t * mp,ip_pkt_t * ipp,ip_recv_attr_t * ira)490311754SKacheong.Poon@Sun.COM tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
490411754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira)
490511754SKacheong.Poon@Sun.COM {
490611754SKacheong.Poon@Sun.COM 	struct T_optdata_ind *todi;
490711754SKacheong.Poon@Sun.COM 	int optlen;
490811754SKacheong.Poon@Sun.COM 	uchar_t *optptr;
490911754SKacheong.Poon@Sun.COM 	struct T_opthdr *toh;
491011754SKacheong.Poon@Sun.COM 	crb_t addflag;	/* Which pieces to add */
491111754SKacheong.Poon@Sun.COM 	mblk_t *mp1;
491211754SKacheong.Poon@Sun.COM 	conn_t	*connp = tcp->tcp_connp;
491311754SKacheong.Poon@Sun.COM 
491411754SKacheong.Poon@Sun.COM 	optlen = 0;
491511754SKacheong.Poon@Sun.COM 	addflag.crb_all = 0;
491611754SKacheong.Poon@Sun.COM 	/* If app asked for pktinfo and the index has changed ... */
491711754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
491811754SKacheong.Poon@Sun.COM 	    ira->ira_ruifindex != tcp->tcp_recvifindex) {
491911754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) +
492011754SKacheong.Poon@Sun.COM 		    sizeof (struct in6_pktinfo);
492111754SKacheong.Poon@Sun.COM 		addflag.crb_ip_recvpktinfo = 1;
492211754SKacheong.Poon@Sun.COM 	}
492311754SKacheong.Poon@Sun.COM 	/* If app asked for hoplimit and it has changed ... */
492411754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
492511754SKacheong.Poon@Sun.COM 	    ipp->ipp_hoplimit != tcp->tcp_recvhops) {
492611754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
492711754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvhoplimit = 1;
492811754SKacheong.Poon@Sun.COM 	}
492911754SKacheong.Poon@Sun.COM 	/* If app asked for tclass and it has changed ... */
493011754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
493111754SKacheong.Poon@Sun.COM 	    ipp->ipp_tclass != tcp->tcp_recvtclass) {
493211754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
493311754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvtclass = 1;
493411754SKacheong.Poon@Sun.COM 	}
493511754SKacheong.Poon@Sun.COM 	/*
493611754SKacheong.Poon@Sun.COM 	 * If app asked for hopbyhop headers and it has changed ...
493711754SKacheong.Poon@Sun.COM 	 * For security labels, note that (1) security labels can't change on
493811754SKacheong.Poon@Sun.COM 	 * a connected socket at all, (2) we're connected to at most one peer,
493911754SKacheong.Poon@Sun.COM 	 * (3) if anything changes, then it must be some other extra option.
494011754SKacheong.Poon@Sun.COM 	 */
494111754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
494211754SKacheong.Poon@Sun.COM 	    ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen,
494311754SKacheong.Poon@Sun.COM 	    (ipp->ipp_fields & IPPF_HOPOPTS),
494411754SKacheong.Poon@Sun.COM 	    ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
494511754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
494611754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvhopopts = 1;
494711754SKacheong.Poon@Sun.COM 		if (!ip_allocbuf((void **)&tcp->tcp_hopopts,
494811754SKacheong.Poon@Sun.COM 		    &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS),
494911754SKacheong.Poon@Sun.COM 		    ipp->ipp_hopopts, ipp->ipp_hopoptslen))
495011754SKacheong.Poon@Sun.COM 			return (mp);
495111754SKacheong.Poon@Sun.COM 	}
495211754SKacheong.Poon@Sun.COM 	/* If app asked for dst headers before routing headers ... */
495311754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
495411754SKacheong.Poon@Sun.COM 	    ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen,
495511754SKacheong.Poon@Sun.COM 	    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
495611754SKacheong.Poon@Sun.COM 	    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
495711754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) +
495811754SKacheong.Poon@Sun.COM 		    ipp->ipp_rthdrdstoptslen;
495911754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvrthdrdstopts = 1;
496011754SKacheong.Poon@Sun.COM 		if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts,
496111754SKacheong.Poon@Sun.COM 		    &tcp->tcp_rthdrdstoptslen,
496211754SKacheong.Poon@Sun.COM 		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
496311754SKacheong.Poon@Sun.COM 		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
496411754SKacheong.Poon@Sun.COM 			return (mp);
496511754SKacheong.Poon@Sun.COM 	}
496611754SKacheong.Poon@Sun.COM 	/* If app asked for routing headers and it has changed ... */
496711754SKacheong.Poon@Sun.COM 	if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
496811754SKacheong.Poon@Sun.COM 	    ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen,
496911754SKacheong.Poon@Sun.COM 	    (ipp->ipp_fields & IPPF_RTHDR),
497011754SKacheong.Poon@Sun.COM 	    ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
497111754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
497211754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvrthdr = 1;
497311754SKacheong.Poon@Sun.COM 		if (!ip_allocbuf((void **)&tcp->tcp_rthdr,
497411754SKacheong.Poon@Sun.COM 		    &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR),
497511754SKacheong.Poon@Sun.COM 		    ipp->ipp_rthdr, ipp->ipp_rthdrlen))
497611754SKacheong.Poon@Sun.COM 			return (mp);
497711754SKacheong.Poon@Sun.COM 	}
497811754SKacheong.Poon@Sun.COM 	/* If app asked for dest headers and it has changed ... */
497911754SKacheong.Poon@Sun.COM 	if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts ||
498011754SKacheong.Poon@Sun.COM 	    connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) &&
498111754SKacheong.Poon@Sun.COM 	    ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen,
498211754SKacheong.Poon@Sun.COM 	    (ipp->ipp_fields & IPPF_DSTOPTS),
498311754SKacheong.Poon@Sun.COM 	    ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
498411754SKacheong.Poon@Sun.COM 		optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
498511754SKacheong.Poon@Sun.COM 		addflag.crb_ipv6_recvdstopts = 1;
498611754SKacheong.Poon@Sun.COM 		if (!ip_allocbuf((void **)&tcp->tcp_dstopts,
498711754SKacheong.Poon@Sun.COM 		    &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS),
498811754SKacheong.Poon@Sun.COM 		    ipp->ipp_dstopts, ipp->ipp_dstoptslen))
498911754SKacheong.Poon@Sun.COM 			return (mp);
499011754SKacheong.Poon@Sun.COM 	}
499111754SKacheong.Poon@Sun.COM 
499211754SKacheong.Poon@Sun.COM 	if (optlen == 0) {
499311754SKacheong.Poon@Sun.COM 		/* Nothing to add */
499411754SKacheong.Poon@Sun.COM 		return (mp);
499511754SKacheong.Poon@Sun.COM 	}
499611754SKacheong.Poon@Sun.COM 	mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED);
499711754SKacheong.Poon@Sun.COM 	if (mp1 == NULL) {
499811754SKacheong.Poon@Sun.COM 		/*
499911754SKacheong.Poon@Sun.COM 		 * Defer sending ancillary data until the next TCP segment
500011754SKacheong.Poon@Sun.COM 		 * arrives.
500111754SKacheong.Poon@Sun.COM 		 */
500211754SKacheong.Poon@Sun.COM 		return (mp);
500311754SKacheong.Poon@Sun.COM 	}
500411754SKacheong.Poon@Sun.COM 	mp1->b_cont = mp;
500511754SKacheong.Poon@Sun.COM 	mp = mp1;
500611754SKacheong.Poon@Sun.COM 	mp->b_wptr += sizeof (*todi) + optlen;
500711754SKacheong.Poon@Sun.COM 	mp->b_datap->db_type = M_PROTO;
500811754SKacheong.Poon@Sun.COM 	todi = (struct T_optdata_ind *)mp->b_rptr;
500911754SKacheong.Poon@Sun.COM 	todi->PRIM_type = T_OPTDATA_IND;
501011754SKacheong.Poon@Sun.COM 	todi->DATA_flag = 1;	/* MORE data */
501111754SKacheong.Poon@Sun.COM 	todi->OPT_length = optlen;
501211754SKacheong.Poon@Sun.COM 	todi->OPT_offset = sizeof (*todi);
501311754SKacheong.Poon@Sun.COM 	optptr = (uchar_t *)&todi[1];
501411754SKacheong.Poon@Sun.COM 	/*
501511754SKacheong.Poon@Sun.COM 	 * If app asked for pktinfo and the index has changed ...
501611754SKacheong.Poon@Sun.COM 	 * Note that the local address never changes for the connection.
501711754SKacheong.Poon@Sun.COM 	 */
501811754SKacheong.Poon@Sun.COM 	if (addflag.crb_ip_recvpktinfo) {
501911754SKacheong.Poon@Sun.COM 		struct in6_pktinfo *pkti;
502011754SKacheong.Poon@Sun.COM 		uint_t ifindex;
502111754SKacheong.Poon@Sun.COM 
502211754SKacheong.Poon@Sun.COM 		ifindex = ira->ira_ruifindex;
502311754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
502411754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
502511754SKacheong.Poon@Sun.COM 		toh->name = IPV6_PKTINFO;
502611754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + sizeof (*pkti);
502711754SKacheong.Poon@Sun.COM 		toh->status = 0;
502811754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
502911754SKacheong.Poon@Sun.COM 		pkti = (struct in6_pktinfo *)optptr;
503011754SKacheong.Poon@Sun.COM 		pkti->ipi6_addr = connp->conn_laddr_v6;
503111754SKacheong.Poon@Sun.COM 		pkti->ipi6_ifindex = ifindex;
503211754SKacheong.Poon@Sun.COM 		optptr += sizeof (*pkti);
503311754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
503411754SKacheong.Poon@Sun.COM 		/* Save as "last" value */
503511754SKacheong.Poon@Sun.COM 		tcp->tcp_recvifindex = ifindex;
503611754SKacheong.Poon@Sun.COM 	}
503711754SKacheong.Poon@Sun.COM 	/* If app asked for hoplimit and it has changed ... */
503811754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvhoplimit) {
503911754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
504011754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
504111754SKacheong.Poon@Sun.COM 		toh->name = IPV6_HOPLIMIT;
504211754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + sizeof (uint_t);
504311754SKacheong.Poon@Sun.COM 		toh->status = 0;
504411754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
504511754SKacheong.Poon@Sun.COM 		*(uint_t *)optptr = ipp->ipp_hoplimit;
504611754SKacheong.Poon@Sun.COM 		optptr += sizeof (uint_t);
504711754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
504811754SKacheong.Poon@Sun.COM 		/* Save as "last" value */
504911754SKacheong.Poon@Sun.COM 		tcp->tcp_recvhops = ipp->ipp_hoplimit;
505011754SKacheong.Poon@Sun.COM 	}
505111754SKacheong.Poon@Sun.COM 	/* If app asked for tclass and it has changed ... */
505211754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvtclass) {
505311754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
505411754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
505511754SKacheong.Poon@Sun.COM 		toh->name = IPV6_TCLASS;
505611754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + sizeof (uint_t);
505711754SKacheong.Poon@Sun.COM 		toh->status = 0;
505811754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
505911754SKacheong.Poon@Sun.COM 		*(uint_t *)optptr = ipp->ipp_tclass;
506011754SKacheong.Poon@Sun.COM 		optptr += sizeof (uint_t);
506111754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
506211754SKacheong.Poon@Sun.COM 		/* Save as "last" value */
506311754SKacheong.Poon@Sun.COM 		tcp->tcp_recvtclass = ipp->ipp_tclass;
506411754SKacheong.Poon@Sun.COM 	}
506511754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvhopopts) {
506611754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
506711754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
506811754SKacheong.Poon@Sun.COM 		toh->name = IPV6_HOPOPTS;
506911754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + ipp->ipp_hopoptslen;
507011754SKacheong.Poon@Sun.COM 		toh->status = 0;
507111754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
507211754SKacheong.Poon@Sun.COM 		bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen);
507311754SKacheong.Poon@Sun.COM 		optptr += ipp->ipp_hopoptslen;
507411754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
507511754SKacheong.Poon@Sun.COM 		/* Save as last value */
507611754SKacheong.Poon@Sun.COM 		ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen,
507711754SKacheong.Poon@Sun.COM 		    (ipp->ipp_fields & IPPF_HOPOPTS),
507811754SKacheong.Poon@Sun.COM 		    ipp->ipp_hopopts, ipp->ipp_hopoptslen);
507911754SKacheong.Poon@Sun.COM 	}
508011754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvrthdrdstopts) {
508111754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
508211754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
508311754SKacheong.Poon@Sun.COM 		toh->name = IPV6_RTHDRDSTOPTS;
508411754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen;
508511754SKacheong.Poon@Sun.COM 		toh->status = 0;
508611754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
508711754SKacheong.Poon@Sun.COM 		bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
508811754SKacheong.Poon@Sun.COM 		optptr += ipp->ipp_rthdrdstoptslen;
508911754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
509011754SKacheong.Poon@Sun.COM 		/* Save as last value */
509111754SKacheong.Poon@Sun.COM 		ip_savebuf((void **)&tcp->tcp_rthdrdstopts,
509211754SKacheong.Poon@Sun.COM 		    &tcp->tcp_rthdrdstoptslen,
509311754SKacheong.Poon@Sun.COM 		    (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
509411754SKacheong.Poon@Sun.COM 		    ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
509511754SKacheong.Poon@Sun.COM 	}
509611754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvrthdr) {
509711754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
509811754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
509911754SKacheong.Poon@Sun.COM 		toh->name = IPV6_RTHDR;
510011754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + ipp->ipp_rthdrlen;
510111754SKacheong.Poon@Sun.COM 		toh->status = 0;
510211754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
510311754SKacheong.Poon@Sun.COM 		bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen);
510411754SKacheong.Poon@Sun.COM 		optptr += ipp->ipp_rthdrlen;
510511754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
510611754SKacheong.Poon@Sun.COM 		/* Save as last value */
510711754SKacheong.Poon@Sun.COM 		ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen,
510811754SKacheong.Poon@Sun.COM 		    (ipp->ipp_fields & IPPF_RTHDR),
510911754SKacheong.Poon@Sun.COM 		    ipp->ipp_rthdr, ipp->ipp_rthdrlen);
511011754SKacheong.Poon@Sun.COM 	}
511111754SKacheong.Poon@Sun.COM 	if (addflag.crb_ipv6_recvdstopts) {
511211754SKacheong.Poon@Sun.COM 		toh = (struct T_opthdr *)optptr;
511311754SKacheong.Poon@Sun.COM 		toh->level = IPPROTO_IPV6;
511411754SKacheong.Poon@Sun.COM 		toh->name = IPV6_DSTOPTS;
511511754SKacheong.Poon@Sun.COM 		toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
511611754SKacheong.Poon@Sun.COM 		toh->status = 0;
511711754SKacheong.Poon@Sun.COM 		optptr += sizeof (*toh);
511811754SKacheong.Poon@Sun.COM 		bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
511911754SKacheong.Poon@Sun.COM 		optptr += ipp->ipp_dstoptslen;
512011754SKacheong.Poon@Sun.COM 		ASSERT(OK_32PTR(optptr));
512111754SKacheong.Poon@Sun.COM 		/* Save as last value */
512211754SKacheong.Poon@Sun.COM 		ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
512311754SKacheong.Poon@Sun.COM 		    (ipp->ipp_fields & IPPF_DSTOPTS),
512411754SKacheong.Poon@Sun.COM 		    ipp->ipp_dstopts, ipp->ipp_dstoptslen);
512511754SKacheong.Poon@Sun.COM 	}
512611754SKacheong.Poon@Sun.COM 	ASSERT(optptr == mp->b_wptr);
512711754SKacheong.Poon@Sun.COM 	return (mp);
512811754SKacheong.Poon@Sun.COM }
512911754SKacheong.Poon@Sun.COM 
513011754SKacheong.Poon@Sun.COM /* The minimum of smoothed mean deviation in RTO calculation. */
513111754SKacheong.Poon@Sun.COM #define	TCP_SD_MIN	400
513211754SKacheong.Poon@Sun.COM 
513311754SKacheong.Poon@Sun.COM /*
513411754SKacheong.Poon@Sun.COM  * Set RTO for this connection.  The formula is from Jacobson and Karels'
513511754SKacheong.Poon@Sun.COM  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
513611754SKacheong.Poon@Sun.COM  * are the same as those in Appendix A.2 of that paper.
513711754SKacheong.Poon@Sun.COM  *
513811754SKacheong.Poon@Sun.COM  * m = new measurement
513911754SKacheong.Poon@Sun.COM  * sa = smoothed RTT average (8 * average estimates).
514011754SKacheong.Poon@Sun.COM  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
514111754SKacheong.Poon@Sun.COM  */
514211754SKacheong.Poon@Sun.COM static void
tcp_set_rto(tcp_t * tcp,clock_t rtt)514311754SKacheong.Poon@Sun.COM tcp_set_rto(tcp_t *tcp, clock_t rtt)
514411754SKacheong.Poon@Sun.COM {
514511754SKacheong.Poon@Sun.COM 	long m = TICK_TO_MSEC(rtt);
514611754SKacheong.Poon@Sun.COM 	clock_t sa = tcp->tcp_rtt_sa;
514711754SKacheong.Poon@Sun.COM 	clock_t sv = tcp->tcp_rtt_sd;
514811754SKacheong.Poon@Sun.COM 	clock_t rto;
514911754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
515011754SKacheong.Poon@Sun.COM 
515111754SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpRttUpdate);
515211754SKacheong.Poon@Sun.COM 	tcp->tcp_rtt_update++;
515311754SKacheong.Poon@Sun.COM 
515411754SKacheong.Poon@Sun.COM 	/* tcp_rtt_sa is not 0 means this is a new sample. */
515511754SKacheong.Poon@Sun.COM 	if (sa != 0) {
515611754SKacheong.Poon@Sun.COM 		/*
515711754SKacheong.Poon@Sun.COM 		 * Update average estimator:
515811754SKacheong.Poon@Sun.COM 		 *	new rtt = 7/8 old rtt + 1/8 Error
515911754SKacheong.Poon@Sun.COM 		 */
516011754SKacheong.Poon@Sun.COM 
516111754SKacheong.Poon@Sun.COM 		/* m is now Error in estimate. */
516211754SKacheong.Poon@Sun.COM 		m -= sa >> 3;
516311754SKacheong.Poon@Sun.COM 		if ((sa += m) <= 0) {
516411754SKacheong.Poon@Sun.COM 			/*
516511754SKacheong.Poon@Sun.COM 			 * Don't allow the smoothed average to be negative.
516611754SKacheong.Poon@Sun.COM 			 * We use 0 to denote reinitialization of the
516711754SKacheong.Poon@Sun.COM 			 * variables.
516811754SKacheong.Poon@Sun.COM 			 */
516911754SKacheong.Poon@Sun.COM 			sa = 1;
517011754SKacheong.Poon@Sun.COM 		}
517111754SKacheong.Poon@Sun.COM 
517211754SKacheong.Poon@Sun.COM 		/*
517311754SKacheong.Poon@Sun.COM 		 * Update deviation estimator:
517411754SKacheong.Poon@Sun.COM 		 *	new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
517511754SKacheong.Poon@Sun.COM 		 */
517611754SKacheong.Poon@Sun.COM 		if (m < 0)
517711754SKacheong.Poon@Sun.COM 			m = -m;
517811754SKacheong.Poon@Sun.COM 		m -= sv >> 2;
517911754SKacheong.Poon@Sun.COM 		sv += m;
518011754SKacheong.Poon@Sun.COM 	} else {
518111754SKacheong.Poon@Sun.COM 		/*
518211754SKacheong.Poon@Sun.COM 		 * This follows BSD's implementation.  So the reinitialized
518311754SKacheong.Poon@Sun.COM 		 * RTO is 3 * m.  We cannot go less than 2 because if the
518411754SKacheong.Poon@Sun.COM 		 * link is bandwidth dominated, doubling the window size
518511754SKacheong.Poon@Sun.COM 		 * during slow start means doubling the RTT.  We want to be
518611754SKacheong.Poon@Sun.COM 		 * more conservative when we reinitialize our estimates.  3
518711754SKacheong.Poon@Sun.COM 		 * is just a convenient number.
518811754SKacheong.Poon@Sun.COM 		 */
518911754SKacheong.Poon@Sun.COM 		sa = m << 3;
519011754SKacheong.Poon@Sun.COM 		sv = m << 1;
519111754SKacheong.Poon@Sun.COM 	}
519211754SKacheong.Poon@Sun.COM 	if (sv < TCP_SD_MIN) {
519311754SKacheong.Poon@Sun.COM 		/*
519411754SKacheong.Poon@Sun.COM 		 * We do not know that if sa captures the delay ACK
519511754SKacheong.Poon@Sun.COM 		 * effect as in a long train of segments, a receiver
519611754SKacheong.Poon@Sun.COM 		 * does not delay its ACKs.  So set the minimum of sv
519711754SKacheong.Poon@Sun.COM 		 * to be TCP_SD_MIN, which is default to 400 ms, twice
519811754SKacheong.Poon@Sun.COM 		 * of BSD DATO.  That means the minimum of mean
519911754SKacheong.Poon@Sun.COM 		 * deviation is 100 ms.
520011754SKacheong.Poon@Sun.COM 		 *
520111754SKacheong.Poon@Sun.COM 		 */
520211754SKacheong.Poon@Sun.COM 		sv = TCP_SD_MIN;
520311754SKacheong.Poon@Sun.COM 	}
520411754SKacheong.Poon@Sun.COM 	tcp->tcp_rtt_sa = sa;
520511754SKacheong.Poon@Sun.COM 	tcp->tcp_rtt_sd = sv;
520611754SKacheong.Poon@Sun.COM 	/*
520711754SKacheong.Poon@Sun.COM 	 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
520811754SKacheong.Poon@Sun.COM 	 *
520911754SKacheong.Poon@Sun.COM 	 * Add tcp_rexmit_interval extra in case of extreme environment
521011754SKacheong.Poon@Sun.COM 	 * where the algorithm fails to work.  The default value of
521111754SKacheong.Poon@Sun.COM 	 * tcp_rexmit_interval_extra should be 0.
521211754SKacheong.Poon@Sun.COM 	 *
521311754SKacheong.Poon@Sun.COM 	 * As we use a finer grained clock than BSD and update
521411754SKacheong.Poon@Sun.COM 	 * RTO for every ACKs, add in another .25 of RTT to the
521511754SKacheong.Poon@Sun.COM 	 * deviation of RTO to accomodate burstiness of 1/4 of
521611754SKacheong.Poon@Sun.COM 	 * window size.
521711754SKacheong.Poon@Sun.COM 	 */
521811754SKacheong.Poon@Sun.COM 	rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
521911754SKacheong.Poon@Sun.COM 
522012544SKacheong.Poon@Sun.COM 	TCP_SET_RTO(tcp, rto);
522111754SKacheong.Poon@Sun.COM 
522211754SKacheong.Poon@Sun.COM 	/* Now, we can reset tcp_timer_backoff to use the new RTO... */
522311754SKacheong.Poon@Sun.COM 	tcp->tcp_timer_backoff = 0;
522411754SKacheong.Poon@Sun.COM }
522511754SKacheong.Poon@Sun.COM 
522611754SKacheong.Poon@Sun.COM /*
522711754SKacheong.Poon@Sun.COM  * On a labeled system we have some protocols above TCP, such as RPC, which
522811754SKacheong.Poon@Sun.COM  * appear to assume that every mblk in a chain has a db_credp.
522911754SKacheong.Poon@Sun.COM  */
523011754SKacheong.Poon@Sun.COM static void
tcp_setcred_data(mblk_t * mp,ip_recv_attr_t * ira)523111754SKacheong.Poon@Sun.COM tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
523211754SKacheong.Poon@Sun.COM {
523311754SKacheong.Poon@Sun.COM 	ASSERT(is_system_labeled());
523411754SKacheong.Poon@Sun.COM 	ASSERT(ira->ira_cred != NULL);
523511754SKacheong.Poon@Sun.COM 
523611754SKacheong.Poon@Sun.COM 	while (mp != NULL) {
523711754SKacheong.Poon@Sun.COM 		mblk_setcred(mp, ira->ira_cred, NOPID);
523811754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
523911754SKacheong.Poon@Sun.COM 	}
524011754SKacheong.Poon@Sun.COM }
524111754SKacheong.Poon@Sun.COM 
524211754SKacheong.Poon@Sun.COM uint_t
tcp_rwnd_reopen(tcp_t * tcp)524311754SKacheong.Poon@Sun.COM tcp_rwnd_reopen(tcp_t *tcp)
524411754SKacheong.Poon@Sun.COM {
524511754SKacheong.Poon@Sun.COM 	uint_t ret = 0;
524611754SKacheong.Poon@Sun.COM 	uint_t thwin;
524711754SKacheong.Poon@Sun.COM 	conn_t *connp = tcp->tcp_connp;
524811754SKacheong.Poon@Sun.COM 
524911754SKacheong.Poon@Sun.COM 	/* Learn the latest rwnd information that we sent to the other side. */
525011754SKacheong.Poon@Sun.COM 	thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win))
525111754SKacheong.Poon@Sun.COM 	    << tcp->tcp_rcv_ws;
525211754SKacheong.Poon@Sun.COM 	/* This is peer's calculated send window (our receive window). */
525311754SKacheong.Poon@Sun.COM 	thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
525411754SKacheong.Poon@Sun.COM 	/*
525511754SKacheong.Poon@Sun.COM 	 * Increase the receive window to max.  But we need to do receiver
525611754SKacheong.Poon@Sun.COM 	 * SWS avoidance.  This means that we need to check the increase of
525711754SKacheong.Poon@Sun.COM 	 * of receive window is at least 1 MSS.
525811754SKacheong.Poon@Sun.COM 	 */
525911754SKacheong.Poon@Sun.COM 	if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) {
526011754SKacheong.Poon@Sun.COM 		/*
526111754SKacheong.Poon@Sun.COM 		 * If the window that the other side knows is less than max
526211754SKacheong.Poon@Sun.COM 		 * deferred acks segments, send an update immediately.
526311754SKacheong.Poon@Sun.COM 		 */
526411754SKacheong.Poon@Sun.COM 		if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
526511754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutWinUpdate);
526611754SKacheong.Poon@Sun.COM 			ret = TH_ACK_NEEDED;
526711754SKacheong.Poon@Sun.COM 		}
526811754SKacheong.Poon@Sun.COM 		tcp->tcp_rwnd = connp->conn_rcvbuf;
526911754SKacheong.Poon@Sun.COM 	}
527011754SKacheong.Poon@Sun.COM 	return (ret);
527111754SKacheong.Poon@Sun.COM }
527211754SKacheong.Poon@Sun.COM 
527311754SKacheong.Poon@Sun.COM /*
527411754SKacheong.Poon@Sun.COM  * Handle a packet that has been reclassified by TCP.
527511754SKacheong.Poon@Sun.COM  * This function drops the ref on connp that the caller had.
527611754SKacheong.Poon@Sun.COM  */
527711754SKacheong.Poon@Sun.COM void
tcp_reinput(conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst)527811754SKacheong.Poon@Sun.COM tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
527911754SKacheong.Poon@Sun.COM {
528011754SKacheong.Poon@Sun.COM 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
528111754SKacheong.Poon@Sun.COM 
528211754SKacheong.Poon@Sun.COM 	if (connp->conn_incoming_ifindex != 0 &&
528311754SKacheong.Poon@Sun.COM 	    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
528411754SKacheong.Poon@Sun.COM 		freemsg(mp);
528511754SKacheong.Poon@Sun.COM 		CONN_DEC_REF(connp);
528611754SKacheong.Poon@Sun.COM 		return;
528711754SKacheong.Poon@Sun.COM 	}
528811754SKacheong.Poon@Sun.COM 
528911754SKacheong.Poon@Sun.COM 	if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
529011754SKacheong.Poon@Sun.COM 	    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
529111754SKacheong.Poon@Sun.COM 		ip6_t *ip6h;
529211754SKacheong.Poon@Sun.COM 		ipha_t *ipha;
529311754SKacheong.Poon@Sun.COM 
529411754SKacheong.Poon@Sun.COM 		if (ira->ira_flags & IRAF_IS_IPV4) {
529511754SKacheong.Poon@Sun.COM 			ipha = (ipha_t *)mp->b_rptr;
529611754SKacheong.Poon@Sun.COM 			ip6h = NULL;
529711754SKacheong.Poon@Sun.COM 		} else {
529811754SKacheong.Poon@Sun.COM 			ipha = NULL;
529911754SKacheong.Poon@Sun.COM 			ip6h = (ip6_t *)mp->b_rptr;
530011754SKacheong.Poon@Sun.COM 		}
530111754SKacheong.Poon@Sun.COM 		mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira);
530211754SKacheong.Poon@Sun.COM 		if (mp == NULL) {
530311754SKacheong.Poon@Sun.COM 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
530411754SKacheong.Poon@Sun.COM 			/* Note that mp is NULL */
530511754SKacheong.Poon@Sun.COM 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
530611754SKacheong.Poon@Sun.COM 			CONN_DEC_REF(connp);
530711754SKacheong.Poon@Sun.COM 			return;
530811754SKacheong.Poon@Sun.COM 		}
530911754SKacheong.Poon@Sun.COM 	}
531011754SKacheong.Poon@Sun.COM 
531111754SKacheong.Poon@Sun.COM 	if (IPCL_IS_TCP(connp)) {
531211754SKacheong.Poon@Sun.COM 		/*
531311754SKacheong.Poon@Sun.COM 		 * do not drain, certain use cases can blow
531411754SKacheong.Poon@Sun.COM 		 * the stack
531511754SKacheong.Poon@Sun.COM 		 */
531611754SKacheong.Poon@Sun.COM 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
531711754SKacheong.Poon@Sun.COM 		    connp->conn_recv, connp, ira,
531811754SKacheong.Poon@Sun.COM 		    SQ_NODRAIN, SQTAG_IP_TCP_INPUT);
531911754SKacheong.Poon@Sun.COM 	} else {
532011754SKacheong.Poon@Sun.COM 		/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
532111754SKacheong.Poon@Sun.COM 		(connp->conn_recv)(connp, mp, NULL,
532211754SKacheong.Poon@Sun.COM 		    ira);
532311754SKacheong.Poon@Sun.COM 		CONN_DEC_REF(connp);
532411754SKacheong.Poon@Sun.COM 	}
532511754SKacheong.Poon@Sun.COM 
532611754SKacheong.Poon@Sun.COM }
532711754SKacheong.Poon@Sun.COM 
532811754SKacheong.Poon@Sun.COM /* ARGSUSED */
532911754SKacheong.Poon@Sun.COM static void
tcp_rsrv_input(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)533011754SKacheong.Poon@Sun.COM tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
533111754SKacheong.Poon@Sun.COM {
533211754SKacheong.Poon@Sun.COM 	conn_t	*connp = (conn_t *)arg;
533311754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = connp->conn_tcp;
533411754SKacheong.Poon@Sun.COM 	queue_t	*q = connp->conn_rq;
533511754SKacheong.Poon@Sun.COM 
533611754SKacheong.Poon@Sun.COM 	ASSERT(!IPCL_IS_NONSTR(connp));
533711754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
533811754SKacheong.Poon@Sun.COM 	tcp->tcp_rsrv_mp = mp;
533911754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
534011754SKacheong.Poon@Sun.COM 
534111754SKacheong.Poon@Sun.COM 	if (TCP_IS_DETACHED(tcp) || q == NULL) {
534211754SKacheong.Poon@Sun.COM 		return;
534311754SKacheong.Poon@Sun.COM 	}
534411754SKacheong.Poon@Sun.COM 
534511754SKacheong.Poon@Sun.COM 	if (tcp->tcp_fused) {
534611754SKacheong.Poon@Sun.COM 		tcp_fuse_backenable(tcp);
534711754SKacheong.Poon@Sun.COM 		return;
534811754SKacheong.Poon@Sun.COM 	}
534911754SKacheong.Poon@Sun.COM 
535011754SKacheong.Poon@Sun.COM 	if (canputnext(q)) {
535111754SKacheong.Poon@Sun.COM 		/* Not flow-controlled, open rwnd */
535211754SKacheong.Poon@Sun.COM 		tcp->tcp_rwnd = connp->conn_rcvbuf;
535311754SKacheong.Poon@Sun.COM 
535411754SKacheong.Poon@Sun.COM 		/*
535511754SKacheong.Poon@Sun.COM 		 * Send back a window update immediately if TCP is above
535611754SKacheong.Poon@Sun.COM 		 * ESTABLISHED state and the increase of the rcv window
535711754SKacheong.Poon@Sun.COM 		 * that the other side knows is at least 1 MSS after flow
535811754SKacheong.Poon@Sun.COM 		 * control is lifted.
535911754SKacheong.Poon@Sun.COM 		 */
536011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
536111754SKacheong.Poon@Sun.COM 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
536211754SKacheong.Poon@Sun.COM 			tcp_xmit_ctl(NULL, tcp,
536311754SKacheong.Poon@Sun.COM 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
536411754SKacheong.Poon@Sun.COM 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
536511754SKacheong.Poon@Sun.COM 		}
536611754SKacheong.Poon@Sun.COM 	}
536711754SKacheong.Poon@Sun.COM }
536811754SKacheong.Poon@Sun.COM 
536911754SKacheong.Poon@Sun.COM /*
537011754SKacheong.Poon@Sun.COM  * The read side service routine is called mostly when we get back-enabled as a
537111754SKacheong.Poon@Sun.COM  * result of flow control relief.  Since we don't actually queue anything in
537211754SKacheong.Poon@Sun.COM  * TCP, we have no data to send out of here.  What we do is clear the receive
537311754SKacheong.Poon@Sun.COM  * window, and send out a window update.
537411754SKacheong.Poon@Sun.COM  */
537511754SKacheong.Poon@Sun.COM void
tcp_rsrv(queue_t * q)537611754SKacheong.Poon@Sun.COM tcp_rsrv(queue_t *q)
537711754SKacheong.Poon@Sun.COM {
537811754SKacheong.Poon@Sun.COM 	conn_t		*connp = Q_TO_CONN(q);
537911754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
538011754SKacheong.Poon@Sun.COM 	mblk_t		*mp;
538111754SKacheong.Poon@Sun.COM 
538211754SKacheong.Poon@Sun.COM 	/* No code does a putq on the read side */
538311754SKacheong.Poon@Sun.COM 	ASSERT(q->q_first == NULL);
538411754SKacheong.Poon@Sun.COM 
538511754SKacheong.Poon@Sun.COM 	/*
538611754SKacheong.Poon@Sun.COM 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already
538711754SKacheong.Poon@Sun.COM 	 * been run.  So just return.
538811754SKacheong.Poon@Sun.COM 	 */
538911754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
539011754SKacheong.Poon@Sun.COM 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
539111754SKacheong.Poon@Sun.COM 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
539211754SKacheong.Poon@Sun.COM 		return;
539311754SKacheong.Poon@Sun.COM 	}
539411754SKacheong.Poon@Sun.COM 	tcp->tcp_rsrv_mp = NULL;
539511754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
539611754SKacheong.Poon@Sun.COM 
539711754SKacheong.Poon@Sun.COM 	CONN_INC_REF(connp);
539811754SKacheong.Poon@Sun.COM 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
539911754SKacheong.Poon@Sun.COM 	    NULL, SQ_PROCESS, SQTAG_TCP_RSRV);
540011754SKacheong.Poon@Sun.COM }
540111754SKacheong.Poon@Sun.COM 
540211754SKacheong.Poon@Sun.COM /* At minimum we need 8 bytes in the TCP header for the lookup */
540311754SKacheong.Poon@Sun.COM #define	ICMP_MIN_TCP_HDR	8
540411754SKacheong.Poon@Sun.COM 
540511754SKacheong.Poon@Sun.COM /*
540611754SKacheong.Poon@Sun.COM  * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
540711754SKacheong.Poon@Sun.COM  * passed up by IP. The message is always received on the correct tcp_t.
540811754SKacheong.Poon@Sun.COM  * Assumes that IP has pulled up everything up to and including the ICMP header.
540911754SKacheong.Poon@Sun.COM  */
541011754SKacheong.Poon@Sun.COM /* ARGSUSED2 */
541111754SKacheong.Poon@Sun.COM void
tcp_icmp_input(void * arg1,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)541211754SKacheong.Poon@Sun.COM tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
541311754SKacheong.Poon@Sun.COM {
541411754SKacheong.Poon@Sun.COM 	conn_t		*connp = (conn_t *)arg1;
541511754SKacheong.Poon@Sun.COM 	icmph_t		*icmph;
541611754SKacheong.Poon@Sun.COM 	ipha_t		*ipha;
541711754SKacheong.Poon@Sun.COM 	int		iph_hdr_length;
541811754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
541911754SKacheong.Poon@Sun.COM 	uint32_t	seg_seq;
542011754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
542111754SKacheong.Poon@Sun.COM 
542211754SKacheong.Poon@Sun.COM 	/* Assume IP provides aligned packets */
542311754SKacheong.Poon@Sun.COM 	ASSERT(OK_32PTR(mp->b_rptr));
542411754SKacheong.Poon@Sun.COM 	ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
542511754SKacheong.Poon@Sun.COM 
542611754SKacheong.Poon@Sun.COM 	/*
542711754SKacheong.Poon@Sun.COM 	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
542811754SKacheong.Poon@Sun.COM 	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
542911754SKacheong.Poon@Sun.COM 	 */
543011754SKacheong.Poon@Sun.COM 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
543111754SKacheong.Poon@Sun.COM 		tcp_icmp_error_ipv6(tcp, mp, ira);
543211754SKacheong.Poon@Sun.COM 		return;
543311754SKacheong.Poon@Sun.COM 	}
543411754SKacheong.Poon@Sun.COM 
543511754SKacheong.Poon@Sun.COM 	/* Skip past the outer IP and ICMP headers */
543611754SKacheong.Poon@Sun.COM 	iph_hdr_length = ira->ira_ip_hdr_length;
543711754SKacheong.Poon@Sun.COM 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
543811754SKacheong.Poon@Sun.COM 	/*
543911754SKacheong.Poon@Sun.COM 	 * If we don't have the correct outer IP header length
544011754SKacheong.Poon@Sun.COM 	 * or if we don't have a complete inner IP header
544111754SKacheong.Poon@Sun.COM 	 * drop it.
544211754SKacheong.Poon@Sun.COM 	 */
544311754SKacheong.Poon@Sun.COM 	if (iph_hdr_length < sizeof (ipha_t) ||
544411754SKacheong.Poon@Sun.COM 	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
544511754SKacheong.Poon@Sun.COM noticmpv4:
544611754SKacheong.Poon@Sun.COM 		freemsg(mp);
544711754SKacheong.Poon@Sun.COM 		return;
544811754SKacheong.Poon@Sun.COM 	}
544911754SKacheong.Poon@Sun.COM 	ipha = (ipha_t *)&icmph[1];
545011754SKacheong.Poon@Sun.COM 
545111754SKacheong.Poon@Sun.COM 	/* Skip past the inner IP and find the ULP header */
545211754SKacheong.Poon@Sun.COM 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
545311754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
545411754SKacheong.Poon@Sun.COM 	/*
545511754SKacheong.Poon@Sun.COM 	 * If we don't have the correct inner IP header length or if the ULP
545611754SKacheong.Poon@Sun.COM 	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
545711754SKacheong.Poon@Sun.COM 	 * bytes of TCP header, drop it.
545811754SKacheong.Poon@Sun.COM 	 */
545911754SKacheong.Poon@Sun.COM 	if (iph_hdr_length < sizeof (ipha_t) ||
546011754SKacheong.Poon@Sun.COM 	    ipha->ipha_protocol != IPPROTO_TCP ||
546111754SKacheong.Poon@Sun.COM 	    (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
546211754SKacheong.Poon@Sun.COM 		goto noticmpv4;
546311754SKacheong.Poon@Sun.COM 	}
546411754SKacheong.Poon@Sun.COM 
546511754SKacheong.Poon@Sun.COM 	seg_seq = ntohl(tcpha->tha_seq);
546611754SKacheong.Poon@Sun.COM 	switch (icmph->icmph_type) {
546711754SKacheong.Poon@Sun.COM 	case ICMP_DEST_UNREACHABLE:
546811754SKacheong.Poon@Sun.COM 		switch (icmph->icmph_code) {
546911754SKacheong.Poon@Sun.COM 		case ICMP_FRAGMENTATION_NEEDED:
547011754SKacheong.Poon@Sun.COM 			/*
547111754SKacheong.Poon@Sun.COM 			 * Update Path MTU, then try to send something out.
547211754SKacheong.Poon@Sun.COM 			 */
547311754SKacheong.Poon@Sun.COM 			tcp_update_pmtu(tcp, B_TRUE);
547411754SKacheong.Poon@Sun.COM 			tcp_rexmit_after_error(tcp);
547511754SKacheong.Poon@Sun.COM 			break;
547611754SKacheong.Poon@Sun.COM 		case ICMP_PORT_UNREACHABLE:
547711754SKacheong.Poon@Sun.COM 		case ICMP_PROTOCOL_UNREACHABLE:
547811754SKacheong.Poon@Sun.COM 			switch (tcp->tcp_state) {
547911754SKacheong.Poon@Sun.COM 			case TCPS_SYN_SENT:
548011754SKacheong.Poon@Sun.COM 			case TCPS_SYN_RCVD:
548111754SKacheong.Poon@Sun.COM 				/*
548211754SKacheong.Poon@Sun.COM 				 * ICMP can snipe away incipient
548311754SKacheong.Poon@Sun.COM 				 * TCP connections as long as
548411754SKacheong.Poon@Sun.COM 				 * seq number is same as initial
548511754SKacheong.Poon@Sun.COM 				 * send seq number.
548611754SKacheong.Poon@Sun.COM 				 */
548711754SKacheong.Poon@Sun.COM 				if (seg_seq == tcp->tcp_iss) {
548811754SKacheong.Poon@Sun.COM 					(void) tcp_clean_death(tcp,
548911754SKacheong.Poon@Sun.COM 					    ECONNREFUSED);
549011754SKacheong.Poon@Sun.COM 				}
549111754SKacheong.Poon@Sun.COM 				break;
549211754SKacheong.Poon@Sun.COM 			}
549311754SKacheong.Poon@Sun.COM 			break;
549411754SKacheong.Poon@Sun.COM 		case ICMP_HOST_UNREACHABLE:
549511754SKacheong.Poon@Sun.COM 		case ICMP_NET_UNREACHABLE:
549611754SKacheong.Poon@Sun.COM 			/* Record the error in case we finally time out. */
549711754SKacheong.Poon@Sun.COM 			if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
549811754SKacheong.Poon@Sun.COM 				tcp->tcp_client_errno = EHOSTUNREACH;
549911754SKacheong.Poon@Sun.COM 			else
550011754SKacheong.Poon@Sun.COM 				tcp->tcp_client_errno = ENETUNREACH;
550111754SKacheong.Poon@Sun.COM 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
550211754SKacheong.Poon@Sun.COM 				if (tcp->tcp_listener != NULL &&
550311754SKacheong.Poon@Sun.COM 				    tcp->tcp_listener->tcp_syn_defense) {
550411754SKacheong.Poon@Sun.COM 					/*
550511754SKacheong.Poon@Sun.COM 					 * Ditch the half-open connection if we
550611754SKacheong.Poon@Sun.COM 					 * suspect a SYN attack is under way.
550711754SKacheong.Poon@Sun.COM 					 */
550811754SKacheong.Poon@Sun.COM 					(void) tcp_clean_death(tcp,
550911754SKacheong.Poon@Sun.COM 					    tcp->tcp_client_errno);
551011754SKacheong.Poon@Sun.COM 				}
551111754SKacheong.Poon@Sun.COM 			}
551211754SKacheong.Poon@Sun.COM 			break;
551311754SKacheong.Poon@Sun.COM 		default:
551411754SKacheong.Poon@Sun.COM 			break;
551511754SKacheong.Poon@Sun.COM 		}
551611754SKacheong.Poon@Sun.COM 		break;
551711754SKacheong.Poon@Sun.COM 	case ICMP_SOURCE_QUENCH: {
551811754SKacheong.Poon@Sun.COM 		/*
551911754SKacheong.Poon@Sun.COM 		 * use a global boolean to control
552011754SKacheong.Poon@Sun.COM 		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
552111754SKacheong.Poon@Sun.COM 		 * The default is false.
552211754SKacheong.Poon@Sun.COM 		 */
552311754SKacheong.Poon@Sun.COM 		if (tcp_icmp_source_quench) {
552411754SKacheong.Poon@Sun.COM 			/*
552511754SKacheong.Poon@Sun.COM 			 * Reduce the sending rate as if we got a
552611754SKacheong.Poon@Sun.COM 			 * retransmit timeout
552711754SKacheong.Poon@Sun.COM 			 */
552811754SKacheong.Poon@Sun.COM 			uint32_t npkt;
552911754SKacheong.Poon@Sun.COM 
553011754SKacheong.Poon@Sun.COM 			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
553111754SKacheong.Poon@Sun.COM 			    tcp->tcp_mss;
553211754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
553311754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd = tcp->tcp_mss;
553411754SKacheong.Poon@Sun.COM 			tcp->tcp_cwnd_cnt = 0;
553511754SKacheong.Poon@Sun.COM 		}
553611754SKacheong.Poon@Sun.COM 		break;
553711754SKacheong.Poon@Sun.COM 	}
553811754SKacheong.Poon@Sun.COM 	}
553911754SKacheong.Poon@Sun.COM 	freemsg(mp);
554011754SKacheong.Poon@Sun.COM }
554111754SKacheong.Poon@Sun.COM 
554211754SKacheong.Poon@Sun.COM /*
554311754SKacheong.Poon@Sun.COM  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
554411754SKacheong.Poon@Sun.COM  * error messages passed up by IP.
554511754SKacheong.Poon@Sun.COM  * Assumes that IP has pulled up all the extension headers as well
554611754SKacheong.Poon@Sun.COM  * as the ICMPv6 header.
554711754SKacheong.Poon@Sun.COM  */
554811754SKacheong.Poon@Sun.COM static void
tcp_icmp_error_ipv6(tcp_t * tcp,mblk_t * mp,ip_recv_attr_t * ira)554911754SKacheong.Poon@Sun.COM tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
555011754SKacheong.Poon@Sun.COM {
555111754SKacheong.Poon@Sun.COM 	icmp6_t		*icmp6;
555211754SKacheong.Poon@Sun.COM 	ip6_t		*ip6h;
555311754SKacheong.Poon@Sun.COM 	uint16_t	iph_hdr_length = ira->ira_ip_hdr_length;
555411754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
555511754SKacheong.Poon@Sun.COM 	uint8_t		*nexthdrp;
555611754SKacheong.Poon@Sun.COM 	uint32_t	seg_seq;
555711754SKacheong.Poon@Sun.COM 
555811754SKacheong.Poon@Sun.COM 	/*
555911754SKacheong.Poon@Sun.COM 	 * Verify that we have a complete IP header.
556011754SKacheong.Poon@Sun.COM 	 */
556111754SKacheong.Poon@Sun.COM 	ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
556211754SKacheong.Poon@Sun.COM 
556311754SKacheong.Poon@Sun.COM 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
556411754SKacheong.Poon@Sun.COM 	ip6h = (ip6_t *)&icmp6[1];
556511754SKacheong.Poon@Sun.COM 	/*
556611754SKacheong.Poon@Sun.COM 	 * Verify if we have a complete ICMP and inner IP header.
556711754SKacheong.Poon@Sun.COM 	 */
556811754SKacheong.Poon@Sun.COM 	if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
556911754SKacheong.Poon@Sun.COM noticmpv6:
557011754SKacheong.Poon@Sun.COM 		freemsg(mp);
557111754SKacheong.Poon@Sun.COM 		return;
557211754SKacheong.Poon@Sun.COM 	}
557311754SKacheong.Poon@Sun.COM 
557411754SKacheong.Poon@Sun.COM 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
557511754SKacheong.Poon@Sun.COM 		goto noticmpv6;
557611754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
557711754SKacheong.Poon@Sun.COM 	/*
557811754SKacheong.Poon@Sun.COM 	 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
557911754SKacheong.Poon@Sun.COM 	 * have at least ICMP_MIN_TCP_HDR bytes of  TCP header drop the
558011754SKacheong.Poon@Sun.COM 	 * packet.
558111754SKacheong.Poon@Sun.COM 	 */
558211754SKacheong.Poon@Sun.COM 	if ((*nexthdrp != IPPROTO_TCP) ||
558311754SKacheong.Poon@Sun.COM 	    ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
558411754SKacheong.Poon@Sun.COM 		goto noticmpv6;
558511754SKacheong.Poon@Sun.COM 	}
558611754SKacheong.Poon@Sun.COM 
558711754SKacheong.Poon@Sun.COM 	seg_seq = ntohl(tcpha->tha_seq);
558811754SKacheong.Poon@Sun.COM 	switch (icmp6->icmp6_type) {
558911754SKacheong.Poon@Sun.COM 	case ICMP6_PACKET_TOO_BIG:
559011754SKacheong.Poon@Sun.COM 		/*
559111754SKacheong.Poon@Sun.COM 		 * Update Path MTU, then try to send something out.
559211754SKacheong.Poon@Sun.COM 		 */
559311754SKacheong.Poon@Sun.COM 		tcp_update_pmtu(tcp, B_TRUE);
559411754SKacheong.Poon@Sun.COM 		tcp_rexmit_after_error(tcp);
559511754SKacheong.Poon@Sun.COM 		break;
559611754SKacheong.Poon@Sun.COM 	case ICMP6_DST_UNREACH:
559711754SKacheong.Poon@Sun.COM 		switch (icmp6->icmp6_code) {
559811754SKacheong.Poon@Sun.COM 		case ICMP6_DST_UNREACH_NOPORT:
559911754SKacheong.Poon@Sun.COM 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
560011754SKacheong.Poon@Sun.COM 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
560111754SKacheong.Poon@Sun.COM 			    (seg_seq == tcp->tcp_iss)) {
560211754SKacheong.Poon@Sun.COM 				(void) tcp_clean_death(tcp, ECONNREFUSED);
560311754SKacheong.Poon@Sun.COM 			}
560411754SKacheong.Poon@Sun.COM 			break;
560511754SKacheong.Poon@Sun.COM 		case ICMP6_DST_UNREACH_ADMIN:
560611754SKacheong.Poon@Sun.COM 		case ICMP6_DST_UNREACH_NOROUTE:
560711754SKacheong.Poon@Sun.COM 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
560811754SKacheong.Poon@Sun.COM 		case ICMP6_DST_UNREACH_ADDR:
560911754SKacheong.Poon@Sun.COM 			/* Record the error in case we finally time out. */
561011754SKacheong.Poon@Sun.COM 			tcp->tcp_client_errno = EHOSTUNREACH;
561111754SKacheong.Poon@Sun.COM 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
561211754SKacheong.Poon@Sun.COM 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
561311754SKacheong.Poon@Sun.COM 			    (seg_seq == tcp->tcp_iss)) {
561411754SKacheong.Poon@Sun.COM 				if (tcp->tcp_listener != NULL &&
561511754SKacheong.Poon@Sun.COM 				    tcp->tcp_listener->tcp_syn_defense) {
561611754SKacheong.Poon@Sun.COM 					/*
561711754SKacheong.Poon@Sun.COM 					 * Ditch the half-open connection if we
561811754SKacheong.Poon@Sun.COM 					 * suspect a SYN attack is under way.
561911754SKacheong.Poon@Sun.COM 					 */
562011754SKacheong.Poon@Sun.COM 					(void) tcp_clean_death(tcp,
562111754SKacheong.Poon@Sun.COM 					    tcp->tcp_client_errno);
562211754SKacheong.Poon@Sun.COM 				}
562311754SKacheong.Poon@Sun.COM 			}
562411754SKacheong.Poon@Sun.COM 
562511754SKacheong.Poon@Sun.COM 
562611754SKacheong.Poon@Sun.COM 			break;
562711754SKacheong.Poon@Sun.COM 		default:
562811754SKacheong.Poon@Sun.COM 			break;
562911754SKacheong.Poon@Sun.COM 		}
563011754SKacheong.Poon@Sun.COM 		break;
563111754SKacheong.Poon@Sun.COM 	case ICMP6_PARAM_PROB:
563211754SKacheong.Poon@Sun.COM 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
563311754SKacheong.Poon@Sun.COM 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
563411754SKacheong.Poon@Sun.COM 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
563511754SKacheong.Poon@Sun.COM 		    (uchar_t *)nexthdrp) {
563611754SKacheong.Poon@Sun.COM 			if (tcp->tcp_state == TCPS_SYN_SENT ||
563711754SKacheong.Poon@Sun.COM 			    tcp->tcp_state == TCPS_SYN_RCVD) {
563811754SKacheong.Poon@Sun.COM 				(void) tcp_clean_death(tcp, ECONNREFUSED);
563911754SKacheong.Poon@Sun.COM 			}
564011754SKacheong.Poon@Sun.COM 			break;
564111754SKacheong.Poon@Sun.COM 		}
564211754SKacheong.Poon@Sun.COM 		break;
564311754SKacheong.Poon@Sun.COM 
564411754SKacheong.Poon@Sun.COM 	case ICMP6_TIME_EXCEEDED:
564511754SKacheong.Poon@Sun.COM 	default:
564611754SKacheong.Poon@Sun.COM 		break;
564711754SKacheong.Poon@Sun.COM 	}
564811754SKacheong.Poon@Sun.COM 	freemsg(mp);
564911754SKacheong.Poon@Sun.COM }
565011754SKacheong.Poon@Sun.COM 
565111754SKacheong.Poon@Sun.COM /*
565211754SKacheong.Poon@Sun.COM  * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
565311754SKacheong.Poon@Sun.COM  * change. But it can refer to fields like tcp_suna and tcp_snxt.
565411754SKacheong.Poon@Sun.COM  *
565511754SKacheong.Poon@Sun.COM  * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
565611754SKacheong.Poon@Sun.COM  * error messages received by IP. The message is always received on the correct
565711754SKacheong.Poon@Sun.COM  * tcp_t.
565811754SKacheong.Poon@Sun.COM  */
565911754SKacheong.Poon@Sun.COM /* ARGSUSED */
566011754SKacheong.Poon@Sun.COM boolean_t
tcp_verifyicmp(conn_t * connp,void * arg2,icmph_t * icmph,icmp6_t * icmp6,ip_recv_attr_t * ira)566111754SKacheong.Poon@Sun.COM tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
566211754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira)
566311754SKacheong.Poon@Sun.COM {
566411754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha = (tcpha_t *)arg2;
566511754SKacheong.Poon@Sun.COM 	uint32_t	seq = ntohl(tcpha->tha_seq);
566611754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
566711754SKacheong.Poon@Sun.COM 
566811754SKacheong.Poon@Sun.COM 	/*
566911754SKacheong.Poon@Sun.COM 	 * TCP sequence number contained in payload of the ICMP error message
567011754SKacheong.Poon@Sun.COM 	 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
567111754SKacheong.Poon@Sun.COM 	 * the message is either a stale ICMP error, or an attack from the
567211754SKacheong.Poon@Sun.COM 	 * network. Fail the verification.
567311754SKacheong.Poon@Sun.COM 	 */
567411754SKacheong.Poon@Sun.COM 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
567511754SKacheong.Poon@Sun.COM 		return (B_FALSE);
567611754SKacheong.Poon@Sun.COM 
567711754SKacheong.Poon@Sun.COM 	/* For "too big" we also check the ignore flag */
567811754SKacheong.Poon@Sun.COM 	if (ira->ira_flags & IRAF_IS_IPV4) {
567911754SKacheong.Poon@Sun.COM 		ASSERT(icmph != NULL);
568011754SKacheong.Poon@Sun.COM 		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
568111754SKacheong.Poon@Sun.COM 		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
568211754SKacheong.Poon@Sun.COM 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
568311754SKacheong.Poon@Sun.COM 			return (B_FALSE);
568411754SKacheong.Poon@Sun.COM 	} else {
568511754SKacheong.Poon@Sun.COM 		ASSERT(icmp6 != NULL);
568611754SKacheong.Poon@Sun.COM 		if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
568711754SKacheong.Poon@Sun.COM 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
568811754SKacheong.Poon@Sun.COM 			return (B_FALSE);
568911754SKacheong.Poon@Sun.COM 	}
569011754SKacheong.Poon@Sun.COM 	return (B_TRUE);
569111754SKacheong.Poon@Sun.COM }
5692