xref: /onnv-gate/usr/src/uts/common/inet/tcp/tcp_output.c (revision 13041:16bccd6e1b08)
111754SKacheong.Poon@Sun.COM /*
211754SKacheong.Poon@Sun.COM  * CDDL HEADER START
311754SKacheong.Poon@Sun.COM  *
411754SKacheong.Poon@Sun.COM  * The contents of this file are subject to the terms of the
511754SKacheong.Poon@Sun.COM  * Common Development and Distribution License (the "License").
611754SKacheong.Poon@Sun.COM  * You may not use this file except in compliance with the License.
711754SKacheong.Poon@Sun.COM  *
811754SKacheong.Poon@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
911754SKacheong.Poon@Sun.COM  * or http://www.opensolaris.org/os/licensing.
1011754SKacheong.Poon@Sun.COM  * See the License for the specific language governing permissions
1111754SKacheong.Poon@Sun.COM  * and limitations under the License.
1211754SKacheong.Poon@Sun.COM  *
1311754SKacheong.Poon@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
1411754SKacheong.Poon@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1511754SKacheong.Poon@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
1611754SKacheong.Poon@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
1711754SKacheong.Poon@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
1811754SKacheong.Poon@Sun.COM  *
1911754SKacheong.Poon@Sun.COM  * CDDL HEADER END
2011754SKacheong.Poon@Sun.COM  */
2111754SKacheong.Poon@Sun.COM 
2211754SKacheong.Poon@Sun.COM /*
2312056SKacheong.Poon@Sun.COM  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
2411754SKacheong.Poon@Sun.COM  */
2511754SKacheong.Poon@Sun.COM 
2611754SKacheong.Poon@Sun.COM /* This file contains all TCP output processing functions. */
2711754SKacheong.Poon@Sun.COM 
2811754SKacheong.Poon@Sun.COM #include <sys/types.h>
2911754SKacheong.Poon@Sun.COM #include <sys/stream.h>
3011754SKacheong.Poon@Sun.COM #include <sys/strsun.h>
3111754SKacheong.Poon@Sun.COM #include <sys/strsubr.h>
3211754SKacheong.Poon@Sun.COM #include <sys/stropts.h>
3311754SKacheong.Poon@Sun.COM #include <sys/strlog.h>
3411754SKacheong.Poon@Sun.COM #define	_SUN_TPI_VERSION 2
3511754SKacheong.Poon@Sun.COM #include <sys/tihdr.h>
3611754SKacheong.Poon@Sun.COM #include <sys/suntpi.h>
3711754SKacheong.Poon@Sun.COM #include <sys/xti_inet.h>
3811754SKacheong.Poon@Sun.COM #include <sys/timod.h>
3911754SKacheong.Poon@Sun.COM #include <sys/pattr.h>
4011754SKacheong.Poon@Sun.COM #include <sys/squeue_impl.h>
4111754SKacheong.Poon@Sun.COM #include <sys/squeue.h>
4211754SKacheong.Poon@Sun.COM #include <sys/sockio.h>
4311754SKacheong.Poon@Sun.COM #include <sys/tsol/tnet.h>
4411754SKacheong.Poon@Sun.COM 
4511754SKacheong.Poon@Sun.COM #include <inet/common.h>
4611754SKacheong.Poon@Sun.COM #include <inet/ip.h>
4711754SKacheong.Poon@Sun.COM #include <inet/tcp.h>
4811754SKacheong.Poon@Sun.COM #include <inet/tcp_impl.h>
4911754SKacheong.Poon@Sun.COM #include <inet/snmpcom.h>
5011754SKacheong.Poon@Sun.COM #include <inet/proto_set.h>
5111754SKacheong.Poon@Sun.COM #include <inet/ipsec_impl.h>
5211754SKacheong.Poon@Sun.COM #include <inet/ip_ndp.h>
5311754SKacheong.Poon@Sun.COM 
5411754SKacheong.Poon@Sun.COM static mblk_t	*tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
5511754SKacheong.Poon@Sun.COM static void	tcp_wput_cmdblk(queue_t *, mblk_t *);
5611754SKacheong.Poon@Sun.COM static void	tcp_wput_flush(tcp_t *, mblk_t *);
5711754SKacheong.Poon@Sun.COM static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
5811754SKacheong.Poon@Sun.COM static int	tcp_xmit_end(tcp_t *);
5911754SKacheong.Poon@Sun.COM static int	tcp_send(tcp_t *, const int, const int, const int,
6011754SKacheong.Poon@Sun.COM 		    const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
6111754SKacheong.Poon@Sun.COM static void	tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
6211754SKacheong.Poon@Sun.COM 		    int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
6311754SKacheong.Poon@Sun.COM static boolean_t	tcp_send_rst_chk(tcp_stack_t *);
6411754SKacheong.Poon@Sun.COM static void	tcp_process_shrunk_swnd(tcp_t *, uint32_t);
6511754SKacheong.Poon@Sun.COM static void	tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
6611754SKacheong.Poon@Sun.COM 
6711754SKacheong.Poon@Sun.COM /*
6811754SKacheong.Poon@Sun.COM  * Functions called directly via squeue having a prototype of edesc_t.
6911754SKacheong.Poon@Sun.COM  */
7011754SKacheong.Poon@Sun.COM static void	tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
7111754SKacheong.Poon@Sun.COM static void	tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
7211754SKacheong.Poon@Sun.COM static void	tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
7311754SKacheong.Poon@Sun.COM 
7411754SKacheong.Poon@Sun.COM /*
7511754SKacheong.Poon@Sun.COM  * This controls how tiny a write must be before we try to copy it
7611754SKacheong.Poon@Sun.COM  * into the mblk on the tail of the transmit queue.  Not much
7711754SKacheong.Poon@Sun.COM  * speedup is observed for values larger than sixteen.  Zero will
7811754SKacheong.Poon@Sun.COM  * disable the optimisation.
7911754SKacheong.Poon@Sun.COM  */
8011754SKacheong.Poon@Sun.COM static int tcp_tx_pull_len = 16;
8111754SKacheong.Poon@Sun.COM 
8211754SKacheong.Poon@Sun.COM void
tcp_wput(queue_t * q,mblk_t * mp)8311754SKacheong.Poon@Sun.COM tcp_wput(queue_t *q, mblk_t *mp)
8411754SKacheong.Poon@Sun.COM {
8511754SKacheong.Poon@Sun.COM 	conn_t	*connp = Q_TO_CONN(q);
8611754SKacheong.Poon@Sun.COM 	tcp_t	*tcp;
8711754SKacheong.Poon@Sun.COM 	void (*output_proc)();
8811754SKacheong.Poon@Sun.COM 	t_scalar_t type;
8911754SKacheong.Poon@Sun.COM 	uchar_t *rptr;
9011754SKacheong.Poon@Sun.COM 	struct iocblk	*iocp;
9111754SKacheong.Poon@Sun.COM 	size_t size;
9211754SKacheong.Poon@Sun.COM 
9311754SKacheong.Poon@Sun.COM 	ASSERT(connp->conn_ref >= 2);
9411754SKacheong.Poon@Sun.COM 
9511754SKacheong.Poon@Sun.COM 	switch (DB_TYPE(mp)) {
9611754SKacheong.Poon@Sun.COM 	case M_DATA:
9711754SKacheong.Poon@Sun.COM 		tcp = connp->conn_tcp;
9811754SKacheong.Poon@Sun.COM 		ASSERT(tcp != NULL);
9911754SKacheong.Poon@Sun.COM 
10011754SKacheong.Poon@Sun.COM 		size = msgdsize(mp);
10111754SKacheong.Poon@Sun.COM 
10211754SKacheong.Poon@Sun.COM 		mutex_enter(&tcp->tcp_non_sq_lock);
10311754SKacheong.Poon@Sun.COM 		tcp->tcp_squeue_bytes += size;
10411754SKacheong.Poon@Sun.COM 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
10511754SKacheong.Poon@Sun.COM 			tcp_setqfull(tcp);
10611754SKacheong.Poon@Sun.COM 		}
10711754SKacheong.Poon@Sun.COM 		mutex_exit(&tcp->tcp_non_sq_lock);
10811754SKacheong.Poon@Sun.COM 
10911754SKacheong.Poon@Sun.COM 		CONN_INC_REF(connp);
11011754SKacheong.Poon@Sun.COM 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
11111754SKacheong.Poon@Sun.COM 		    NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
11211754SKacheong.Poon@Sun.COM 		return;
11311754SKacheong.Poon@Sun.COM 
11411754SKacheong.Poon@Sun.COM 	case M_CMD:
11511754SKacheong.Poon@Sun.COM 		tcp_wput_cmdblk(q, mp);
11611754SKacheong.Poon@Sun.COM 		return;
11711754SKacheong.Poon@Sun.COM 
11811754SKacheong.Poon@Sun.COM 	case M_PROTO:
11911754SKacheong.Poon@Sun.COM 	case M_PCPROTO:
12011754SKacheong.Poon@Sun.COM 		/*
12111754SKacheong.Poon@Sun.COM 		 * if it is a snmp message, don't get behind the squeue
12211754SKacheong.Poon@Sun.COM 		 */
12311754SKacheong.Poon@Sun.COM 		tcp = connp->conn_tcp;
12411754SKacheong.Poon@Sun.COM 		rptr = mp->b_rptr;
12511754SKacheong.Poon@Sun.COM 		if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
12611754SKacheong.Poon@Sun.COM 			type = ((union T_primitives *)rptr)->type;
12711754SKacheong.Poon@Sun.COM 		} else {
12811754SKacheong.Poon@Sun.COM 			if (connp->conn_debug) {
12911754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1,
13011754SKacheong.Poon@Sun.COM 				    SL_ERROR|SL_TRACE,
13111754SKacheong.Poon@Sun.COM 				    "tcp_wput_proto, dropping one...");
13211754SKacheong.Poon@Sun.COM 			}
13311754SKacheong.Poon@Sun.COM 			freemsg(mp);
13411754SKacheong.Poon@Sun.COM 			return;
13511754SKacheong.Poon@Sun.COM 		}
13611754SKacheong.Poon@Sun.COM 		if (type == T_SVR4_OPTMGMT_REQ) {
13711754SKacheong.Poon@Sun.COM 			/*
13811754SKacheong.Poon@Sun.COM 			 * All Solaris components should pass a db_credp
13911754SKacheong.Poon@Sun.COM 			 * for this TPI message, hence we ASSERT.
14011754SKacheong.Poon@Sun.COM 			 * But in case there is some other M_PROTO that looks
14111754SKacheong.Poon@Sun.COM 			 * like a TPI message sent by some other kernel
14211754SKacheong.Poon@Sun.COM 			 * component, we check and return an error.
14311754SKacheong.Poon@Sun.COM 			 */
14411754SKacheong.Poon@Sun.COM 			cred_t	*cr = msg_getcred(mp, NULL);
14511754SKacheong.Poon@Sun.COM 
14611754SKacheong.Poon@Sun.COM 			ASSERT(cr != NULL);
14711754SKacheong.Poon@Sun.COM 			if (cr == NULL) {
14811754SKacheong.Poon@Sun.COM 				tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
14911754SKacheong.Poon@Sun.COM 				return;
15011754SKacheong.Poon@Sun.COM 			}
15111754SKacheong.Poon@Sun.COM 			if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get,
15211754SKacheong.Poon@Sun.COM 			    cr)) {
15311754SKacheong.Poon@Sun.COM 				/*
15411754SKacheong.Poon@Sun.COM 				 * This was a SNMP request
15511754SKacheong.Poon@Sun.COM 				 */
15611754SKacheong.Poon@Sun.COM 				return;
15711754SKacheong.Poon@Sun.COM 			} else {
15811754SKacheong.Poon@Sun.COM 				output_proc = tcp_wput_proto;
15911754SKacheong.Poon@Sun.COM 			}
16011754SKacheong.Poon@Sun.COM 		} else {
16111754SKacheong.Poon@Sun.COM 			output_proc = tcp_wput_proto;
16211754SKacheong.Poon@Sun.COM 		}
16311754SKacheong.Poon@Sun.COM 		break;
16411754SKacheong.Poon@Sun.COM 	case M_IOCTL:
16511754SKacheong.Poon@Sun.COM 		/*
16611754SKacheong.Poon@Sun.COM 		 * Most ioctls can be processed right away without going via
16711754SKacheong.Poon@Sun.COM 		 * squeues - process them right here. Those that do require
16811754SKacheong.Poon@Sun.COM 		 * squeue (currently _SIOCSOCKFALLBACK)
16911754SKacheong.Poon@Sun.COM 		 * are processed by tcp_wput_ioctl().
17011754SKacheong.Poon@Sun.COM 		 */
17111754SKacheong.Poon@Sun.COM 		iocp = (struct iocblk *)mp->b_rptr;
17211754SKacheong.Poon@Sun.COM 		tcp = connp->conn_tcp;
17311754SKacheong.Poon@Sun.COM 
17411754SKacheong.Poon@Sun.COM 		switch (iocp->ioc_cmd) {
17511754SKacheong.Poon@Sun.COM 		case TCP_IOC_ABORT_CONN:
17611754SKacheong.Poon@Sun.COM 			tcp_ioctl_abort_conn(q, mp);
17711754SKacheong.Poon@Sun.COM 			return;
17811754SKacheong.Poon@Sun.COM 		case TI_GETPEERNAME:
17911754SKacheong.Poon@Sun.COM 		case TI_GETMYNAME:
18011754SKacheong.Poon@Sun.COM 			mi_copyin(q, mp, NULL,
18111754SKacheong.Poon@Sun.COM 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
18211754SKacheong.Poon@Sun.COM 			return;
18311754SKacheong.Poon@Sun.COM 
18411754SKacheong.Poon@Sun.COM 		default:
18511754SKacheong.Poon@Sun.COM 			output_proc = tcp_wput_ioctl;
18611754SKacheong.Poon@Sun.COM 			break;
18711754SKacheong.Poon@Sun.COM 		}
18811754SKacheong.Poon@Sun.COM 		break;
18911754SKacheong.Poon@Sun.COM 	default:
19011754SKacheong.Poon@Sun.COM 		output_proc = tcp_wput_nondata;
19111754SKacheong.Poon@Sun.COM 		break;
19211754SKacheong.Poon@Sun.COM 	}
19311754SKacheong.Poon@Sun.COM 
19411754SKacheong.Poon@Sun.COM 	CONN_INC_REF(connp);
19511754SKacheong.Poon@Sun.COM 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
19611754SKacheong.Poon@Sun.COM 	    NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
19711754SKacheong.Poon@Sun.COM }
19811754SKacheong.Poon@Sun.COM 
19911754SKacheong.Poon@Sun.COM /*
20011754SKacheong.Poon@Sun.COM  * The TCP normal data output path.
20111754SKacheong.Poon@Sun.COM  * NOTE: the logic of the fast path is duplicated from this function.
20211754SKacheong.Poon@Sun.COM  */
20311754SKacheong.Poon@Sun.COM void
tcp_wput_data(tcp_t * tcp,mblk_t * mp,boolean_t urgent)20411754SKacheong.Poon@Sun.COM tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
20511754SKacheong.Poon@Sun.COM {
20611754SKacheong.Poon@Sun.COM 	int		len;
20711754SKacheong.Poon@Sun.COM 	mblk_t		*local_time;
20811754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
20911754SKacheong.Poon@Sun.COM 	uint32_t	snxt;
21011754SKacheong.Poon@Sun.COM 	int		tail_unsent;
21111754SKacheong.Poon@Sun.COM 	int		tcpstate;
21211754SKacheong.Poon@Sun.COM 	int		usable = 0;
21311754SKacheong.Poon@Sun.COM 	mblk_t		*xmit_tail;
21411754SKacheong.Poon@Sun.COM 	int32_t		mss;
21511754SKacheong.Poon@Sun.COM 	int32_t		num_sack_blk = 0;
21611754SKacheong.Poon@Sun.COM 	int32_t		total_hdr_len;
21711754SKacheong.Poon@Sun.COM 	int32_t		tcp_hdr_len;
21811754SKacheong.Poon@Sun.COM 	int		rc;
21911754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
22011754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
22111754SKacheong.Poon@Sun.COM 	clock_t		now = LBOLT_FASTPATH;
22211754SKacheong.Poon@Sun.COM 
22311754SKacheong.Poon@Sun.COM 	tcpstate = tcp->tcp_state;
22411754SKacheong.Poon@Sun.COM 	if (mp == NULL) {
22511754SKacheong.Poon@Sun.COM 		/*
22611754SKacheong.Poon@Sun.COM 		 * tcp_wput_data() with NULL mp should only be called when
22711754SKacheong.Poon@Sun.COM 		 * there is unsent data.
22811754SKacheong.Poon@Sun.COM 		 */
22911754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_unsent > 0);
23011754SKacheong.Poon@Sun.COM 		/* Really tacky... but we need this for detached closes. */
23111754SKacheong.Poon@Sun.COM 		len = tcp->tcp_unsent;
23211754SKacheong.Poon@Sun.COM 		goto data_null;
23311754SKacheong.Poon@Sun.COM 	}
23411754SKacheong.Poon@Sun.COM 
23511754SKacheong.Poon@Sun.COM 	ASSERT(mp->b_datap->db_type == M_DATA);
23611754SKacheong.Poon@Sun.COM 	/*
23711754SKacheong.Poon@Sun.COM 	 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
23811754SKacheong.Poon@Sun.COM 	 * or before a connection attempt has begun.
23911754SKacheong.Poon@Sun.COM 	 */
24011754SKacheong.Poon@Sun.COM 	if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
24111754SKacheong.Poon@Sun.COM 	    (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
24211754SKacheong.Poon@Sun.COM 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
24311754SKacheong.Poon@Sun.COM #ifdef DEBUG
24411754SKacheong.Poon@Sun.COM 			cmn_err(CE_WARN,
24511754SKacheong.Poon@Sun.COM 			    "tcp_wput_data: data after ordrel, %s",
24611754SKacheong.Poon@Sun.COM 			    tcp_display(tcp, NULL,
24711754SKacheong.Poon@Sun.COM 			    DISP_ADDR_AND_PORT));
24811754SKacheong.Poon@Sun.COM #else
24911754SKacheong.Poon@Sun.COM 			if (connp->conn_debug) {
25011754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1,
25111754SKacheong.Poon@Sun.COM 				    SL_TRACE|SL_ERROR,
25211754SKacheong.Poon@Sun.COM 				    "tcp_wput_data: data after ordrel, %s\n",
25311754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
25411754SKacheong.Poon@Sun.COM 				    DISP_ADDR_AND_PORT));
25511754SKacheong.Poon@Sun.COM 			}
25611754SKacheong.Poon@Sun.COM #endif /* DEBUG */
25711754SKacheong.Poon@Sun.COM 		}
25811754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_zcopy_aware &&
25911754SKacheong.Poon@Sun.COM 		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
26011754SKacheong.Poon@Sun.COM 			tcp_zcopy_notify(tcp);
26111754SKacheong.Poon@Sun.COM 		freemsg(mp);
26211754SKacheong.Poon@Sun.COM 		mutex_enter(&tcp->tcp_non_sq_lock);
26311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_flow_stopped &&
26411754SKacheong.Poon@Sun.COM 		    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
26511754SKacheong.Poon@Sun.COM 			tcp_clrqfull(tcp);
26611754SKacheong.Poon@Sun.COM 		}
26711754SKacheong.Poon@Sun.COM 		mutex_exit(&tcp->tcp_non_sq_lock);
26811754SKacheong.Poon@Sun.COM 		return;
26911754SKacheong.Poon@Sun.COM 	}
27011754SKacheong.Poon@Sun.COM 
27111754SKacheong.Poon@Sun.COM 	/* Strip empties */
27211754SKacheong.Poon@Sun.COM 	for (;;) {
27311754SKacheong.Poon@Sun.COM 		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
27411754SKacheong.Poon@Sun.COM 		    (uintptr_t)INT_MAX);
27511754SKacheong.Poon@Sun.COM 		len = (int)(mp->b_wptr - mp->b_rptr);
27611754SKacheong.Poon@Sun.COM 		if (len > 0)
27711754SKacheong.Poon@Sun.COM 			break;
27811754SKacheong.Poon@Sun.COM 		mp1 = mp;
27911754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
28011754SKacheong.Poon@Sun.COM 		freeb(mp1);
28111754SKacheong.Poon@Sun.COM 		if (mp == NULL) {
28211754SKacheong.Poon@Sun.COM 			return;
28311754SKacheong.Poon@Sun.COM 		}
28411754SKacheong.Poon@Sun.COM 	}
28511754SKacheong.Poon@Sun.COM 
28611754SKacheong.Poon@Sun.COM 	/* If we are the first on the list ... */
28711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_xmit_head == NULL) {
28811754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_head = mp;
28911754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_tail = mp;
29011754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_tail_unsent = len;
29111754SKacheong.Poon@Sun.COM 	} else {
29211754SKacheong.Poon@Sun.COM 		/* If tiny tx and room in txq tail, pullup to save mblks. */
29311754SKacheong.Poon@Sun.COM 		struct datab *dp;
29411754SKacheong.Poon@Sun.COM 
29511754SKacheong.Poon@Sun.COM 		mp1 = tcp->tcp_xmit_last;
29611754SKacheong.Poon@Sun.COM 		if (len < tcp_tx_pull_len &&
29711754SKacheong.Poon@Sun.COM 		    (dp = mp1->b_datap)->db_ref == 1 &&
29811754SKacheong.Poon@Sun.COM 		    dp->db_lim - mp1->b_wptr >= len) {
29911754SKacheong.Poon@Sun.COM 			ASSERT(len > 0);
30011754SKacheong.Poon@Sun.COM 			ASSERT(!mp1->b_cont);
30111754SKacheong.Poon@Sun.COM 			if (len == 1) {
30211754SKacheong.Poon@Sun.COM 				*mp1->b_wptr++ = *mp->b_rptr;
30311754SKacheong.Poon@Sun.COM 			} else {
30411754SKacheong.Poon@Sun.COM 				bcopy(mp->b_rptr, mp1->b_wptr, len);
30511754SKacheong.Poon@Sun.COM 				mp1->b_wptr += len;
30611754SKacheong.Poon@Sun.COM 			}
30711754SKacheong.Poon@Sun.COM 			if (mp1 == tcp->tcp_xmit_tail)
30811754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_tail_unsent += len;
30911754SKacheong.Poon@Sun.COM 			mp1->b_cont = mp->b_cont;
31011754SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_zcopy_aware &&
31111754SKacheong.Poon@Sun.COM 			    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
31211754SKacheong.Poon@Sun.COM 				mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
31311754SKacheong.Poon@Sun.COM 			freeb(mp);
31411754SKacheong.Poon@Sun.COM 			mp = mp1;
31511754SKacheong.Poon@Sun.COM 		} else {
31611754SKacheong.Poon@Sun.COM 			tcp->tcp_xmit_last->b_cont = mp;
31711754SKacheong.Poon@Sun.COM 		}
31811754SKacheong.Poon@Sun.COM 		len += tcp->tcp_unsent;
31911754SKacheong.Poon@Sun.COM 	}
32011754SKacheong.Poon@Sun.COM 
32111754SKacheong.Poon@Sun.COM 	/* Tack on however many more positive length mblks we have */
32211754SKacheong.Poon@Sun.COM 	if ((mp1 = mp->b_cont) != NULL) {
32311754SKacheong.Poon@Sun.COM 		do {
32411754SKacheong.Poon@Sun.COM 			int tlen;
32511754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
32611754SKacheong.Poon@Sun.COM 			    (uintptr_t)INT_MAX);
32711754SKacheong.Poon@Sun.COM 			tlen = (int)(mp1->b_wptr - mp1->b_rptr);
32811754SKacheong.Poon@Sun.COM 			if (tlen <= 0) {
32911754SKacheong.Poon@Sun.COM 				mp->b_cont = mp1->b_cont;
33011754SKacheong.Poon@Sun.COM 				freeb(mp1);
33111754SKacheong.Poon@Sun.COM 			} else {
33211754SKacheong.Poon@Sun.COM 				len += tlen;
33311754SKacheong.Poon@Sun.COM 				mp = mp1;
33411754SKacheong.Poon@Sun.COM 			}
33511754SKacheong.Poon@Sun.COM 		} while ((mp1 = mp->b_cont) != NULL);
33611754SKacheong.Poon@Sun.COM 	}
33711754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_last = mp;
33811754SKacheong.Poon@Sun.COM 	tcp->tcp_unsent = len;
33911754SKacheong.Poon@Sun.COM 
34011754SKacheong.Poon@Sun.COM 	if (urgent)
34111754SKacheong.Poon@Sun.COM 		usable = 1;
34211754SKacheong.Poon@Sun.COM 
34311754SKacheong.Poon@Sun.COM data_null:
34411754SKacheong.Poon@Sun.COM 	snxt = tcp->tcp_snxt;
34511754SKacheong.Poon@Sun.COM 	xmit_tail = tcp->tcp_xmit_tail;
34611754SKacheong.Poon@Sun.COM 	tail_unsent = tcp->tcp_xmit_tail_unsent;
34711754SKacheong.Poon@Sun.COM 
34811754SKacheong.Poon@Sun.COM 	/*
34911754SKacheong.Poon@Sun.COM 	 * Note that tcp_mss has been adjusted to take into account the
35011754SKacheong.Poon@Sun.COM 	 * timestamp option if applicable.  Because SACK options do not
35111754SKacheong.Poon@Sun.COM 	 * appear in every TCP segments and they are of variable lengths,
35211754SKacheong.Poon@Sun.COM 	 * they cannot be included in tcp_mss.  Thus we need to calculate
35311754SKacheong.Poon@Sun.COM 	 * the actual segment length when we need to send a segment which
35411754SKacheong.Poon@Sun.COM 	 * includes SACK options.
35511754SKacheong.Poon@Sun.COM 	 */
35611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
35711754SKacheong.Poon@Sun.COM 		int32_t	opt_len;
35811754SKacheong.Poon@Sun.COM 
35911754SKacheong.Poon@Sun.COM 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
36011754SKacheong.Poon@Sun.COM 		    tcp->tcp_num_sack_blk);
36111754SKacheong.Poon@Sun.COM 		opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
36211754SKacheong.Poon@Sun.COM 		    2 + TCPOPT_HEADER_LEN;
36311754SKacheong.Poon@Sun.COM 		mss = tcp->tcp_mss - opt_len;
36411754SKacheong.Poon@Sun.COM 		total_hdr_len = connp->conn_ht_iphc_len + opt_len;
36511754SKacheong.Poon@Sun.COM 		tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
36611754SKacheong.Poon@Sun.COM 	} else {
36711754SKacheong.Poon@Sun.COM 		mss = tcp->tcp_mss;
36811754SKacheong.Poon@Sun.COM 		total_hdr_len = connp->conn_ht_iphc_len;
36911754SKacheong.Poon@Sun.COM 		tcp_hdr_len = connp->conn_ht_ulp_len;
37011754SKacheong.Poon@Sun.COM 	}
37111754SKacheong.Poon@Sun.COM 
37211754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
37311754SKacheong.Poon@Sun.COM 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
37411754SKacheong.Poon@Sun.COM 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
37511754SKacheong.Poon@Sun.COM 	}
37611754SKacheong.Poon@Sun.COM 	if (tcpstate == TCPS_SYN_RCVD) {
37711754SKacheong.Poon@Sun.COM 		/*
37811754SKacheong.Poon@Sun.COM 		 * The three-way connection establishment handshake is not
37911754SKacheong.Poon@Sun.COM 		 * complete yet. We want to queue the data for transmission
38011754SKacheong.Poon@Sun.COM 		 * after entering ESTABLISHED state (RFC793). A jump to
38111754SKacheong.Poon@Sun.COM 		 * "done" label effectively leaves data on the queue.
38211754SKacheong.Poon@Sun.COM 		 */
38311754SKacheong.Poon@Sun.COM 		goto done;
38411754SKacheong.Poon@Sun.COM 	} else {
38511754SKacheong.Poon@Sun.COM 		int usable_r;
38611754SKacheong.Poon@Sun.COM 
38711754SKacheong.Poon@Sun.COM 		/*
38811754SKacheong.Poon@Sun.COM 		 * In the special case when cwnd is zero, which can only
38911754SKacheong.Poon@Sun.COM 		 * happen if the connection is ECN capable, return now.
39011754SKacheong.Poon@Sun.COM 		 * New segments is sent using tcp_timer().  The timer
39111754SKacheong.Poon@Sun.COM 		 * is set in tcp_input_data().
39211754SKacheong.Poon@Sun.COM 		 */
39311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_cwnd == 0) {
39411754SKacheong.Poon@Sun.COM 			/*
39511754SKacheong.Poon@Sun.COM 			 * Note that tcp_cwnd is 0 before 3-way handshake is
39611754SKacheong.Poon@Sun.COM 			 * finished.
39711754SKacheong.Poon@Sun.COM 			 */
39811754SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_ecn_ok ||
39911754SKacheong.Poon@Sun.COM 			    tcp->tcp_state < TCPS_ESTABLISHED);
40011754SKacheong.Poon@Sun.COM 			return;
40111754SKacheong.Poon@Sun.COM 		}
40211754SKacheong.Poon@Sun.COM 
40311754SKacheong.Poon@Sun.COM 		/* NOTE: trouble if xmitting while SYN not acked? */
40411754SKacheong.Poon@Sun.COM 		usable_r = snxt - tcp->tcp_suna;
40511754SKacheong.Poon@Sun.COM 		usable_r = tcp->tcp_swnd - usable_r;
40611754SKacheong.Poon@Sun.COM 
40711754SKacheong.Poon@Sun.COM 		/*
40811754SKacheong.Poon@Sun.COM 		 * Check if the receiver has shrunk the window.  If
40911754SKacheong.Poon@Sun.COM 		 * tcp_wput_data() with NULL mp is called, tcp_fin_sent
41011754SKacheong.Poon@Sun.COM 		 * cannot be set as there is unsent data, so FIN cannot
41111754SKacheong.Poon@Sun.COM 		 * be sent out.  Otherwise, we need to take into account
41211754SKacheong.Poon@Sun.COM 		 * of FIN as it consumes an "invisible" sequence number.
41311754SKacheong.Poon@Sun.COM 		 */
41411754SKacheong.Poon@Sun.COM 		ASSERT(tcp->tcp_fin_sent == 0);
41511754SKacheong.Poon@Sun.COM 		if (usable_r < 0) {
41611754SKacheong.Poon@Sun.COM 			/*
41711754SKacheong.Poon@Sun.COM 			 * The receiver has shrunk the window and we have sent
41811754SKacheong.Poon@Sun.COM 			 * -usable_r date beyond the window, re-adjust.
41911754SKacheong.Poon@Sun.COM 			 *
42011754SKacheong.Poon@Sun.COM 			 * If TCP window scaling is enabled, there can be
42111754SKacheong.Poon@Sun.COM 			 * round down error as the advertised receive window
42211754SKacheong.Poon@Sun.COM 			 * is actually right shifted n bits.  This means that
42311754SKacheong.Poon@Sun.COM 			 * the lower n bits info is wiped out.  It will look
42411754SKacheong.Poon@Sun.COM 			 * like the window is shrunk.  Do a check here to
42511754SKacheong.Poon@Sun.COM 			 * see if the shrunk amount is actually within the
42611754SKacheong.Poon@Sun.COM 			 * error in window calculation.  If it is, just
42711754SKacheong.Poon@Sun.COM 			 * return.  Note that this check is inside the
42811754SKacheong.Poon@Sun.COM 			 * shrunk window check.  This makes sure that even
42911754SKacheong.Poon@Sun.COM 			 * though tcp_process_shrunk_swnd() is not called,
43011754SKacheong.Poon@Sun.COM 			 * we will stop further processing.
43111754SKacheong.Poon@Sun.COM 			 */
43211754SKacheong.Poon@Sun.COM 			if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
43311754SKacheong.Poon@Sun.COM 				tcp_process_shrunk_swnd(tcp, -usable_r);
43411754SKacheong.Poon@Sun.COM 			}
43511754SKacheong.Poon@Sun.COM 			return;
43611754SKacheong.Poon@Sun.COM 		}
43711754SKacheong.Poon@Sun.COM 
43811754SKacheong.Poon@Sun.COM 		/* usable = MIN(swnd, cwnd) - unacked_bytes */
43911754SKacheong.Poon@Sun.COM 		if (tcp->tcp_swnd > tcp->tcp_cwnd)
44011754SKacheong.Poon@Sun.COM 			usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
44111754SKacheong.Poon@Sun.COM 
44211754SKacheong.Poon@Sun.COM 		/* usable = MIN(usable, unsent) */
44311754SKacheong.Poon@Sun.COM 		if (usable_r > len)
44411754SKacheong.Poon@Sun.COM 			usable_r = len;
44511754SKacheong.Poon@Sun.COM 
44611754SKacheong.Poon@Sun.COM 		/* usable = MAX(usable, {1 for urgent, 0 for data}) */
44711754SKacheong.Poon@Sun.COM 		if (usable_r > 0) {
44811754SKacheong.Poon@Sun.COM 			usable = usable_r;
44911754SKacheong.Poon@Sun.COM 		} else {
45011754SKacheong.Poon@Sun.COM 			/* Bypass all other unnecessary processing. */
45111754SKacheong.Poon@Sun.COM 			goto done;
45211754SKacheong.Poon@Sun.COM 		}
45311754SKacheong.Poon@Sun.COM 	}
45411754SKacheong.Poon@Sun.COM 
45511754SKacheong.Poon@Sun.COM 	local_time = (mblk_t *)now;
45611754SKacheong.Poon@Sun.COM 
45711754SKacheong.Poon@Sun.COM 	/*
45811754SKacheong.Poon@Sun.COM 	 * "Our" Nagle Algorithm.  This is not the same as in the old
45911754SKacheong.Poon@Sun.COM 	 * BSD.  This is more in line with the true intent of Nagle.
46011754SKacheong.Poon@Sun.COM 	 *
46111754SKacheong.Poon@Sun.COM 	 * The conditions are:
46211754SKacheong.Poon@Sun.COM 	 * 1. The amount of unsent data (or amount of data which can be
46311754SKacheong.Poon@Sun.COM 	 *    sent, whichever is smaller) is less than Nagle limit.
46411754SKacheong.Poon@Sun.COM 	 * 2. The last sent size is also less than Nagle limit.
46511754SKacheong.Poon@Sun.COM 	 * 3. There is unack'ed data.
46611754SKacheong.Poon@Sun.COM 	 * 4. Urgent pointer is not set.  Send urgent data ignoring the
46711754SKacheong.Poon@Sun.COM 	 *    Nagle algorithm.  This reduces the probability that urgent
46811754SKacheong.Poon@Sun.COM 	 *    bytes get "merged" together.
46911754SKacheong.Poon@Sun.COM 	 * 5. The app has not closed the connection.  This eliminates the
47011754SKacheong.Poon@Sun.COM 	 *    wait time of the receiving side waiting for the last piece of
47111754SKacheong.Poon@Sun.COM 	 *    (small) data.
47211754SKacheong.Poon@Sun.COM 	 *
47311754SKacheong.Poon@Sun.COM 	 * If all are satisified, exit without sending anything.  Note
47411754SKacheong.Poon@Sun.COM 	 * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
47511754SKacheong.Poon@Sun.COM 	 * the smaller of 1 MSS and global tcp_naglim_def (default to be
47611754SKacheong.Poon@Sun.COM 	 * 4095).
47711754SKacheong.Poon@Sun.COM 	 */
47811754SKacheong.Poon@Sun.COM 	if (usable < (int)tcp->tcp_naglim &&
47911754SKacheong.Poon@Sun.COM 	    tcp->tcp_naglim > tcp->tcp_last_sent_len &&
48011754SKacheong.Poon@Sun.COM 	    snxt != tcp->tcp_suna &&
48111754SKacheong.Poon@Sun.COM 	    !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
48211754SKacheong.Poon@Sun.COM 	    !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
48311754SKacheong.Poon@Sun.COM 		goto done;
48411754SKacheong.Poon@Sun.COM 	}
48511754SKacheong.Poon@Sun.COM 
48611754SKacheong.Poon@Sun.COM 	/*
48711754SKacheong.Poon@Sun.COM 	 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
48811754SKacheong.Poon@Sun.COM 	 * is set, then we have to force TCP not to send partial segment
48911754SKacheong.Poon@Sun.COM 	 * (smaller than MSS bytes). We are calculating the usable now
49011754SKacheong.Poon@Sun.COM 	 * based on full mss and will save the rest of remaining data for
49111754SKacheong.Poon@Sun.COM 	 * later. When tcp_zero_win_probe is set, TCP needs to send out
49211754SKacheong.Poon@Sun.COM 	 * something to do zero window probe.
49311754SKacheong.Poon@Sun.COM 	 */
49411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) {
49511754SKacheong.Poon@Sun.COM 		if (usable < mss)
49611754SKacheong.Poon@Sun.COM 			goto done;
49711754SKacheong.Poon@Sun.COM 		usable = (usable / mss) * mss;
49811754SKacheong.Poon@Sun.COM 	}
49911754SKacheong.Poon@Sun.COM 
50011754SKacheong.Poon@Sun.COM 	/* Update the latest receive window size in TCP header. */
50111754SKacheong.Poon@Sun.COM 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
50211754SKacheong.Poon@Sun.COM 
50311754SKacheong.Poon@Sun.COM 	/* Send the packet. */
50411754SKacheong.Poon@Sun.COM 	rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
50511754SKacheong.Poon@Sun.COM 	    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
50611754SKacheong.Poon@Sun.COM 	    local_time);
50711754SKacheong.Poon@Sun.COM 
50811754SKacheong.Poon@Sun.COM 	/* Pretend that all we were trying to send really got sent */
50911754SKacheong.Poon@Sun.COM 	if (rc < 0 && tail_unsent < 0) {
51011754SKacheong.Poon@Sun.COM 		do {
51111754SKacheong.Poon@Sun.COM 			xmit_tail = xmit_tail->b_cont;
51211754SKacheong.Poon@Sun.COM 			xmit_tail->b_prev = local_time;
51311754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)(xmit_tail->b_wptr -
51411754SKacheong.Poon@Sun.COM 			    xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
51511754SKacheong.Poon@Sun.COM 			tail_unsent += (int)(xmit_tail->b_wptr -
51611754SKacheong.Poon@Sun.COM 			    xmit_tail->b_rptr);
51711754SKacheong.Poon@Sun.COM 		} while (tail_unsent < 0);
51811754SKacheong.Poon@Sun.COM 	}
51911754SKacheong.Poon@Sun.COM done:;
52011754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail = xmit_tail;
52111754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail_unsent = tail_unsent;
52211754SKacheong.Poon@Sun.COM 	len = tcp->tcp_snxt - snxt;
52311754SKacheong.Poon@Sun.COM 	if (len) {
52411754SKacheong.Poon@Sun.COM 		/*
52511754SKacheong.Poon@Sun.COM 		 * If new data was sent, need to update the notsack
52611754SKacheong.Poon@Sun.COM 		 * list, which is, afterall, data blocks that have
52711754SKacheong.Poon@Sun.COM 		 * not been sack'ed by the receiver.  New data is
52811754SKacheong.Poon@Sun.COM 		 * not sack'ed.
52911754SKacheong.Poon@Sun.COM 		 */
53011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
53111754SKacheong.Poon@Sun.COM 			/* len is a negative value. */
53211754SKacheong.Poon@Sun.COM 			tcp->tcp_pipe -= len;
53311754SKacheong.Poon@Sun.COM 			tcp_notsack_update(&(tcp->tcp_notsack_list),
53411754SKacheong.Poon@Sun.COM 			    tcp->tcp_snxt, snxt,
53511754SKacheong.Poon@Sun.COM 			    &(tcp->tcp_num_notsack_blk),
53611754SKacheong.Poon@Sun.COM 			    &(tcp->tcp_cnt_notsack_list));
53711754SKacheong.Poon@Sun.COM 		}
53811754SKacheong.Poon@Sun.COM 		tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
53911754SKacheong.Poon@Sun.COM 		tcp->tcp_rack = tcp->tcp_rnxt;
54011754SKacheong.Poon@Sun.COM 		tcp->tcp_rack_cnt = 0;
54111754SKacheong.Poon@Sun.COM 		if ((snxt + len) == tcp->tcp_suna) {
54211754SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
54311754SKacheong.Poon@Sun.COM 		}
54411754SKacheong.Poon@Sun.COM 	} else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
54511754SKacheong.Poon@Sun.COM 		/*
54611754SKacheong.Poon@Sun.COM 		 * Didn't send anything. Make sure the timer is running
54711754SKacheong.Poon@Sun.COM 		 * so that we will probe a zero window.
54811754SKacheong.Poon@Sun.COM 		 */
54911754SKacheong.Poon@Sun.COM 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
55011754SKacheong.Poon@Sun.COM 	}
55111754SKacheong.Poon@Sun.COM 	/* Note that len is the amount we just sent but with a negative sign */
55211754SKacheong.Poon@Sun.COM 	tcp->tcp_unsent += len;
55311754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_non_sq_lock);
55411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_flow_stopped) {
55511754SKacheong.Poon@Sun.COM 		if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
55611754SKacheong.Poon@Sun.COM 			tcp_clrqfull(tcp);
55711754SKacheong.Poon@Sun.COM 		}
55811754SKacheong.Poon@Sun.COM 	} else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
55911754SKacheong.Poon@Sun.COM 		if (!(tcp->tcp_detached))
56011754SKacheong.Poon@Sun.COM 			tcp_setqfull(tcp);
56111754SKacheong.Poon@Sun.COM 	}
56211754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_non_sq_lock);
56311754SKacheong.Poon@Sun.COM }
56411754SKacheong.Poon@Sun.COM 
56511754SKacheong.Poon@Sun.COM /*
56611754SKacheong.Poon@Sun.COM  * Initial STREAMS write side put() procedure for sockets. It tries to
56711754SKacheong.Poon@Sun.COM  * handle the T_CAPABILITY_REQ which sockfs sends down while setting
56811754SKacheong.Poon@Sun.COM  * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
56911754SKacheong.Poon@Sun.COM  * are handled by tcp_wput() as usual.
57011754SKacheong.Poon@Sun.COM  *
57111754SKacheong.Poon@Sun.COM  * All further messages will also be handled by tcp_wput() because we cannot
57211754SKacheong.Poon@Sun.COM  * be sure that the above short cut is safe later.
57311754SKacheong.Poon@Sun.COM  */
57411754SKacheong.Poon@Sun.COM void
tcp_wput_sock(queue_t * wq,mblk_t * mp)57511754SKacheong.Poon@Sun.COM tcp_wput_sock(queue_t *wq, mblk_t *mp)
57611754SKacheong.Poon@Sun.COM {
57711754SKacheong.Poon@Sun.COM 	conn_t			*connp = Q_TO_CONN(wq);
57811754SKacheong.Poon@Sun.COM 	tcp_t			*tcp = connp->conn_tcp;
57911754SKacheong.Poon@Sun.COM 	struct T_capability_req	*car = (struct T_capability_req *)mp->b_rptr;
58011754SKacheong.Poon@Sun.COM 
58111754SKacheong.Poon@Sun.COM 	ASSERT(wq->q_qinfo == &tcp_sock_winit);
58211754SKacheong.Poon@Sun.COM 	wq->q_qinfo = &tcp_winit;
58311754SKacheong.Poon@Sun.COM 
58411754SKacheong.Poon@Sun.COM 	ASSERT(IPCL_IS_TCP(connp));
58511754SKacheong.Poon@Sun.COM 	ASSERT(TCP_IS_SOCKET(tcp));
58611754SKacheong.Poon@Sun.COM 
58711754SKacheong.Poon@Sun.COM 	if (DB_TYPE(mp) == M_PCPROTO &&
58811754SKacheong.Poon@Sun.COM 	    MBLKL(mp) == sizeof (struct T_capability_req) &&
58911754SKacheong.Poon@Sun.COM 	    car->PRIM_type == T_CAPABILITY_REQ) {
59011754SKacheong.Poon@Sun.COM 		tcp_capability_req(tcp, mp);
59111754SKacheong.Poon@Sun.COM 		return;
59211754SKacheong.Poon@Sun.COM 	}
59311754SKacheong.Poon@Sun.COM 
59411754SKacheong.Poon@Sun.COM 	tcp_wput(wq, mp);
59511754SKacheong.Poon@Sun.COM }
59611754SKacheong.Poon@Sun.COM 
59711754SKacheong.Poon@Sun.COM /* ARGSUSED */
59811754SKacheong.Poon@Sun.COM void
tcp_wput_fallback(queue_t * wq,mblk_t * mp)59911754SKacheong.Poon@Sun.COM tcp_wput_fallback(queue_t *wq, mblk_t *mp)
60011754SKacheong.Poon@Sun.COM {
60111754SKacheong.Poon@Sun.COM #ifdef DEBUG
60211754SKacheong.Poon@Sun.COM 	cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
60311754SKacheong.Poon@Sun.COM #endif
60411754SKacheong.Poon@Sun.COM 	freemsg(mp);
60511754SKacheong.Poon@Sun.COM }
60611754SKacheong.Poon@Sun.COM 
60711754SKacheong.Poon@Sun.COM /*
60811754SKacheong.Poon@Sun.COM  * Call by tcp_wput() to handle misc non M_DATA messages.
60911754SKacheong.Poon@Sun.COM  */
61011754SKacheong.Poon@Sun.COM /* ARGSUSED */
61111754SKacheong.Poon@Sun.COM static void
tcp_wput_nondata(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)61211754SKacheong.Poon@Sun.COM tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
61311754SKacheong.Poon@Sun.COM {
61411754SKacheong.Poon@Sun.COM 	conn_t	*connp = (conn_t *)arg;
61511754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = connp->conn_tcp;
61611754SKacheong.Poon@Sun.COM 
61711754SKacheong.Poon@Sun.COM 	ASSERT(DB_TYPE(mp) != M_IOCTL);
61811754SKacheong.Poon@Sun.COM 	/*
61911754SKacheong.Poon@Sun.COM 	 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
62011754SKacheong.Poon@Sun.COM 	 * Once the close starts, streamhead and sockfs will not let any data
62111754SKacheong.Poon@Sun.COM 	 * packets come down (close ensures that there are no threads using the
62211754SKacheong.Poon@Sun.COM 	 * queue and no new threads will come down) but since qprocsoff()
62311754SKacheong.Poon@Sun.COM 	 * hasn't happened yet, a M_FLUSH or some non data message might
62411754SKacheong.Poon@Sun.COM 	 * get reflected back (in response to our own FLUSHRW) and get
62511754SKacheong.Poon@Sun.COM 	 * processed after tcp_close() is done. The conn would still be valid
62611754SKacheong.Poon@Sun.COM 	 * because a ref would have added but we need to check the state
62711754SKacheong.Poon@Sun.COM 	 * before actually processing the packet.
62811754SKacheong.Poon@Sun.COM 	 */
62911754SKacheong.Poon@Sun.COM 	if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
63011754SKacheong.Poon@Sun.COM 		freemsg(mp);
63111754SKacheong.Poon@Sun.COM 		return;
63211754SKacheong.Poon@Sun.COM 	}
63311754SKacheong.Poon@Sun.COM 
63411754SKacheong.Poon@Sun.COM 	switch (DB_TYPE(mp)) {
63511754SKacheong.Poon@Sun.COM 	case M_IOCDATA:
63611754SKacheong.Poon@Sun.COM 		tcp_wput_iocdata(tcp, mp);
63711754SKacheong.Poon@Sun.COM 		break;
63811754SKacheong.Poon@Sun.COM 	case M_FLUSH:
63911754SKacheong.Poon@Sun.COM 		tcp_wput_flush(tcp, mp);
64011754SKacheong.Poon@Sun.COM 		break;
64111754SKacheong.Poon@Sun.COM 	default:
64211754SKacheong.Poon@Sun.COM 		ip_wput_nondata(connp->conn_wq, mp);
64311754SKacheong.Poon@Sun.COM 		break;
64411754SKacheong.Poon@Sun.COM 	}
64511754SKacheong.Poon@Sun.COM }
64611754SKacheong.Poon@Sun.COM 
64711754SKacheong.Poon@Sun.COM /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
64811754SKacheong.Poon@Sun.COM static void
tcp_wput_flush(tcp_t * tcp,mblk_t * mp)64911754SKacheong.Poon@Sun.COM tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
65011754SKacheong.Poon@Sun.COM {
65111754SKacheong.Poon@Sun.COM 	uchar_t	fval = *mp->b_rptr;
65211754SKacheong.Poon@Sun.COM 	mblk_t	*tail;
65311754SKacheong.Poon@Sun.COM 	conn_t	*connp = tcp->tcp_connp;
65411754SKacheong.Poon@Sun.COM 	queue_t	*q = connp->conn_wq;
65511754SKacheong.Poon@Sun.COM 
65611754SKacheong.Poon@Sun.COM 	/* TODO: How should flush interact with urgent data? */
65711754SKacheong.Poon@Sun.COM 	if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL &&
65811754SKacheong.Poon@Sun.COM 	    !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
65911754SKacheong.Poon@Sun.COM 		/*
66011754SKacheong.Poon@Sun.COM 		 * Flush only data that has not yet been put on the wire.  If
66111754SKacheong.Poon@Sun.COM 		 * we flush data that we have already transmitted, life, as we
66211754SKacheong.Poon@Sun.COM 		 * know it, may come to an end.
66311754SKacheong.Poon@Sun.COM 		 */
66411754SKacheong.Poon@Sun.COM 		tail = tcp->tcp_xmit_tail;
66511754SKacheong.Poon@Sun.COM 		tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
66611754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_tail_unsent = 0;
66711754SKacheong.Poon@Sun.COM 		tcp->tcp_unsent = 0;
66811754SKacheong.Poon@Sun.COM 		if (tail->b_wptr != tail->b_rptr)
66911754SKacheong.Poon@Sun.COM 			tail = tail->b_cont;
67011754SKacheong.Poon@Sun.COM 		if (tail) {
67111754SKacheong.Poon@Sun.COM 			mblk_t **excess = &tcp->tcp_xmit_head;
67211754SKacheong.Poon@Sun.COM 			for (;;) {
67311754SKacheong.Poon@Sun.COM 				mblk_t *mp1 = *excess;
67411754SKacheong.Poon@Sun.COM 				if (mp1 == tail)
67511754SKacheong.Poon@Sun.COM 					break;
67611754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_tail = mp1;
67711754SKacheong.Poon@Sun.COM 				tcp->tcp_xmit_last = mp1;
67811754SKacheong.Poon@Sun.COM 				excess = &mp1->b_cont;
67911754SKacheong.Poon@Sun.COM 			}
68011754SKacheong.Poon@Sun.COM 			*excess = NULL;
68111754SKacheong.Poon@Sun.COM 			tcp_close_mpp(&tail);
68211754SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_zcopy_aware)
68311754SKacheong.Poon@Sun.COM 				tcp_zcopy_notify(tcp);
68411754SKacheong.Poon@Sun.COM 		}
68511754SKacheong.Poon@Sun.COM 		/*
68611754SKacheong.Poon@Sun.COM 		 * We have no unsent data, so unsent must be less than
68711754SKacheong.Poon@Sun.COM 		 * conn_sndlowat, so re-enable flow.
68811754SKacheong.Poon@Sun.COM 		 */
68911754SKacheong.Poon@Sun.COM 		mutex_enter(&tcp->tcp_non_sq_lock);
69011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_flow_stopped) {
69111754SKacheong.Poon@Sun.COM 			tcp_clrqfull(tcp);
69211754SKacheong.Poon@Sun.COM 		}
69311754SKacheong.Poon@Sun.COM 		mutex_exit(&tcp->tcp_non_sq_lock);
69411754SKacheong.Poon@Sun.COM 	}
69511754SKacheong.Poon@Sun.COM 	/*
69611754SKacheong.Poon@Sun.COM 	 * TODO: you can't just flush these, you have to increase rwnd for one
69711754SKacheong.Poon@Sun.COM 	 * thing.  For another, how should urgent data interact?
69811754SKacheong.Poon@Sun.COM 	 */
69911754SKacheong.Poon@Sun.COM 	if (fval & FLUSHR) {
70011754SKacheong.Poon@Sun.COM 		*mp->b_rptr = fval & ~FLUSHW;
70111754SKacheong.Poon@Sun.COM 		/* XXX */
70211754SKacheong.Poon@Sun.COM 		qreply(q, mp);
70311754SKacheong.Poon@Sun.COM 		return;
70411754SKacheong.Poon@Sun.COM 	}
70511754SKacheong.Poon@Sun.COM 	freemsg(mp);
70611754SKacheong.Poon@Sun.COM }
70711754SKacheong.Poon@Sun.COM 
70811754SKacheong.Poon@Sun.COM /*
70911754SKacheong.Poon@Sun.COM  * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
71011754SKacheong.Poon@Sun.COM  * messages.
71111754SKacheong.Poon@Sun.COM  */
71211754SKacheong.Poon@Sun.COM static void
tcp_wput_iocdata(tcp_t * tcp,mblk_t * mp)71311754SKacheong.Poon@Sun.COM tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
71411754SKacheong.Poon@Sun.COM {
71511754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
71611754SKacheong.Poon@Sun.COM 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
71711754SKacheong.Poon@Sun.COM 	STRUCT_HANDLE(strbuf, sb);
71811754SKacheong.Poon@Sun.COM 	uint_t		addrlen;
71911754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
72011754SKacheong.Poon@Sun.COM 	queue_t 	*q = connp->conn_wq;
72111754SKacheong.Poon@Sun.COM 
72211754SKacheong.Poon@Sun.COM 	/* Make sure it is one of ours. */
72311754SKacheong.Poon@Sun.COM 	switch (iocp->ioc_cmd) {
72411754SKacheong.Poon@Sun.COM 	case TI_GETMYNAME:
72511754SKacheong.Poon@Sun.COM 	case TI_GETPEERNAME:
72611754SKacheong.Poon@Sun.COM 		break;
72711754SKacheong.Poon@Sun.COM 	default:
72811754SKacheong.Poon@Sun.COM 		/*
72911754SKacheong.Poon@Sun.COM 		 * If the conn is closing, then error the ioctl here. Otherwise
73011754SKacheong.Poon@Sun.COM 		 * use the CONN_IOCTLREF_* macros to hold off tcp_close until
73111754SKacheong.Poon@Sun.COM 		 * we're done here.
73211754SKacheong.Poon@Sun.COM 		 */
73311754SKacheong.Poon@Sun.COM 		mutex_enter(&connp->conn_lock);
73411754SKacheong.Poon@Sun.COM 		if (connp->conn_state_flags & CONN_CLOSING) {
73511754SKacheong.Poon@Sun.COM 			mutex_exit(&connp->conn_lock);
73611754SKacheong.Poon@Sun.COM 			iocp->ioc_error = EINVAL;
73711754SKacheong.Poon@Sun.COM 			mp->b_datap->db_type = M_IOCNAK;
73811754SKacheong.Poon@Sun.COM 			iocp->ioc_count = 0;
73911754SKacheong.Poon@Sun.COM 			qreply(q, mp);
74011754SKacheong.Poon@Sun.COM 			return;
74111754SKacheong.Poon@Sun.COM 		}
74211754SKacheong.Poon@Sun.COM 
74311754SKacheong.Poon@Sun.COM 		CONN_INC_IOCTLREF_LOCKED(connp);
74411754SKacheong.Poon@Sun.COM 		ip_wput_nondata(q, mp);
74511754SKacheong.Poon@Sun.COM 		CONN_DEC_IOCTLREF(connp);
74611754SKacheong.Poon@Sun.COM 		return;
74711754SKacheong.Poon@Sun.COM 	}
74811754SKacheong.Poon@Sun.COM 	switch (mi_copy_state(q, mp, &mp1)) {
74911754SKacheong.Poon@Sun.COM 	case -1:
75011754SKacheong.Poon@Sun.COM 		return;
75111754SKacheong.Poon@Sun.COM 	case MI_COPY_CASE(MI_COPY_IN, 1):
75211754SKacheong.Poon@Sun.COM 		break;
75311754SKacheong.Poon@Sun.COM 	case MI_COPY_CASE(MI_COPY_OUT, 1):
75411754SKacheong.Poon@Sun.COM 		/* Copy out the strbuf. */
75511754SKacheong.Poon@Sun.COM 		mi_copyout(q, mp);
75611754SKacheong.Poon@Sun.COM 		return;
75711754SKacheong.Poon@Sun.COM 	case MI_COPY_CASE(MI_COPY_OUT, 2):
75811754SKacheong.Poon@Sun.COM 		/* All done. */
75911754SKacheong.Poon@Sun.COM 		mi_copy_done(q, mp, 0);
76011754SKacheong.Poon@Sun.COM 		return;
76111754SKacheong.Poon@Sun.COM 	default:
76211754SKacheong.Poon@Sun.COM 		mi_copy_done(q, mp, EPROTO);
76311754SKacheong.Poon@Sun.COM 		return;
76411754SKacheong.Poon@Sun.COM 	}
76511754SKacheong.Poon@Sun.COM 	/* Check alignment of the strbuf */
76611754SKacheong.Poon@Sun.COM 	if (!OK_32PTR(mp1->b_rptr)) {
76711754SKacheong.Poon@Sun.COM 		mi_copy_done(q, mp, EINVAL);
76811754SKacheong.Poon@Sun.COM 		return;
76911754SKacheong.Poon@Sun.COM 	}
77011754SKacheong.Poon@Sun.COM 
77111754SKacheong.Poon@Sun.COM 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
77211754SKacheong.Poon@Sun.COM 
77311754SKacheong.Poon@Sun.COM 	if (connp->conn_family == AF_INET)
77411754SKacheong.Poon@Sun.COM 		addrlen = sizeof (sin_t);
77511754SKacheong.Poon@Sun.COM 	else
77611754SKacheong.Poon@Sun.COM 		addrlen = sizeof (sin6_t);
77711754SKacheong.Poon@Sun.COM 
77811754SKacheong.Poon@Sun.COM 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
77911754SKacheong.Poon@Sun.COM 		mi_copy_done(q, mp, EINVAL);
78011754SKacheong.Poon@Sun.COM 		return;
78111754SKacheong.Poon@Sun.COM 	}
78211754SKacheong.Poon@Sun.COM 
78311754SKacheong.Poon@Sun.COM 	switch (iocp->ioc_cmd) {
78411754SKacheong.Poon@Sun.COM 	case TI_GETMYNAME:
78511754SKacheong.Poon@Sun.COM 		break;
78611754SKacheong.Poon@Sun.COM 	case TI_GETPEERNAME:
78711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state < TCPS_SYN_RCVD) {
78811754SKacheong.Poon@Sun.COM 			mi_copy_done(q, mp, ENOTCONN);
78911754SKacheong.Poon@Sun.COM 			return;
79011754SKacheong.Poon@Sun.COM 		}
79111754SKacheong.Poon@Sun.COM 		break;
79211754SKacheong.Poon@Sun.COM 	}
79311754SKacheong.Poon@Sun.COM 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
79411754SKacheong.Poon@Sun.COM 	if (!mp1)
79511754SKacheong.Poon@Sun.COM 		return;
79611754SKacheong.Poon@Sun.COM 
79711754SKacheong.Poon@Sun.COM 	STRUCT_FSET(sb, len, addrlen);
79811754SKacheong.Poon@Sun.COM 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
79911754SKacheong.Poon@Sun.COM 	case TI_GETMYNAME:
80011754SKacheong.Poon@Sun.COM 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
80111754SKacheong.Poon@Sun.COM 		    &addrlen);
80211754SKacheong.Poon@Sun.COM 		break;
80311754SKacheong.Poon@Sun.COM 	case TI_GETPEERNAME:
80411754SKacheong.Poon@Sun.COM 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
80511754SKacheong.Poon@Sun.COM 		    &addrlen);
80611754SKacheong.Poon@Sun.COM 		break;
80711754SKacheong.Poon@Sun.COM 	}
80811754SKacheong.Poon@Sun.COM 	mp1->b_wptr += addrlen;
80911754SKacheong.Poon@Sun.COM 	/* Copy out the address */
81011754SKacheong.Poon@Sun.COM 	mi_copyout(q, mp);
81111754SKacheong.Poon@Sun.COM }
81211754SKacheong.Poon@Sun.COM 
81311754SKacheong.Poon@Sun.COM /*
81411754SKacheong.Poon@Sun.COM  * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
81511754SKacheong.Poon@Sun.COM  * messages.
81611754SKacheong.Poon@Sun.COM  */
81711754SKacheong.Poon@Sun.COM /* ARGSUSED */
81811754SKacheong.Poon@Sun.COM static void
tcp_wput_ioctl(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)81911754SKacheong.Poon@Sun.COM tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
82011754SKacheong.Poon@Sun.COM {
82111754SKacheong.Poon@Sun.COM 	conn_t 		*connp = (conn_t *)arg;
82211754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
82311754SKacheong.Poon@Sun.COM 	queue_t		*q = connp->conn_wq;
82411754SKacheong.Poon@Sun.COM 	struct iocblk	*iocp;
82511754SKacheong.Poon@Sun.COM 
82611754SKacheong.Poon@Sun.COM 	ASSERT(DB_TYPE(mp) == M_IOCTL);
82711754SKacheong.Poon@Sun.COM 	/*
82811754SKacheong.Poon@Sun.COM 	 * Try and ASSERT the minimum possible references on the
82911754SKacheong.Poon@Sun.COM 	 * conn early enough. Since we are executing on write side,
83011754SKacheong.Poon@Sun.COM 	 * the connection is obviously not detached and that means
83111754SKacheong.Poon@Sun.COM 	 * there is a ref each for TCP and IP. Since we are behind
83211754SKacheong.Poon@Sun.COM 	 * the squeue, the minimum references needed are 3. If the
83311754SKacheong.Poon@Sun.COM 	 * conn is in classifier hash list, there should be an
83411754SKacheong.Poon@Sun.COM 	 * extra ref for that (we check both the possibilities).
83511754SKacheong.Poon@Sun.COM 	 */
83611754SKacheong.Poon@Sun.COM 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
83711754SKacheong.Poon@Sun.COM 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
83811754SKacheong.Poon@Sun.COM 
83911754SKacheong.Poon@Sun.COM 	iocp = (struct iocblk *)mp->b_rptr;
84011754SKacheong.Poon@Sun.COM 	switch (iocp->ioc_cmd) {
84111754SKacheong.Poon@Sun.COM 	case _SIOCSOCKFALLBACK:
84211754SKacheong.Poon@Sun.COM 		/*
84311754SKacheong.Poon@Sun.COM 		 * Either sockmod is about to be popped and the socket
84411754SKacheong.Poon@Sun.COM 		 * would now be treated as a plain stream, or a module
84511754SKacheong.Poon@Sun.COM 		 * is about to be pushed so we could no longer use read-
84611754SKacheong.Poon@Sun.COM 		 * side synchronous streams for fused loopback tcp.
84711754SKacheong.Poon@Sun.COM 		 * Drain any queued data and disable direct sockfs
84811754SKacheong.Poon@Sun.COM 		 * interface from now on.
84911754SKacheong.Poon@Sun.COM 		 */
85011754SKacheong.Poon@Sun.COM 		if (!tcp->tcp_issocket) {
85111754SKacheong.Poon@Sun.COM 			DB_TYPE(mp) = M_IOCNAK;
85211754SKacheong.Poon@Sun.COM 			iocp->ioc_error = EINVAL;
85311754SKacheong.Poon@Sun.COM 		} else {
85411754SKacheong.Poon@Sun.COM 			tcp_use_pure_tpi(tcp);
85511754SKacheong.Poon@Sun.COM 			DB_TYPE(mp) = M_IOCACK;
85611754SKacheong.Poon@Sun.COM 			iocp->ioc_error = 0;
85711754SKacheong.Poon@Sun.COM 		}
85811754SKacheong.Poon@Sun.COM 		iocp->ioc_count = 0;
85911754SKacheong.Poon@Sun.COM 		iocp->ioc_rval = 0;
86011754SKacheong.Poon@Sun.COM 		qreply(q, mp);
86111754SKacheong.Poon@Sun.COM 		return;
86211754SKacheong.Poon@Sun.COM 	}
86311754SKacheong.Poon@Sun.COM 
86411754SKacheong.Poon@Sun.COM 	/*
86511754SKacheong.Poon@Sun.COM 	 * If the conn is closing, then error the ioctl here. Otherwise bump the
86611754SKacheong.Poon@Sun.COM 	 * conn_ioctlref to hold off tcp_close until we're done here.
86711754SKacheong.Poon@Sun.COM 	 */
86811754SKacheong.Poon@Sun.COM 	mutex_enter(&(connp)->conn_lock);
86911754SKacheong.Poon@Sun.COM 	if ((connp)->conn_state_flags & CONN_CLOSING) {
87011754SKacheong.Poon@Sun.COM 		mutex_exit(&(connp)->conn_lock);
87111754SKacheong.Poon@Sun.COM 		iocp->ioc_error = EINVAL;
87211754SKacheong.Poon@Sun.COM 		mp->b_datap->db_type = M_IOCNAK;
87311754SKacheong.Poon@Sun.COM 		iocp->ioc_count = 0;
87411754SKacheong.Poon@Sun.COM 		qreply(q, mp);
87511754SKacheong.Poon@Sun.COM 		return;
87611754SKacheong.Poon@Sun.COM 	}
87711754SKacheong.Poon@Sun.COM 
87811754SKacheong.Poon@Sun.COM 	CONN_INC_IOCTLREF_LOCKED(connp);
87911754SKacheong.Poon@Sun.COM 	ip_wput_nondata(q, mp);
88011754SKacheong.Poon@Sun.COM 	CONN_DEC_IOCTLREF(connp);
88111754SKacheong.Poon@Sun.COM }
88211754SKacheong.Poon@Sun.COM 
88311754SKacheong.Poon@Sun.COM /*
88411754SKacheong.Poon@Sun.COM  * This routine is called by tcp_wput() to handle all TPI requests.
88511754SKacheong.Poon@Sun.COM  */
88611754SKacheong.Poon@Sun.COM /* ARGSUSED */
88711754SKacheong.Poon@Sun.COM static void
tcp_wput_proto(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)88811754SKacheong.Poon@Sun.COM tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
88911754SKacheong.Poon@Sun.COM {
89011754SKacheong.Poon@Sun.COM 	conn_t		*connp = (conn_t *)arg;
89111754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
89211754SKacheong.Poon@Sun.COM 	union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
89311754SKacheong.Poon@Sun.COM 	uchar_t		*rptr;
89411754SKacheong.Poon@Sun.COM 	t_scalar_t	type;
89511754SKacheong.Poon@Sun.COM 	cred_t		*cr;
89611754SKacheong.Poon@Sun.COM 
89711754SKacheong.Poon@Sun.COM 	/*
89811754SKacheong.Poon@Sun.COM 	 * Try and ASSERT the minimum possible references on the
89911754SKacheong.Poon@Sun.COM 	 * conn early enough. Since we are executing on write side,
90011754SKacheong.Poon@Sun.COM 	 * the connection is obviously not detached and that means
90111754SKacheong.Poon@Sun.COM 	 * there is a ref each for TCP and IP. Since we are behind
90211754SKacheong.Poon@Sun.COM 	 * the squeue, the minimum references needed are 3. If the
90311754SKacheong.Poon@Sun.COM 	 * conn is in classifier hash list, there should be an
90411754SKacheong.Poon@Sun.COM 	 * extra ref for that (we check both the possibilities).
90511754SKacheong.Poon@Sun.COM 	 */
90611754SKacheong.Poon@Sun.COM 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
90711754SKacheong.Poon@Sun.COM 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
90811754SKacheong.Poon@Sun.COM 
90911754SKacheong.Poon@Sun.COM 	rptr = mp->b_rptr;
91011754SKacheong.Poon@Sun.COM 	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
91111754SKacheong.Poon@Sun.COM 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
91211754SKacheong.Poon@Sun.COM 		type = ((union T_primitives *)rptr)->type;
91311754SKacheong.Poon@Sun.COM 		if (type == T_EXDATA_REQ) {
91411754SKacheong.Poon@Sun.COM 			tcp_output_urgent(connp, mp, arg2, NULL);
91511754SKacheong.Poon@Sun.COM 		} else if (type != T_DATA_REQ) {
91611754SKacheong.Poon@Sun.COM 			goto non_urgent_data;
91711754SKacheong.Poon@Sun.COM 		} else {
91811754SKacheong.Poon@Sun.COM 			/* TODO: options, flags, ... from user */
91911754SKacheong.Poon@Sun.COM 			/* Set length to zero for reclamation below */
92011754SKacheong.Poon@Sun.COM 			tcp_wput_data(tcp, mp->b_cont, B_TRUE);
92111754SKacheong.Poon@Sun.COM 			freeb(mp);
92211754SKacheong.Poon@Sun.COM 		}
92311754SKacheong.Poon@Sun.COM 		return;
92411754SKacheong.Poon@Sun.COM 	} else {
92511754SKacheong.Poon@Sun.COM 		if (connp->conn_debug) {
92611754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
92711754SKacheong.Poon@Sun.COM 			    "tcp_wput_proto, dropping one...");
92811754SKacheong.Poon@Sun.COM 		}
92911754SKacheong.Poon@Sun.COM 		freemsg(mp);
93011754SKacheong.Poon@Sun.COM 		return;
93111754SKacheong.Poon@Sun.COM 	}
93211754SKacheong.Poon@Sun.COM 
93311754SKacheong.Poon@Sun.COM non_urgent_data:
93411754SKacheong.Poon@Sun.COM 
93511754SKacheong.Poon@Sun.COM 	switch ((int)tprim->type) {
93611754SKacheong.Poon@Sun.COM 	case O_T_BIND_REQ:	/* bind request */
93711754SKacheong.Poon@Sun.COM 	case T_BIND_REQ:	/* new semantics bind request */
93811754SKacheong.Poon@Sun.COM 		tcp_tpi_bind(tcp, mp);
93911754SKacheong.Poon@Sun.COM 		break;
94011754SKacheong.Poon@Sun.COM 	case T_UNBIND_REQ:	/* unbind request */
94111754SKacheong.Poon@Sun.COM 		tcp_tpi_unbind(tcp, mp);
94211754SKacheong.Poon@Sun.COM 		break;
94311754SKacheong.Poon@Sun.COM 	case O_T_CONN_RES:	/* old connection response XXX */
94411754SKacheong.Poon@Sun.COM 	case T_CONN_RES:	/* connection response */
94511754SKacheong.Poon@Sun.COM 		tcp_tli_accept(tcp, mp);
94611754SKacheong.Poon@Sun.COM 		break;
94711754SKacheong.Poon@Sun.COM 	case T_CONN_REQ:	/* connection request */
94811754SKacheong.Poon@Sun.COM 		tcp_tpi_connect(tcp, mp);
94911754SKacheong.Poon@Sun.COM 		break;
95011754SKacheong.Poon@Sun.COM 	case T_DISCON_REQ:	/* disconnect request */
95111754SKacheong.Poon@Sun.COM 		tcp_disconnect(tcp, mp);
95211754SKacheong.Poon@Sun.COM 		break;
95311754SKacheong.Poon@Sun.COM 	case T_CAPABILITY_REQ:
95411754SKacheong.Poon@Sun.COM 		tcp_capability_req(tcp, mp);	/* capability request */
95511754SKacheong.Poon@Sun.COM 		break;
95611754SKacheong.Poon@Sun.COM 	case T_INFO_REQ:	/* information request */
95711754SKacheong.Poon@Sun.COM 		tcp_info_req(tcp, mp);
95811754SKacheong.Poon@Sun.COM 		break;
95911754SKacheong.Poon@Sun.COM 	case T_SVR4_OPTMGMT_REQ:	/* manage options req */
96011754SKacheong.Poon@Sun.COM 	case T_OPTMGMT_REQ:
96111754SKacheong.Poon@Sun.COM 		/*
96211754SKacheong.Poon@Sun.COM 		 * Note:  no support for snmpcom_req() through new
96311754SKacheong.Poon@Sun.COM 		 * T_OPTMGMT_REQ. See comments in ip.c
96411754SKacheong.Poon@Sun.COM 		 */
96511754SKacheong.Poon@Sun.COM 
96611754SKacheong.Poon@Sun.COM 		/*
96711754SKacheong.Poon@Sun.COM 		 * All Solaris components should pass a db_credp
96811754SKacheong.Poon@Sun.COM 		 * for this TPI message, hence we ASSERT.
96911754SKacheong.Poon@Sun.COM 		 * But in case there is some other M_PROTO that looks
97011754SKacheong.Poon@Sun.COM 		 * like a TPI message sent by some other kernel
97111754SKacheong.Poon@Sun.COM 		 * component, we check and return an error.
97211754SKacheong.Poon@Sun.COM 		 */
97311754SKacheong.Poon@Sun.COM 		cr = msg_getcred(mp, NULL);
97411754SKacheong.Poon@Sun.COM 		ASSERT(cr != NULL);
97511754SKacheong.Poon@Sun.COM 		if (cr == NULL) {
97611754SKacheong.Poon@Sun.COM 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
97711754SKacheong.Poon@Sun.COM 			return;
97811754SKacheong.Poon@Sun.COM 		}
97911754SKacheong.Poon@Sun.COM 		/*
98011754SKacheong.Poon@Sun.COM 		 * If EINPROGRESS is returned, the request has been queued
98111754SKacheong.Poon@Sun.COM 		 * for subsequent processing by ip_restart_optmgmt(), which
98211754SKacheong.Poon@Sun.COM 		 * will do the CONN_DEC_REF().
98311754SKacheong.Poon@Sun.COM 		 */
98411754SKacheong.Poon@Sun.COM 		if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
98511754SKacheong.Poon@Sun.COM 			svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
98611754SKacheong.Poon@Sun.COM 		} else {
98711754SKacheong.Poon@Sun.COM 			tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
98811754SKacheong.Poon@Sun.COM 		}
98911754SKacheong.Poon@Sun.COM 		break;
99011754SKacheong.Poon@Sun.COM 
99111754SKacheong.Poon@Sun.COM 	case T_UNITDATA_REQ:	/* unitdata request */
99211754SKacheong.Poon@Sun.COM 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
99311754SKacheong.Poon@Sun.COM 		break;
99411754SKacheong.Poon@Sun.COM 	case T_ORDREL_REQ:	/* orderly release req */
99511754SKacheong.Poon@Sun.COM 		freemsg(mp);
99611754SKacheong.Poon@Sun.COM 
99711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_fused)
99811754SKacheong.Poon@Sun.COM 			tcp_unfuse(tcp);
99911754SKacheong.Poon@Sun.COM 
100011754SKacheong.Poon@Sun.COM 		if (tcp_xmit_end(tcp) != 0) {
100111754SKacheong.Poon@Sun.COM 			/*
100211754SKacheong.Poon@Sun.COM 			 * We were crossing FINs and got a reset from
100311754SKacheong.Poon@Sun.COM 			 * the other side. Just ignore it.
100411754SKacheong.Poon@Sun.COM 			 */
100511754SKacheong.Poon@Sun.COM 			if (connp->conn_debug) {
100611754SKacheong.Poon@Sun.COM 				(void) strlog(TCP_MOD_ID, 0, 1,
100711754SKacheong.Poon@Sun.COM 				    SL_ERROR|SL_TRACE,
100811754SKacheong.Poon@Sun.COM 				    "tcp_wput_proto, T_ORDREL_REQ out of "
100911754SKacheong.Poon@Sun.COM 				    "state %s",
101011754SKacheong.Poon@Sun.COM 				    tcp_display(tcp, NULL,
101111754SKacheong.Poon@Sun.COM 				    DISP_ADDR_AND_PORT));
101211754SKacheong.Poon@Sun.COM 			}
101311754SKacheong.Poon@Sun.COM 		}
101411754SKacheong.Poon@Sun.COM 		break;
101511754SKacheong.Poon@Sun.COM 	case T_ADDR_REQ:
101611754SKacheong.Poon@Sun.COM 		tcp_addr_req(tcp, mp);
101711754SKacheong.Poon@Sun.COM 		break;
101811754SKacheong.Poon@Sun.COM 	default:
101911754SKacheong.Poon@Sun.COM 		if (connp->conn_debug) {
102011754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
102111754SKacheong.Poon@Sun.COM 			    "tcp_wput_proto, bogus TPI msg, type %d",
102211754SKacheong.Poon@Sun.COM 			    tprim->type);
102311754SKacheong.Poon@Sun.COM 		}
102411754SKacheong.Poon@Sun.COM 		/*
102511754SKacheong.Poon@Sun.COM 		 * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
102611754SKacheong.Poon@Sun.COM 		 * to recover.
102711754SKacheong.Poon@Sun.COM 		 */
102811754SKacheong.Poon@Sun.COM 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
102911754SKacheong.Poon@Sun.COM 		break;
103011754SKacheong.Poon@Sun.COM 	}
103111754SKacheong.Poon@Sun.COM }
103211754SKacheong.Poon@Sun.COM 
103311754SKacheong.Poon@Sun.COM /*
103411754SKacheong.Poon@Sun.COM  * Handle special out-of-band ioctl requests (see PSARC/2008/265).
103511754SKacheong.Poon@Sun.COM  */
103611754SKacheong.Poon@Sun.COM static void
tcp_wput_cmdblk(queue_t * q,mblk_t * mp)103711754SKacheong.Poon@Sun.COM tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
103811754SKacheong.Poon@Sun.COM {
103911754SKacheong.Poon@Sun.COM 	void	*data;
104011754SKacheong.Poon@Sun.COM 	mblk_t	*datamp = mp->b_cont;
104111754SKacheong.Poon@Sun.COM 	conn_t	*connp = Q_TO_CONN(q);
104211754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = connp->conn_tcp;
104311754SKacheong.Poon@Sun.COM 	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
104411754SKacheong.Poon@Sun.COM 
104511754SKacheong.Poon@Sun.COM 	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
104611754SKacheong.Poon@Sun.COM 		cmdp->cb_error = EPROTO;
104711754SKacheong.Poon@Sun.COM 		qreply(q, mp);
104811754SKacheong.Poon@Sun.COM 		return;
104911754SKacheong.Poon@Sun.COM 	}
105011754SKacheong.Poon@Sun.COM 
105111754SKacheong.Poon@Sun.COM 	data = datamp->b_rptr;
105211754SKacheong.Poon@Sun.COM 
105311754SKacheong.Poon@Sun.COM 	switch (cmdp->cb_cmd) {
105411754SKacheong.Poon@Sun.COM 	case TI_GETPEERNAME:
105511754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state < TCPS_SYN_RCVD)
105611754SKacheong.Poon@Sun.COM 			cmdp->cb_error = ENOTCONN;
105711754SKacheong.Poon@Sun.COM 		else
105811754SKacheong.Poon@Sun.COM 			cmdp->cb_error = conn_getpeername(connp, data,
105911754SKacheong.Poon@Sun.COM 			    &cmdp->cb_len);
106011754SKacheong.Poon@Sun.COM 		break;
106111754SKacheong.Poon@Sun.COM 	case TI_GETMYNAME:
106211754SKacheong.Poon@Sun.COM 		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
106311754SKacheong.Poon@Sun.COM 		break;
106411754SKacheong.Poon@Sun.COM 	default:
106511754SKacheong.Poon@Sun.COM 		cmdp->cb_error = EINVAL;
106611754SKacheong.Poon@Sun.COM 		break;
106711754SKacheong.Poon@Sun.COM 	}
106811754SKacheong.Poon@Sun.COM 
106911754SKacheong.Poon@Sun.COM 	qreply(q, mp);
107011754SKacheong.Poon@Sun.COM }
107111754SKacheong.Poon@Sun.COM 
107211754SKacheong.Poon@Sun.COM /*
107311754SKacheong.Poon@Sun.COM  * The TCP fast path write put procedure.
107411754SKacheong.Poon@Sun.COM  * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
107511754SKacheong.Poon@Sun.COM  */
107611754SKacheong.Poon@Sun.COM /* ARGSUSED */
107711754SKacheong.Poon@Sun.COM void
tcp_output(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)107811754SKacheong.Poon@Sun.COM tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
107911754SKacheong.Poon@Sun.COM {
108011754SKacheong.Poon@Sun.COM 	int		len;
108111754SKacheong.Poon@Sun.COM 	int		hdrlen;
108211754SKacheong.Poon@Sun.COM 	int		plen;
108311754SKacheong.Poon@Sun.COM 	mblk_t		*mp1;
108411754SKacheong.Poon@Sun.COM 	uchar_t		*rptr;
108511754SKacheong.Poon@Sun.COM 	uint32_t	snxt;
108611754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
108711754SKacheong.Poon@Sun.COM 	struct datab	*db;
108811754SKacheong.Poon@Sun.COM 	uint32_t	suna;
108911754SKacheong.Poon@Sun.COM 	uint32_t	mss;
109011754SKacheong.Poon@Sun.COM 	ipaddr_t	*dst;
109111754SKacheong.Poon@Sun.COM 	ipaddr_t	*src;
109211754SKacheong.Poon@Sun.COM 	uint32_t	sum;
109311754SKacheong.Poon@Sun.COM 	int		usable;
109411754SKacheong.Poon@Sun.COM 	conn_t		*connp = (conn_t *)arg;
109511754SKacheong.Poon@Sun.COM 	tcp_t		*tcp = connp->conn_tcp;
109611754SKacheong.Poon@Sun.COM 	uint32_t	msize;
109711754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
109811754SKacheong.Poon@Sun.COM 	ip_xmit_attr_t	*ixa;
109911754SKacheong.Poon@Sun.COM 	clock_t		now;
110011754SKacheong.Poon@Sun.COM 
110111754SKacheong.Poon@Sun.COM 	/*
110211754SKacheong.Poon@Sun.COM 	 * Try and ASSERT the minimum possible references on the
110311754SKacheong.Poon@Sun.COM 	 * conn early enough. Since we are executing on write side,
110411754SKacheong.Poon@Sun.COM 	 * the connection is obviously not detached and that means
110511754SKacheong.Poon@Sun.COM 	 * there is a ref each for TCP and IP. Since we are behind
110611754SKacheong.Poon@Sun.COM 	 * the squeue, the minimum references needed are 3. If the
110711754SKacheong.Poon@Sun.COM 	 * conn is in classifier hash list, there should be an
110811754SKacheong.Poon@Sun.COM 	 * extra ref for that (we check both the possibilities).
110911754SKacheong.Poon@Sun.COM 	 */
111011754SKacheong.Poon@Sun.COM 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
111111754SKacheong.Poon@Sun.COM 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
111211754SKacheong.Poon@Sun.COM 
111311754SKacheong.Poon@Sun.COM 	ASSERT(DB_TYPE(mp) == M_DATA);
111411754SKacheong.Poon@Sun.COM 	msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
111511754SKacheong.Poon@Sun.COM 
111611754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_non_sq_lock);
111711754SKacheong.Poon@Sun.COM 	tcp->tcp_squeue_bytes -= msize;
111811754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_non_sq_lock);
111911754SKacheong.Poon@Sun.COM 
112011754SKacheong.Poon@Sun.COM 	/* Bypass tcp protocol for fused tcp loopback */
112111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
112211754SKacheong.Poon@Sun.COM 		return;
112311754SKacheong.Poon@Sun.COM 
112411754SKacheong.Poon@Sun.COM 	mss = tcp->tcp_mss;
112511754SKacheong.Poon@Sun.COM 	/*
112611754SKacheong.Poon@Sun.COM 	 * If ZEROCOPY has turned off, try not to send any zero-copy message
112711754SKacheong.Poon@Sun.COM 	 * down. Do backoff, now.
112811754SKacheong.Poon@Sun.COM 	 */
112911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
113011754SKacheong.Poon@Sun.COM 		mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
113111754SKacheong.Poon@Sun.COM 
113211754SKacheong.Poon@Sun.COM 
113311754SKacheong.Poon@Sun.COM 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
113411754SKacheong.Poon@Sun.COM 	len = (int)(mp->b_wptr - mp->b_rptr);
113511754SKacheong.Poon@Sun.COM 
113611754SKacheong.Poon@Sun.COM 	/*
113711754SKacheong.Poon@Sun.COM 	 * Criteria for fast path:
113811754SKacheong.Poon@Sun.COM 	 *
113911754SKacheong.Poon@Sun.COM 	 *   1. no unsent data
114011754SKacheong.Poon@Sun.COM 	 *   2. single mblk in request
114111754SKacheong.Poon@Sun.COM 	 *   3. connection established
114211754SKacheong.Poon@Sun.COM 	 *   4. data in mblk
114311754SKacheong.Poon@Sun.COM 	 *   5. len <= mss
114411754SKacheong.Poon@Sun.COM 	 *   6. no tcp_valid bits
114511754SKacheong.Poon@Sun.COM 	 */
114611754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_unsent != 0) ||
114711754SKacheong.Poon@Sun.COM 	    (tcp->tcp_cork) ||
114811754SKacheong.Poon@Sun.COM 	    (mp->b_cont != NULL) ||
114911754SKacheong.Poon@Sun.COM 	    (tcp->tcp_state != TCPS_ESTABLISHED) ||
115011754SKacheong.Poon@Sun.COM 	    (len == 0) ||
115111754SKacheong.Poon@Sun.COM 	    (len > mss) ||
115211754SKacheong.Poon@Sun.COM 	    (tcp->tcp_valid_bits != 0)) {
115311754SKacheong.Poon@Sun.COM 		tcp_wput_data(tcp, mp, B_FALSE);
115411754SKacheong.Poon@Sun.COM 		return;
115511754SKacheong.Poon@Sun.COM 	}
115611754SKacheong.Poon@Sun.COM 
115711754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
115811754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_fin_sent == 0);
115911754SKacheong.Poon@Sun.COM 
116011754SKacheong.Poon@Sun.COM 	/* queue new packet onto retransmission queue */
116111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_xmit_head == NULL) {
116211754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_head = mp;
116311754SKacheong.Poon@Sun.COM 	} else {
116411754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_last->b_cont = mp;
116511754SKacheong.Poon@Sun.COM 	}
116611754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_last = mp;
116711754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail = mp;
116811754SKacheong.Poon@Sun.COM 
116911754SKacheong.Poon@Sun.COM 	/* find out how much we can send */
117011754SKacheong.Poon@Sun.COM 	/* BEGIN CSTYLED */
117111754SKacheong.Poon@Sun.COM 	/*
117211754SKacheong.Poon@Sun.COM 	 *    un-acked	   usable
117311754SKacheong.Poon@Sun.COM 	 *  |--------------|-----------------|
117411754SKacheong.Poon@Sun.COM 	 *  tcp_suna       tcp_snxt	  tcp_suna+tcp_swnd
117511754SKacheong.Poon@Sun.COM 	 */
117611754SKacheong.Poon@Sun.COM 	/* END CSTYLED */
117711754SKacheong.Poon@Sun.COM 
117811754SKacheong.Poon@Sun.COM 	/* start sending from tcp_snxt */
117911754SKacheong.Poon@Sun.COM 	snxt = tcp->tcp_snxt;
118011754SKacheong.Poon@Sun.COM 
118111754SKacheong.Poon@Sun.COM 	/*
118211754SKacheong.Poon@Sun.COM 	 * Check to see if this connection has been idled for some
118311754SKacheong.Poon@Sun.COM 	 * time and no ACK is expected.  If it is, we need to slow
118411754SKacheong.Poon@Sun.COM 	 * start again to get back the connection's "self-clock" as
118511754SKacheong.Poon@Sun.COM 	 * described in VJ's paper.
118611754SKacheong.Poon@Sun.COM 	 *
118711754SKacheong.Poon@Sun.COM 	 * Reinitialize tcp_cwnd after idle.
118811754SKacheong.Poon@Sun.COM 	 */
118911754SKacheong.Poon@Sun.COM 	now = LBOLT_FASTPATH;
119011754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
119111754SKacheong.Poon@Sun.COM 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
119211754SKacheong.Poon@Sun.COM 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
119311754SKacheong.Poon@Sun.COM 	}
119411754SKacheong.Poon@Sun.COM 
119511754SKacheong.Poon@Sun.COM 	usable = tcp->tcp_swnd;		/* tcp window size */
119611754SKacheong.Poon@Sun.COM 	if (usable > tcp->tcp_cwnd)
119711754SKacheong.Poon@Sun.COM 		usable = tcp->tcp_cwnd;	/* congestion window smaller */
119811754SKacheong.Poon@Sun.COM 	usable -= snxt;		/* subtract stuff already sent */
119911754SKacheong.Poon@Sun.COM 	suna = tcp->tcp_suna;
120011754SKacheong.Poon@Sun.COM 	usable += suna;
120111754SKacheong.Poon@Sun.COM 	/* usable can be < 0 if the congestion window is smaller */
120211754SKacheong.Poon@Sun.COM 	if (len > usable) {
120311754SKacheong.Poon@Sun.COM 		/* Can't send complete M_DATA in one shot */
120411754SKacheong.Poon@Sun.COM 		goto slow;
120511754SKacheong.Poon@Sun.COM 	}
120611754SKacheong.Poon@Sun.COM 
120711754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_non_sq_lock);
120811754SKacheong.Poon@Sun.COM 	if (tcp->tcp_flow_stopped &&
120911754SKacheong.Poon@Sun.COM 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
121011754SKacheong.Poon@Sun.COM 		tcp_clrqfull(tcp);
121111754SKacheong.Poon@Sun.COM 	}
121211754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_non_sq_lock);
121311754SKacheong.Poon@Sun.COM 
121411754SKacheong.Poon@Sun.COM 	/*
121511754SKacheong.Poon@Sun.COM 	 * determine if anything to send (Nagle).
121611754SKacheong.Poon@Sun.COM 	 *
121711754SKacheong.Poon@Sun.COM 	 *   1. len < tcp_mss (i.e. small)
121811754SKacheong.Poon@Sun.COM 	 *   2. unacknowledged data present
121911754SKacheong.Poon@Sun.COM 	 *   3. len < nagle limit
122011754SKacheong.Poon@Sun.COM 	 *   4. last packet sent < nagle limit (previous packet sent)
122111754SKacheong.Poon@Sun.COM 	 */
122211754SKacheong.Poon@Sun.COM 	if ((len < mss) && (snxt != suna) &&
122311754SKacheong.Poon@Sun.COM 	    (len < (int)tcp->tcp_naglim) &&
122411754SKacheong.Poon@Sun.COM 	    (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
122511754SKacheong.Poon@Sun.COM 		/*
122611754SKacheong.Poon@Sun.COM 		 * This was the first unsent packet and normally
122711754SKacheong.Poon@Sun.COM 		 * mss < xmit_hiwater so there is no need to worry
122811754SKacheong.Poon@Sun.COM 		 * about flow control. The next packet will go
122911754SKacheong.Poon@Sun.COM 		 * through the flow control check in tcp_wput_data().
123011754SKacheong.Poon@Sun.COM 		 */
123111754SKacheong.Poon@Sun.COM 		/* leftover work from above */
123211754SKacheong.Poon@Sun.COM 		tcp->tcp_unsent = len;
123311754SKacheong.Poon@Sun.COM 		tcp->tcp_xmit_tail_unsent = len;
123411754SKacheong.Poon@Sun.COM 
123511754SKacheong.Poon@Sun.COM 		return;
123611754SKacheong.Poon@Sun.COM 	}
123711754SKacheong.Poon@Sun.COM 
123811754SKacheong.Poon@Sun.COM 	/*
123911754SKacheong.Poon@Sun.COM 	 * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
124011754SKacheong.Poon@Sun.COM 	 * send now.
124111754SKacheong.Poon@Sun.COM 	 */
124211754SKacheong.Poon@Sun.COM 
124311754SKacheong.Poon@Sun.COM 	if (snxt == suna) {
124411754SKacheong.Poon@Sun.COM 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
124511754SKacheong.Poon@Sun.COM 	}
124611754SKacheong.Poon@Sun.COM 
124711754SKacheong.Poon@Sun.COM 	/* we have always sent something */
124811754SKacheong.Poon@Sun.COM 	tcp->tcp_rack_cnt = 0;
124911754SKacheong.Poon@Sun.COM 
125011754SKacheong.Poon@Sun.COM 	tcp->tcp_snxt = snxt + len;
125111754SKacheong.Poon@Sun.COM 	tcp->tcp_rack = tcp->tcp_rnxt;
125211754SKacheong.Poon@Sun.COM 
125311754SKacheong.Poon@Sun.COM 	if ((mp1 = dupb(mp)) == 0)
125411754SKacheong.Poon@Sun.COM 		goto no_memory;
125511754SKacheong.Poon@Sun.COM 	mp->b_prev = (mblk_t *)(uintptr_t)now;
125611754SKacheong.Poon@Sun.COM 	mp->b_next = (mblk_t *)(uintptr_t)snxt;
125711754SKacheong.Poon@Sun.COM 
125811754SKacheong.Poon@Sun.COM 	/* adjust tcp header information */
125911754SKacheong.Poon@Sun.COM 	tcpha = tcp->tcp_tcpha;
126011754SKacheong.Poon@Sun.COM 	tcpha->tha_flags = (TH_ACK|TH_PUSH);
126111754SKacheong.Poon@Sun.COM 
126211754SKacheong.Poon@Sun.COM 	sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
126311754SKacheong.Poon@Sun.COM 	sum = (sum >> 16) + (sum & 0xFFFF);
126411754SKacheong.Poon@Sun.COM 	tcpha->tha_sum = htons(sum);
126511754SKacheong.Poon@Sun.COM 
126611754SKacheong.Poon@Sun.COM 	tcpha->tha_seq = htonl(snxt);
126711754SKacheong.Poon@Sun.COM 
126811754SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
126911754SKacheong.Poon@Sun.COM 	TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
127011754SKacheong.Poon@Sun.COM 	BUMP_LOCAL(tcp->tcp_obsegs);
127111754SKacheong.Poon@Sun.COM 
127211754SKacheong.Poon@Sun.COM 	/* Update the latest receive window size in TCP header. */
127311754SKacheong.Poon@Sun.COM 	tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
127411754SKacheong.Poon@Sun.COM 
127511754SKacheong.Poon@Sun.COM 	tcp->tcp_last_sent_len = (ushort_t)len;
127611754SKacheong.Poon@Sun.COM 
127711754SKacheong.Poon@Sun.COM 	plen = len + connp->conn_ht_iphc_len;
127811754SKacheong.Poon@Sun.COM 
127911754SKacheong.Poon@Sun.COM 	ixa = connp->conn_ixa;
128011754SKacheong.Poon@Sun.COM 	ixa->ixa_pktlen = plen;
128111754SKacheong.Poon@Sun.COM 
128211754SKacheong.Poon@Sun.COM 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
128311754SKacheong.Poon@Sun.COM 		tcp->tcp_ipha->ipha_length = htons(plen);
128411754SKacheong.Poon@Sun.COM 	} else {
128511754SKacheong.Poon@Sun.COM 		tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
128611754SKacheong.Poon@Sun.COM 	}
128711754SKacheong.Poon@Sun.COM 
128811754SKacheong.Poon@Sun.COM 	/* see if we need to allocate a mblk for the headers */
128911754SKacheong.Poon@Sun.COM 	hdrlen = connp->conn_ht_iphc_len;
129011754SKacheong.Poon@Sun.COM 	rptr = mp1->b_rptr - hdrlen;
129111754SKacheong.Poon@Sun.COM 	db = mp1->b_datap;
129211754SKacheong.Poon@Sun.COM 	if ((db->db_ref != 2) || rptr < db->db_base ||
129311754SKacheong.Poon@Sun.COM 	    (!OK_32PTR(rptr))) {
129411754SKacheong.Poon@Sun.COM 		/* NOTE: we assume allocb returns an OK_32PTR */
129511754SKacheong.Poon@Sun.COM 		mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
129611754SKacheong.Poon@Sun.COM 		if (!mp) {
129711754SKacheong.Poon@Sun.COM 			freemsg(mp1);
129811754SKacheong.Poon@Sun.COM 			goto no_memory;
129911754SKacheong.Poon@Sun.COM 		}
130011754SKacheong.Poon@Sun.COM 		mp->b_cont = mp1;
130111754SKacheong.Poon@Sun.COM 		mp1 = mp;
130211754SKacheong.Poon@Sun.COM 		/* Leave room for Link Level header */
130311754SKacheong.Poon@Sun.COM 		rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
130411754SKacheong.Poon@Sun.COM 		mp1->b_wptr = &rptr[hdrlen];
130511754SKacheong.Poon@Sun.COM 	}
130611754SKacheong.Poon@Sun.COM 	mp1->b_rptr = rptr;
130711754SKacheong.Poon@Sun.COM 
130811754SKacheong.Poon@Sun.COM 	/* Fill in the timestamp option. */
130911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok) {
131011754SKacheong.Poon@Sun.COM 		uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
131111754SKacheong.Poon@Sun.COM 
131211754SKacheong.Poon@Sun.COM 		U32_TO_BE32(llbolt,
131311754SKacheong.Poon@Sun.COM 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
131411754SKacheong.Poon@Sun.COM 		U32_TO_BE32(tcp->tcp_ts_recent,
131511754SKacheong.Poon@Sun.COM 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
131611754SKacheong.Poon@Sun.COM 	} else {
131711754SKacheong.Poon@Sun.COM 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
131811754SKacheong.Poon@Sun.COM 	}
131911754SKacheong.Poon@Sun.COM 
132011754SKacheong.Poon@Sun.COM 	/* copy header into outgoing packet */
132111754SKacheong.Poon@Sun.COM 	dst = (ipaddr_t *)rptr;
132211754SKacheong.Poon@Sun.COM 	src = (ipaddr_t *)connp->conn_ht_iphc;
132311754SKacheong.Poon@Sun.COM 	dst[0] = src[0];
132411754SKacheong.Poon@Sun.COM 	dst[1] = src[1];
132511754SKacheong.Poon@Sun.COM 	dst[2] = src[2];
132611754SKacheong.Poon@Sun.COM 	dst[3] = src[3];
132711754SKacheong.Poon@Sun.COM 	dst[4] = src[4];
132811754SKacheong.Poon@Sun.COM 	dst[5] = src[5];
132911754SKacheong.Poon@Sun.COM 	dst[6] = src[6];
133011754SKacheong.Poon@Sun.COM 	dst[7] = src[7];
133111754SKacheong.Poon@Sun.COM 	dst[8] = src[8];
133211754SKacheong.Poon@Sun.COM 	dst[9] = src[9];
133311754SKacheong.Poon@Sun.COM 	if (hdrlen -= 40) {
133411754SKacheong.Poon@Sun.COM 		hdrlen >>= 2;
133511754SKacheong.Poon@Sun.COM 		dst += 10;
133611754SKacheong.Poon@Sun.COM 		src += 10;
133711754SKacheong.Poon@Sun.COM 		do {
133811754SKacheong.Poon@Sun.COM 			*dst++ = *src++;
133911754SKacheong.Poon@Sun.COM 		} while (--hdrlen);
134011754SKacheong.Poon@Sun.COM 	}
134111754SKacheong.Poon@Sun.COM 
134211754SKacheong.Poon@Sun.COM 	/*
134311754SKacheong.Poon@Sun.COM 	 * Set the ECN info in the TCP header.  Note that this
134411754SKacheong.Poon@Sun.COM 	 * is not the template header.
134511754SKacheong.Poon@Sun.COM 	 */
134611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ecn_ok) {
134711754SKacheong.Poon@Sun.COM 		TCP_SET_ECT(tcp, rptr);
134811754SKacheong.Poon@Sun.COM 
134911754SKacheong.Poon@Sun.COM 		tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
135011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_echo_on)
135111754SKacheong.Poon@Sun.COM 			tcpha->tha_flags |= TH_ECE;
135211754SKacheong.Poon@Sun.COM 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
135311754SKacheong.Poon@Sun.COM 			tcpha->tha_flags |= TH_CWR;
135411754SKacheong.Poon@Sun.COM 			tcp->tcp_ecn_cwr_sent = B_TRUE;
135511754SKacheong.Poon@Sun.COM 		}
135611754SKacheong.Poon@Sun.COM 	}
135711754SKacheong.Poon@Sun.COM 
135811754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ip_forward_progress) {
135911754SKacheong.Poon@Sun.COM 		tcp->tcp_ip_forward_progress = B_FALSE;
136011754SKacheong.Poon@Sun.COM 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
136111754SKacheong.Poon@Sun.COM 	} else {
136211754SKacheong.Poon@Sun.COM 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
136311754SKacheong.Poon@Sun.COM 	}
136411754SKacheong.Poon@Sun.COM 	tcp_send_data(tcp, mp1);
136511754SKacheong.Poon@Sun.COM 	return;
136611754SKacheong.Poon@Sun.COM 
136711754SKacheong.Poon@Sun.COM 	/*
136811754SKacheong.Poon@Sun.COM 	 * If we ran out of memory, we pretend to have sent the packet
136911754SKacheong.Poon@Sun.COM 	 * and that it was lost on the wire.
137011754SKacheong.Poon@Sun.COM 	 */
137111754SKacheong.Poon@Sun.COM no_memory:
137211754SKacheong.Poon@Sun.COM 	return;
137311754SKacheong.Poon@Sun.COM 
137411754SKacheong.Poon@Sun.COM slow:
137511754SKacheong.Poon@Sun.COM 	/* leftover work from above */
137611754SKacheong.Poon@Sun.COM 	tcp->tcp_unsent = len;
137711754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail_unsent = len;
137811754SKacheong.Poon@Sun.COM 	tcp_wput_data(tcp, NULL, B_FALSE);
137911754SKacheong.Poon@Sun.COM }
138011754SKacheong.Poon@Sun.COM 
138111754SKacheong.Poon@Sun.COM /* ARGSUSED2 */
138211754SKacheong.Poon@Sun.COM void
tcp_output_urgent(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)138311754SKacheong.Poon@Sun.COM tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
138411754SKacheong.Poon@Sun.COM {
138511754SKacheong.Poon@Sun.COM 	int len;
138611754SKacheong.Poon@Sun.COM 	uint32_t msize;
138711754SKacheong.Poon@Sun.COM 	conn_t *connp = (conn_t *)arg;
138811754SKacheong.Poon@Sun.COM 	tcp_t *tcp = connp->conn_tcp;
138911754SKacheong.Poon@Sun.COM 
139011754SKacheong.Poon@Sun.COM 	msize = msgdsize(mp);
139111754SKacheong.Poon@Sun.COM 
139211754SKacheong.Poon@Sun.COM 	len = msize - 1;
139311754SKacheong.Poon@Sun.COM 	if (len < 0) {
139411754SKacheong.Poon@Sun.COM 		freemsg(mp);
139511754SKacheong.Poon@Sun.COM 		return;
139611754SKacheong.Poon@Sun.COM 	}
139711754SKacheong.Poon@Sun.COM 
139811754SKacheong.Poon@Sun.COM 	/*
139911754SKacheong.Poon@Sun.COM 	 * Try to force urgent data out on the wire. Even if we have unsent
140011754SKacheong.Poon@Sun.COM 	 * data this will at least send the urgent flag.
140111754SKacheong.Poon@Sun.COM 	 * XXX does not handle more flag correctly.
140211754SKacheong.Poon@Sun.COM 	 */
140311754SKacheong.Poon@Sun.COM 	len += tcp->tcp_unsent;
140411754SKacheong.Poon@Sun.COM 	len += tcp->tcp_snxt;
140511754SKacheong.Poon@Sun.COM 	tcp->tcp_urg = len;
140611754SKacheong.Poon@Sun.COM 	tcp->tcp_valid_bits |= TCP_URG_VALID;
140711754SKacheong.Poon@Sun.COM 
140811754SKacheong.Poon@Sun.COM 	/* Bypass tcp protocol for fused tcp loopback */
140911754SKacheong.Poon@Sun.COM 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
141011754SKacheong.Poon@Sun.COM 		return;
141111754SKacheong.Poon@Sun.COM 
141211754SKacheong.Poon@Sun.COM 	/* Strip off the T_EXDATA_REQ if the data is from TPI */
141311754SKacheong.Poon@Sun.COM 	if (DB_TYPE(mp) != M_DATA) {
141411754SKacheong.Poon@Sun.COM 		mblk_t *mp1 = mp;
141511754SKacheong.Poon@Sun.COM 		ASSERT(!IPCL_IS_NONSTR(connp));
141611754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
141711754SKacheong.Poon@Sun.COM 		freeb(mp1);
141811754SKacheong.Poon@Sun.COM 	}
141911754SKacheong.Poon@Sun.COM 	tcp_wput_data(tcp, mp, B_TRUE);
142011754SKacheong.Poon@Sun.COM }
142111754SKacheong.Poon@Sun.COM 
142211754SKacheong.Poon@Sun.COM /*
142311754SKacheong.Poon@Sun.COM  * Called by streams close routine via squeues when our client blows off her
142411754SKacheong.Poon@Sun.COM  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
142511754SKacheong.Poon@Sun.COM  * connection politely" When SO_LINGER is set (with a non-zero linger time and
142611754SKacheong.Poon@Sun.COM  * it is not a nonblocking socket) then this routine sleeps until the FIN is
142711754SKacheong.Poon@Sun.COM  * acked.
142811754SKacheong.Poon@Sun.COM  *
142911754SKacheong.Poon@Sun.COM  * NOTE: tcp_close potentially returns error when lingering.
143011754SKacheong.Poon@Sun.COM  * However, the stream head currently does not pass these errors
143111754SKacheong.Poon@Sun.COM  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
143211754SKacheong.Poon@Sun.COM  * errors to the application (from tsleep()) and not errors
143311754SKacheong.Poon@Sun.COM  * like ECONNRESET caused by receiving a reset packet.
143411754SKacheong.Poon@Sun.COM  */
143511754SKacheong.Poon@Sun.COM 
143611754SKacheong.Poon@Sun.COM /* ARGSUSED */
143711754SKacheong.Poon@Sun.COM void
tcp_close_output(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)143811754SKacheong.Poon@Sun.COM tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
143911754SKacheong.Poon@Sun.COM {
144011754SKacheong.Poon@Sun.COM 	char	*msg;
144111754SKacheong.Poon@Sun.COM 	conn_t	*connp = (conn_t *)arg;
144211754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = connp->conn_tcp;
144311754SKacheong.Poon@Sun.COM 	clock_t	delta = 0;
144411754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
144511754SKacheong.Poon@Sun.COM 
144612643SAnders.Persson@Sun.COM 	/*
144712643SAnders.Persson@Sun.COM 	 * When a non-STREAMS socket is being closed, it does not always
144812643SAnders.Persson@Sun.COM 	 * stick around waiting for tcp_close_output to run and can therefore
144912643SAnders.Persson@Sun.COM 	 * have dropped a reference already. So adjust the asserts accordingly.
145012643SAnders.Persson@Sun.COM 	 */
145112643SAnders.Persson@Sun.COM 	ASSERT((connp->conn_fanout != NULL &&
145212643SAnders.Persson@Sun.COM 	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
145312643SAnders.Persson@Sun.COM 	    (connp->conn_fanout == NULL &&
145412643SAnders.Persson@Sun.COM 	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
145511754SKacheong.Poon@Sun.COM 
145611754SKacheong.Poon@Sun.COM 	mutex_enter(&tcp->tcp_eager_lock);
145711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
145812643SAnders.Persson@Sun.COM 		/*
145912643SAnders.Persson@Sun.COM 		 * Cleanup for listener. For non-STREAM sockets sockfs will
146012643SAnders.Persson@Sun.COM 		 * close all the eagers on 'q', so in that case only deal
146112643SAnders.Persson@Sun.COM 		 * with 'q0'.
146212643SAnders.Persson@Sun.COM 		 */
146312643SAnders.Persson@Sun.COM 		tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
146411754SKacheong.Poon@Sun.COM 		tcp->tcp_wait_for_eagers = 1;
146511754SKacheong.Poon@Sun.COM 	}
146611754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_eager_lock);
146711754SKacheong.Poon@Sun.COM 
146811754SKacheong.Poon@Sun.COM 	tcp->tcp_lso = B_FALSE;
146911754SKacheong.Poon@Sun.COM 
147011754SKacheong.Poon@Sun.COM 	msg = NULL;
147111754SKacheong.Poon@Sun.COM 	switch (tcp->tcp_state) {
147211754SKacheong.Poon@Sun.COM 	case TCPS_CLOSED:
147311754SKacheong.Poon@Sun.COM 	case TCPS_IDLE:
147412840SAnders.Persson@Sun.COM 		break;
147511754SKacheong.Poon@Sun.COM 	case TCPS_BOUND:
147612840SAnders.Persson@Sun.COM 		if (tcp->tcp_listener != NULL) {
147712840SAnders.Persson@Sun.COM 			ASSERT(IPCL_IS_NONSTR(connp));
147812840SAnders.Persson@Sun.COM 			/*
147912840SAnders.Persson@Sun.COM 			 * Unlink from the listener and drop the reference
148012840SAnders.Persson@Sun.COM 			 * put on it by the eager. tcp_closei_local will not
148112840SAnders.Persson@Sun.COM 			 * do it because tcp_tconnind_started is TRUE.
148212840SAnders.Persson@Sun.COM 			 */
148312840SAnders.Persson@Sun.COM 			mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
148412840SAnders.Persson@Sun.COM 			tcp_eager_unlink(tcp);
148512840SAnders.Persson@Sun.COM 			mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
148612840SAnders.Persson@Sun.COM 			CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
148712840SAnders.Persson@Sun.COM 		}
148812840SAnders.Persson@Sun.COM 		break;
148911754SKacheong.Poon@Sun.COM 	case TCPS_LISTEN:
149011754SKacheong.Poon@Sun.COM 		break;
149111754SKacheong.Poon@Sun.COM 	case TCPS_SYN_SENT:
149211754SKacheong.Poon@Sun.COM 		msg = "tcp_close, during connect";
149311754SKacheong.Poon@Sun.COM 		break;
149411754SKacheong.Poon@Sun.COM 	case TCPS_SYN_RCVD:
149511754SKacheong.Poon@Sun.COM 		/*
149611754SKacheong.Poon@Sun.COM 		 * Close during the connect 3-way handshake
149711754SKacheong.Poon@Sun.COM 		 * but here there may or may not be pending data
149811754SKacheong.Poon@Sun.COM 		 * already on queue. Process almost same as in
149911754SKacheong.Poon@Sun.COM 		 * the ESTABLISHED state.
150011754SKacheong.Poon@Sun.COM 		 */
150111754SKacheong.Poon@Sun.COM 		/* FALLTHRU */
150211754SKacheong.Poon@Sun.COM 	default:
150311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_fused)
150411754SKacheong.Poon@Sun.COM 			tcp_unfuse(tcp);
150511754SKacheong.Poon@Sun.COM 
150611754SKacheong.Poon@Sun.COM 		/*
150711754SKacheong.Poon@Sun.COM 		 * If SO_LINGER has set a zero linger time, abort the
150811754SKacheong.Poon@Sun.COM 		 * connection with a reset.
150911754SKacheong.Poon@Sun.COM 		 */
151011754SKacheong.Poon@Sun.COM 		if (connp->conn_linger && connp->conn_lingertime == 0) {
151111754SKacheong.Poon@Sun.COM 			msg = "tcp_close, zero lingertime";
151211754SKacheong.Poon@Sun.COM 			break;
151311754SKacheong.Poon@Sun.COM 		}
151411754SKacheong.Poon@Sun.COM 
151511754SKacheong.Poon@Sun.COM 		/*
151611754SKacheong.Poon@Sun.COM 		 * Abort connection if there is unread data queued.
151711754SKacheong.Poon@Sun.COM 		 */
151811754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
151911754SKacheong.Poon@Sun.COM 			msg = "tcp_close, unread data";
152011754SKacheong.Poon@Sun.COM 			break;
152111754SKacheong.Poon@Sun.COM 		}
152212643SAnders.Persson@Sun.COM 
152311754SKacheong.Poon@Sun.COM 		/*
152412643SAnders.Persson@Sun.COM 		 * Abort connection if it is being closed without first
152512643SAnders.Persson@Sun.COM 		 * being accepted. This can happen if a listening non-STREAM
152612643SAnders.Persson@Sun.COM 		 * socket wants to get rid of the socket, for example, if the
152712643SAnders.Persson@Sun.COM 		 * listener is closing.
152811754SKacheong.Poon@Sun.COM 		 */
152912643SAnders.Persson@Sun.COM 		if (tcp->tcp_listener != NULL) {
153012643SAnders.Persson@Sun.COM 			ASSERT(IPCL_IS_NONSTR(connp));
153112643SAnders.Persson@Sun.COM 			msg = "tcp_close, close before accept";
153212643SAnders.Persson@Sun.COM 
153312643SAnders.Persson@Sun.COM 			/*
153412643SAnders.Persson@Sun.COM 			 * Unlink from the listener and drop the reference
153512643SAnders.Persson@Sun.COM 			 * put on it by the eager. tcp_closei_local will not
153612643SAnders.Persson@Sun.COM 			 * do it because tcp_tconnind_started is TRUE.
153712643SAnders.Persson@Sun.COM 			 */
153812643SAnders.Persson@Sun.COM 			mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
153912643SAnders.Persson@Sun.COM 			tcp_eager_unlink(tcp);
154012643SAnders.Persson@Sun.COM 			mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
154112643SAnders.Persson@Sun.COM 			CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
154211754SKacheong.Poon@Sun.COM 			break;
154312643SAnders.Persson@Sun.COM 		}
154411754SKacheong.Poon@Sun.COM 
154511754SKacheong.Poon@Sun.COM 		/*
154611754SKacheong.Poon@Sun.COM 		 * Transmit the FIN before detaching the tcp_t.
154711754SKacheong.Poon@Sun.COM 		 * After tcp_detach returns this queue/perimeter
154811754SKacheong.Poon@Sun.COM 		 * no longer owns the tcp_t thus others can modify it.
154911754SKacheong.Poon@Sun.COM 		 */
155011754SKacheong.Poon@Sun.COM 		(void) tcp_xmit_end(tcp);
155111754SKacheong.Poon@Sun.COM 
155211754SKacheong.Poon@Sun.COM 		/*
155311754SKacheong.Poon@Sun.COM 		 * If lingering on close then wait until the fin is acked,
155411754SKacheong.Poon@Sun.COM 		 * the SO_LINGER time passes, or a reset is sent/received.
155511754SKacheong.Poon@Sun.COM 		 */
155611754SKacheong.Poon@Sun.COM 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
155711754SKacheong.Poon@Sun.COM 		    !(tcp->tcp_fin_acked) &&
155811754SKacheong.Poon@Sun.COM 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
155911754SKacheong.Poon@Sun.COM 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
156011754SKacheong.Poon@Sun.COM 				tcp->tcp_client_errno = EWOULDBLOCK;
156111754SKacheong.Poon@Sun.COM 			} else if (tcp->tcp_client_errno == 0) {
156211754SKacheong.Poon@Sun.COM 
156311754SKacheong.Poon@Sun.COM 				ASSERT(tcp->tcp_linger_tid == 0);
156411754SKacheong.Poon@Sun.COM 
156512056SKacheong.Poon@Sun.COM 				/* conn_lingertime is in sec. */
156611754SKacheong.Poon@Sun.COM 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
156711754SKacheong.Poon@Sun.COM 				    tcp_close_linger_timeout,
156812056SKacheong.Poon@Sun.COM 				    connp->conn_lingertime * MILLISEC);
156911754SKacheong.Poon@Sun.COM 
157011754SKacheong.Poon@Sun.COM 				/* tcp_close_linger_timeout will finish close */
157111754SKacheong.Poon@Sun.COM 				if (tcp->tcp_linger_tid == 0)
157211754SKacheong.Poon@Sun.COM 					tcp->tcp_client_errno = ENOSR;
157311754SKacheong.Poon@Sun.COM 				else
157411754SKacheong.Poon@Sun.COM 					return;
157511754SKacheong.Poon@Sun.COM 			}
157611754SKacheong.Poon@Sun.COM 
157711754SKacheong.Poon@Sun.COM 			/*
157811754SKacheong.Poon@Sun.COM 			 * Check if we need to detach or just close
157911754SKacheong.Poon@Sun.COM 			 * the instance.
158011754SKacheong.Poon@Sun.COM 			 */
158111754SKacheong.Poon@Sun.COM 			if (tcp->tcp_state <= TCPS_LISTEN)
158211754SKacheong.Poon@Sun.COM 				break;
158311754SKacheong.Poon@Sun.COM 		}
158411754SKacheong.Poon@Sun.COM 
158511754SKacheong.Poon@Sun.COM 		/*
158611754SKacheong.Poon@Sun.COM 		 * Make sure that no other thread will access the conn_rq of
158711754SKacheong.Poon@Sun.COM 		 * this instance (through lookups etc.) as conn_rq will go
158811754SKacheong.Poon@Sun.COM 		 * away shortly.
158911754SKacheong.Poon@Sun.COM 		 */
159011754SKacheong.Poon@Sun.COM 		tcp_acceptor_hash_remove(tcp);
159111754SKacheong.Poon@Sun.COM 
159211754SKacheong.Poon@Sun.COM 		mutex_enter(&tcp->tcp_non_sq_lock);
159311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_flow_stopped) {
159411754SKacheong.Poon@Sun.COM 			tcp_clrqfull(tcp);
159511754SKacheong.Poon@Sun.COM 		}
159611754SKacheong.Poon@Sun.COM 		mutex_exit(&tcp->tcp_non_sq_lock);
159711754SKacheong.Poon@Sun.COM 
159811754SKacheong.Poon@Sun.COM 		if (tcp->tcp_timer_tid != 0) {
159911754SKacheong.Poon@Sun.COM 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
160011754SKacheong.Poon@Sun.COM 			tcp->tcp_timer_tid = 0;
160111754SKacheong.Poon@Sun.COM 		}
160211754SKacheong.Poon@Sun.COM 		/*
160311754SKacheong.Poon@Sun.COM 		 * Need to cancel those timers which will not be used when
160411754SKacheong.Poon@Sun.COM 		 * TCP is detached.  This has to be done before the conn_wq
160511754SKacheong.Poon@Sun.COM 		 * is set to NULL.
160611754SKacheong.Poon@Sun.COM 		 */
160711754SKacheong.Poon@Sun.COM 		tcp_timers_stop(tcp);
160811754SKacheong.Poon@Sun.COM 
160911754SKacheong.Poon@Sun.COM 		tcp->tcp_detached = B_TRUE;
161011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
161111754SKacheong.Poon@Sun.COM 			tcp_time_wait_append(tcp);
161211754SKacheong.Poon@Sun.COM 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
161312643SAnders.Persson@Sun.COM 			ASSERT(connp->conn_ref >=
161412643SAnders.Persson@Sun.COM 			    (IPCL_IS_NONSTR(connp) ? 2 : 3));
161511754SKacheong.Poon@Sun.COM 			goto finish;
161611754SKacheong.Poon@Sun.COM 		}
161711754SKacheong.Poon@Sun.COM 
161811754SKacheong.Poon@Sun.COM 		/*
161911754SKacheong.Poon@Sun.COM 		 * If delta is zero the timer event wasn't executed and was
162011754SKacheong.Poon@Sun.COM 		 * successfully canceled. In this case we need to restart it
162111754SKacheong.Poon@Sun.COM 		 * with the minimal delta possible.
162211754SKacheong.Poon@Sun.COM 		 */
162311754SKacheong.Poon@Sun.COM 		if (delta >= 0)
162411754SKacheong.Poon@Sun.COM 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
162511754SKacheong.Poon@Sun.COM 			    delta ? delta : 1);
162611754SKacheong.Poon@Sun.COM 
162712643SAnders.Persson@Sun.COM 		ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
162811754SKacheong.Poon@Sun.COM 		goto finish;
162911754SKacheong.Poon@Sun.COM 	}
163011754SKacheong.Poon@Sun.COM 
163111754SKacheong.Poon@Sun.COM 	/* Detach did not complete. Still need to remove q from stream. */
163211754SKacheong.Poon@Sun.COM 	if (msg) {
163311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
163411754SKacheong.Poon@Sun.COM 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
163511754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpEstabResets);
163611754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state == TCPS_SYN_SENT ||
163711754SKacheong.Poon@Sun.COM 		    tcp->tcp_state == TCPS_SYN_RCVD)
163811754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpAttemptFails);
163911754SKacheong.Poon@Sun.COM 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
164011754SKacheong.Poon@Sun.COM 	}
164111754SKacheong.Poon@Sun.COM 
164211754SKacheong.Poon@Sun.COM 	tcp_closei_local(tcp);
164311754SKacheong.Poon@Sun.COM 	CONN_DEC_REF(connp);
164412643SAnders.Persson@Sun.COM 	ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
164511754SKacheong.Poon@Sun.COM 
164611754SKacheong.Poon@Sun.COM finish:
164711754SKacheong.Poon@Sun.COM 	/*
164811754SKacheong.Poon@Sun.COM 	 * Don't change the queues in the case of a listener that has
164911754SKacheong.Poon@Sun.COM 	 * eagers in its q or q0. It could surprise the eagers.
165011754SKacheong.Poon@Sun.COM 	 * Instead wait for the eagers outside the squeue.
165112643SAnders.Persson@Sun.COM 	 *
165212643SAnders.Persson@Sun.COM 	 * For non-STREAMS sockets tcp_wait_for_eagers implies that
165312643SAnders.Persson@Sun.COM 	 * we should delay the su_closed upcall until all eagers have
165412643SAnders.Persson@Sun.COM 	 * dropped their references.
165511754SKacheong.Poon@Sun.COM 	 */
165611754SKacheong.Poon@Sun.COM 	if (!tcp->tcp_wait_for_eagers) {
165711754SKacheong.Poon@Sun.COM 		tcp->tcp_detached = B_TRUE;
165811754SKacheong.Poon@Sun.COM 		connp->conn_rq = NULL;
165911754SKacheong.Poon@Sun.COM 		connp->conn_wq = NULL;
166012643SAnders.Persson@Sun.COM 
166112643SAnders.Persson@Sun.COM 		/* non-STREAM socket, release the upper handle */
166212643SAnders.Persson@Sun.COM 		if (IPCL_IS_NONSTR(connp)) {
166312643SAnders.Persson@Sun.COM 			ASSERT(connp->conn_upper_handle != NULL);
166412643SAnders.Persson@Sun.COM 			(*connp->conn_upcalls->su_closed)
166512643SAnders.Persson@Sun.COM 			    (connp->conn_upper_handle);
166612643SAnders.Persson@Sun.COM 			connp->conn_upper_handle = NULL;
166712643SAnders.Persson@Sun.COM 			connp->conn_upcalls = NULL;
166812643SAnders.Persson@Sun.COM 		}
166911754SKacheong.Poon@Sun.COM 	}
167011754SKacheong.Poon@Sun.COM 
167111754SKacheong.Poon@Sun.COM 	/* Signal tcp_close() to finish closing. */
167212643SAnders.Persson@Sun.COM 	mutex_enter(&tcp->tcp_closelock);
167311754SKacheong.Poon@Sun.COM 	tcp->tcp_closed = 1;
167411754SKacheong.Poon@Sun.COM 	cv_signal(&tcp->tcp_closecv);
167511754SKacheong.Poon@Sun.COM 	mutex_exit(&tcp->tcp_closelock);
167611754SKacheong.Poon@Sun.COM }
167711754SKacheong.Poon@Sun.COM 
167811754SKacheong.Poon@Sun.COM /* ARGSUSED */
167911754SKacheong.Poon@Sun.COM void
tcp_shutdown_output(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)168011754SKacheong.Poon@Sun.COM tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
168111754SKacheong.Poon@Sun.COM {
168211754SKacheong.Poon@Sun.COM 	conn_t 	*connp = (conn_t *)arg;
168311754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = connp->conn_tcp;
168411754SKacheong.Poon@Sun.COM 
168511754SKacheong.Poon@Sun.COM 	freemsg(mp);
168611754SKacheong.Poon@Sun.COM 
168711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_fused)
168811754SKacheong.Poon@Sun.COM 		tcp_unfuse(tcp);
168911754SKacheong.Poon@Sun.COM 
169011754SKacheong.Poon@Sun.COM 	if (tcp_xmit_end(tcp) != 0) {
169111754SKacheong.Poon@Sun.COM 		/*
169211754SKacheong.Poon@Sun.COM 		 * We were crossing FINs and got a reset from
169311754SKacheong.Poon@Sun.COM 		 * the other side. Just ignore it.
169411754SKacheong.Poon@Sun.COM 		 */
169511754SKacheong.Poon@Sun.COM 		if (connp->conn_debug) {
169611754SKacheong.Poon@Sun.COM 			(void) strlog(TCP_MOD_ID, 0, 1,
169711754SKacheong.Poon@Sun.COM 			    SL_ERROR|SL_TRACE,
169811754SKacheong.Poon@Sun.COM 			    "tcp_shutdown_output() out of state %s",
169911754SKacheong.Poon@Sun.COM 			    tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
170011754SKacheong.Poon@Sun.COM 		}
170111754SKacheong.Poon@Sun.COM 	}
170211754SKacheong.Poon@Sun.COM }
170311754SKacheong.Poon@Sun.COM 
170411754SKacheong.Poon@Sun.COM #pragma inline(tcp_send_data)
170511754SKacheong.Poon@Sun.COM 
170611754SKacheong.Poon@Sun.COM void
tcp_send_data(tcp_t * tcp,mblk_t * mp)170711754SKacheong.Poon@Sun.COM tcp_send_data(tcp_t *tcp, mblk_t *mp)
170811754SKacheong.Poon@Sun.COM {
170911754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
171011754SKacheong.Poon@Sun.COM 
171111754SKacheong.Poon@Sun.COM 	/*
171211754SKacheong.Poon@Sun.COM 	 * Check here to avoid sending zero-copy message down to IP when
171311754SKacheong.Poon@Sun.COM 	 * ZEROCOPY capability has turned off. We only need to deal with
171411754SKacheong.Poon@Sun.COM 	 * the race condition between sockfs and the notification here.
171511754SKacheong.Poon@Sun.COM 	 * Since we have tried to backoff the tcp_xmit_head when turning
171611754SKacheong.Poon@Sun.COM 	 * zero-copy off and new messages in tcp_output(), we simply drop
171711754SKacheong.Poon@Sun.COM 	 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
171811754SKacheong.Poon@Sun.COM 	 * is not true.
171911754SKacheong.Poon@Sun.COM 	 */
172011754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
172111754SKacheong.Poon@Sun.COM 	    !tcp->tcp_xmit_zc_clean) {
172211754SKacheong.Poon@Sun.COM 		ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
172311754SKacheong.Poon@Sun.COM 		freemsg(mp);
172411754SKacheong.Poon@Sun.COM 		return;
172511754SKacheong.Poon@Sun.COM 	}
172611754SKacheong.Poon@Sun.COM 
172712507SAlan.Maguire@Sun.COM 	DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
172812507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, tcp,
172912507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_tcph_t *,
173012507SAlan.Maguire@Sun.COM 	    &mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]);
173112507SAlan.Maguire@Sun.COM 
173211754SKacheong.Poon@Sun.COM 	ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
173311754SKacheong.Poon@Sun.COM 	(void) conn_ip_output(mp, connp->conn_ixa);
173411754SKacheong.Poon@Sun.COM }
173511754SKacheong.Poon@Sun.COM 
173611754SKacheong.Poon@Sun.COM /* ARGSUSED2 */
173711754SKacheong.Poon@Sun.COM void
tcp_send_synack(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)173811754SKacheong.Poon@Sun.COM tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
173911754SKacheong.Poon@Sun.COM {
174011754SKacheong.Poon@Sun.COM 	conn_t	*econnp = (conn_t *)arg;
174111754SKacheong.Poon@Sun.COM 	tcp_t	*tcp = econnp->conn_tcp;
174212266SErik.Nordmark@Sun.COM 	ip_xmit_attr_t *ixa = econnp->conn_ixa;
174311754SKacheong.Poon@Sun.COM 
174411754SKacheong.Poon@Sun.COM 	/* Guard against a RST having blown it away while on the squeue */
174511754SKacheong.Poon@Sun.COM 	if (tcp->tcp_state == TCPS_CLOSED) {
174611754SKacheong.Poon@Sun.COM 		freemsg(mp);
174711754SKacheong.Poon@Sun.COM 		return;
174811754SKacheong.Poon@Sun.COM 	}
174911754SKacheong.Poon@Sun.COM 
175012266SErik.Nordmark@Sun.COM 	/*
175112266SErik.Nordmark@Sun.COM 	 * In the off-chance that the eager received and responded to
175212266SErik.Nordmark@Sun.COM 	 * some other packet while the SYN|ACK was queued, we recalculate
175312266SErik.Nordmark@Sun.COM 	 * the ixa_pktlen. It would be better to fix the SYN/accept
175412266SErik.Nordmark@Sun.COM 	 * multithreading scheme to avoid this complexity.
175512266SErik.Nordmark@Sun.COM 	 */
175612266SErik.Nordmark@Sun.COM 	ixa->ixa_pktlen = msgdsize(mp);
175712266SErik.Nordmark@Sun.COM 	(void) conn_ip_output(mp, ixa);
175811754SKacheong.Poon@Sun.COM }
175911754SKacheong.Poon@Sun.COM 
176011754SKacheong.Poon@Sun.COM /*
176111754SKacheong.Poon@Sun.COM  * tcp_send() is called by tcp_wput_data() and returns one of the following:
176211754SKacheong.Poon@Sun.COM  *
176311754SKacheong.Poon@Sun.COM  * -1 = failed allocation.
176411754SKacheong.Poon@Sun.COM  *  0 = success; burst count reached, or usable send window is too small,
176511754SKacheong.Poon@Sun.COM  *      and that we'd rather wait until later before sending again.
176611754SKacheong.Poon@Sun.COM  */
176711754SKacheong.Poon@Sun.COM static int
tcp_send(tcp_t * tcp,const int mss,const int total_hdr_len,const int tcp_hdr_len,const int num_sack_blk,int * usable,uint_t * snxt,int * tail_unsent,mblk_t ** xmit_tail,mblk_t * local_time)176811754SKacheong.Poon@Sun.COM tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
176911754SKacheong.Poon@Sun.COM     const int tcp_hdr_len, const int num_sack_blk, int *usable,
177011754SKacheong.Poon@Sun.COM     uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
177111754SKacheong.Poon@Sun.COM {
177211754SKacheong.Poon@Sun.COM 	int		num_burst_seg = tcp->tcp_snd_burst;
177311754SKacheong.Poon@Sun.COM 	int		num_lso_seg = 1;
177411754SKacheong.Poon@Sun.COM 	uint_t		lso_usable;
177511754SKacheong.Poon@Sun.COM 	boolean_t	do_lso_send = B_FALSE;
177611754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
177711754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
177811754SKacheong.Poon@Sun.COM 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
177911754SKacheong.Poon@Sun.COM 
178011754SKacheong.Poon@Sun.COM 	/*
178111754SKacheong.Poon@Sun.COM 	 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
178211754SKacheong.Poon@Sun.COM 	 * the underlying connection is LSO capable. Will check whether having
178311754SKacheong.Poon@Sun.COM 	 * enough available data to initiate LSO transmission in the for(){}
178411754SKacheong.Poon@Sun.COM 	 * loops.
178511754SKacheong.Poon@Sun.COM 	 */
178611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
178711754SKacheong.Poon@Sun.COM 		do_lso_send = B_TRUE;
178811754SKacheong.Poon@Sun.COM 
178911754SKacheong.Poon@Sun.COM 	for (;;) {
179011754SKacheong.Poon@Sun.COM 		struct datab	*db;
179111754SKacheong.Poon@Sun.COM 		tcpha_t		*tcpha;
179211754SKacheong.Poon@Sun.COM 		uint32_t	sum;
179311754SKacheong.Poon@Sun.COM 		mblk_t		*mp, *mp1;
179411754SKacheong.Poon@Sun.COM 		uchar_t		*rptr;
179511754SKacheong.Poon@Sun.COM 		int		len;
179611754SKacheong.Poon@Sun.COM 
179711754SKacheong.Poon@Sun.COM 		/*
179811754SKacheong.Poon@Sun.COM 		 * Burst count reached, return successfully.
179911754SKacheong.Poon@Sun.COM 		 */
180011754SKacheong.Poon@Sun.COM 		if (num_burst_seg == 0)
180111754SKacheong.Poon@Sun.COM 			break;
180211754SKacheong.Poon@Sun.COM 
180311754SKacheong.Poon@Sun.COM 		/*
180411754SKacheong.Poon@Sun.COM 		 * Calculate the maximum payload length we can send at one
180511754SKacheong.Poon@Sun.COM 		 * time.
180611754SKacheong.Poon@Sun.COM 		 */
180711754SKacheong.Poon@Sun.COM 		if (do_lso_send) {
180811754SKacheong.Poon@Sun.COM 			/*
180911754SKacheong.Poon@Sun.COM 			 * Check whether be able to to do LSO for the current
181011754SKacheong.Poon@Sun.COM 			 * available data.
181111754SKacheong.Poon@Sun.COM 			 */
181211754SKacheong.Poon@Sun.COM 			if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
181311754SKacheong.Poon@Sun.COM 				lso_usable = MIN(tcp->tcp_lso_max, *usable);
181411754SKacheong.Poon@Sun.COM 				lso_usable = MIN(lso_usable,
181511754SKacheong.Poon@Sun.COM 				    num_burst_seg * mss);
181611754SKacheong.Poon@Sun.COM 
181711754SKacheong.Poon@Sun.COM 				num_lso_seg = lso_usable / mss;
181811754SKacheong.Poon@Sun.COM 				if (lso_usable % mss) {
181911754SKacheong.Poon@Sun.COM 					num_lso_seg++;
182011754SKacheong.Poon@Sun.COM 					tcp->tcp_last_sent_len = (ushort_t)
182111754SKacheong.Poon@Sun.COM 					    (lso_usable % mss);
182211754SKacheong.Poon@Sun.COM 				} else {
182311754SKacheong.Poon@Sun.COM 					tcp->tcp_last_sent_len = (ushort_t)mss;
182411754SKacheong.Poon@Sun.COM 				}
182511754SKacheong.Poon@Sun.COM 			} else {
182611754SKacheong.Poon@Sun.COM 				do_lso_send = B_FALSE;
182711754SKacheong.Poon@Sun.COM 				num_lso_seg = 1;
182811754SKacheong.Poon@Sun.COM 				lso_usable = mss;
182911754SKacheong.Poon@Sun.COM 			}
183011754SKacheong.Poon@Sun.COM 		}
183111754SKacheong.Poon@Sun.COM 
183211754SKacheong.Poon@Sun.COM 		ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
183311754SKacheong.Poon@Sun.COM #ifdef DEBUG
183411754SKacheong.Poon@Sun.COM 		DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
183511754SKacheong.Poon@Sun.COM 		    do_lso_send);
183611754SKacheong.Poon@Sun.COM #endif
183711754SKacheong.Poon@Sun.COM 		/*
183811754SKacheong.Poon@Sun.COM 		 * Adjust num_burst_seg here.
183911754SKacheong.Poon@Sun.COM 		 */
184011754SKacheong.Poon@Sun.COM 		num_burst_seg -= num_lso_seg;
184111754SKacheong.Poon@Sun.COM 
184211754SKacheong.Poon@Sun.COM 		len = mss;
184311754SKacheong.Poon@Sun.COM 		if (len > *usable) {
184411754SKacheong.Poon@Sun.COM 			ASSERT(do_lso_send == B_FALSE);
184511754SKacheong.Poon@Sun.COM 
184611754SKacheong.Poon@Sun.COM 			len = *usable;
184711754SKacheong.Poon@Sun.COM 			if (len <= 0) {
184811754SKacheong.Poon@Sun.COM 				/* Terminate the loop */
184911754SKacheong.Poon@Sun.COM 				break;	/* success; too small */
185011754SKacheong.Poon@Sun.COM 			}
185111754SKacheong.Poon@Sun.COM 			/*
185211754SKacheong.Poon@Sun.COM 			 * Sender silly-window avoidance.
185311754SKacheong.Poon@Sun.COM 			 * Ignore this if we are going to send a
185411754SKacheong.Poon@Sun.COM 			 * zero window probe out.
185511754SKacheong.Poon@Sun.COM 			 *
185611754SKacheong.Poon@Sun.COM 			 * TODO: force data into microscopic window?
185711754SKacheong.Poon@Sun.COM 			 *	==> (!pushed || (unsent > usable))
185811754SKacheong.Poon@Sun.COM 			 */
185911754SKacheong.Poon@Sun.COM 			if (len < (tcp->tcp_max_swnd >> 1) &&
186011754SKacheong.Poon@Sun.COM 			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
186111754SKacheong.Poon@Sun.COM 			    !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
186211754SKacheong.Poon@Sun.COM 			    len == 1) && (! tcp->tcp_zero_win_probe)) {
186311754SKacheong.Poon@Sun.COM 				/*
186411754SKacheong.Poon@Sun.COM 				 * If the retransmit timer is not running
186511754SKacheong.Poon@Sun.COM 				 * we start it so that we will retransmit
186611754SKacheong.Poon@Sun.COM 				 * in the case when the receiver has
186711754SKacheong.Poon@Sun.COM 				 * decremented the window.
186811754SKacheong.Poon@Sun.COM 				 */
186911754SKacheong.Poon@Sun.COM 				if (*snxt == tcp->tcp_snxt &&
187011754SKacheong.Poon@Sun.COM 				    *snxt == tcp->tcp_suna) {
187111754SKacheong.Poon@Sun.COM 					/*
187211754SKacheong.Poon@Sun.COM 					 * We are not supposed to send
187311754SKacheong.Poon@Sun.COM 					 * anything.  So let's wait a little
187411754SKacheong.Poon@Sun.COM 					 * bit longer before breaking SWS
187511754SKacheong.Poon@Sun.COM 					 * avoidance.
187611754SKacheong.Poon@Sun.COM 					 *
187711754SKacheong.Poon@Sun.COM 					 * What should the value be?
187811754SKacheong.Poon@Sun.COM 					 * Suggestion: MAX(init rexmit time,
187911754SKacheong.Poon@Sun.COM 					 * tcp->tcp_rto)
188011754SKacheong.Poon@Sun.COM 					 */
188111754SKacheong.Poon@Sun.COM 					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
188211754SKacheong.Poon@Sun.COM 				}
188311754SKacheong.Poon@Sun.COM 				break;	/* success; too small */
188411754SKacheong.Poon@Sun.COM 			}
188511754SKacheong.Poon@Sun.COM 		}
188611754SKacheong.Poon@Sun.COM 
188711754SKacheong.Poon@Sun.COM 		tcpha = tcp->tcp_tcpha;
188811754SKacheong.Poon@Sun.COM 
188911754SKacheong.Poon@Sun.COM 		/*
189011754SKacheong.Poon@Sun.COM 		 * The reason to adjust len here is that we need to set flags
189111754SKacheong.Poon@Sun.COM 		 * and calculate checksum.
189211754SKacheong.Poon@Sun.COM 		 */
189311754SKacheong.Poon@Sun.COM 		if (do_lso_send)
189411754SKacheong.Poon@Sun.COM 			len = lso_usable;
189511754SKacheong.Poon@Sun.COM 
189611754SKacheong.Poon@Sun.COM 		*usable -= len; /* Approximate - can be adjusted later */
189711754SKacheong.Poon@Sun.COM 		if (*usable > 0)
189811754SKacheong.Poon@Sun.COM 			tcpha->tha_flags = TH_ACK;
189911754SKacheong.Poon@Sun.COM 		else
190011754SKacheong.Poon@Sun.COM 			tcpha->tha_flags = (TH_ACK | TH_PUSH);
190111754SKacheong.Poon@Sun.COM 
190211754SKacheong.Poon@Sun.COM 		/*
190311754SKacheong.Poon@Sun.COM 		 * Prime pump for IP's checksumming on our behalf.
190411754SKacheong.Poon@Sun.COM 		 * Include the adjustment for a source route if any.
190511754SKacheong.Poon@Sun.COM 		 * In case of LSO, the partial pseudo-header checksum should
190611754SKacheong.Poon@Sun.COM 		 * exclusive TCP length, so zero tha_sum before IP calculate
190711754SKacheong.Poon@Sun.COM 		 * pseudo-header checksum for partial checksum offload.
190811754SKacheong.Poon@Sun.COM 		 */
190911754SKacheong.Poon@Sun.COM 		if (do_lso_send) {
191011754SKacheong.Poon@Sun.COM 			sum = 0;
191111754SKacheong.Poon@Sun.COM 		} else {
191211754SKacheong.Poon@Sun.COM 			sum = len + tcp_hdr_len + connp->conn_sum;
191311754SKacheong.Poon@Sun.COM 			sum = (sum >> 16) + (sum & 0xFFFF);
191411754SKacheong.Poon@Sun.COM 		}
191511754SKacheong.Poon@Sun.COM 		tcpha->tha_sum = htons(sum);
191611754SKacheong.Poon@Sun.COM 		tcpha->tha_seq = htonl(*snxt);
191711754SKacheong.Poon@Sun.COM 
191811754SKacheong.Poon@Sun.COM 		/*
191911754SKacheong.Poon@Sun.COM 		 * Branch off to tcp_xmit_mp() if any of the VALID bits is
192011754SKacheong.Poon@Sun.COM 		 * set.  For the case when TCP_FSS_VALID is the only valid
192111754SKacheong.Poon@Sun.COM 		 * bit (normal active close), branch off only when we think
192211754SKacheong.Poon@Sun.COM 		 * that the FIN flag needs to be set.  Note for this case,
192311754SKacheong.Poon@Sun.COM 		 * that (snxt + len) may not reflect the actual seg_len,
192411754SKacheong.Poon@Sun.COM 		 * as len may be further reduced in tcp_xmit_mp().  If len
192511754SKacheong.Poon@Sun.COM 		 * gets modified, we will end up here again.
192611754SKacheong.Poon@Sun.COM 		 */
192711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_valid_bits != 0 &&
192811754SKacheong.Poon@Sun.COM 		    (tcp->tcp_valid_bits != TCP_FSS_VALID ||
192911754SKacheong.Poon@Sun.COM 		    ((*snxt + len) == tcp->tcp_fss))) {
193011754SKacheong.Poon@Sun.COM 			uchar_t		*prev_rptr;
193111754SKacheong.Poon@Sun.COM 			uint32_t	prev_snxt = tcp->tcp_snxt;
193211754SKacheong.Poon@Sun.COM 
193311754SKacheong.Poon@Sun.COM 			if (*tail_unsent == 0) {
193411754SKacheong.Poon@Sun.COM 				ASSERT((*xmit_tail)->b_cont != NULL);
193511754SKacheong.Poon@Sun.COM 				*xmit_tail = (*xmit_tail)->b_cont;
193611754SKacheong.Poon@Sun.COM 				prev_rptr = (*xmit_tail)->b_rptr;
193711754SKacheong.Poon@Sun.COM 				*tail_unsent = (int)((*xmit_tail)->b_wptr -
193811754SKacheong.Poon@Sun.COM 				    (*xmit_tail)->b_rptr);
193911754SKacheong.Poon@Sun.COM 			} else {
194011754SKacheong.Poon@Sun.COM 				prev_rptr = (*xmit_tail)->b_rptr;
194111754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
194211754SKacheong.Poon@Sun.COM 				    *tail_unsent;
194311754SKacheong.Poon@Sun.COM 			}
194411754SKacheong.Poon@Sun.COM 			mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
194511754SKacheong.Poon@Sun.COM 			    *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
194611754SKacheong.Poon@Sun.COM 			/* Restore tcp_snxt so we get amount sent right. */
194711754SKacheong.Poon@Sun.COM 			tcp->tcp_snxt = prev_snxt;
194811754SKacheong.Poon@Sun.COM 			if (prev_rptr == (*xmit_tail)->b_rptr) {
194911754SKacheong.Poon@Sun.COM 				/*
195011754SKacheong.Poon@Sun.COM 				 * If the previous timestamp is still in use,
195111754SKacheong.Poon@Sun.COM 				 * don't stomp on it.
195211754SKacheong.Poon@Sun.COM 				 */
195311754SKacheong.Poon@Sun.COM 				if ((*xmit_tail)->b_next == NULL) {
195411754SKacheong.Poon@Sun.COM 					(*xmit_tail)->b_prev = local_time;
195511754SKacheong.Poon@Sun.COM 					(*xmit_tail)->b_next =
195611754SKacheong.Poon@Sun.COM 					    (mblk_t *)(uintptr_t)(*snxt);
195711754SKacheong.Poon@Sun.COM 				}
195811754SKacheong.Poon@Sun.COM 			} else
195911754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_rptr = prev_rptr;
196011754SKacheong.Poon@Sun.COM 
196111754SKacheong.Poon@Sun.COM 			if (mp == NULL) {
196211754SKacheong.Poon@Sun.COM 				return (-1);
196311754SKacheong.Poon@Sun.COM 			}
196411754SKacheong.Poon@Sun.COM 			mp1 = mp->b_cont;
196511754SKacheong.Poon@Sun.COM 
196611754SKacheong.Poon@Sun.COM 			if (len <= mss) /* LSO is unusable (!do_lso_send) */
196711754SKacheong.Poon@Sun.COM 				tcp->tcp_last_sent_len = (ushort_t)len;
196811754SKacheong.Poon@Sun.COM 			while (mp1->b_cont) {
196911754SKacheong.Poon@Sun.COM 				*xmit_tail = (*xmit_tail)->b_cont;
197011754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_prev = local_time;
197111754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_next =
197211754SKacheong.Poon@Sun.COM 				    (mblk_t *)(uintptr_t)(*snxt);
197311754SKacheong.Poon@Sun.COM 				mp1 = mp1->b_cont;
197411754SKacheong.Poon@Sun.COM 			}
197511754SKacheong.Poon@Sun.COM 			*snxt += len;
197611754SKacheong.Poon@Sun.COM 			*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
197711754SKacheong.Poon@Sun.COM 			BUMP_LOCAL(tcp->tcp_obsegs);
197811754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
197911754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
198011754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp);
198111754SKacheong.Poon@Sun.COM 			continue;
198211754SKacheong.Poon@Sun.COM 		}
198311754SKacheong.Poon@Sun.COM 
198411754SKacheong.Poon@Sun.COM 		*snxt += len;	/* Adjust later if we don't send all of len */
198511754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
198611754SKacheong.Poon@Sun.COM 		TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
198711754SKacheong.Poon@Sun.COM 
198811754SKacheong.Poon@Sun.COM 		if (*tail_unsent) {
198911754SKacheong.Poon@Sun.COM 			/* Are the bytes above us in flight? */
199011754SKacheong.Poon@Sun.COM 			rptr = (*xmit_tail)->b_wptr - *tail_unsent;
199111754SKacheong.Poon@Sun.COM 			if (rptr != (*xmit_tail)->b_rptr) {
199211754SKacheong.Poon@Sun.COM 				*tail_unsent -= len;
199311754SKacheong.Poon@Sun.COM 				if (len <= mss) /* LSO is unusable */
199411754SKacheong.Poon@Sun.COM 					tcp->tcp_last_sent_len = (ushort_t)len;
199511754SKacheong.Poon@Sun.COM 				len += total_hdr_len;
199611754SKacheong.Poon@Sun.COM 				ixa->ixa_pktlen = len;
199711754SKacheong.Poon@Sun.COM 
199811754SKacheong.Poon@Sun.COM 				if (ixa->ixa_flags & IXAF_IS_IPV4) {
199911754SKacheong.Poon@Sun.COM 					tcp->tcp_ipha->ipha_length = htons(len);
200011754SKacheong.Poon@Sun.COM 				} else {
200111754SKacheong.Poon@Sun.COM 					tcp->tcp_ip6h->ip6_plen =
200211754SKacheong.Poon@Sun.COM 					    htons(len - IPV6_HDR_LEN);
200311754SKacheong.Poon@Sun.COM 				}
200411754SKacheong.Poon@Sun.COM 
200511754SKacheong.Poon@Sun.COM 				mp = dupb(*xmit_tail);
200611754SKacheong.Poon@Sun.COM 				if (mp == NULL) {
200711754SKacheong.Poon@Sun.COM 					return (-1);	/* out_of_mem */
200811754SKacheong.Poon@Sun.COM 				}
200911754SKacheong.Poon@Sun.COM 				mp->b_rptr = rptr;
201011754SKacheong.Poon@Sun.COM 				/*
201111754SKacheong.Poon@Sun.COM 				 * If the old timestamp is no longer in use,
201211754SKacheong.Poon@Sun.COM 				 * sample a new timestamp now.
201311754SKacheong.Poon@Sun.COM 				 */
201411754SKacheong.Poon@Sun.COM 				if ((*xmit_tail)->b_next == NULL) {
201511754SKacheong.Poon@Sun.COM 					(*xmit_tail)->b_prev = local_time;
201611754SKacheong.Poon@Sun.COM 					(*xmit_tail)->b_next =
201711754SKacheong.Poon@Sun.COM 					    (mblk_t *)(uintptr_t)(*snxt-len);
201811754SKacheong.Poon@Sun.COM 				}
201911754SKacheong.Poon@Sun.COM 				goto must_alloc;
202011754SKacheong.Poon@Sun.COM 			}
202111754SKacheong.Poon@Sun.COM 		} else {
202211754SKacheong.Poon@Sun.COM 			*xmit_tail = (*xmit_tail)->b_cont;
202311754SKacheong.Poon@Sun.COM 			ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
202411754SKacheong.Poon@Sun.COM 			    (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
202511754SKacheong.Poon@Sun.COM 			*tail_unsent = (int)((*xmit_tail)->b_wptr -
202611754SKacheong.Poon@Sun.COM 			    (*xmit_tail)->b_rptr);
202711754SKacheong.Poon@Sun.COM 		}
202811754SKacheong.Poon@Sun.COM 
202911754SKacheong.Poon@Sun.COM 		(*xmit_tail)->b_prev = local_time;
203011754SKacheong.Poon@Sun.COM 		(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
203111754SKacheong.Poon@Sun.COM 
203211754SKacheong.Poon@Sun.COM 		*tail_unsent -= len;
203311754SKacheong.Poon@Sun.COM 		if (len <= mss) /* LSO is unusable (!do_lso_send) */
203411754SKacheong.Poon@Sun.COM 			tcp->tcp_last_sent_len = (ushort_t)len;
203511754SKacheong.Poon@Sun.COM 
203611754SKacheong.Poon@Sun.COM 		len += total_hdr_len;
203711754SKacheong.Poon@Sun.COM 		ixa->ixa_pktlen = len;
203811754SKacheong.Poon@Sun.COM 
203911754SKacheong.Poon@Sun.COM 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
204011754SKacheong.Poon@Sun.COM 			tcp->tcp_ipha->ipha_length = htons(len);
204111754SKacheong.Poon@Sun.COM 		} else {
204211754SKacheong.Poon@Sun.COM 			tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
204311754SKacheong.Poon@Sun.COM 		}
204411754SKacheong.Poon@Sun.COM 
204511754SKacheong.Poon@Sun.COM 		mp = dupb(*xmit_tail);
204611754SKacheong.Poon@Sun.COM 		if (mp == NULL) {
204711754SKacheong.Poon@Sun.COM 			return (-1);	/* out_of_mem */
204811754SKacheong.Poon@Sun.COM 		}
204911754SKacheong.Poon@Sun.COM 
205011754SKacheong.Poon@Sun.COM 		len = total_hdr_len;
205111754SKacheong.Poon@Sun.COM 		/*
205211754SKacheong.Poon@Sun.COM 		 * There are four reasons to allocate a new hdr mblk:
205311754SKacheong.Poon@Sun.COM 		 *  1) The bytes above us are in use by another packet
205411754SKacheong.Poon@Sun.COM 		 *  2) We don't have good alignment
205511754SKacheong.Poon@Sun.COM 		 *  3) The mblk is being shared
205611754SKacheong.Poon@Sun.COM 		 *  4) We don't have enough room for a header
205711754SKacheong.Poon@Sun.COM 		 */
205811754SKacheong.Poon@Sun.COM 		rptr = mp->b_rptr - len;
205911754SKacheong.Poon@Sun.COM 		if (!OK_32PTR(rptr) ||
206011754SKacheong.Poon@Sun.COM 		    ((db = mp->b_datap), db->db_ref != 2) ||
206111754SKacheong.Poon@Sun.COM 		    rptr < db->db_base) {
206211754SKacheong.Poon@Sun.COM 			/* NOTE: we assume allocb returns an OK_32PTR */
206311754SKacheong.Poon@Sun.COM 
206411754SKacheong.Poon@Sun.COM 		must_alloc:;
206511754SKacheong.Poon@Sun.COM 			mp1 = allocb(connp->conn_ht_iphc_allocated +
206611754SKacheong.Poon@Sun.COM 			    tcps->tcps_wroff_xtra, BPRI_MED);
206711754SKacheong.Poon@Sun.COM 			if (mp1 == NULL) {
206811754SKacheong.Poon@Sun.COM 				freemsg(mp);
206911754SKacheong.Poon@Sun.COM 				return (-1);	/* out_of_mem */
207011754SKacheong.Poon@Sun.COM 			}
207111754SKacheong.Poon@Sun.COM 			mp1->b_cont = mp;
207211754SKacheong.Poon@Sun.COM 			mp = mp1;
207311754SKacheong.Poon@Sun.COM 			/* Leave room for Link Level header */
207411754SKacheong.Poon@Sun.COM 			len = total_hdr_len;
207511754SKacheong.Poon@Sun.COM 			rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
207611754SKacheong.Poon@Sun.COM 			mp->b_wptr = &rptr[len];
207711754SKacheong.Poon@Sun.COM 		}
207811754SKacheong.Poon@Sun.COM 
207911754SKacheong.Poon@Sun.COM 		/*
208011754SKacheong.Poon@Sun.COM 		 * Fill in the header using the template header, and add
208111754SKacheong.Poon@Sun.COM 		 * options such as time-stamp, ECN and/or SACK, as needed.
208211754SKacheong.Poon@Sun.COM 		 */
208311754SKacheong.Poon@Sun.COM 		tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
208411754SKacheong.Poon@Sun.COM 
208511754SKacheong.Poon@Sun.COM 		mp->b_rptr = rptr;
208611754SKacheong.Poon@Sun.COM 
208711754SKacheong.Poon@Sun.COM 		if (*tail_unsent) {
208811754SKacheong.Poon@Sun.COM 			int spill = *tail_unsent;
208911754SKacheong.Poon@Sun.COM 
209011754SKacheong.Poon@Sun.COM 			mp1 = mp->b_cont;
209111754SKacheong.Poon@Sun.COM 			if (mp1 == NULL)
209211754SKacheong.Poon@Sun.COM 				mp1 = mp;
209311754SKacheong.Poon@Sun.COM 
209411754SKacheong.Poon@Sun.COM 			/*
209511754SKacheong.Poon@Sun.COM 			 * If we're a little short, tack on more mblks until
209611754SKacheong.Poon@Sun.COM 			 * there is no more spillover.
209711754SKacheong.Poon@Sun.COM 			 */
209811754SKacheong.Poon@Sun.COM 			while (spill < 0) {
209911754SKacheong.Poon@Sun.COM 				mblk_t *nmp;
210011754SKacheong.Poon@Sun.COM 				int nmpsz;
210111754SKacheong.Poon@Sun.COM 
210211754SKacheong.Poon@Sun.COM 				nmp = (*xmit_tail)->b_cont;
210311754SKacheong.Poon@Sun.COM 				nmpsz = MBLKL(nmp);
210411754SKacheong.Poon@Sun.COM 
210511754SKacheong.Poon@Sun.COM 				/*
210611754SKacheong.Poon@Sun.COM 				 * Excess data in mblk; can we split it?
210711754SKacheong.Poon@Sun.COM 				 * If LSO is enabled for the connection,
210811754SKacheong.Poon@Sun.COM 				 * keep on splitting as this is a transient
210911754SKacheong.Poon@Sun.COM 				 * send path.
211011754SKacheong.Poon@Sun.COM 				 */
211111754SKacheong.Poon@Sun.COM 				if (!do_lso_send && (spill + nmpsz > 0)) {
211211754SKacheong.Poon@Sun.COM 					/*
211311754SKacheong.Poon@Sun.COM 					 * Don't split if stream head was
211411754SKacheong.Poon@Sun.COM 					 * told to break up larger writes
211511754SKacheong.Poon@Sun.COM 					 * into smaller ones.
211611754SKacheong.Poon@Sun.COM 					 */
211711754SKacheong.Poon@Sun.COM 					if (tcp->tcp_maxpsz_multiplier > 0)
211811754SKacheong.Poon@Sun.COM 						break;
211911754SKacheong.Poon@Sun.COM 
212011754SKacheong.Poon@Sun.COM 					/*
212111754SKacheong.Poon@Sun.COM 					 * Next mblk is less than SMSS/2
212211754SKacheong.Poon@Sun.COM 					 * rounded up to nearest 64-byte;
212311754SKacheong.Poon@Sun.COM 					 * let it get sent as part of the
212411754SKacheong.Poon@Sun.COM 					 * next segment.
212511754SKacheong.Poon@Sun.COM 					 */
212611754SKacheong.Poon@Sun.COM 					if (tcp->tcp_localnet &&
212711754SKacheong.Poon@Sun.COM 					    !tcp->tcp_cork &&
212811754SKacheong.Poon@Sun.COM 					    (nmpsz < roundup((mss >> 1), 64)))
212911754SKacheong.Poon@Sun.COM 						break;
213011754SKacheong.Poon@Sun.COM 				}
213111754SKacheong.Poon@Sun.COM 
213211754SKacheong.Poon@Sun.COM 				*xmit_tail = nmp;
213311754SKacheong.Poon@Sun.COM 				ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
213411754SKacheong.Poon@Sun.COM 				/* Stash for rtt use later */
213511754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_prev = local_time;
213611754SKacheong.Poon@Sun.COM 				(*xmit_tail)->b_next =
213711754SKacheong.Poon@Sun.COM 				    (mblk_t *)(uintptr_t)(*snxt - len);
213811754SKacheong.Poon@Sun.COM 				mp1->b_cont = dupb(*xmit_tail);
213911754SKacheong.Poon@Sun.COM 				mp1 = mp1->b_cont;
214011754SKacheong.Poon@Sun.COM 
214111754SKacheong.Poon@Sun.COM 				spill += nmpsz;
214211754SKacheong.Poon@Sun.COM 				if (mp1 == NULL) {
214311754SKacheong.Poon@Sun.COM 					*tail_unsent = spill;
214411754SKacheong.Poon@Sun.COM 					freemsg(mp);
214511754SKacheong.Poon@Sun.COM 					return (-1);	/* out_of_mem */
214611754SKacheong.Poon@Sun.COM 				}
214711754SKacheong.Poon@Sun.COM 			}
214811754SKacheong.Poon@Sun.COM 
214911754SKacheong.Poon@Sun.COM 			/* Trim back any surplus on the last mblk */
215011754SKacheong.Poon@Sun.COM 			if (spill >= 0) {
215111754SKacheong.Poon@Sun.COM 				mp1->b_wptr -= spill;
215211754SKacheong.Poon@Sun.COM 				*tail_unsent = spill;
215311754SKacheong.Poon@Sun.COM 			} else {
215411754SKacheong.Poon@Sun.COM 				/*
215511754SKacheong.Poon@Sun.COM 				 * We did not send everything we could in
215611754SKacheong.Poon@Sun.COM 				 * order to remain within the b_cont limit.
215711754SKacheong.Poon@Sun.COM 				 */
215811754SKacheong.Poon@Sun.COM 				*usable -= spill;
215911754SKacheong.Poon@Sun.COM 				*snxt += spill;
216011754SKacheong.Poon@Sun.COM 				tcp->tcp_last_sent_len += spill;
216111754SKacheong.Poon@Sun.COM 				TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
216211754SKacheong.Poon@Sun.COM 				/*
216311754SKacheong.Poon@Sun.COM 				 * Adjust the checksum
216411754SKacheong.Poon@Sun.COM 				 */
216511754SKacheong.Poon@Sun.COM 				tcpha = (tcpha_t *)(rptr +
216611754SKacheong.Poon@Sun.COM 				    ixa->ixa_ip_hdr_length);
216711754SKacheong.Poon@Sun.COM 				sum += spill;
216811754SKacheong.Poon@Sun.COM 				sum = (sum >> 16) + (sum & 0xFFFF);
216911754SKacheong.Poon@Sun.COM 				tcpha->tha_sum = htons(sum);
217011754SKacheong.Poon@Sun.COM 				if (connp->conn_ipversion == IPV4_VERSION) {
217111754SKacheong.Poon@Sun.COM 					sum = ntohs(
217211754SKacheong.Poon@Sun.COM 					    ((ipha_t *)rptr)->ipha_length) +
217311754SKacheong.Poon@Sun.COM 					    spill;
217411754SKacheong.Poon@Sun.COM 					((ipha_t *)rptr)->ipha_length =
217511754SKacheong.Poon@Sun.COM 					    htons(sum);
217611754SKacheong.Poon@Sun.COM 				} else {
217711754SKacheong.Poon@Sun.COM 					sum = ntohs(
217811754SKacheong.Poon@Sun.COM 					    ((ip6_t *)rptr)->ip6_plen) +
217911754SKacheong.Poon@Sun.COM 					    spill;
218011754SKacheong.Poon@Sun.COM 					((ip6_t *)rptr)->ip6_plen =
218111754SKacheong.Poon@Sun.COM 					    htons(sum);
218211754SKacheong.Poon@Sun.COM 				}
218311754SKacheong.Poon@Sun.COM 				ixa->ixa_pktlen += spill;
218411754SKacheong.Poon@Sun.COM 				*tail_unsent = 0;
218511754SKacheong.Poon@Sun.COM 			}
218611754SKacheong.Poon@Sun.COM 		}
218711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ip_forward_progress) {
218811754SKacheong.Poon@Sun.COM 			tcp->tcp_ip_forward_progress = B_FALSE;
218911754SKacheong.Poon@Sun.COM 			ixa->ixa_flags |= IXAF_REACH_CONF;
219011754SKacheong.Poon@Sun.COM 		} else {
219111754SKacheong.Poon@Sun.COM 			ixa->ixa_flags &= ~IXAF_REACH_CONF;
219211754SKacheong.Poon@Sun.COM 		}
219311754SKacheong.Poon@Sun.COM 
219411754SKacheong.Poon@Sun.COM 		if (do_lso_send) {
219511754SKacheong.Poon@Sun.COM 			/* Append LSO information to the mp. */
219611754SKacheong.Poon@Sun.COM 			lso_info_set(mp, mss, HW_LSO);
219711754SKacheong.Poon@Sun.COM 			ixa->ixa_fragsize = IP_MAXPACKET;
219811754SKacheong.Poon@Sun.COM 			ixa->ixa_extra_ident = num_lso_seg - 1;
219911754SKacheong.Poon@Sun.COM 
220011754SKacheong.Poon@Sun.COM 			DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
220111754SKacheong.Poon@Sun.COM 			    boolean_t, B_TRUE);
220211754SKacheong.Poon@Sun.COM 
220311754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp);
220411754SKacheong.Poon@Sun.COM 
220511754SKacheong.Poon@Sun.COM 			/*
220611754SKacheong.Poon@Sun.COM 			 * Restore values of ixa_fragsize and ixa_extra_ident.
220711754SKacheong.Poon@Sun.COM 			 */
220811754SKacheong.Poon@Sun.COM 			ixa->ixa_fragsize = ixa->ixa_pmtu;
220911754SKacheong.Poon@Sun.COM 			ixa->ixa_extra_ident = 0;
221011754SKacheong.Poon@Sun.COM 			tcp->tcp_obsegs += num_lso_seg;
221111754SKacheong.Poon@Sun.COM 			TCP_STAT(tcps, tcp_lso_times);
221211754SKacheong.Poon@Sun.COM 			TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
221311754SKacheong.Poon@Sun.COM 		} else {
221411754SKacheong.Poon@Sun.COM 			/*
221511754SKacheong.Poon@Sun.COM 			 * Make sure to clean up LSO information. Wherever a
221611754SKacheong.Poon@Sun.COM 			 * new mp uses the prepended header room after dupb(),
221711754SKacheong.Poon@Sun.COM 			 * lso_info_cleanup() should be called.
221811754SKacheong.Poon@Sun.COM 			 */
221911754SKacheong.Poon@Sun.COM 			lso_info_cleanup(mp);
222011754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp);
222111754SKacheong.Poon@Sun.COM 			BUMP_LOCAL(tcp->tcp_obsegs);
222211754SKacheong.Poon@Sun.COM 		}
222311754SKacheong.Poon@Sun.COM 	}
222411754SKacheong.Poon@Sun.COM 
222511754SKacheong.Poon@Sun.COM 	return (0);
222611754SKacheong.Poon@Sun.COM }
222711754SKacheong.Poon@Sun.COM 
222811754SKacheong.Poon@Sun.COM /*
222911754SKacheong.Poon@Sun.COM  * Initiate closedown sequence on an active connection.  (May be called as
223011754SKacheong.Poon@Sun.COM  * writer.)  Return value zero for OK return, non-zero for error return.
223111754SKacheong.Poon@Sun.COM  */
223211754SKacheong.Poon@Sun.COM static int
tcp_xmit_end(tcp_t * tcp)223311754SKacheong.Poon@Sun.COM tcp_xmit_end(tcp_t *tcp)
223411754SKacheong.Poon@Sun.COM {
223511754SKacheong.Poon@Sun.COM 	mblk_t		*mp;
223611754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
223711754SKacheong.Poon@Sun.COM 	iulp_t		uinfo;
223811754SKacheong.Poon@Sun.COM 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
223911754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
224011754SKacheong.Poon@Sun.COM 
224111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_state < TCPS_SYN_RCVD ||
224211754SKacheong.Poon@Sun.COM 	    tcp->tcp_state > TCPS_CLOSE_WAIT) {
224311754SKacheong.Poon@Sun.COM 		/*
224411754SKacheong.Poon@Sun.COM 		 * Invalid state, only states TCPS_SYN_RCVD,
224511754SKacheong.Poon@Sun.COM 		 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
224611754SKacheong.Poon@Sun.COM 		 */
224711754SKacheong.Poon@Sun.COM 		return (-1);
224811754SKacheong.Poon@Sun.COM 	}
224911754SKacheong.Poon@Sun.COM 
225011754SKacheong.Poon@Sun.COM 	tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
225111754SKacheong.Poon@Sun.COM 	tcp->tcp_valid_bits |= TCP_FSS_VALID;
225211754SKacheong.Poon@Sun.COM 	/*
225311754SKacheong.Poon@Sun.COM 	 * If there is nothing more unsent, send the FIN now.
225411754SKacheong.Poon@Sun.COM 	 * Otherwise, it will go out with the last segment.
225511754SKacheong.Poon@Sun.COM 	 */
225611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_unsent == 0) {
225711754SKacheong.Poon@Sun.COM 		mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
225811754SKacheong.Poon@Sun.COM 		    tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
225911754SKacheong.Poon@Sun.COM 
226011754SKacheong.Poon@Sun.COM 		if (mp) {
226111754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, mp);
226211754SKacheong.Poon@Sun.COM 		} else {
226311754SKacheong.Poon@Sun.COM 			/*
226411754SKacheong.Poon@Sun.COM 			 * Couldn't allocate msg.  Pretend we got it out.
226511754SKacheong.Poon@Sun.COM 			 * Wait for rexmit timeout.
226611754SKacheong.Poon@Sun.COM 			 */
226711754SKacheong.Poon@Sun.COM 			tcp->tcp_snxt = tcp->tcp_fss + 1;
226811754SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
226911754SKacheong.Poon@Sun.COM 		}
227011754SKacheong.Poon@Sun.COM 
227111754SKacheong.Poon@Sun.COM 		/*
227211754SKacheong.Poon@Sun.COM 		 * If needed, update tcp_rexmit_snxt as tcp_snxt is
227311754SKacheong.Poon@Sun.COM 		 * changed.
227411754SKacheong.Poon@Sun.COM 		 */
227511754SKacheong.Poon@Sun.COM 		if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
227611754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
227711754SKacheong.Poon@Sun.COM 		}
227811754SKacheong.Poon@Sun.COM 	} else {
227911754SKacheong.Poon@Sun.COM 		/*
228011754SKacheong.Poon@Sun.COM 		 * If tcp->tcp_cork is set, then the data will not get sent,
228111754SKacheong.Poon@Sun.COM 		 * so we have to check that and unset it first.
228211754SKacheong.Poon@Sun.COM 		 */
228311754SKacheong.Poon@Sun.COM 		if (tcp->tcp_cork)
228411754SKacheong.Poon@Sun.COM 			tcp->tcp_cork = B_FALSE;
228511754SKacheong.Poon@Sun.COM 		tcp_wput_data(tcp, NULL, B_FALSE);
228611754SKacheong.Poon@Sun.COM 	}
228711754SKacheong.Poon@Sun.COM 
228811754SKacheong.Poon@Sun.COM 	/*
228911754SKacheong.Poon@Sun.COM 	 * If TCP does not get enough samples of RTT or tcp_rtt_updates
229011754SKacheong.Poon@Sun.COM 	 * is 0, don't update the cache.
229111754SKacheong.Poon@Sun.COM 	 */
229211754SKacheong.Poon@Sun.COM 	if (tcps->tcps_rtt_updates == 0 ||
229311754SKacheong.Poon@Sun.COM 	    tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
229411754SKacheong.Poon@Sun.COM 		return (0);
229511754SKacheong.Poon@Sun.COM 
229611754SKacheong.Poon@Sun.COM 	/*
229711754SKacheong.Poon@Sun.COM 	 * We do not have a good algorithm to update ssthresh at this time.
229811754SKacheong.Poon@Sun.COM 	 * So don't do any update.
229911754SKacheong.Poon@Sun.COM 	 */
230011754SKacheong.Poon@Sun.COM 	bzero(&uinfo, sizeof (uinfo));
230111754SKacheong.Poon@Sun.COM 	uinfo.iulp_rtt = tcp->tcp_rtt_sa;
230211754SKacheong.Poon@Sun.COM 	uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
230311754SKacheong.Poon@Sun.COM 
230411754SKacheong.Poon@Sun.COM 	/*
230511754SKacheong.Poon@Sun.COM 	 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
230611754SKacheong.Poon@Sun.COM 	 * if source routed but we don't.
230711754SKacheong.Poon@Sun.COM 	 */
230811754SKacheong.Poon@Sun.COM 	if (connp->conn_ipversion == IPV4_VERSION) {
230911754SKacheong.Poon@Sun.COM 		if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
231011754SKacheong.Poon@Sun.COM 			return (0);
231111754SKacheong.Poon@Sun.COM 		}
231211754SKacheong.Poon@Sun.COM 		(void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
231311754SKacheong.Poon@Sun.COM 	} else {
231411754SKacheong.Poon@Sun.COM 		uint_t ifindex;
231511754SKacheong.Poon@Sun.COM 
231611754SKacheong.Poon@Sun.COM 		if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
231711754SKacheong.Poon@Sun.COM 		    &tcp->tcp_ip6h->ip6_dst))) {
231811754SKacheong.Poon@Sun.COM 			return (0);
231911754SKacheong.Poon@Sun.COM 		}
232011754SKacheong.Poon@Sun.COM 		ifindex = 0;
232111754SKacheong.Poon@Sun.COM 		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
232211754SKacheong.Poon@Sun.COM 			ip_xmit_attr_t *ixa = connp->conn_ixa;
232311754SKacheong.Poon@Sun.COM 
232411754SKacheong.Poon@Sun.COM 			/*
232511754SKacheong.Poon@Sun.COM 			 * If we are going to create a DCE we'd better have
232611754SKacheong.Poon@Sun.COM 			 * an ifindex
232711754SKacheong.Poon@Sun.COM 			 */
232811754SKacheong.Poon@Sun.COM 			if (ixa->ixa_nce != NULL) {
232911754SKacheong.Poon@Sun.COM 				ifindex = ixa->ixa_nce->nce_common->ncec_ill->
233011754SKacheong.Poon@Sun.COM 				    ill_phyint->phyint_ifindex;
233111754SKacheong.Poon@Sun.COM 			} else {
233211754SKacheong.Poon@Sun.COM 				return (0);
233311754SKacheong.Poon@Sun.COM 			}
233411754SKacheong.Poon@Sun.COM 		}
233511754SKacheong.Poon@Sun.COM 
233611754SKacheong.Poon@Sun.COM 		(void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
233711754SKacheong.Poon@Sun.COM 		    ipst);
233811754SKacheong.Poon@Sun.COM 	}
233911754SKacheong.Poon@Sun.COM 	return (0);
234011754SKacheong.Poon@Sun.COM }
234111754SKacheong.Poon@Sun.COM 
234211754SKacheong.Poon@Sun.COM /*
234311754SKacheong.Poon@Sun.COM  * Send out a control packet on the tcp connection specified.  This routine
234411754SKacheong.Poon@Sun.COM  * is typically called where we need a simple ACK or RST generated.
234511754SKacheong.Poon@Sun.COM  */
234611754SKacheong.Poon@Sun.COM void
tcp_xmit_ctl(char * str,tcp_t * tcp,uint32_t seq,uint32_t ack,int ctl)234711754SKacheong.Poon@Sun.COM tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
234811754SKacheong.Poon@Sun.COM {
234911754SKacheong.Poon@Sun.COM 	uchar_t		*rptr;
235011754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
235111754SKacheong.Poon@Sun.COM 	ipha_t		*ipha = NULL;
235211754SKacheong.Poon@Sun.COM 	ip6_t		*ip6h = NULL;
235311754SKacheong.Poon@Sun.COM 	uint32_t	sum;
235411754SKacheong.Poon@Sun.COM 	int		total_hdr_len;
235511754SKacheong.Poon@Sun.COM 	int		ip_hdr_len;
235611754SKacheong.Poon@Sun.COM 	mblk_t		*mp;
235711754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
235811754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
235911754SKacheong.Poon@Sun.COM 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
236011754SKacheong.Poon@Sun.COM 
236111754SKacheong.Poon@Sun.COM 	/*
236211754SKacheong.Poon@Sun.COM 	 * Save sum for use in source route later.
236311754SKacheong.Poon@Sun.COM 	 */
236411754SKacheong.Poon@Sun.COM 	sum = connp->conn_ht_ulp_len + connp->conn_sum;
236511754SKacheong.Poon@Sun.COM 	total_hdr_len = connp->conn_ht_iphc_len;
236611754SKacheong.Poon@Sun.COM 	ip_hdr_len = ixa->ixa_ip_hdr_length;
236711754SKacheong.Poon@Sun.COM 
236811754SKacheong.Poon@Sun.COM 	/* If a text string is passed in with the request, pass it to strlog. */
236911754SKacheong.Poon@Sun.COM 	if (str != NULL && connp->conn_debug) {
237011754SKacheong.Poon@Sun.COM 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
237111754SKacheong.Poon@Sun.COM 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
237211754SKacheong.Poon@Sun.COM 		    str, seq, ack, ctl);
237311754SKacheong.Poon@Sun.COM 	}
237411754SKacheong.Poon@Sun.COM 	mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
237511754SKacheong.Poon@Sun.COM 	    BPRI_MED);
237611754SKacheong.Poon@Sun.COM 	if (mp == NULL) {
237711754SKacheong.Poon@Sun.COM 		return;
237811754SKacheong.Poon@Sun.COM 	}
237911754SKacheong.Poon@Sun.COM 	rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
238011754SKacheong.Poon@Sun.COM 	mp->b_rptr = rptr;
238111754SKacheong.Poon@Sun.COM 	mp->b_wptr = &rptr[total_hdr_len];
238211754SKacheong.Poon@Sun.COM 	bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
238311754SKacheong.Poon@Sun.COM 
238411754SKacheong.Poon@Sun.COM 	ixa->ixa_pktlen = total_hdr_len;
238511754SKacheong.Poon@Sun.COM 
238611754SKacheong.Poon@Sun.COM 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
238711754SKacheong.Poon@Sun.COM 		ipha = (ipha_t *)rptr;
238811754SKacheong.Poon@Sun.COM 		ipha->ipha_length = htons(total_hdr_len);
238911754SKacheong.Poon@Sun.COM 	} else {
239011754SKacheong.Poon@Sun.COM 		ip6h = (ip6_t *)rptr;
239111754SKacheong.Poon@Sun.COM 		ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
239211754SKacheong.Poon@Sun.COM 	}
239311754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
239411754SKacheong.Poon@Sun.COM 	tcpha->tha_flags = (uint8_t)ctl;
239511754SKacheong.Poon@Sun.COM 	if (ctl & TH_RST) {
239611754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
239711754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutControl);
239811754SKacheong.Poon@Sun.COM 		/*
239911754SKacheong.Poon@Sun.COM 		 * Don't send TSopt w/ TH_RST packets per RFC 1323.
240011754SKacheong.Poon@Sun.COM 		 */
240111754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_ts_ok &&
240211754SKacheong.Poon@Sun.COM 		    tcp->tcp_state > TCPS_SYN_SENT) {
240311754SKacheong.Poon@Sun.COM 			mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
240411754SKacheong.Poon@Sun.COM 			*(mp->b_wptr) = TCPOPT_EOL;
240511754SKacheong.Poon@Sun.COM 
240611754SKacheong.Poon@Sun.COM 			ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
240711754SKacheong.Poon@Sun.COM 
240811754SKacheong.Poon@Sun.COM 			if (connp->conn_ipversion == IPV4_VERSION) {
240911754SKacheong.Poon@Sun.COM 				ipha->ipha_length = htons(total_hdr_len -
241011754SKacheong.Poon@Sun.COM 				    TCPOPT_REAL_TS_LEN);
241111754SKacheong.Poon@Sun.COM 			} else {
241211754SKacheong.Poon@Sun.COM 				ip6h->ip6_plen = htons(total_hdr_len -
241311754SKacheong.Poon@Sun.COM 				    IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
241411754SKacheong.Poon@Sun.COM 			}
241511754SKacheong.Poon@Sun.COM 			tcpha->tha_offset_and_reserved -= (3 << 4);
241611754SKacheong.Poon@Sun.COM 			sum -= TCPOPT_REAL_TS_LEN;
241711754SKacheong.Poon@Sun.COM 		}
241811754SKacheong.Poon@Sun.COM 	}
241911754SKacheong.Poon@Sun.COM 	if (ctl & TH_ACK) {
242011754SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_ts_ok) {
242111754SKacheong.Poon@Sun.COM 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
242211754SKacheong.Poon@Sun.COM 
242311754SKacheong.Poon@Sun.COM 			U32_TO_BE32(llbolt,
242411754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
242511754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tcp->tcp_ts_recent,
242611754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
242711754SKacheong.Poon@Sun.COM 		}
242811754SKacheong.Poon@Sun.COM 
242911754SKacheong.Poon@Sun.COM 		/* Update the latest receive window size in TCP header. */
243011754SKacheong.Poon@Sun.COM 		tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
243111754SKacheong.Poon@Sun.COM 		/* Track what we sent to the peer */
243211754SKacheong.Poon@Sun.COM 		tcp->tcp_tcpha->tha_win = tcpha->tha_win;
243311754SKacheong.Poon@Sun.COM 		tcp->tcp_rack = ack;
243411754SKacheong.Poon@Sun.COM 		tcp->tcp_rack_cnt = 0;
243511754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutAck);
243611754SKacheong.Poon@Sun.COM 	}
243711754SKacheong.Poon@Sun.COM 	BUMP_LOCAL(tcp->tcp_obsegs);
243811754SKacheong.Poon@Sun.COM 	tcpha->tha_seq = htonl(seq);
243911754SKacheong.Poon@Sun.COM 	tcpha->tha_ack = htonl(ack);
244011754SKacheong.Poon@Sun.COM 	/*
244111754SKacheong.Poon@Sun.COM 	 * Include the adjustment for a source route if any.
244211754SKacheong.Poon@Sun.COM 	 */
244311754SKacheong.Poon@Sun.COM 	sum = (sum >> 16) + (sum & 0xFFFF);
244411754SKacheong.Poon@Sun.COM 	tcpha->tha_sum = htons(sum);
244511754SKacheong.Poon@Sun.COM 	tcp_send_data(tcp, mp);
244611754SKacheong.Poon@Sun.COM }
244711754SKacheong.Poon@Sun.COM 
244811754SKacheong.Poon@Sun.COM /*
244911754SKacheong.Poon@Sun.COM  * Generate a reset based on an inbound packet, connp is set by caller
245011754SKacheong.Poon@Sun.COM  * when RST is in response to an unexpected inbound packet for which
245111754SKacheong.Poon@Sun.COM  * there is active tcp state in the system.
245211754SKacheong.Poon@Sun.COM  *
245311754SKacheong.Poon@Sun.COM  * IPSEC NOTE : Try to send the reply with the same protection as it came
245411754SKacheong.Poon@Sun.COM  * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
245511754SKacheong.Poon@Sun.COM  * That way the packet will go out at the same level of protection as it
245611754SKacheong.Poon@Sun.COM  * came in with.
245711754SKacheong.Poon@Sun.COM  */
245811754SKacheong.Poon@Sun.COM static void
tcp_xmit_early_reset(char * str,mblk_t * mp,uint32_t seq,uint32_t ack,int ctl,ip_recv_attr_t * ira,ip_stack_t * ipst,conn_t * connp)245911754SKacheong.Poon@Sun.COM tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
246011754SKacheong.Poon@Sun.COM     ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
246111754SKacheong.Poon@Sun.COM {
246211754SKacheong.Poon@Sun.COM 	ipha_t		*ipha = NULL;
246311754SKacheong.Poon@Sun.COM 	ip6_t		*ip6h = NULL;
246411754SKacheong.Poon@Sun.COM 	ushort_t	len;
246511754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
246611754SKacheong.Poon@Sun.COM 	int		i;
246711754SKacheong.Poon@Sun.COM 	ipaddr_t	v4addr;
246811754SKacheong.Poon@Sun.COM 	in6_addr_t	v6addr;
246911754SKacheong.Poon@Sun.COM 	netstack_t	*ns = ipst->ips_netstack;
247011754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = ns->netstack_tcp;
247111754SKacheong.Poon@Sun.COM 	ip_xmit_attr_t	ixas, *ixa;
247211754SKacheong.Poon@Sun.COM 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
247311754SKacheong.Poon@Sun.COM 	boolean_t	need_refrele = B_FALSE;		/* ixa_refrele(ixa) */
247411754SKacheong.Poon@Sun.COM 	ushort_t	port;
247511754SKacheong.Poon@Sun.COM 
247611754SKacheong.Poon@Sun.COM 	if (!tcp_send_rst_chk(tcps)) {
247711754SKacheong.Poon@Sun.COM 		TCP_STAT(tcps, tcp_rst_unsent);
247811754SKacheong.Poon@Sun.COM 		freemsg(mp);
247911754SKacheong.Poon@Sun.COM 		return;
248011754SKacheong.Poon@Sun.COM 	}
248111754SKacheong.Poon@Sun.COM 
248211754SKacheong.Poon@Sun.COM 	/*
248311754SKacheong.Poon@Sun.COM 	 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
248411754SKacheong.Poon@Sun.COM 	 * options from the listener. In that case the caller must ensure that
248511754SKacheong.Poon@Sun.COM 	 * we are running on the listener = connp squeue.
248611754SKacheong.Poon@Sun.COM 	 *
248711754SKacheong.Poon@Sun.COM 	 * We get a safe copy of conn_ixa so we don't need to restore anything
248811754SKacheong.Poon@Sun.COM 	 * we or ip_output_simple might change in the ixa.
248911754SKacheong.Poon@Sun.COM 	 */
249011754SKacheong.Poon@Sun.COM 	if (connp != NULL) {
249111754SKacheong.Poon@Sun.COM 		ASSERT(connp->conn_on_sqp);
249211754SKacheong.Poon@Sun.COM 
249311754SKacheong.Poon@Sun.COM 		ixa = conn_get_ixa_exclusive(connp);
249411754SKacheong.Poon@Sun.COM 		if (ixa == NULL) {
249511754SKacheong.Poon@Sun.COM 			TCP_STAT(tcps, tcp_rst_unsent);
249611754SKacheong.Poon@Sun.COM 			freemsg(mp);
249711754SKacheong.Poon@Sun.COM 			return;
249811754SKacheong.Poon@Sun.COM 		}
249911754SKacheong.Poon@Sun.COM 		need_refrele = B_TRUE;
250011754SKacheong.Poon@Sun.COM 	} else {
250111754SKacheong.Poon@Sun.COM 		bzero(&ixas, sizeof (ixas));
250211754SKacheong.Poon@Sun.COM 		ixa = &ixas;
250311754SKacheong.Poon@Sun.COM 		/*
250411754SKacheong.Poon@Sun.COM 		 * IXAF_VERIFY_SOURCE is overkill since we know the
250511754SKacheong.Poon@Sun.COM 		 * packet was for us.
250611754SKacheong.Poon@Sun.COM 		 */
250711754SKacheong.Poon@Sun.COM 		ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
250811754SKacheong.Poon@Sun.COM 		ixa->ixa_protocol = IPPROTO_TCP;
250911754SKacheong.Poon@Sun.COM 		ixa->ixa_zoneid = ira->ira_zoneid;
251011754SKacheong.Poon@Sun.COM 		ixa->ixa_ifindex = 0;
251111754SKacheong.Poon@Sun.COM 		ixa->ixa_ipst = ipst;
251211754SKacheong.Poon@Sun.COM 		ixa->ixa_cred = kcred;
251311754SKacheong.Poon@Sun.COM 		ixa->ixa_cpid = NOPID;
251411754SKacheong.Poon@Sun.COM 	}
251511754SKacheong.Poon@Sun.COM 
251611754SKacheong.Poon@Sun.COM 	if (str && tcps->tcps_dbg) {
251711754SKacheong.Poon@Sun.COM 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
251811754SKacheong.Poon@Sun.COM 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
251911754SKacheong.Poon@Sun.COM 		    "flags 0x%x",
252011754SKacheong.Poon@Sun.COM 		    str, seq, ack, ctl);
252111754SKacheong.Poon@Sun.COM 	}
252211754SKacheong.Poon@Sun.COM 	if (mp->b_datap->db_ref != 1) {
252311754SKacheong.Poon@Sun.COM 		mblk_t *mp1 = copyb(mp);
252411754SKacheong.Poon@Sun.COM 		freemsg(mp);
252511754SKacheong.Poon@Sun.COM 		mp = mp1;
252611754SKacheong.Poon@Sun.COM 		if (mp == NULL)
252711754SKacheong.Poon@Sun.COM 			goto done;
252811754SKacheong.Poon@Sun.COM 	} else if (mp->b_cont) {
252911754SKacheong.Poon@Sun.COM 		freemsg(mp->b_cont);
253011754SKacheong.Poon@Sun.COM 		mp->b_cont = NULL;
253111754SKacheong.Poon@Sun.COM 		DB_CKSUMFLAGS(mp) = 0;
253211754SKacheong.Poon@Sun.COM 	}
253311754SKacheong.Poon@Sun.COM 	/*
253411754SKacheong.Poon@Sun.COM 	 * We skip reversing source route here.
253511754SKacheong.Poon@Sun.COM 	 * (for now we replace all IP options with EOL)
253611754SKacheong.Poon@Sun.COM 	 */
253711754SKacheong.Poon@Sun.COM 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
253811754SKacheong.Poon@Sun.COM 		ipha = (ipha_t *)mp->b_rptr;
253911754SKacheong.Poon@Sun.COM 		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
254011754SKacheong.Poon@Sun.COM 			mp->b_rptr[i] = IPOPT_EOL;
254111754SKacheong.Poon@Sun.COM 		/*
254211754SKacheong.Poon@Sun.COM 		 * Make sure that src address isn't flagrantly invalid.
254311754SKacheong.Poon@Sun.COM 		 * Not all broadcast address checking for the src address
254411754SKacheong.Poon@Sun.COM 		 * is possible, since we don't know the netmask of the src
254511754SKacheong.Poon@Sun.COM 		 * addr.  No check for destination address is done, since
254611754SKacheong.Poon@Sun.COM 		 * IP will not pass up a packet with a broadcast dest
254711754SKacheong.Poon@Sun.COM 		 * address to TCP.  Similar checks are done below for IPv6.
254811754SKacheong.Poon@Sun.COM 		 */
254911754SKacheong.Poon@Sun.COM 		if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
255011754SKacheong.Poon@Sun.COM 		    CLASSD(ipha->ipha_src)) {
255111754SKacheong.Poon@Sun.COM 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
255211754SKacheong.Poon@Sun.COM 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
255311754SKacheong.Poon@Sun.COM 			freemsg(mp);
255411754SKacheong.Poon@Sun.COM 			goto done;
255511754SKacheong.Poon@Sun.COM 		}
255611754SKacheong.Poon@Sun.COM 	} else {
255711754SKacheong.Poon@Sun.COM 		ip6h = (ip6_t *)mp->b_rptr;
255811754SKacheong.Poon@Sun.COM 
255911754SKacheong.Poon@Sun.COM 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
256011754SKacheong.Poon@Sun.COM 		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
256111754SKacheong.Poon@Sun.COM 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
256211754SKacheong.Poon@Sun.COM 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
256311754SKacheong.Poon@Sun.COM 			freemsg(mp);
256411754SKacheong.Poon@Sun.COM 			goto done;
256511754SKacheong.Poon@Sun.COM 		}
256611754SKacheong.Poon@Sun.COM 
256711754SKacheong.Poon@Sun.COM 		/* Remove any extension headers assuming partial overlay */
256811754SKacheong.Poon@Sun.COM 		if (ip_hdr_len > IPV6_HDR_LEN) {
256911754SKacheong.Poon@Sun.COM 			uint8_t *to;
257011754SKacheong.Poon@Sun.COM 
257111754SKacheong.Poon@Sun.COM 			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
257211754SKacheong.Poon@Sun.COM 			ovbcopy(ip6h, to, IPV6_HDR_LEN);
257311754SKacheong.Poon@Sun.COM 			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
257411754SKacheong.Poon@Sun.COM 			ip_hdr_len = IPV6_HDR_LEN;
257511754SKacheong.Poon@Sun.COM 			ip6h = (ip6_t *)mp->b_rptr;
257611754SKacheong.Poon@Sun.COM 			ip6h->ip6_nxt = IPPROTO_TCP;
257711754SKacheong.Poon@Sun.COM 		}
257811754SKacheong.Poon@Sun.COM 	}
257911754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
258011754SKacheong.Poon@Sun.COM 	if (tcpha->tha_flags & TH_RST) {
258111754SKacheong.Poon@Sun.COM 		freemsg(mp);
258211754SKacheong.Poon@Sun.COM 		goto done;
258311754SKacheong.Poon@Sun.COM 	}
258411754SKacheong.Poon@Sun.COM 	tcpha->tha_offset_and_reserved = (5 << 4);
258511754SKacheong.Poon@Sun.COM 	len = ip_hdr_len + sizeof (tcpha_t);
258611754SKacheong.Poon@Sun.COM 	mp->b_wptr = &mp->b_rptr[len];
258711754SKacheong.Poon@Sun.COM 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
258811754SKacheong.Poon@Sun.COM 		ipha->ipha_length = htons(len);
258911754SKacheong.Poon@Sun.COM 		/* Swap addresses */
259011754SKacheong.Poon@Sun.COM 		v4addr = ipha->ipha_src;
259111754SKacheong.Poon@Sun.COM 		ipha->ipha_src = ipha->ipha_dst;
259211754SKacheong.Poon@Sun.COM 		ipha->ipha_dst = v4addr;
259311754SKacheong.Poon@Sun.COM 		ipha->ipha_ident = 0;
259411754SKacheong.Poon@Sun.COM 		ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
259511754SKacheong.Poon@Sun.COM 		ixa->ixa_flags |= IXAF_IS_IPV4;
259611754SKacheong.Poon@Sun.COM 		ixa->ixa_ip_hdr_length = ip_hdr_len;
259711754SKacheong.Poon@Sun.COM 	} else {
259811754SKacheong.Poon@Sun.COM 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
259911754SKacheong.Poon@Sun.COM 		/* Swap addresses */
260011754SKacheong.Poon@Sun.COM 		v6addr = ip6h->ip6_src;
260111754SKacheong.Poon@Sun.COM 		ip6h->ip6_src = ip6h->ip6_dst;
260211754SKacheong.Poon@Sun.COM 		ip6h->ip6_dst = v6addr;
260311754SKacheong.Poon@Sun.COM 		ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
260411754SKacheong.Poon@Sun.COM 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
260511754SKacheong.Poon@Sun.COM 
260611754SKacheong.Poon@Sun.COM 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
260711754SKacheong.Poon@Sun.COM 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
260811754SKacheong.Poon@Sun.COM 			ixa->ixa_scopeid = ira->ira_ruifindex;
260911754SKacheong.Poon@Sun.COM 		}
261011754SKacheong.Poon@Sun.COM 		ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
261111754SKacheong.Poon@Sun.COM 	}
261211754SKacheong.Poon@Sun.COM 	ixa->ixa_pktlen = len;
261311754SKacheong.Poon@Sun.COM 
261411754SKacheong.Poon@Sun.COM 	/* Swap the ports */
261511754SKacheong.Poon@Sun.COM 	port = tcpha->tha_fport;
261611754SKacheong.Poon@Sun.COM 	tcpha->tha_fport = tcpha->tha_lport;
261711754SKacheong.Poon@Sun.COM 	tcpha->tha_lport = port;
261811754SKacheong.Poon@Sun.COM 
261911754SKacheong.Poon@Sun.COM 	tcpha->tha_ack = htonl(ack);
262011754SKacheong.Poon@Sun.COM 	tcpha->tha_seq = htonl(seq);
262111754SKacheong.Poon@Sun.COM 	tcpha->tha_win = 0;
262211754SKacheong.Poon@Sun.COM 	tcpha->tha_sum = htons(sizeof (tcpha_t));
262311754SKacheong.Poon@Sun.COM 	tcpha->tha_flags = (uint8_t)ctl;
262411754SKacheong.Poon@Sun.COM 	if (ctl & TH_RST) {
262512507SAlan.Maguire@Sun.COM 		if (ctl & TH_ACK) {
262612507SAlan.Maguire@Sun.COM 			/*
262712507SAlan.Maguire@Sun.COM 			 * Probe connection rejection here.
262812507SAlan.Maguire@Sun.COM 			 * tcp_xmit_listeners_reset() drops non-SYN segments
262912507SAlan.Maguire@Sun.COM 			 * that do not specify TH_ACK in their flags without
263012507SAlan.Maguire@Sun.COM 			 * calling this function.  As a consequence, if this
263112507SAlan.Maguire@Sun.COM 			 * function is called with a TH_RST|TH_ACK ctl argument,
263212507SAlan.Maguire@Sun.COM 			 * it is being called in response to a SYN segment
263312507SAlan.Maguire@Sun.COM 			 * and thus the tcp:::accept-refused probe point
263412507SAlan.Maguire@Sun.COM 			 * is valid here.
263512507SAlan.Maguire@Sun.COM 			 */
263612507SAlan.Maguire@Sun.COM 			DTRACE_TCP5(accept__refused, mblk_t *, NULL,
263712507SAlan.Maguire@Sun.COM 			    void, NULL, void_ip_t *, mp->b_rptr, tcp_t *, NULL,
263812507SAlan.Maguire@Sun.COM 			    tcph_t *, tcpha);
263912507SAlan.Maguire@Sun.COM 		}
264011754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
264111754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutControl);
264211754SKacheong.Poon@Sun.COM 	}
264311754SKacheong.Poon@Sun.COM 
264411754SKacheong.Poon@Sun.COM 	/* Discard any old label */
264511754SKacheong.Poon@Sun.COM 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
264611754SKacheong.Poon@Sun.COM 		ASSERT(ixa->ixa_tsl != NULL);
264711754SKacheong.Poon@Sun.COM 		label_rele(ixa->ixa_tsl);
264811754SKacheong.Poon@Sun.COM 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
264911754SKacheong.Poon@Sun.COM 	}
265011754SKacheong.Poon@Sun.COM 	ixa->ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
265111754SKacheong.Poon@Sun.COM 
265211754SKacheong.Poon@Sun.COM 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
265311754SKacheong.Poon@Sun.COM 		/*
265411754SKacheong.Poon@Sun.COM 		 * Apply IPsec based on how IPsec was applied to
265511754SKacheong.Poon@Sun.COM 		 * the packet that caused the RST.
265611754SKacheong.Poon@Sun.COM 		 */
265711754SKacheong.Poon@Sun.COM 		if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
265811754SKacheong.Poon@Sun.COM 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
265911754SKacheong.Poon@Sun.COM 			/* Note: mp already consumed and ip_drop_packet done */
266011754SKacheong.Poon@Sun.COM 			goto done;
266111754SKacheong.Poon@Sun.COM 		}
266211754SKacheong.Poon@Sun.COM 	} else {
266311754SKacheong.Poon@Sun.COM 		/*
266411754SKacheong.Poon@Sun.COM 		 * This is in clear. The RST message we are building
266511754SKacheong.Poon@Sun.COM 		 * here should go out in clear, independent of our policy.
266611754SKacheong.Poon@Sun.COM 		 */
266711754SKacheong.Poon@Sun.COM 		ixa->ixa_flags |= IXAF_NO_IPSEC;
266811754SKacheong.Poon@Sun.COM 	}
266911754SKacheong.Poon@Sun.COM 
267012507SAlan.Maguire@Sun.COM 	DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
267112507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
267212507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_tcph_t *, tcpha);
267312507SAlan.Maguire@Sun.COM 
267411754SKacheong.Poon@Sun.COM 	/*
267511754SKacheong.Poon@Sun.COM 	 * NOTE:  one might consider tracing a TCP packet here, but
267611754SKacheong.Poon@Sun.COM 	 * this function has no active TCP state and no tcp structure
267711754SKacheong.Poon@Sun.COM 	 * that has a trace buffer.  If we traced here, we would have
267811754SKacheong.Poon@Sun.COM 	 * to keep a local trace buffer in tcp_record_trace().
267911754SKacheong.Poon@Sun.COM 	 */
268011754SKacheong.Poon@Sun.COM 
268111754SKacheong.Poon@Sun.COM 	(void) ip_output_simple(mp, ixa);
268211754SKacheong.Poon@Sun.COM done:
268311754SKacheong.Poon@Sun.COM 	ixa_cleanup(ixa);
268411754SKacheong.Poon@Sun.COM 	if (need_refrele) {
268511754SKacheong.Poon@Sun.COM 		ASSERT(ixa != &ixas);
268611754SKacheong.Poon@Sun.COM 		ixa_refrele(ixa);
268711754SKacheong.Poon@Sun.COM 	}
268811754SKacheong.Poon@Sun.COM }
268911754SKacheong.Poon@Sun.COM 
269011754SKacheong.Poon@Sun.COM /*
269111754SKacheong.Poon@Sun.COM  * Generate a "no listener here" RST in response to an "unknown" segment.
269211754SKacheong.Poon@Sun.COM  * connp is set by caller when RST is in response to an unexpected
269311754SKacheong.Poon@Sun.COM  * inbound packet for which there is active tcp state in the system.
269411754SKacheong.Poon@Sun.COM  * Note that we are reusing the incoming mp to construct the outgoing RST.
269511754SKacheong.Poon@Sun.COM  */
269611754SKacheong.Poon@Sun.COM void
tcp_xmit_listeners_reset(mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst,conn_t * connp)269711754SKacheong.Poon@Sun.COM tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
269811754SKacheong.Poon@Sun.COM     conn_t *connp)
269911754SKacheong.Poon@Sun.COM {
270011754SKacheong.Poon@Sun.COM 	uchar_t		*rptr;
270111754SKacheong.Poon@Sun.COM 	uint32_t	seg_len;
270211754SKacheong.Poon@Sun.COM 	tcpha_t		*tcpha;
270311754SKacheong.Poon@Sun.COM 	uint32_t	seg_seq;
270411754SKacheong.Poon@Sun.COM 	uint32_t	seg_ack;
270511754SKacheong.Poon@Sun.COM 	uint_t		flags;
270611754SKacheong.Poon@Sun.COM 	ipha_t 		*ipha;
270711754SKacheong.Poon@Sun.COM 	ip6_t 		*ip6h;
270811754SKacheong.Poon@Sun.COM 	boolean_t	policy_present;
270911754SKacheong.Poon@Sun.COM 	netstack_t	*ns = ipst->ips_netstack;
271011754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = ns->netstack_tcp;
271111754SKacheong.Poon@Sun.COM 	ipsec_stack_t	*ipss = tcps->tcps_netstack->netstack_ipsec;
271211754SKacheong.Poon@Sun.COM 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
271311754SKacheong.Poon@Sun.COM 
271411754SKacheong.Poon@Sun.COM 	TCP_STAT(tcps, tcp_no_listener);
271511754SKacheong.Poon@Sun.COM 
271612507SAlan.Maguire@Sun.COM 	/*
271712507SAlan.Maguire@Sun.COM 	 * DTrace this "unknown" segment as a tcp:::receive, as we did
271812507SAlan.Maguire@Sun.COM 	 * just receive something that was TCP.
271912507SAlan.Maguire@Sun.COM 	 */
272012507SAlan.Maguire@Sun.COM 	DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, NULL,
272112507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
272212507SAlan.Maguire@Sun.COM 	    __dtrace_tcp_tcph_t *, &mp->b_rptr[ip_hdr_len]);
272312507SAlan.Maguire@Sun.COM 
272411754SKacheong.Poon@Sun.COM 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
272511754SKacheong.Poon@Sun.COM 		policy_present = ipss->ipsec_inbound_v4_policy_present;
272611754SKacheong.Poon@Sun.COM 		ipha = (ipha_t *)mp->b_rptr;
272711754SKacheong.Poon@Sun.COM 		ip6h = NULL;
272811754SKacheong.Poon@Sun.COM 	} else {
272911754SKacheong.Poon@Sun.COM 		policy_present = ipss->ipsec_inbound_v6_policy_present;
273011754SKacheong.Poon@Sun.COM 		ipha = NULL;
273111754SKacheong.Poon@Sun.COM 		ip6h = (ip6_t *)mp->b_rptr;
273211754SKacheong.Poon@Sun.COM 	}
273311754SKacheong.Poon@Sun.COM 
273411754SKacheong.Poon@Sun.COM 	if (policy_present) {
273511754SKacheong.Poon@Sun.COM 		/*
273611754SKacheong.Poon@Sun.COM 		 * The conn_t parameter is NULL because we already know
273711754SKacheong.Poon@Sun.COM 		 * nobody's home.
273811754SKacheong.Poon@Sun.COM 		 */
273911754SKacheong.Poon@Sun.COM 		mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
274011754SKacheong.Poon@Sun.COM 		    ira, ns);
274111754SKacheong.Poon@Sun.COM 		if (mp == NULL)
274211754SKacheong.Poon@Sun.COM 			return;
274311754SKacheong.Poon@Sun.COM 	}
274411754SKacheong.Poon@Sun.COM 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
274511754SKacheong.Poon@Sun.COM 		DTRACE_PROBE2(
274611754SKacheong.Poon@Sun.COM 		    tx__ip__log__error__nolistener__tcp,
274711754SKacheong.Poon@Sun.COM 		    char *, "Could not reply with RST to mp(1)",
274811754SKacheong.Poon@Sun.COM 		    mblk_t *, mp);
274911754SKacheong.Poon@Sun.COM 		ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
275011754SKacheong.Poon@Sun.COM 		freemsg(mp);
275111754SKacheong.Poon@Sun.COM 		return;
275211754SKacheong.Poon@Sun.COM 	}
275311754SKacheong.Poon@Sun.COM 
275411754SKacheong.Poon@Sun.COM 	rptr = mp->b_rptr;
275511754SKacheong.Poon@Sun.COM 
275611754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
275711754SKacheong.Poon@Sun.COM 	seg_seq = ntohl(tcpha->tha_seq);
275811754SKacheong.Poon@Sun.COM 	seg_ack = ntohl(tcpha->tha_ack);
275911754SKacheong.Poon@Sun.COM 	flags = tcpha->tha_flags;
276011754SKacheong.Poon@Sun.COM 
276111754SKacheong.Poon@Sun.COM 	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
276211754SKacheong.Poon@Sun.COM 	if (flags & TH_RST) {
276311754SKacheong.Poon@Sun.COM 		freemsg(mp);
276411754SKacheong.Poon@Sun.COM 	} else if (flags & TH_ACK) {
276511754SKacheong.Poon@Sun.COM 		tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
276611754SKacheong.Poon@Sun.COM 		    ira, ipst, connp);
276711754SKacheong.Poon@Sun.COM 	} else {
276811754SKacheong.Poon@Sun.COM 		if (flags & TH_SYN) {
276911754SKacheong.Poon@Sun.COM 			seg_len++;
277011754SKacheong.Poon@Sun.COM 		} else {
277111754SKacheong.Poon@Sun.COM 			/*
277211754SKacheong.Poon@Sun.COM 			 * Here we violate the RFC.  Note that a normal
277311754SKacheong.Poon@Sun.COM 			 * TCP will never send a segment without the ACK
277411754SKacheong.Poon@Sun.COM 			 * flag, except for RST or SYN segment.  This
277511754SKacheong.Poon@Sun.COM 			 * segment is neither.  Just drop it on the
277611754SKacheong.Poon@Sun.COM 			 * floor.
277711754SKacheong.Poon@Sun.COM 			 */
277811754SKacheong.Poon@Sun.COM 			freemsg(mp);
277911754SKacheong.Poon@Sun.COM 			TCP_STAT(tcps, tcp_rst_unsent);
278011754SKacheong.Poon@Sun.COM 			return;
278111754SKacheong.Poon@Sun.COM 		}
278211754SKacheong.Poon@Sun.COM 
278311754SKacheong.Poon@Sun.COM 		tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
278411754SKacheong.Poon@Sun.COM 		    seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
278511754SKacheong.Poon@Sun.COM 	}
278611754SKacheong.Poon@Sun.COM }
278711754SKacheong.Poon@Sun.COM 
278811754SKacheong.Poon@Sun.COM /*
2789*13041SKacheong.Poon@Sun.COM  * Helper function for tcp_xmit_mp() in handling connection set up flag
2790*13041SKacheong.Poon@Sun.COM  * options setting.
2791*13041SKacheong.Poon@Sun.COM  */
2792*13041SKacheong.Poon@Sun.COM static void
tcp_xmit_mp_aux_iss(tcp_t * tcp,conn_t * connp,tcpha_t * tcpha,mblk_t * mp,uint_t * flags)2793*13041SKacheong.Poon@Sun.COM tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp,
2794*13041SKacheong.Poon@Sun.COM     uint_t *flags)
2795*13041SKacheong.Poon@Sun.COM {
2796*13041SKacheong.Poon@Sun.COM 	uint32_t u1;
2797*13041SKacheong.Poon@Sun.COM 	uint8_t	*wptr = mp->b_wptr;
2798*13041SKacheong.Poon@Sun.COM 	tcp_stack_t *tcps = tcp->tcp_tcps;
2799*13041SKacheong.Poon@Sun.COM 	boolean_t add_sack = B_FALSE;
2800*13041SKacheong.Poon@Sun.COM 
2801*13041SKacheong.Poon@Sun.COM 	/*
2802*13041SKacheong.Poon@Sun.COM 	 * If TCP_ISS_VALID and the seq number is tcp_iss,
2803*13041SKacheong.Poon@Sun.COM 	 * TCP can only be in SYN-SENT, SYN-RCVD or
2804*13041SKacheong.Poon@Sun.COM 	 * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
2805*13041SKacheong.Poon@Sun.COM 	 * our SYN is not ack'ed but the app closes this
2806*13041SKacheong.Poon@Sun.COM 	 * TCP connection.
2807*13041SKacheong.Poon@Sun.COM 	 */
2808*13041SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
2809*13041SKacheong.Poon@Sun.COM 	    tcp->tcp_state == TCPS_SYN_RCVD ||
2810*13041SKacheong.Poon@Sun.COM 	    tcp->tcp_state == TCPS_FIN_WAIT_1);
2811*13041SKacheong.Poon@Sun.COM 
2812*13041SKacheong.Poon@Sun.COM 	/*
2813*13041SKacheong.Poon@Sun.COM 	 * Tack on the MSS option.  It is always needed
2814*13041SKacheong.Poon@Sun.COM 	 * for both active and passive open.
2815*13041SKacheong.Poon@Sun.COM 	 *
2816*13041SKacheong.Poon@Sun.COM 	 * MSS option value should be interface MTU - MIN
2817*13041SKacheong.Poon@Sun.COM 	 * TCP/IP header according to RFC 793 as it means
2818*13041SKacheong.Poon@Sun.COM 	 * the maximum segment size TCP can receive.  But
2819*13041SKacheong.Poon@Sun.COM 	 * to get around some broken middle boxes/end hosts
2820*13041SKacheong.Poon@Sun.COM 	 * out there, we allow the option value to be the
2821*13041SKacheong.Poon@Sun.COM 	 * same as the MSS option size on the peer side.
2822*13041SKacheong.Poon@Sun.COM 	 * In this way, the other side will not send
2823*13041SKacheong.Poon@Sun.COM 	 * anything larger than they can receive.
2824*13041SKacheong.Poon@Sun.COM 	 *
2825*13041SKacheong.Poon@Sun.COM 	 * Note that for SYN_SENT state, the ndd param
2826*13041SKacheong.Poon@Sun.COM 	 * tcp_use_smss_as_mss_opt has no effect as we
2827*13041SKacheong.Poon@Sun.COM 	 * don't know the peer's MSS option value. So
2828*13041SKacheong.Poon@Sun.COM 	 * the only case we need to take care of is in
2829*13041SKacheong.Poon@Sun.COM 	 * SYN_RCVD state, which is done later.
2830*13041SKacheong.Poon@Sun.COM 	 */
2831*13041SKacheong.Poon@Sun.COM 	wptr[0] = TCPOPT_MAXSEG;
2832*13041SKacheong.Poon@Sun.COM 	wptr[1] = TCPOPT_MAXSEG_LEN;
2833*13041SKacheong.Poon@Sun.COM 	wptr += 2;
2834*13041SKacheong.Poon@Sun.COM 	u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ?
2835*13041SKacheong.Poon@Sun.COM 	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH;
2836*13041SKacheong.Poon@Sun.COM 	U16_TO_BE16(u1, wptr);
2837*13041SKacheong.Poon@Sun.COM 	wptr += 2;
2838*13041SKacheong.Poon@Sun.COM 
2839*13041SKacheong.Poon@Sun.COM 	/* Update the offset to cover the additional word */
2840*13041SKacheong.Poon@Sun.COM 	tcpha->tha_offset_and_reserved += (1 << 4);
2841*13041SKacheong.Poon@Sun.COM 
2842*13041SKacheong.Poon@Sun.COM 	switch (tcp->tcp_state) {
2843*13041SKacheong.Poon@Sun.COM 	case TCPS_SYN_SENT:
2844*13041SKacheong.Poon@Sun.COM 		*flags = TH_SYN;
2845*13041SKacheong.Poon@Sun.COM 
2846*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok)
2847*13041SKacheong.Poon@Sun.COM 			add_sack = B_TRUE;
2848*13041SKacheong.Poon@Sun.COM 
2849*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_ts_ok) {
2850*13041SKacheong.Poon@Sun.COM 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2851*13041SKacheong.Poon@Sun.COM 
2852*13041SKacheong.Poon@Sun.COM 			if (add_sack) {
2853*13041SKacheong.Poon@Sun.COM 				wptr[0] = TCPOPT_SACK_PERMITTED;
2854*13041SKacheong.Poon@Sun.COM 				wptr[1] = TCPOPT_SACK_OK_LEN;
2855*13041SKacheong.Poon@Sun.COM 				add_sack = B_FALSE;
2856*13041SKacheong.Poon@Sun.COM 			} else {
2857*13041SKacheong.Poon@Sun.COM 				wptr[0] = TCPOPT_NOP;
2858*13041SKacheong.Poon@Sun.COM 				wptr[1] = TCPOPT_NOP;
2859*13041SKacheong.Poon@Sun.COM 			}
2860*13041SKacheong.Poon@Sun.COM 			wptr[2] = TCPOPT_TSTAMP;
2861*13041SKacheong.Poon@Sun.COM 			wptr[3] = TCPOPT_TSTAMP_LEN;
2862*13041SKacheong.Poon@Sun.COM 			wptr += 4;
2863*13041SKacheong.Poon@Sun.COM 			U32_TO_BE32(llbolt, wptr);
2864*13041SKacheong.Poon@Sun.COM 			wptr += 4;
2865*13041SKacheong.Poon@Sun.COM 			ASSERT(tcp->tcp_ts_recent == 0);
2866*13041SKacheong.Poon@Sun.COM 			U32_TO_BE32(0L, wptr);
2867*13041SKacheong.Poon@Sun.COM 			wptr += 4;
2868*13041SKacheong.Poon@Sun.COM 			tcpha->tha_offset_and_reserved += (3 << 4);
2869*13041SKacheong.Poon@Sun.COM 		}
2870*13041SKacheong.Poon@Sun.COM 
2871*13041SKacheong.Poon@Sun.COM 		/*
2872*13041SKacheong.Poon@Sun.COM 		 * Set up all the bits to tell other side
2873*13041SKacheong.Poon@Sun.COM 		 * we are ECN capable.
2874*13041SKacheong.Poon@Sun.COM 		 */
2875*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_ok)
2876*13041SKacheong.Poon@Sun.COM 			*flags |= (TH_ECE | TH_CWR);
2877*13041SKacheong.Poon@Sun.COM 
2878*13041SKacheong.Poon@Sun.COM 		break;
2879*13041SKacheong.Poon@Sun.COM 
2880*13041SKacheong.Poon@Sun.COM 	case TCPS_SYN_RCVD:
2881*13041SKacheong.Poon@Sun.COM 		*flags |= TH_SYN;
2882*13041SKacheong.Poon@Sun.COM 
2883*13041SKacheong.Poon@Sun.COM 		/*
2884*13041SKacheong.Poon@Sun.COM 		 * Reset the MSS option value to be SMSS
2885*13041SKacheong.Poon@Sun.COM 		 * We should probably add back the bytes
2886*13041SKacheong.Poon@Sun.COM 		 * for timestamp option and IPsec.  We
2887*13041SKacheong.Poon@Sun.COM 		 * don't do that as this is a workaround
2888*13041SKacheong.Poon@Sun.COM 		 * for broken middle boxes/end hosts, it
2889*13041SKacheong.Poon@Sun.COM 		 * is better for us to be more cautious.
2890*13041SKacheong.Poon@Sun.COM 		 * They may not take these things into
2891*13041SKacheong.Poon@Sun.COM 		 * account in their SMSS calculation.  Thus
2892*13041SKacheong.Poon@Sun.COM 		 * the peer's calculated SMSS may be smaller
2893*13041SKacheong.Poon@Sun.COM 		 * than what it can be.  This should be OK.
2894*13041SKacheong.Poon@Sun.COM 		 */
2895*13041SKacheong.Poon@Sun.COM 		if (tcps->tcps_use_smss_as_mss_opt) {
2896*13041SKacheong.Poon@Sun.COM 			u1 = tcp->tcp_mss;
2897*13041SKacheong.Poon@Sun.COM 			/*
2898*13041SKacheong.Poon@Sun.COM 			 * Note that wptr points just past the MSS
2899*13041SKacheong.Poon@Sun.COM 			 * option value.
2900*13041SKacheong.Poon@Sun.COM 			 */
2901*13041SKacheong.Poon@Sun.COM 			U16_TO_BE16(u1, wptr - 2);
2902*13041SKacheong.Poon@Sun.COM 		}
2903*13041SKacheong.Poon@Sun.COM 
2904*13041SKacheong.Poon@Sun.COM 		/*
2905*13041SKacheong.Poon@Sun.COM 		 * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
2906*13041SKacheong.Poon@Sun.COM 		 * when the peer also uses timestamps option.  And
2907*13041SKacheong.Poon@Sun.COM 		 * the TCP header template must have already been
2908*13041SKacheong.Poon@Sun.COM 		 * updated to include the timestamps option.
2909*13041SKacheong.Poon@Sun.COM 		 */
2910*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_snd_sack_ok) {
2911*13041SKacheong.Poon@Sun.COM 			if (tcp->tcp_snd_ts_ok) {
2912*13041SKacheong.Poon@Sun.COM 				uint8_t *tmp_wptr;
2913*13041SKacheong.Poon@Sun.COM 
2914*13041SKacheong.Poon@Sun.COM 				/*
2915*13041SKacheong.Poon@Sun.COM 				 * Use the NOP in the header just
2916*13041SKacheong.Poon@Sun.COM 				 * before timestamps opton.
2917*13041SKacheong.Poon@Sun.COM 				 */
2918*13041SKacheong.Poon@Sun.COM 				tmp_wptr = (uint8_t *)tcpha +
2919*13041SKacheong.Poon@Sun.COM 				    TCP_MIN_HEADER_LENGTH;
2920*13041SKacheong.Poon@Sun.COM 				ASSERT(tmp_wptr[0] == TCPOPT_NOP &&
2921*13041SKacheong.Poon@Sun.COM 				    tmp_wptr[1] == TCPOPT_NOP);
2922*13041SKacheong.Poon@Sun.COM 				tmp_wptr[0] = TCPOPT_SACK_PERMITTED;
2923*13041SKacheong.Poon@Sun.COM 				tmp_wptr[1] = TCPOPT_SACK_OK_LEN;
2924*13041SKacheong.Poon@Sun.COM 			} else {
2925*13041SKacheong.Poon@Sun.COM 				add_sack = B_TRUE;
2926*13041SKacheong.Poon@Sun.COM 			}
2927*13041SKacheong.Poon@Sun.COM 		}
2928*13041SKacheong.Poon@Sun.COM 
2929*13041SKacheong.Poon@Sun.COM 
2930*13041SKacheong.Poon@Sun.COM 		/*
2931*13041SKacheong.Poon@Sun.COM 		 * If the other side is ECN capable, reply
2932*13041SKacheong.Poon@Sun.COM 		 * that we are also ECN capable.
2933*13041SKacheong.Poon@Sun.COM 		 */
2934*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_ok)
2935*13041SKacheong.Poon@Sun.COM 			*flags |= TH_ECE;
2936*13041SKacheong.Poon@Sun.COM 		break;
2937*13041SKacheong.Poon@Sun.COM 
2938*13041SKacheong.Poon@Sun.COM 	default:
2939*13041SKacheong.Poon@Sun.COM 		/*
2940*13041SKacheong.Poon@Sun.COM 		 * The above ASSERT() makes sure that this
2941*13041SKacheong.Poon@Sun.COM 		 * must be FIN-WAIT-1 state.  Our SYN has
2942*13041SKacheong.Poon@Sun.COM 		 * not been ack'ed so retransmit it.
2943*13041SKacheong.Poon@Sun.COM 		 */
2944*13041SKacheong.Poon@Sun.COM 		*flags |= TH_SYN;
2945*13041SKacheong.Poon@Sun.COM 		break;
2946*13041SKacheong.Poon@Sun.COM 	}
2947*13041SKacheong.Poon@Sun.COM 
2948*13041SKacheong.Poon@Sun.COM 	if (add_sack) {
2949*13041SKacheong.Poon@Sun.COM 		wptr[0] = TCPOPT_NOP;
2950*13041SKacheong.Poon@Sun.COM 		wptr[1] = TCPOPT_NOP;
2951*13041SKacheong.Poon@Sun.COM 		wptr[2] = TCPOPT_SACK_PERMITTED;
2952*13041SKacheong.Poon@Sun.COM 		wptr[3] = TCPOPT_SACK_OK_LEN;
2953*13041SKacheong.Poon@Sun.COM 		wptr += TCPOPT_REAL_SACK_OK_LEN;
2954*13041SKacheong.Poon@Sun.COM 		tcpha->tha_offset_and_reserved += (1 << 4);
2955*13041SKacheong.Poon@Sun.COM 	}
2956*13041SKacheong.Poon@Sun.COM 
2957*13041SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ws_ok) {
2958*13041SKacheong.Poon@Sun.COM 		wptr[0] =  TCPOPT_NOP;
2959*13041SKacheong.Poon@Sun.COM 		wptr[1] =  TCPOPT_WSCALE;
2960*13041SKacheong.Poon@Sun.COM 		wptr[2] =  TCPOPT_WS_LEN;
2961*13041SKacheong.Poon@Sun.COM 		wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
2962*13041SKacheong.Poon@Sun.COM 		wptr += TCPOPT_REAL_WS_LEN;
2963*13041SKacheong.Poon@Sun.COM 		tcpha->tha_offset_and_reserved += (1 << 4);
2964*13041SKacheong.Poon@Sun.COM 	}
2965*13041SKacheong.Poon@Sun.COM 
2966*13041SKacheong.Poon@Sun.COM 	mp->b_wptr = wptr;
2967*13041SKacheong.Poon@Sun.COM 	u1 = (int)(mp->b_wptr - mp->b_rptr);
2968*13041SKacheong.Poon@Sun.COM 	/*
2969*13041SKacheong.Poon@Sun.COM 	 * Get IP set to checksum on our behalf
2970*13041SKacheong.Poon@Sun.COM 	 * Include the adjustment for a source route if any.
2971*13041SKacheong.Poon@Sun.COM 	 */
2972*13041SKacheong.Poon@Sun.COM 	u1 += connp->conn_sum;
2973*13041SKacheong.Poon@Sun.COM 	u1 = (u1 >> 16) + (u1 & 0xFFFF);
2974*13041SKacheong.Poon@Sun.COM 	tcpha->tha_sum = htons(u1);
2975*13041SKacheong.Poon@Sun.COM 	TCPS_BUMP_MIB(tcps, tcpOutControl);
2976*13041SKacheong.Poon@Sun.COM }
2977*13041SKacheong.Poon@Sun.COM 
2978*13041SKacheong.Poon@Sun.COM /*
2979*13041SKacheong.Poon@Sun.COM  * Helper function for tcp_xmit_mp() in handling connection tear down
2980*13041SKacheong.Poon@Sun.COM  * flag setting and state changes.
2981*13041SKacheong.Poon@Sun.COM  */
2982*13041SKacheong.Poon@Sun.COM static void
tcp_xmit_mp_aux_fss(tcp_t * tcp,ip_xmit_attr_t * ixa,uint_t * flags)2983*13041SKacheong.Poon@Sun.COM tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags)
2984*13041SKacheong.Poon@Sun.COM {
2985*13041SKacheong.Poon@Sun.COM 	if (!tcp->tcp_fin_acked) {
2986*13041SKacheong.Poon@Sun.COM 		*flags |= TH_FIN;
2987*13041SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl);
2988*13041SKacheong.Poon@Sun.COM 	}
2989*13041SKacheong.Poon@Sun.COM 	if (!tcp->tcp_fin_sent) {
2990*13041SKacheong.Poon@Sun.COM 		tcp->tcp_fin_sent = B_TRUE;
2991*13041SKacheong.Poon@Sun.COM 		switch (tcp->tcp_state) {
2992*13041SKacheong.Poon@Sun.COM 		case TCPS_SYN_RCVD:
2993*13041SKacheong.Poon@Sun.COM 			tcp->tcp_state = TCPS_FIN_WAIT_1;
2994*13041SKacheong.Poon@Sun.COM 			DTRACE_TCP6(state__change, void, NULL,
2995*13041SKacheong.Poon@Sun.COM 			    ip_xmit_attr_t *, ixa, void, NULL,
2996*13041SKacheong.Poon@Sun.COM 			    tcp_t *, tcp, void, NULL,
2997*13041SKacheong.Poon@Sun.COM 			    int32_t, TCPS_SYN_RCVD);
2998*13041SKacheong.Poon@Sun.COM 			break;
2999*13041SKacheong.Poon@Sun.COM 		case TCPS_ESTABLISHED:
3000*13041SKacheong.Poon@Sun.COM 			tcp->tcp_state = TCPS_FIN_WAIT_1;
3001*13041SKacheong.Poon@Sun.COM 			DTRACE_TCP6(state__change, void, NULL,
3002*13041SKacheong.Poon@Sun.COM 			    ip_xmit_attr_t *, ixa, void, NULL,
3003*13041SKacheong.Poon@Sun.COM 			    tcp_t *, tcp, void, NULL,
3004*13041SKacheong.Poon@Sun.COM 			    int32_t, TCPS_ESTABLISHED);
3005*13041SKacheong.Poon@Sun.COM 			break;
3006*13041SKacheong.Poon@Sun.COM 		case TCPS_CLOSE_WAIT:
3007*13041SKacheong.Poon@Sun.COM 			tcp->tcp_state = TCPS_LAST_ACK;
3008*13041SKacheong.Poon@Sun.COM 			DTRACE_TCP6(state__change, void, NULL,
3009*13041SKacheong.Poon@Sun.COM 			    ip_xmit_attr_t *, ixa, void, NULL,
3010*13041SKacheong.Poon@Sun.COM 			    tcp_t *, tcp, void, NULL,
3011*13041SKacheong.Poon@Sun.COM 			    int32_t, TCPS_CLOSE_WAIT);
3012*13041SKacheong.Poon@Sun.COM 			break;
3013*13041SKacheong.Poon@Sun.COM 		}
3014*13041SKacheong.Poon@Sun.COM 		if (tcp->tcp_suna == tcp->tcp_snxt)
3015*13041SKacheong.Poon@Sun.COM 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3016*13041SKacheong.Poon@Sun.COM 		tcp->tcp_snxt = tcp->tcp_fss + 1;
3017*13041SKacheong.Poon@Sun.COM 	}
3018*13041SKacheong.Poon@Sun.COM }
3019*13041SKacheong.Poon@Sun.COM 
3020*13041SKacheong.Poon@Sun.COM /*
302111754SKacheong.Poon@Sun.COM  * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
302211754SKacheong.Poon@Sun.COM  * ip and tcp header ready to pass down to IP.  If the mp passed in is
302311754SKacheong.Poon@Sun.COM  * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
302411754SKacheong.Poon@Sun.COM  * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
302511754SKacheong.Poon@Sun.COM  * otherwise it will dup partial mblks.)
302611754SKacheong.Poon@Sun.COM  * Otherwise, an appropriate ACK packet will be generated.  This
302711754SKacheong.Poon@Sun.COM  * routine is not usually called to send new data for the first time.  It
302811754SKacheong.Poon@Sun.COM  * is mostly called out of the timer for retransmits, and to generate ACKs.
302911754SKacheong.Poon@Sun.COM  *
303011754SKacheong.Poon@Sun.COM  * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
303111754SKacheong.Poon@Sun.COM  * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
303211754SKacheong.Poon@Sun.COM  * of the original mblk chain will be returned in *offset and *end_mp.
303311754SKacheong.Poon@Sun.COM  */
303411754SKacheong.Poon@Sun.COM mblk_t *
tcp_xmit_mp(tcp_t * tcp,mblk_t * mp,int32_t max_to_send,int32_t * offset,mblk_t ** end_mp,uint32_t seq,boolean_t sendall,uint32_t * seg_len,boolean_t rexmit)303511754SKacheong.Poon@Sun.COM tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
303611754SKacheong.Poon@Sun.COM     mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
303711754SKacheong.Poon@Sun.COM     boolean_t rexmit)
303811754SKacheong.Poon@Sun.COM {
303911754SKacheong.Poon@Sun.COM 	int	data_length;
304011754SKacheong.Poon@Sun.COM 	int32_t	off = 0;
304111754SKacheong.Poon@Sun.COM 	uint_t	flags;
304211754SKacheong.Poon@Sun.COM 	mblk_t	*mp1;
304311754SKacheong.Poon@Sun.COM 	mblk_t	*mp2;
304411754SKacheong.Poon@Sun.COM 	uchar_t	*rptr;
304511754SKacheong.Poon@Sun.COM 	tcpha_t	*tcpha;
304611754SKacheong.Poon@Sun.COM 	int32_t	num_sack_blk = 0;
304711754SKacheong.Poon@Sun.COM 	int32_t	sack_opt_len = 0;
304811754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
304911754SKacheong.Poon@Sun.COM 	conn_t		*connp = tcp->tcp_connp;
305011754SKacheong.Poon@Sun.COM 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
305111754SKacheong.Poon@Sun.COM 
305211754SKacheong.Poon@Sun.COM 	/* Allocate for our maximum TCP header + link-level */
305311754SKacheong.Poon@Sun.COM 	mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
305411754SKacheong.Poon@Sun.COM 	    BPRI_MED);
3055*13041SKacheong.Poon@Sun.COM 	if (mp1 == NULL)
305611754SKacheong.Poon@Sun.COM 		return (NULL);
305711754SKacheong.Poon@Sun.COM 	data_length = 0;
305811754SKacheong.Poon@Sun.COM 
305911754SKacheong.Poon@Sun.COM 	/*
306011754SKacheong.Poon@Sun.COM 	 * Note that tcp_mss has been adjusted to take into account the
306111754SKacheong.Poon@Sun.COM 	 * timestamp option if applicable.  Because SACK options do not
306211754SKacheong.Poon@Sun.COM 	 * appear in every TCP segments and they are of variable lengths,
306311754SKacheong.Poon@Sun.COM 	 * they cannot be included in tcp_mss.  Thus we need to calculate
306411754SKacheong.Poon@Sun.COM 	 * the actual segment length when we need to send a segment which
306511754SKacheong.Poon@Sun.COM 	 * includes SACK options.
306611754SKacheong.Poon@Sun.COM 	 */
306711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
306811754SKacheong.Poon@Sun.COM 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
306911754SKacheong.Poon@Sun.COM 		    tcp->tcp_num_sack_blk);
307011754SKacheong.Poon@Sun.COM 		sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
307111754SKacheong.Poon@Sun.COM 		    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
307211754SKacheong.Poon@Sun.COM 		if (max_to_send + sack_opt_len > tcp->tcp_mss)
307311754SKacheong.Poon@Sun.COM 			max_to_send -= sack_opt_len;
307411754SKacheong.Poon@Sun.COM 	}
307511754SKacheong.Poon@Sun.COM 
307611754SKacheong.Poon@Sun.COM 	if (offset != NULL) {
307711754SKacheong.Poon@Sun.COM 		off = *offset;
307811754SKacheong.Poon@Sun.COM 		/* We use offset as an indicator that end_mp is not NULL. */
307911754SKacheong.Poon@Sun.COM 		*end_mp = NULL;
308011754SKacheong.Poon@Sun.COM 	}
308111754SKacheong.Poon@Sun.COM 	for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
308211754SKacheong.Poon@Sun.COM 		/* This could be faster with cooperation from downstream */
308311754SKacheong.Poon@Sun.COM 		if (mp2 != mp1 && !sendall &&
308411754SKacheong.Poon@Sun.COM 		    data_length + (int)(mp->b_wptr - mp->b_rptr) >
308511754SKacheong.Poon@Sun.COM 		    max_to_send)
308611754SKacheong.Poon@Sun.COM 			/*
308711754SKacheong.Poon@Sun.COM 			 * Don't send the next mblk since the whole mblk
308811754SKacheong.Poon@Sun.COM 			 * does not fit.
308911754SKacheong.Poon@Sun.COM 			 */
309011754SKacheong.Poon@Sun.COM 			break;
309111754SKacheong.Poon@Sun.COM 		mp2->b_cont = dupb(mp);
309211754SKacheong.Poon@Sun.COM 		mp2 = mp2->b_cont;
309311754SKacheong.Poon@Sun.COM 		if (!mp2) {
309411754SKacheong.Poon@Sun.COM 			freemsg(mp1);
309511754SKacheong.Poon@Sun.COM 			return (NULL);
309611754SKacheong.Poon@Sun.COM 		}
309711754SKacheong.Poon@Sun.COM 		mp2->b_rptr += off;
309811754SKacheong.Poon@Sun.COM 		ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
309911754SKacheong.Poon@Sun.COM 		    (uintptr_t)INT_MAX);
310011754SKacheong.Poon@Sun.COM 
310111754SKacheong.Poon@Sun.COM 		data_length += (int)(mp2->b_wptr - mp2->b_rptr);
310211754SKacheong.Poon@Sun.COM 		if (data_length > max_to_send) {
310311754SKacheong.Poon@Sun.COM 			mp2->b_wptr -= data_length - max_to_send;
310411754SKacheong.Poon@Sun.COM 			data_length = max_to_send;
310511754SKacheong.Poon@Sun.COM 			off = mp2->b_wptr - mp->b_rptr;
310611754SKacheong.Poon@Sun.COM 			break;
310711754SKacheong.Poon@Sun.COM 		} else {
310811754SKacheong.Poon@Sun.COM 			off = 0;
310911754SKacheong.Poon@Sun.COM 		}
311011754SKacheong.Poon@Sun.COM 	}
311111754SKacheong.Poon@Sun.COM 	if (offset != NULL) {
311211754SKacheong.Poon@Sun.COM 		*offset = off;
311311754SKacheong.Poon@Sun.COM 		*end_mp = mp;
311411754SKacheong.Poon@Sun.COM 	}
311511754SKacheong.Poon@Sun.COM 	if (seg_len != NULL) {
311611754SKacheong.Poon@Sun.COM 		*seg_len = data_length;
311711754SKacheong.Poon@Sun.COM 	}
311811754SKacheong.Poon@Sun.COM 
311911754SKacheong.Poon@Sun.COM 	/* Update the latest receive window size in TCP header. */
312011754SKacheong.Poon@Sun.COM 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
312111754SKacheong.Poon@Sun.COM 
312211754SKacheong.Poon@Sun.COM 	rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
312311754SKacheong.Poon@Sun.COM 	mp1->b_rptr = rptr;
312411754SKacheong.Poon@Sun.COM 	mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
312511754SKacheong.Poon@Sun.COM 	bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
312611754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
312711754SKacheong.Poon@Sun.COM 	tcpha->tha_seq = htonl(seq);
312811754SKacheong.Poon@Sun.COM 
312911754SKacheong.Poon@Sun.COM 	/*
313011754SKacheong.Poon@Sun.COM 	 * Use tcp_unsent to determine if the PUSH bit should be used assumes
313111754SKacheong.Poon@Sun.COM 	 * that this function was called from tcp_wput_data. Thus, when called
313211754SKacheong.Poon@Sun.COM 	 * to retransmit data the setting of the PUSH bit may appear some
313311754SKacheong.Poon@Sun.COM 	 * what random in that it might get set when it should not. This
313411754SKacheong.Poon@Sun.COM 	 * should not pose any performance issues.
313511754SKacheong.Poon@Sun.COM 	 */
313611754SKacheong.Poon@Sun.COM 	if (data_length != 0 && (tcp->tcp_unsent == 0 ||
313711754SKacheong.Poon@Sun.COM 	    tcp->tcp_unsent == data_length)) {
313811754SKacheong.Poon@Sun.COM 		flags = TH_ACK | TH_PUSH;
313911754SKacheong.Poon@Sun.COM 	} else {
314011754SKacheong.Poon@Sun.COM 		flags = TH_ACK;
314111754SKacheong.Poon@Sun.COM 	}
314211754SKacheong.Poon@Sun.COM 
314311754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ecn_ok) {
314411754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_echo_on)
314511754SKacheong.Poon@Sun.COM 			flags |= TH_ECE;
314611754SKacheong.Poon@Sun.COM 
314711754SKacheong.Poon@Sun.COM 		/*
314811754SKacheong.Poon@Sun.COM 		 * Only set ECT bit and ECN_CWR if a segment contains new data.
314911754SKacheong.Poon@Sun.COM 		 * There is no TCP flow control for non-data segments, and
315011754SKacheong.Poon@Sun.COM 		 * only data segment is transmitted reliably.
315111754SKacheong.Poon@Sun.COM 		 */
315211754SKacheong.Poon@Sun.COM 		if (data_length > 0 && !rexmit) {
315311754SKacheong.Poon@Sun.COM 			TCP_SET_ECT(tcp, rptr);
315411754SKacheong.Poon@Sun.COM 			if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
315511754SKacheong.Poon@Sun.COM 				flags |= TH_CWR;
315611754SKacheong.Poon@Sun.COM 				tcp->tcp_ecn_cwr_sent = B_TRUE;
315711754SKacheong.Poon@Sun.COM 			}
315811754SKacheong.Poon@Sun.COM 		}
315911754SKacheong.Poon@Sun.COM 	}
316011754SKacheong.Poon@Sun.COM 
3161*13041SKacheong.Poon@Sun.COM 	/* Check if there is any special processing needs to be done. */
316211754SKacheong.Poon@Sun.COM 	if (tcp->tcp_valid_bits) {
316311754SKacheong.Poon@Sun.COM 		uint32_t u1;
316411754SKacheong.Poon@Sun.COM 
3165*13041SKacheong.Poon@Sun.COM 		/* We don't allow having SYN and FIN in the same segment... */
316611754SKacheong.Poon@Sun.COM 		if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
316711754SKacheong.Poon@Sun.COM 		    seq == tcp->tcp_iss) {
3168*13041SKacheong.Poon@Sun.COM 			/* Need to do connection set up processing. */
3169*13041SKacheong.Poon@Sun.COM 			tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags);
3170*13041SKacheong.Poon@Sun.COM 		} else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3171*13041SKacheong.Poon@Sun.COM 		    (seq + data_length) == tcp->tcp_fss) {
3172*13041SKacheong.Poon@Sun.COM 			/* Need to do connection tear down processing. */
3173*13041SKacheong.Poon@Sun.COM 			tcp_xmit_mp_aux_fss(tcp, ixa, &flags);
317411754SKacheong.Poon@Sun.COM 		}
3175*13041SKacheong.Poon@Sun.COM 
317611754SKacheong.Poon@Sun.COM 		/*
3177*13041SKacheong.Poon@Sun.COM 		 * Need to do urgent pointer processing.
3178*13041SKacheong.Poon@Sun.COM 		 *
317911754SKacheong.Poon@Sun.COM 		 * Note the trick here.  u1 is unsigned.  When tcp_urg
318011754SKacheong.Poon@Sun.COM 		 * is smaller than seq, u1 will become a very huge value.
318111754SKacheong.Poon@Sun.COM 		 * So the comparison will fail.  Also note that tcp_urp
318211754SKacheong.Poon@Sun.COM 		 * should be positive, see RFC 793 page 17.
318311754SKacheong.Poon@Sun.COM 		 */
318411754SKacheong.Poon@Sun.COM 		u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
318511754SKacheong.Poon@Sun.COM 		if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
318611754SKacheong.Poon@Sun.COM 		    u1 < (uint32_t)(64 * 1024)) {
318711754SKacheong.Poon@Sun.COM 			flags |= TH_URG;
318811754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpOutUrg);
318911754SKacheong.Poon@Sun.COM 			tcpha->tha_urp = htons(u1);
319011754SKacheong.Poon@Sun.COM 		}
319111754SKacheong.Poon@Sun.COM 	}
319211754SKacheong.Poon@Sun.COM 	tcpha->tha_flags = (uchar_t)flags;
319311754SKacheong.Poon@Sun.COM 	tcp->tcp_rack = tcp->tcp_rnxt;
319411754SKacheong.Poon@Sun.COM 	tcp->tcp_rack_cnt = 0;
319511754SKacheong.Poon@Sun.COM 
3196*13041SKacheong.Poon@Sun.COM 	/* Fill in the current value of timestamps option. */
319711754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok) {
319811754SKacheong.Poon@Sun.COM 		if (tcp->tcp_state != TCPS_SYN_SENT) {
319911754SKacheong.Poon@Sun.COM 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
320011754SKacheong.Poon@Sun.COM 
320111754SKacheong.Poon@Sun.COM 			U32_TO_BE32(llbolt,
320211754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
320311754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tcp->tcp_ts_recent,
320411754SKacheong.Poon@Sun.COM 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
320511754SKacheong.Poon@Sun.COM 		}
320611754SKacheong.Poon@Sun.COM 	}
320711754SKacheong.Poon@Sun.COM 
3208*13041SKacheong.Poon@Sun.COM 	/* Fill in the SACK blocks. */
320911754SKacheong.Poon@Sun.COM 	if (num_sack_blk > 0) {
321011754SKacheong.Poon@Sun.COM 		uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
321111754SKacheong.Poon@Sun.COM 		sack_blk_t *tmp;
321211754SKacheong.Poon@Sun.COM 		int32_t	i;
321311754SKacheong.Poon@Sun.COM 
321411754SKacheong.Poon@Sun.COM 		wptr[0] = TCPOPT_NOP;
321511754SKacheong.Poon@Sun.COM 		wptr[1] = TCPOPT_NOP;
321611754SKacheong.Poon@Sun.COM 		wptr[2] = TCPOPT_SACK;
321711754SKacheong.Poon@Sun.COM 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
321811754SKacheong.Poon@Sun.COM 		    sizeof (sack_blk_t);
321911754SKacheong.Poon@Sun.COM 		wptr += TCPOPT_REAL_SACK_LEN;
322011754SKacheong.Poon@Sun.COM 
322111754SKacheong.Poon@Sun.COM 		tmp = tcp->tcp_sack_list;
322211754SKacheong.Poon@Sun.COM 		for (i = 0; i < num_sack_blk; i++) {
322311754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tmp[i].begin, wptr);
322411754SKacheong.Poon@Sun.COM 			wptr += sizeof (tcp_seq);
322511754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tmp[i].end, wptr);
322611754SKacheong.Poon@Sun.COM 			wptr += sizeof (tcp_seq);
322711754SKacheong.Poon@Sun.COM 		}
322811754SKacheong.Poon@Sun.COM 		tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
322911754SKacheong.Poon@Sun.COM 	}
323011754SKacheong.Poon@Sun.COM 	ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
323111754SKacheong.Poon@Sun.COM 	data_length += (int)(mp1->b_wptr - rptr);
323211754SKacheong.Poon@Sun.COM 
323311754SKacheong.Poon@Sun.COM 	ixa->ixa_pktlen = data_length;
323411754SKacheong.Poon@Sun.COM 
323511754SKacheong.Poon@Sun.COM 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
323611754SKacheong.Poon@Sun.COM 		((ipha_t *)rptr)->ipha_length = htons(data_length);
323711754SKacheong.Poon@Sun.COM 	} else {
323811754SKacheong.Poon@Sun.COM 		ip6_t *ip6 = (ip6_t *)rptr;
323911754SKacheong.Poon@Sun.COM 
324011754SKacheong.Poon@Sun.COM 		ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
324111754SKacheong.Poon@Sun.COM 	}
324211754SKacheong.Poon@Sun.COM 
324311754SKacheong.Poon@Sun.COM 	/*
324411754SKacheong.Poon@Sun.COM 	 * Prime pump for IP
324511754SKacheong.Poon@Sun.COM 	 * Include the adjustment for a source route if any.
324611754SKacheong.Poon@Sun.COM 	 */
324711754SKacheong.Poon@Sun.COM 	data_length -= ixa->ixa_ip_hdr_length;
324811754SKacheong.Poon@Sun.COM 	data_length += connp->conn_sum;
324911754SKacheong.Poon@Sun.COM 	data_length = (data_length >> 16) + (data_length & 0xFFFF);
325011754SKacheong.Poon@Sun.COM 	tcpha->tha_sum = htons(data_length);
325111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ip_forward_progress) {
325211754SKacheong.Poon@Sun.COM 		tcp->tcp_ip_forward_progress = B_FALSE;
325311754SKacheong.Poon@Sun.COM 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
325411754SKacheong.Poon@Sun.COM 	} else {
325511754SKacheong.Poon@Sun.COM 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
325611754SKacheong.Poon@Sun.COM 	}
325711754SKacheong.Poon@Sun.COM 	return (mp1);
325811754SKacheong.Poon@Sun.COM }
325911754SKacheong.Poon@Sun.COM 
326011754SKacheong.Poon@Sun.COM /*
326111754SKacheong.Poon@Sun.COM  * If this routine returns B_TRUE, TCP can generate a RST in response
326211754SKacheong.Poon@Sun.COM  * to a segment.  If it returns B_FALSE, TCP should not respond.
326311754SKacheong.Poon@Sun.COM  */
326411754SKacheong.Poon@Sun.COM static boolean_t
tcp_send_rst_chk(tcp_stack_t * tcps)326511754SKacheong.Poon@Sun.COM tcp_send_rst_chk(tcp_stack_t *tcps)
326611754SKacheong.Poon@Sun.COM {
326711754SKacheong.Poon@Sun.COM 	int64_t	now;
326811754SKacheong.Poon@Sun.COM 
326911754SKacheong.Poon@Sun.COM 	/*
327011754SKacheong.Poon@Sun.COM 	 * TCP needs to protect itself from generating too many RSTs.
327111754SKacheong.Poon@Sun.COM 	 * This can be a DoS attack by sending us random segments
327211754SKacheong.Poon@Sun.COM 	 * soliciting RSTs.
327311754SKacheong.Poon@Sun.COM 	 *
327411754SKacheong.Poon@Sun.COM 	 * What we do here is to have a limit of tcp_rst_sent_rate RSTs
327511754SKacheong.Poon@Sun.COM 	 * in each 1 second interval.  In this way, TCP still generate
327611754SKacheong.Poon@Sun.COM 	 * RSTs in normal cases but when under attack, the impact is
327711754SKacheong.Poon@Sun.COM 	 * limited.
327811754SKacheong.Poon@Sun.COM 	 */
327911754SKacheong.Poon@Sun.COM 	if (tcps->tcps_rst_sent_rate_enabled != 0) {
328011754SKacheong.Poon@Sun.COM 		now = ddi_get_lbolt64();
328111754SKacheong.Poon@Sun.COM 		if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) >
328211754SKacheong.Poon@Sun.COM 		    1*SECONDS) {
328311754SKacheong.Poon@Sun.COM 			tcps->tcps_last_rst_intrvl = now;
328411754SKacheong.Poon@Sun.COM 			tcps->tcps_rst_cnt = 1;
328511754SKacheong.Poon@Sun.COM 		} else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) {
328611754SKacheong.Poon@Sun.COM 			return (B_FALSE);
328711754SKacheong.Poon@Sun.COM 		}
328811754SKacheong.Poon@Sun.COM 	}
328911754SKacheong.Poon@Sun.COM 	return (B_TRUE);
329011754SKacheong.Poon@Sun.COM }
329111754SKacheong.Poon@Sun.COM 
329211754SKacheong.Poon@Sun.COM /*
329311754SKacheong.Poon@Sun.COM  * This function handles all retransmissions if SACK is enabled for this
329411754SKacheong.Poon@Sun.COM  * connection.  First it calculates how many segments can be retransmitted
329511754SKacheong.Poon@Sun.COM  * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
329611754SKacheong.Poon@Sun.COM  * segments.  A segment is eligible if sack_cnt for that segment is greater
329711754SKacheong.Poon@Sun.COM  * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
329811754SKacheong.Poon@Sun.COM  * all eligible segments, it checks to see if TCP can send some new segments
329911754SKacheong.Poon@Sun.COM  * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
330011754SKacheong.Poon@Sun.COM  *
330111754SKacheong.Poon@Sun.COM  * Parameters:
330211754SKacheong.Poon@Sun.COM  *	tcp_t *tcp: the tcp structure of the connection.
330311754SKacheong.Poon@Sun.COM  *	uint_t *flags: in return, appropriate value will be set for
330411754SKacheong.Poon@Sun.COM  *	tcp_input_data().
330511754SKacheong.Poon@Sun.COM  */
330611754SKacheong.Poon@Sun.COM void
tcp_sack_rexmit(tcp_t * tcp,uint_t * flags)330711754SKacheong.Poon@Sun.COM tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
330811754SKacheong.Poon@Sun.COM {
330911754SKacheong.Poon@Sun.COM 	notsack_blk_t	*notsack_blk;
331011754SKacheong.Poon@Sun.COM 	int32_t		usable_swnd;
331111754SKacheong.Poon@Sun.COM 	int32_t		mss;
331211754SKacheong.Poon@Sun.COM 	uint32_t	seg_len;
331311754SKacheong.Poon@Sun.COM 	mblk_t		*xmit_mp;
331411754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
331511754SKacheong.Poon@Sun.COM 
331611754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_notsack_list != NULL);
331711754SKacheong.Poon@Sun.COM 	ASSERT(tcp->tcp_rexmit == B_FALSE);
331811754SKacheong.Poon@Sun.COM 
331911754SKacheong.Poon@Sun.COM 	/* Defensive coding in case there is a bug... */
332011754SKacheong.Poon@Sun.COM 	if (tcp->tcp_notsack_list == NULL) {
332111754SKacheong.Poon@Sun.COM 		return;
332211754SKacheong.Poon@Sun.COM 	}
332311754SKacheong.Poon@Sun.COM 	notsack_blk = tcp->tcp_notsack_list;
332411754SKacheong.Poon@Sun.COM 	mss = tcp->tcp_mss;
332511754SKacheong.Poon@Sun.COM 
332611754SKacheong.Poon@Sun.COM 	/*
332711754SKacheong.Poon@Sun.COM 	 * Limit the num of outstanding data in the network to be
332811754SKacheong.Poon@Sun.COM 	 * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
332911754SKacheong.Poon@Sun.COM 	 */
333011754SKacheong.Poon@Sun.COM 	usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
333111754SKacheong.Poon@Sun.COM 
333211754SKacheong.Poon@Sun.COM 	/* At least retransmit 1 MSS of data. */
333311754SKacheong.Poon@Sun.COM 	if (usable_swnd <= 0) {
333411754SKacheong.Poon@Sun.COM 		usable_swnd = mss;
333511754SKacheong.Poon@Sun.COM 	}
333611754SKacheong.Poon@Sun.COM 
333711754SKacheong.Poon@Sun.COM 	/* Make sure no new RTT samples will be taken. */
333811754SKacheong.Poon@Sun.COM 	tcp->tcp_csuna = tcp->tcp_snxt;
333911754SKacheong.Poon@Sun.COM 
334011754SKacheong.Poon@Sun.COM 	notsack_blk = tcp->tcp_notsack_list;
334111754SKacheong.Poon@Sun.COM 	while (usable_swnd > 0) {
334211754SKacheong.Poon@Sun.COM 		mblk_t		*snxt_mp, *tmp_mp;
334311754SKacheong.Poon@Sun.COM 		tcp_seq		begin = tcp->tcp_sack_snxt;
334411754SKacheong.Poon@Sun.COM 		tcp_seq		end;
334511754SKacheong.Poon@Sun.COM 		int32_t		off;
334611754SKacheong.Poon@Sun.COM 
334711754SKacheong.Poon@Sun.COM 		for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
334811754SKacheong.Poon@Sun.COM 			if (SEQ_GT(notsack_blk->end, begin) &&
334911754SKacheong.Poon@Sun.COM 			    (notsack_blk->sack_cnt >=
335011754SKacheong.Poon@Sun.COM 			    tcps->tcps_dupack_fast_retransmit)) {
335111754SKacheong.Poon@Sun.COM 				end = notsack_blk->end;
335211754SKacheong.Poon@Sun.COM 				if (SEQ_LT(begin, notsack_blk->begin)) {
335311754SKacheong.Poon@Sun.COM 					begin = notsack_blk->begin;
335411754SKacheong.Poon@Sun.COM 				}
335511754SKacheong.Poon@Sun.COM 				break;
335611754SKacheong.Poon@Sun.COM 			}
335711754SKacheong.Poon@Sun.COM 		}
335811754SKacheong.Poon@Sun.COM 		/*
335911754SKacheong.Poon@Sun.COM 		 * All holes are filled.  Manipulate tcp_cwnd to send more
336011754SKacheong.Poon@Sun.COM 		 * if we can.  Note that after the SACK recovery, tcp_cwnd is
336111754SKacheong.Poon@Sun.COM 		 * set to tcp_cwnd_ssthresh.
336211754SKacheong.Poon@Sun.COM 		 */
336311754SKacheong.Poon@Sun.COM 		if (notsack_blk == NULL) {
336411754SKacheong.Poon@Sun.COM 			usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
336511754SKacheong.Poon@Sun.COM 			if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
336611754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
336711754SKacheong.Poon@Sun.COM 				ASSERT(tcp->tcp_cwnd > 0);
336811754SKacheong.Poon@Sun.COM 				return;
336911754SKacheong.Poon@Sun.COM 			} else {
337011754SKacheong.Poon@Sun.COM 				usable_swnd = usable_swnd / mss;
337111754SKacheong.Poon@Sun.COM 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
337211754SKacheong.Poon@Sun.COM 				    MAX(usable_swnd * mss, mss);
337311754SKacheong.Poon@Sun.COM 				*flags |= TH_XMIT_NEEDED;
337411754SKacheong.Poon@Sun.COM 				return;
337511754SKacheong.Poon@Sun.COM 			}
337611754SKacheong.Poon@Sun.COM 		}
337711754SKacheong.Poon@Sun.COM 
337811754SKacheong.Poon@Sun.COM 		/*
337911754SKacheong.Poon@Sun.COM 		 * Note that we may send more than usable_swnd allows here
338011754SKacheong.Poon@Sun.COM 		 * because of round off, but no more than 1 MSS of data.
338111754SKacheong.Poon@Sun.COM 		 */
338211754SKacheong.Poon@Sun.COM 		seg_len = end - begin;
338311754SKacheong.Poon@Sun.COM 		if (seg_len > mss)
338411754SKacheong.Poon@Sun.COM 			seg_len = mss;
338511754SKacheong.Poon@Sun.COM 		snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
338611754SKacheong.Poon@Sun.COM 		ASSERT(snxt_mp != NULL);
338711754SKacheong.Poon@Sun.COM 		/* This should not happen.  Defensive coding again... */
338811754SKacheong.Poon@Sun.COM 		if (snxt_mp == NULL) {
338911754SKacheong.Poon@Sun.COM 			return;
339011754SKacheong.Poon@Sun.COM 		}
339111754SKacheong.Poon@Sun.COM 
339211754SKacheong.Poon@Sun.COM 		xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
339311754SKacheong.Poon@Sun.COM 		    &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
339411754SKacheong.Poon@Sun.COM 		if (xmit_mp == NULL)
339511754SKacheong.Poon@Sun.COM 			return;
339611754SKacheong.Poon@Sun.COM 
339711754SKacheong.Poon@Sun.COM 		usable_swnd -= seg_len;
339811754SKacheong.Poon@Sun.COM 		tcp->tcp_pipe += seg_len;
339911754SKacheong.Poon@Sun.COM 		tcp->tcp_sack_snxt = begin + seg_len;
340011754SKacheong.Poon@Sun.COM 
340111754SKacheong.Poon@Sun.COM 		tcp_send_data(tcp, xmit_mp);
340211754SKacheong.Poon@Sun.COM 
340311754SKacheong.Poon@Sun.COM 		/*
340411754SKacheong.Poon@Sun.COM 		 * Update the send timestamp to avoid false retransmission.
340511754SKacheong.Poon@Sun.COM 		 */
340611754SKacheong.Poon@Sun.COM 		snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
340711754SKacheong.Poon@Sun.COM 
340811754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpRetransSegs);
340911754SKacheong.Poon@Sun.COM 		TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
341011754SKacheong.Poon@Sun.COM 		TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
341111754SKacheong.Poon@Sun.COM 		/*
341211754SKacheong.Poon@Sun.COM 		 * Update tcp_rexmit_max to extend this SACK recovery phase.
341311754SKacheong.Poon@Sun.COM 		 * This happens when new data sent during fast recovery is
341411754SKacheong.Poon@Sun.COM 		 * also lost.  If TCP retransmits those new data, it needs
341511754SKacheong.Poon@Sun.COM 		 * to extend SACK recover phase to avoid starting another
341611754SKacheong.Poon@Sun.COM 		 * fast retransmit/recovery unnecessarily.
341711754SKacheong.Poon@Sun.COM 		 */
341811754SKacheong.Poon@Sun.COM 		if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
341911754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
342011754SKacheong.Poon@Sun.COM 		}
342111754SKacheong.Poon@Sun.COM 	}
342211754SKacheong.Poon@Sun.COM }
342311754SKacheong.Poon@Sun.COM 
342411754SKacheong.Poon@Sun.COM /*
342511754SKacheong.Poon@Sun.COM  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
342611754SKacheong.Poon@Sun.COM  * or ICMP errors.
342711754SKacheong.Poon@Sun.COM  *
342811754SKacheong.Poon@Sun.COM  * To limit the number of duplicate segments, we limit the number of segment
342911754SKacheong.Poon@Sun.COM  * to be sent in one time to tcp_snd_burst, the burst variable.
343011754SKacheong.Poon@Sun.COM  */
343111754SKacheong.Poon@Sun.COM void
tcp_ss_rexmit(tcp_t * tcp)343211754SKacheong.Poon@Sun.COM tcp_ss_rexmit(tcp_t *tcp)
343311754SKacheong.Poon@Sun.COM {
343411754SKacheong.Poon@Sun.COM 	uint32_t	snxt;
343511754SKacheong.Poon@Sun.COM 	uint32_t	smax;
343611754SKacheong.Poon@Sun.COM 	int32_t		win;
343711754SKacheong.Poon@Sun.COM 	int32_t		mss;
343811754SKacheong.Poon@Sun.COM 	int32_t		off;
343911754SKacheong.Poon@Sun.COM 	int32_t		burst = tcp->tcp_snd_burst;
344011754SKacheong.Poon@Sun.COM 	mblk_t		*snxt_mp;
344111754SKacheong.Poon@Sun.COM 	tcp_stack_t	*tcps = tcp->tcp_tcps;
344211754SKacheong.Poon@Sun.COM 
344311754SKacheong.Poon@Sun.COM 	/*
344411754SKacheong.Poon@Sun.COM 	 * Note that tcp_rexmit can be set even though TCP has retransmitted
344511754SKacheong.Poon@Sun.COM 	 * all unack'ed segments.
344611754SKacheong.Poon@Sun.COM 	 */
344711754SKacheong.Poon@Sun.COM 	if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
344811754SKacheong.Poon@Sun.COM 		smax = tcp->tcp_rexmit_max;
344911754SKacheong.Poon@Sun.COM 		snxt = tcp->tcp_rexmit_nxt;
345011754SKacheong.Poon@Sun.COM 		if (SEQ_LT(snxt, tcp->tcp_suna)) {
345111754SKacheong.Poon@Sun.COM 			snxt = tcp->tcp_suna;
345211754SKacheong.Poon@Sun.COM 		}
345311754SKacheong.Poon@Sun.COM 		win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
345411754SKacheong.Poon@Sun.COM 		win -= snxt - tcp->tcp_suna;
345511754SKacheong.Poon@Sun.COM 		mss = tcp->tcp_mss;
345611754SKacheong.Poon@Sun.COM 		snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
345711754SKacheong.Poon@Sun.COM 
345811754SKacheong.Poon@Sun.COM 		while (SEQ_LT(snxt, smax) && (win > 0) &&
345911754SKacheong.Poon@Sun.COM 		    (burst > 0) && (snxt_mp != NULL)) {
346011754SKacheong.Poon@Sun.COM 			mblk_t	*xmit_mp;
346111754SKacheong.Poon@Sun.COM 			mblk_t	*old_snxt_mp = snxt_mp;
346211754SKacheong.Poon@Sun.COM 			uint32_t cnt = mss;
346311754SKacheong.Poon@Sun.COM 
346411754SKacheong.Poon@Sun.COM 			if (win < cnt) {
346511754SKacheong.Poon@Sun.COM 				cnt = win;
346611754SKacheong.Poon@Sun.COM 			}
346711754SKacheong.Poon@Sun.COM 			if (SEQ_GT(snxt + cnt, smax)) {
346811754SKacheong.Poon@Sun.COM 				cnt = smax - snxt;
346911754SKacheong.Poon@Sun.COM 			}
347011754SKacheong.Poon@Sun.COM 			xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
347111754SKacheong.Poon@Sun.COM 			    &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
347211754SKacheong.Poon@Sun.COM 			if (xmit_mp == NULL)
347311754SKacheong.Poon@Sun.COM 				return;
347411754SKacheong.Poon@Sun.COM 
347511754SKacheong.Poon@Sun.COM 			tcp_send_data(tcp, xmit_mp);
347611754SKacheong.Poon@Sun.COM 
347711754SKacheong.Poon@Sun.COM 			snxt += cnt;
347811754SKacheong.Poon@Sun.COM 			win -= cnt;
347911754SKacheong.Poon@Sun.COM 			/*
348011754SKacheong.Poon@Sun.COM 			 * Update the send timestamp to avoid false
348111754SKacheong.Poon@Sun.COM 			 * retransmission.
348211754SKacheong.Poon@Sun.COM 			 */
348311754SKacheong.Poon@Sun.COM 			old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
348411754SKacheong.Poon@Sun.COM 			TCPS_BUMP_MIB(tcps, tcpRetransSegs);
348511754SKacheong.Poon@Sun.COM 			TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
348611754SKacheong.Poon@Sun.COM 
348711754SKacheong.Poon@Sun.COM 			tcp->tcp_rexmit_nxt = snxt;
348811754SKacheong.Poon@Sun.COM 			burst--;
348911754SKacheong.Poon@Sun.COM 		}
349011754SKacheong.Poon@Sun.COM 		/*
349111754SKacheong.Poon@Sun.COM 		 * If we have transmitted all we have at the time
349211754SKacheong.Poon@Sun.COM 		 * we started the retranmission, we can leave
349311754SKacheong.Poon@Sun.COM 		 * the rest of the job to tcp_wput_data().  But we
349411754SKacheong.Poon@Sun.COM 		 * need to check the send window first.  If the
349511754SKacheong.Poon@Sun.COM 		 * win is not 0, go on with tcp_wput_data().
349611754SKacheong.Poon@Sun.COM 		 */
349711754SKacheong.Poon@Sun.COM 		if (SEQ_LT(snxt, smax) || win == 0) {
349811754SKacheong.Poon@Sun.COM 			return;
349911754SKacheong.Poon@Sun.COM 		}
350011754SKacheong.Poon@Sun.COM 	}
350111754SKacheong.Poon@Sun.COM 	/* Only call tcp_wput_data() if there is data to be sent. */
350211754SKacheong.Poon@Sun.COM 	if (tcp->tcp_unsent) {
350311754SKacheong.Poon@Sun.COM 		tcp_wput_data(tcp, NULL, B_FALSE);
350411754SKacheong.Poon@Sun.COM 	}
350511754SKacheong.Poon@Sun.COM }
350611754SKacheong.Poon@Sun.COM 
350711754SKacheong.Poon@Sun.COM /*
350811754SKacheong.Poon@Sun.COM  * Do slow start retransmission after ICMP errors of PMTU changes.
350911754SKacheong.Poon@Sun.COM  */
351011754SKacheong.Poon@Sun.COM void
tcp_rexmit_after_error(tcp_t * tcp)351111754SKacheong.Poon@Sun.COM tcp_rexmit_after_error(tcp_t *tcp)
351211754SKacheong.Poon@Sun.COM {
351311754SKacheong.Poon@Sun.COM 	/*
351411754SKacheong.Poon@Sun.COM 	 * All sent data has been acknowledged or no data left to send, just
351511754SKacheong.Poon@Sun.COM 	 * to return.
351611754SKacheong.Poon@Sun.COM 	 */
351711754SKacheong.Poon@Sun.COM 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
351811754SKacheong.Poon@Sun.COM 	    (tcp->tcp_xmit_head == NULL))
351911754SKacheong.Poon@Sun.COM 		return;
352011754SKacheong.Poon@Sun.COM 
352111754SKacheong.Poon@Sun.COM 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
352211754SKacheong.Poon@Sun.COM 		tcp->tcp_rexmit_max = tcp->tcp_fss;
352311754SKacheong.Poon@Sun.COM 	else
352411754SKacheong.Poon@Sun.COM 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
352511754SKacheong.Poon@Sun.COM 
352611754SKacheong.Poon@Sun.COM 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
352711754SKacheong.Poon@Sun.COM 	tcp->tcp_rexmit = B_TRUE;
352811754SKacheong.Poon@Sun.COM 	tcp->tcp_dupack_cnt = 0;
352911754SKacheong.Poon@Sun.COM 	tcp->tcp_snd_burst = TCP_CWND_SS;
353011754SKacheong.Poon@Sun.COM 	tcp_ss_rexmit(tcp);
353111754SKacheong.Poon@Sun.COM }
353211754SKacheong.Poon@Sun.COM 
353311754SKacheong.Poon@Sun.COM /*
353411754SKacheong.Poon@Sun.COM  * tcp_get_seg_mp() is called to get the pointer to a segment in the
353511754SKacheong.Poon@Sun.COM  * send queue which starts at the given sequence number. If the given
353611754SKacheong.Poon@Sun.COM  * sequence number is equal to last valid sequence number (tcp_snxt), the
353711754SKacheong.Poon@Sun.COM  * returned mblk is the last valid mblk, and off is set to the length of
353811754SKacheong.Poon@Sun.COM  * that mblk.
353911754SKacheong.Poon@Sun.COM  *
354011754SKacheong.Poon@Sun.COM  * send queue which starts at the given seq. no.
354111754SKacheong.Poon@Sun.COM  *
354211754SKacheong.Poon@Sun.COM  * Parameters:
354311754SKacheong.Poon@Sun.COM  *	tcp_t *tcp: the tcp instance pointer.
354411754SKacheong.Poon@Sun.COM  *	uint32_t seq: the starting seq. no of the requested segment.
354511754SKacheong.Poon@Sun.COM  *	int32_t *off: after the execution, *off will be the offset to
354611754SKacheong.Poon@Sun.COM  *		the returned mblk which points to the requested seq no.
354711754SKacheong.Poon@Sun.COM  *		It is the caller's responsibility to send in a non-null off.
354811754SKacheong.Poon@Sun.COM  *
354911754SKacheong.Poon@Sun.COM  * Return:
355011754SKacheong.Poon@Sun.COM  *	A mblk_t pointer pointing to the requested segment in send queue.
355111754SKacheong.Poon@Sun.COM  */
355211754SKacheong.Poon@Sun.COM static mblk_t *
tcp_get_seg_mp(tcp_t * tcp,uint32_t seq,int32_t * off)355311754SKacheong.Poon@Sun.COM tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
355411754SKacheong.Poon@Sun.COM {
355511754SKacheong.Poon@Sun.COM 	int32_t	cnt;
355611754SKacheong.Poon@Sun.COM 	mblk_t	*mp;
355711754SKacheong.Poon@Sun.COM 
355811754SKacheong.Poon@Sun.COM 	/* Defensive coding.  Make sure we don't send incorrect data. */
355911754SKacheong.Poon@Sun.COM 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt))
356011754SKacheong.Poon@Sun.COM 		return (NULL);
356111754SKacheong.Poon@Sun.COM 
356211754SKacheong.Poon@Sun.COM 	cnt = seq - tcp->tcp_suna;
356311754SKacheong.Poon@Sun.COM 	mp = tcp->tcp_xmit_head;
356411754SKacheong.Poon@Sun.COM 	while (cnt > 0 && mp != NULL) {
356511754SKacheong.Poon@Sun.COM 		cnt -= mp->b_wptr - mp->b_rptr;
356611754SKacheong.Poon@Sun.COM 		if (cnt <= 0) {
356711754SKacheong.Poon@Sun.COM 			cnt += mp->b_wptr - mp->b_rptr;
356811754SKacheong.Poon@Sun.COM 			break;
356911754SKacheong.Poon@Sun.COM 		}
357011754SKacheong.Poon@Sun.COM 		mp = mp->b_cont;
357111754SKacheong.Poon@Sun.COM 	}
357211754SKacheong.Poon@Sun.COM 	ASSERT(mp != NULL);
357311754SKacheong.Poon@Sun.COM 	*off = cnt;
357411754SKacheong.Poon@Sun.COM 	return (mp);
357511754SKacheong.Poon@Sun.COM }
357611754SKacheong.Poon@Sun.COM 
357711754SKacheong.Poon@Sun.COM /*
357811754SKacheong.Poon@Sun.COM  * This routine adjusts next-to-send sequence number variables, in the
357911754SKacheong.Poon@Sun.COM  * case where the reciever has shrunk it's window.
358011754SKacheong.Poon@Sun.COM  */
358111754SKacheong.Poon@Sun.COM void
tcp_update_xmit_tail(tcp_t * tcp,uint32_t snxt)358211754SKacheong.Poon@Sun.COM tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
358311754SKacheong.Poon@Sun.COM {
358411754SKacheong.Poon@Sun.COM 	mblk_t *xmit_tail;
358511754SKacheong.Poon@Sun.COM 	int32_t offset;
358611754SKacheong.Poon@Sun.COM 
358711754SKacheong.Poon@Sun.COM 	tcp->tcp_snxt = snxt;
358811754SKacheong.Poon@Sun.COM 
358911754SKacheong.Poon@Sun.COM 	/* Get the mblk, and the offset in it, as per the shrunk window */
359011754SKacheong.Poon@Sun.COM 	xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
359111754SKacheong.Poon@Sun.COM 	ASSERT(xmit_tail != NULL);
359211754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail = xmit_tail;
359311754SKacheong.Poon@Sun.COM 	tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr -
359411754SKacheong.Poon@Sun.COM 	    xmit_tail->b_rptr - offset;
359511754SKacheong.Poon@Sun.COM }
359611754SKacheong.Poon@Sun.COM 
359711754SKacheong.Poon@Sun.COM /*
359811754SKacheong.Poon@Sun.COM  * This handles the case when the receiver has shrunk its win. Per RFC 1122
359911754SKacheong.Poon@Sun.COM  * if the receiver shrinks the window, i.e. moves the right window to the
360011754SKacheong.Poon@Sun.COM  * left, the we should not send new data, but should retransmit normally the
360111754SKacheong.Poon@Sun.COM  * old unacked data between suna and suna + swnd. We might has sent data
360211754SKacheong.Poon@Sun.COM  * that is now outside the new window, pretend that we didn't send  it.
360311754SKacheong.Poon@Sun.COM  */
360411754SKacheong.Poon@Sun.COM static void
tcp_process_shrunk_swnd(tcp_t * tcp,uint32_t shrunk_count)360511754SKacheong.Poon@Sun.COM tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
360611754SKacheong.Poon@Sun.COM {
360711754SKacheong.Poon@Sun.COM 	uint32_t	snxt = tcp->tcp_snxt;
360811754SKacheong.Poon@Sun.COM 
360911754SKacheong.Poon@Sun.COM 	ASSERT(shrunk_count > 0);
361011754SKacheong.Poon@Sun.COM 
361111754SKacheong.Poon@Sun.COM 	if (!tcp->tcp_is_wnd_shrnk) {
361211754SKacheong.Poon@Sun.COM 		tcp->tcp_snxt_shrunk = snxt;
361311754SKacheong.Poon@Sun.COM 		tcp->tcp_is_wnd_shrnk = B_TRUE;
361411754SKacheong.Poon@Sun.COM 	} else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) {
361511754SKacheong.Poon@Sun.COM 		tcp->tcp_snxt_shrunk = snxt;
361611754SKacheong.Poon@Sun.COM 	}
361711754SKacheong.Poon@Sun.COM 
361811754SKacheong.Poon@Sun.COM 	/* Pretend we didn't send the data outside the window */
361911754SKacheong.Poon@Sun.COM 	snxt -= shrunk_count;
362011754SKacheong.Poon@Sun.COM 
362111754SKacheong.Poon@Sun.COM 	/* Reset all the values per the now shrunk window */
362211754SKacheong.Poon@Sun.COM 	tcp_update_xmit_tail(tcp, snxt);
362311754SKacheong.Poon@Sun.COM 	tcp->tcp_unsent += shrunk_count;
362411754SKacheong.Poon@Sun.COM 
362511754SKacheong.Poon@Sun.COM 	/*
362611754SKacheong.Poon@Sun.COM 	 * If the SACK option is set, delete the entire list of
362711754SKacheong.Poon@Sun.COM 	 * notsack'ed blocks.
362811754SKacheong.Poon@Sun.COM 	 */
362912056SKacheong.Poon@Sun.COM 	TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
363011754SKacheong.Poon@Sun.COM 
363111754SKacheong.Poon@Sun.COM 	if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
363211754SKacheong.Poon@Sun.COM 		/*
363311754SKacheong.Poon@Sun.COM 		 * Make sure the timer is running so that we will probe a zero
363411754SKacheong.Poon@Sun.COM 		 * window.
363511754SKacheong.Poon@Sun.COM 		 */
363611754SKacheong.Poon@Sun.COM 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
363711754SKacheong.Poon@Sun.COM }
363811754SKacheong.Poon@Sun.COM 
363911754SKacheong.Poon@Sun.COM /*
364011754SKacheong.Poon@Sun.COM  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
364111754SKacheong.Poon@Sun.COM  * with the template header, as well as other options such as time-stamp,
364211754SKacheong.Poon@Sun.COM  * ECN and/or SACK.
364311754SKacheong.Poon@Sun.COM  */
364411754SKacheong.Poon@Sun.COM static void
tcp_fill_header(tcp_t * tcp,uchar_t * rptr,clock_t now,int num_sack_blk)364511754SKacheong.Poon@Sun.COM tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
364611754SKacheong.Poon@Sun.COM {
364711754SKacheong.Poon@Sun.COM 	tcpha_t *tcp_tmpl, *tcpha;
364811754SKacheong.Poon@Sun.COM 	uint32_t *dst, *src;
364911754SKacheong.Poon@Sun.COM 	int hdrlen;
365011754SKacheong.Poon@Sun.COM 	conn_t *connp = tcp->tcp_connp;
365111754SKacheong.Poon@Sun.COM 
365211754SKacheong.Poon@Sun.COM 	ASSERT(OK_32PTR(rptr));
365311754SKacheong.Poon@Sun.COM 
365411754SKacheong.Poon@Sun.COM 	/* Template header */
365511754SKacheong.Poon@Sun.COM 	tcp_tmpl = tcp->tcp_tcpha;
365611754SKacheong.Poon@Sun.COM 
365711754SKacheong.Poon@Sun.COM 	/* Header of outgoing packet */
365811754SKacheong.Poon@Sun.COM 	tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
365911754SKacheong.Poon@Sun.COM 
366011754SKacheong.Poon@Sun.COM 	/* dst and src are opaque 32-bit fields, used for copying */
366111754SKacheong.Poon@Sun.COM 	dst = (uint32_t *)rptr;
366211754SKacheong.Poon@Sun.COM 	src = (uint32_t *)connp->conn_ht_iphc;
366311754SKacheong.Poon@Sun.COM 	hdrlen = connp->conn_ht_iphc_len;
366411754SKacheong.Poon@Sun.COM 
366511754SKacheong.Poon@Sun.COM 	/* Fill time-stamp option if needed */
366611754SKacheong.Poon@Sun.COM 	if (tcp->tcp_snd_ts_ok) {
366711754SKacheong.Poon@Sun.COM 		U32_TO_BE32((uint32_t)now,
366811754SKacheong.Poon@Sun.COM 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
366911754SKacheong.Poon@Sun.COM 		U32_TO_BE32(tcp->tcp_ts_recent,
367011754SKacheong.Poon@Sun.COM 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
367111754SKacheong.Poon@Sun.COM 	} else {
367211754SKacheong.Poon@Sun.COM 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
367311754SKacheong.Poon@Sun.COM 	}
367411754SKacheong.Poon@Sun.COM 
367511754SKacheong.Poon@Sun.COM 	/*
367611754SKacheong.Poon@Sun.COM 	 * Copy the template header; is this really more efficient than
367711754SKacheong.Poon@Sun.COM 	 * calling bcopy()?  For simple IPv4/TCP, it may be the case,
367811754SKacheong.Poon@Sun.COM 	 * but perhaps not for other scenarios.
367911754SKacheong.Poon@Sun.COM 	 */
368011754SKacheong.Poon@Sun.COM 	dst[0] = src[0];
368111754SKacheong.Poon@Sun.COM 	dst[1] = src[1];
368211754SKacheong.Poon@Sun.COM 	dst[2] = src[2];
368311754SKacheong.Poon@Sun.COM 	dst[3] = src[3];
368411754SKacheong.Poon@Sun.COM 	dst[4] = src[4];
368511754SKacheong.Poon@Sun.COM 	dst[5] = src[5];
368611754SKacheong.Poon@Sun.COM 	dst[6] = src[6];
368711754SKacheong.Poon@Sun.COM 	dst[7] = src[7];
368811754SKacheong.Poon@Sun.COM 	dst[8] = src[8];
368911754SKacheong.Poon@Sun.COM 	dst[9] = src[9];
369011754SKacheong.Poon@Sun.COM 	if (hdrlen -= 40) {
369111754SKacheong.Poon@Sun.COM 		hdrlen >>= 2;
369211754SKacheong.Poon@Sun.COM 		dst += 10;
369311754SKacheong.Poon@Sun.COM 		src += 10;
369411754SKacheong.Poon@Sun.COM 		do {
369511754SKacheong.Poon@Sun.COM 			*dst++ = *src++;
369611754SKacheong.Poon@Sun.COM 		} while (--hdrlen);
369711754SKacheong.Poon@Sun.COM 	}
369811754SKacheong.Poon@Sun.COM 
369911754SKacheong.Poon@Sun.COM 	/*
370011754SKacheong.Poon@Sun.COM 	 * Set the ECN info in the TCP header if it is not a zero
370111754SKacheong.Poon@Sun.COM 	 * window probe.  Zero window probe is only sent in
370211754SKacheong.Poon@Sun.COM 	 * tcp_wput_data() and tcp_timer().
370311754SKacheong.Poon@Sun.COM 	 */
370411754SKacheong.Poon@Sun.COM 	if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
370511754SKacheong.Poon@Sun.COM 		TCP_SET_ECT(tcp, rptr);
370611754SKacheong.Poon@Sun.COM 
370711754SKacheong.Poon@Sun.COM 		if (tcp->tcp_ecn_echo_on)
370811754SKacheong.Poon@Sun.COM 			tcpha->tha_flags |= TH_ECE;
370911754SKacheong.Poon@Sun.COM 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
371011754SKacheong.Poon@Sun.COM 			tcpha->tha_flags |= TH_CWR;
371111754SKacheong.Poon@Sun.COM 			tcp->tcp_ecn_cwr_sent = B_TRUE;
371211754SKacheong.Poon@Sun.COM 		}
371311754SKacheong.Poon@Sun.COM 	}
371411754SKacheong.Poon@Sun.COM 
371511754SKacheong.Poon@Sun.COM 	/* Fill in SACK options */
371611754SKacheong.Poon@Sun.COM 	if (num_sack_blk > 0) {
371711754SKacheong.Poon@Sun.COM 		uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
371811754SKacheong.Poon@Sun.COM 		sack_blk_t *tmp;
371911754SKacheong.Poon@Sun.COM 		int32_t	i;
372011754SKacheong.Poon@Sun.COM 
372111754SKacheong.Poon@Sun.COM 		wptr[0] = TCPOPT_NOP;
372211754SKacheong.Poon@Sun.COM 		wptr[1] = TCPOPT_NOP;
372311754SKacheong.Poon@Sun.COM 		wptr[2] = TCPOPT_SACK;
372411754SKacheong.Poon@Sun.COM 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
372511754SKacheong.Poon@Sun.COM 		    sizeof (sack_blk_t);
372611754SKacheong.Poon@Sun.COM 		wptr += TCPOPT_REAL_SACK_LEN;
372711754SKacheong.Poon@Sun.COM 
372811754SKacheong.Poon@Sun.COM 		tmp = tcp->tcp_sack_list;
372911754SKacheong.Poon@Sun.COM 		for (i = 0; i < num_sack_blk; i++) {
373011754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tmp[i].begin, wptr);
373111754SKacheong.Poon@Sun.COM 			wptr += sizeof (tcp_seq);
373211754SKacheong.Poon@Sun.COM 			U32_TO_BE32(tmp[i].end, wptr);
373311754SKacheong.Poon@Sun.COM 			wptr += sizeof (tcp_seq);
373411754SKacheong.Poon@Sun.COM 		}
373511754SKacheong.Poon@Sun.COM 		tcpha->tha_offset_and_reserved +=
373611754SKacheong.Poon@Sun.COM 		    ((num_sack_blk * 2 + 1) << 4);
373711754SKacheong.Poon@Sun.COM 	}
373811754SKacheong.Poon@Sun.COM }
3739