xref: /onnv-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 8586:3f902abf1f04)
18348SEric.Yu@Sun.COM /*
28348SEric.Yu@Sun.COM  * CDDL HEADER START
38348SEric.Yu@Sun.COM  *
48348SEric.Yu@Sun.COM  * The contents of this file are subject to the terms of the
58348SEric.Yu@Sun.COM  * Common Development and Distribution License (the "License").
68348SEric.Yu@Sun.COM  * You may not use this file except in compliance with the License.
78348SEric.Yu@Sun.COM  *
88348SEric.Yu@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
98348SEric.Yu@Sun.COM  * or http://www.opensolaris.org/os/licensing.
108348SEric.Yu@Sun.COM  * See the License for the specific language governing permissions
118348SEric.Yu@Sun.COM  * and limitations under the License.
128348SEric.Yu@Sun.COM  *
138348SEric.Yu@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
148348SEric.Yu@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
158348SEric.Yu@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
168348SEric.Yu@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
178348SEric.Yu@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
188348SEric.Yu@Sun.COM  *
198348SEric.Yu@Sun.COM  * CDDL HEADER END
208348SEric.Yu@Sun.COM  */
218348SEric.Yu@Sun.COM 
228348SEric.Yu@Sun.COM /*
238489Sshenjian  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
248348SEric.Yu@Sun.COM  * Use is subject to license terms.
258348SEric.Yu@Sun.COM  */
268348SEric.Yu@Sun.COM 
278348SEric.Yu@Sun.COM #include <sys/types.h>
288348SEric.Yu@Sun.COM #include <sys/param.h>
298348SEric.Yu@Sun.COM #include <sys/systm.h>
308348SEric.Yu@Sun.COM #include <sys/sysmacros.h>
318348SEric.Yu@Sun.COM #include <sys/debug.h>
328348SEric.Yu@Sun.COM #include <sys/cmn_err.h>
338348SEric.Yu@Sun.COM #include <sys/vfs.h>
348348SEric.Yu@Sun.COM #include <sys/policy.h>
358348SEric.Yu@Sun.COM #include <sys/modctl.h>
368348SEric.Yu@Sun.COM 
378348SEric.Yu@Sun.COM #include <sys/sunddi.h>
388348SEric.Yu@Sun.COM 
398348SEric.Yu@Sun.COM #include <sys/strsun.h>
408348SEric.Yu@Sun.COM #include <sys/stropts.h>
418348SEric.Yu@Sun.COM #include <sys/strsubr.h>
428348SEric.Yu@Sun.COM #include <sys/socket.h>
438348SEric.Yu@Sun.COM #include <sys/socketvar.h>
448348SEric.Yu@Sun.COM #include <sys/sodirect.h>
458348SEric.Yu@Sun.COM #include <sys/uio.h>
468348SEric.Yu@Sun.COM 
478348SEric.Yu@Sun.COM #include <inet/ipclassifier.h>
488348SEric.Yu@Sun.COM #include <fs/sockfs/sockcommon.h>
498348SEric.Yu@Sun.COM #include <fs/sockfs/nl7c.h>
508399SRao.Shoaib@Sun.COM #include <fs/sockfs/socktpi.h>
518348SEric.Yu@Sun.COM #include <inet/ip.h>
528348SEric.Yu@Sun.COM 
538348SEric.Yu@Sun.COM extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
548348SEric.Yu@Sun.COM 
558348SEric.Yu@Sun.COM static struct kmem_cache *sock_sod_cache;
568348SEric.Yu@Sun.COM 
578348SEric.Yu@Sun.COM /*
588348SEric.Yu@Sun.COM  * Common socket access functions.
598348SEric.Yu@Sun.COM  *
608348SEric.Yu@Sun.COM  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
618348SEric.Yu@Sun.COM  * the socket_xxx() function should be used.
628348SEric.Yu@Sun.COM  */
638348SEric.Yu@Sun.COM 
648348SEric.Yu@Sun.COM /*
658348SEric.Yu@Sun.COM  * Try to create a new sonode of the requested <family, type, protocol>.
668348SEric.Yu@Sun.COM  */
678348SEric.Yu@Sun.COM /* ARGSUSED */
688348SEric.Yu@Sun.COM struct sonode *
698348SEric.Yu@Sun.COM socket_create(int family, int type, int protocol, char *devpath, char *mod,
708348SEric.Yu@Sun.COM     int flags, int version, struct cred *cr, int *errorp)
718348SEric.Yu@Sun.COM {
728348SEric.Yu@Sun.COM 	struct sonode *so;
738348SEric.Yu@Sun.COM 	struct sockparams *sp = NULL;
748489Sshenjian 	int saved_error;
758348SEric.Yu@Sun.COM 
768348SEric.Yu@Sun.COM 	/*
778348SEric.Yu@Sun.COM 	 * Look for a sockparams entry that match the given criteria.
788348SEric.Yu@Sun.COM 	 * solookup() returns with the entry held.
798348SEric.Yu@Sun.COM 	 */
808348SEric.Yu@Sun.COM 	*errorp = solookup(family, type, protocol, &sp);
818489Sshenjian 	saved_error = *errorp;
828348SEric.Yu@Sun.COM 	if (sp == NULL) {
838348SEric.Yu@Sun.COM 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
848348SEric.Yu@Sun.COM 		/*
858348SEric.Yu@Sun.COM 		 * There is no matching sockparams entry. An ephemeral entry is
868348SEric.Yu@Sun.COM 		 * created if the caller specifies a device or a socket module.
878348SEric.Yu@Sun.COM 		 */
888348SEric.Yu@Sun.COM 		if (devpath != NULL) {
898489Sshenjian 			saved_error = 0;
908348SEric.Yu@Sun.COM 			sp = sockparams_hold_ephemeral_bydev(family, type,
918348SEric.Yu@Sun.COM 			    protocol, devpath, kmflags, errorp);
928348SEric.Yu@Sun.COM 		} else if (mod != NULL) {
938489Sshenjian 			saved_error = 0;
948348SEric.Yu@Sun.COM 			sp = sockparams_hold_ephemeral_bymod(family, type,
958348SEric.Yu@Sun.COM 			    protocol, mod, kmflags, errorp);
968348SEric.Yu@Sun.COM 		} else {
978489Sshenjian 			*errorp = solookup(family, type, 0, &sp);
988348SEric.Yu@Sun.COM 		}
998348SEric.Yu@Sun.COM 
1008489Sshenjian 		if (sp == NULL) {
1018489Sshenjian 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
1028489Sshenjian 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
1038489Sshenjian 				*errorp = saved_error;
1048348SEric.Yu@Sun.COM 			return (NULL);
1058489Sshenjian 		}
1068348SEric.Yu@Sun.COM 	}
1078348SEric.Yu@Sun.COM 
1088348SEric.Yu@Sun.COM 	ASSERT(sp->sp_smod_info != NULL);
1098348SEric.Yu@Sun.COM 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
1108348SEric.Yu@Sun.COM 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
1118348SEric.Yu@Sun.COM 	    protocol, version, flags, errorp, cr);
1128348SEric.Yu@Sun.COM 	if (so == NULL) {
1138348SEric.Yu@Sun.COM 		SOCKPARAMS_DEC_REF(sp);
1148348SEric.Yu@Sun.COM 	} else {
1158348SEric.Yu@Sun.COM 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
1168348SEric.Yu@Sun.COM 			/* Cannot fail, only bumps so_count */
1178348SEric.Yu@Sun.COM 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
1188348SEric.Yu@Sun.COM 		} else {
1198489Sshenjian 			if (saved_error && (*errorp == EPROTONOSUPPORT ||
1208489Sshenjian 			    *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
1218489Sshenjian 				*errorp = saved_error;
1228348SEric.Yu@Sun.COM 			socket_destroy(so);
1238348SEric.Yu@Sun.COM 			so = NULL;
1248348SEric.Yu@Sun.COM 		}
1258348SEric.Yu@Sun.COM 	}
1268348SEric.Yu@Sun.COM 	return (so);
1278348SEric.Yu@Sun.COM }
1288348SEric.Yu@Sun.COM 
1298348SEric.Yu@Sun.COM struct sonode *
1308348SEric.Yu@Sun.COM socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
1318348SEric.Yu@Sun.COM     sock_downcalls_t *dc, int flags, int *errorp)
1328348SEric.Yu@Sun.COM {
1338348SEric.Yu@Sun.COM 	struct sonode *so;
1348348SEric.Yu@Sun.COM 	struct sockparams *sp;
1358348SEric.Yu@Sun.COM 	struct cred *cr;
1368348SEric.Yu@Sun.COM 
1378348SEric.Yu@Sun.COM 	if ((cr = CRED()) == NULL)
1388348SEric.Yu@Sun.COM 		cr = kcred;
1398348SEric.Yu@Sun.COM 
1408348SEric.Yu@Sun.COM 	sp = parent->so_sockparams;
1418348SEric.Yu@Sun.COM 	ASSERT(sp != NULL);
1428348SEric.Yu@Sun.COM 
1438348SEric.Yu@Sun.COM 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
1448348SEric.Yu@Sun.COM 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
1458348SEric.Yu@Sun.COM 	    errorp, cr);
1468348SEric.Yu@Sun.COM 	if (so != NULL) {
1478348SEric.Yu@Sun.COM 		SOCKPARAMS_INC_REF(sp);
1488348SEric.Yu@Sun.COM 
1498348SEric.Yu@Sun.COM 		so->so_proto_handle = lh;
1508348SEric.Yu@Sun.COM 		so->so_downcalls = dc;
1518348SEric.Yu@Sun.COM 		/*
1528348SEric.Yu@Sun.COM 		 * This function may be called in interrupt context, and CRED()
1538348SEric.Yu@Sun.COM 		 * will be NULL. In this case, pass in kcred.
1548348SEric.Yu@Sun.COM 		 */
1558348SEric.Yu@Sun.COM 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
1568348SEric.Yu@Sun.COM 			/* Cannot fail, only bumps so_count */
1578348SEric.Yu@Sun.COM 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
1588348SEric.Yu@Sun.COM 		} else  {
1598348SEric.Yu@Sun.COM 			socket_destroy(so);
1608348SEric.Yu@Sun.COM 			so = NULL;
1618348SEric.Yu@Sun.COM 		}
1628348SEric.Yu@Sun.COM 	}
1638348SEric.Yu@Sun.COM 
1648348SEric.Yu@Sun.COM 	return (so);
1658348SEric.Yu@Sun.COM }
1668348SEric.Yu@Sun.COM 
1678348SEric.Yu@Sun.COM /*
1688348SEric.Yu@Sun.COM  * Bind local endpoint.
1698348SEric.Yu@Sun.COM  */
1708348SEric.Yu@Sun.COM int
1718348SEric.Yu@Sun.COM socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1728348SEric.Yu@Sun.COM     int flags, cred_t *cr)
1738348SEric.Yu@Sun.COM {
1748348SEric.Yu@Sun.COM 	return (SOP_BIND(so, name, namelen, flags, cr));
1758348SEric.Yu@Sun.COM }
1768348SEric.Yu@Sun.COM 
1778348SEric.Yu@Sun.COM /*
1788348SEric.Yu@Sun.COM  * Turn socket into a listen socket.
1798348SEric.Yu@Sun.COM  */
1808348SEric.Yu@Sun.COM int
1818348SEric.Yu@Sun.COM socket_listen(struct sonode *so, int backlog, cred_t *cr)
1828348SEric.Yu@Sun.COM {
1838348SEric.Yu@Sun.COM 	if (backlog < 0) {
1848348SEric.Yu@Sun.COM 		backlog = 0;
1858348SEric.Yu@Sun.COM 	}
1868348SEric.Yu@Sun.COM 
1878348SEric.Yu@Sun.COM 	/*
1888348SEric.Yu@Sun.COM 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1898348SEric.Yu@Sun.COM 	 * before queuing the next connection implying that a
1908348SEric.Yu@Sun.COM 	 * listen(sock, 0) allows one connection to be queued.
1918348SEric.Yu@Sun.COM 	 * BSD also uses 1.5 times the requested backlog.
1928348SEric.Yu@Sun.COM 	 *
1938348SEric.Yu@Sun.COM 	 * XNS Issue 4 required a strict interpretation of the backlog.
1948348SEric.Yu@Sun.COM 	 * This has been waived subsequently for Issue 4 and the change
1958348SEric.Yu@Sun.COM 	 * incorporated in XNS Issue 5. So we aren't required to do
1968348SEric.Yu@Sun.COM 	 * anything special for XPG apps.
1978348SEric.Yu@Sun.COM 	 */
1988348SEric.Yu@Sun.COM 	if (backlog >= (INT_MAX - 1) / 3)
1998348SEric.Yu@Sun.COM 		backlog = INT_MAX;
2008348SEric.Yu@Sun.COM 	else
2018348SEric.Yu@Sun.COM 		backlog = backlog * 3 / 2 + 1;
2028348SEric.Yu@Sun.COM 
2038348SEric.Yu@Sun.COM 	return (SOP_LISTEN(so, backlog, cr));
2048348SEric.Yu@Sun.COM }
2058348SEric.Yu@Sun.COM 
2068348SEric.Yu@Sun.COM /*
2078348SEric.Yu@Sun.COM  * Accept incoming connection.
2088348SEric.Yu@Sun.COM  */
2098348SEric.Yu@Sun.COM int
2108348SEric.Yu@Sun.COM socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
2118348SEric.Yu@Sun.COM {
2128348SEric.Yu@Sun.COM 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
2138348SEric.Yu@Sun.COM }
2148348SEric.Yu@Sun.COM 
2158348SEric.Yu@Sun.COM /*
2168348SEric.Yu@Sun.COM  * Active open.
2178348SEric.Yu@Sun.COM  */
2188348SEric.Yu@Sun.COM int
2198348SEric.Yu@Sun.COM socket_connect(struct sonode *so, const struct sockaddr *name,
2208348SEric.Yu@Sun.COM     socklen_t namelen, int fflag, int flags, cred_t *cr)
2218348SEric.Yu@Sun.COM {
2228348SEric.Yu@Sun.COM 	int error;
2238348SEric.Yu@Sun.COM 
2248348SEric.Yu@Sun.COM 	/*
2258348SEric.Yu@Sun.COM 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2268348SEric.Yu@Sun.COM 	 * connect to a null address. This is the portable method to
2278348SEric.Yu@Sun.COM 	 * unconnect a socket.
2288348SEric.Yu@Sun.COM 	 */
2298348SEric.Yu@Sun.COM 	if ((namelen >= sizeof (sa_family_t)) &&
2308348SEric.Yu@Sun.COM 	    (name->sa_family == AF_UNSPEC)) {
2318348SEric.Yu@Sun.COM 		name = NULL;
2328348SEric.Yu@Sun.COM 		namelen = 0;
2338348SEric.Yu@Sun.COM 	}
2348348SEric.Yu@Sun.COM 
2358348SEric.Yu@Sun.COM 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
2368348SEric.Yu@Sun.COM 
2378348SEric.Yu@Sun.COM 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
2388348SEric.Yu@Sun.COM 		/*
2398348SEric.Yu@Sun.COM 		 * X/Open specification contains a requirement that
2408348SEric.Yu@Sun.COM 		 * ENETUNREACH be returned but does not require
2418348SEric.Yu@Sun.COM 		 * EHOSTUNREACH. In order to keep the test suite
2428348SEric.Yu@Sun.COM 		 * happy we mess with the errno here.
2438348SEric.Yu@Sun.COM 		 */
2448348SEric.Yu@Sun.COM 		error = ENETUNREACH;
2458348SEric.Yu@Sun.COM 	}
2468348SEric.Yu@Sun.COM 
2478348SEric.Yu@Sun.COM 	return (error);
2488348SEric.Yu@Sun.COM }
2498348SEric.Yu@Sun.COM 
2508348SEric.Yu@Sun.COM /*
2518348SEric.Yu@Sun.COM  * Get address of remote node.
2528348SEric.Yu@Sun.COM  */
2538348SEric.Yu@Sun.COM int
2548348SEric.Yu@Sun.COM socket_getpeername(struct sonode *so, struct sockaddr *addr,
2558348SEric.Yu@Sun.COM     socklen_t *addrlen, boolean_t accept, cred_t *cr)
2568348SEric.Yu@Sun.COM {
2578348SEric.Yu@Sun.COM 	ASSERT(*addrlen > 0);
2588348SEric.Yu@Sun.COM 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
2598348SEric.Yu@Sun.COM 
2608348SEric.Yu@Sun.COM }
2618348SEric.Yu@Sun.COM 
2628348SEric.Yu@Sun.COM /*
2638348SEric.Yu@Sun.COM  * Get local address.
2648348SEric.Yu@Sun.COM  */
2658348SEric.Yu@Sun.COM int
2668348SEric.Yu@Sun.COM socket_getsockname(struct sonode *so, struct sockaddr *addr,
2678348SEric.Yu@Sun.COM     socklen_t *addrlen, cred_t *cr)
2688348SEric.Yu@Sun.COM {
2698348SEric.Yu@Sun.COM 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
2708348SEric.Yu@Sun.COM 
2718348SEric.Yu@Sun.COM }
2728348SEric.Yu@Sun.COM 
2738348SEric.Yu@Sun.COM /*
2748348SEric.Yu@Sun.COM  * Called from shutdown().
2758348SEric.Yu@Sun.COM  */
2768348SEric.Yu@Sun.COM int
2778348SEric.Yu@Sun.COM socket_shutdown(struct sonode *so, int how, cred_t *cr)
2788348SEric.Yu@Sun.COM {
2798348SEric.Yu@Sun.COM 	return (SOP_SHUTDOWN(so, how, cr));
2808348SEric.Yu@Sun.COM }
2818348SEric.Yu@Sun.COM 
2828348SEric.Yu@Sun.COM /*
2838348SEric.Yu@Sun.COM  * Get socket options.
2848348SEric.Yu@Sun.COM  */
2858348SEric.Yu@Sun.COM /*ARGSUSED*/
2868348SEric.Yu@Sun.COM int
2878348SEric.Yu@Sun.COM socket_getsockopt(struct sonode *so, int level, int option_name,
2888348SEric.Yu@Sun.COM     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
2898348SEric.Yu@Sun.COM {
2908348SEric.Yu@Sun.COM 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
2918348SEric.Yu@Sun.COM 	    optlenp, flags, cr));
2928348SEric.Yu@Sun.COM }
2938348SEric.Yu@Sun.COM 
2948348SEric.Yu@Sun.COM /*
2958348SEric.Yu@Sun.COM  * Set socket options
2968348SEric.Yu@Sun.COM  */
2978348SEric.Yu@Sun.COM int
2988348SEric.Yu@Sun.COM socket_setsockopt(struct sonode *so, int level, int option_name,
2998348SEric.Yu@Sun.COM     const void *optval, t_uscalar_t optlen, cred_t *cr)
3008348SEric.Yu@Sun.COM {
3018489Sshenjian 	int val = 1;
3028348SEric.Yu@Sun.COM 	/* Caller allocates aligned optval, or passes null */
3038348SEric.Yu@Sun.COM 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
3048348SEric.Yu@Sun.COM 	/* If optval is null optlen is 0, and vice-versa */
3058348SEric.Yu@Sun.COM 	ASSERT(optval != NULL || optlen == 0);
3068348SEric.Yu@Sun.COM 	ASSERT(optlen != 0 || optval == NULL);
3078348SEric.Yu@Sun.COM 
3088489Sshenjian 	if (optval == NULL && optlen == 0)
3098489Sshenjian 		optval = &val;
3108348SEric.Yu@Sun.COM 
3118348SEric.Yu@Sun.COM 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
3128348SEric.Yu@Sun.COM }
3138348SEric.Yu@Sun.COM 
3148348SEric.Yu@Sun.COM int
3158348SEric.Yu@Sun.COM socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3168348SEric.Yu@Sun.COM     cred_t *cr)
3178348SEric.Yu@Sun.COM {
3188348SEric.Yu@Sun.COM 	int error = 0;
3198348SEric.Yu@Sun.COM 	ssize_t orig_resid = uiop->uio_resid;
3208348SEric.Yu@Sun.COM 
3218348SEric.Yu@Sun.COM 	/*
3228348SEric.Yu@Sun.COM 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
3238348SEric.Yu@Sun.COM 	 */
3248348SEric.Yu@Sun.COM 	if (so->so_family == AF_UNIX)
3258348SEric.Yu@Sun.COM 		uiop->uio_extflg |= UIO_COPY_CACHED;
3268348SEric.Yu@Sun.COM 	else
3278348SEric.Yu@Sun.COM 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
3288348SEric.Yu@Sun.COM 
3298348SEric.Yu@Sun.COM 	error = SOP_SENDMSG(so, msg, uiop, cr);
3308348SEric.Yu@Sun.COM 	switch (error) {
3318348SEric.Yu@Sun.COM 	default:
3328348SEric.Yu@Sun.COM 		break;
3338348SEric.Yu@Sun.COM 	case EINTR:
334*8586Sshenjian 	/* EAGAIN is EWOULDBLOCK */
3358348SEric.Yu@Sun.COM 	case EWOULDBLOCK:
3368348SEric.Yu@Sun.COM 		/* We did a partial send */
3378348SEric.Yu@Sun.COM 		if (uiop->uio_resid != orig_resid)
3388348SEric.Yu@Sun.COM 			error = 0;
3398348SEric.Yu@Sun.COM 		break;
3408348SEric.Yu@Sun.COM 	case EPIPE:
3418348SEric.Yu@Sun.COM 		if ((so->so_mode & SM_KERNEL) == 0)
3428348SEric.Yu@Sun.COM 			tsignal(curthread, SIGPIPE);
3438348SEric.Yu@Sun.COM 		break;
3448348SEric.Yu@Sun.COM 	}
3458348SEric.Yu@Sun.COM 
3468348SEric.Yu@Sun.COM 	return (error);
3478348SEric.Yu@Sun.COM }
3488348SEric.Yu@Sun.COM 
3498348SEric.Yu@Sun.COM int
3508348SEric.Yu@Sun.COM socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
3518348SEric.Yu@Sun.COM     struct cred *cr, mblk_t **mpp)
3528348SEric.Yu@Sun.COM {
3538348SEric.Yu@Sun.COM 	int error = 0;
3548348SEric.Yu@Sun.COM 
3558348SEric.Yu@Sun.COM 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
3568348SEric.Yu@Sun.COM 	if (error == EPIPE) {
3578348SEric.Yu@Sun.COM 		tsignal(curthread, SIGPIPE);
3588348SEric.Yu@Sun.COM 	}
3598348SEric.Yu@Sun.COM 	return (error);
3608348SEric.Yu@Sun.COM }
3618348SEric.Yu@Sun.COM 
3628348SEric.Yu@Sun.COM int
3638348SEric.Yu@Sun.COM socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3648348SEric.Yu@Sun.COM     cred_t *cr)
3658348SEric.Yu@Sun.COM {
3668348SEric.Yu@Sun.COM 	int error;
3678348SEric.Yu@Sun.COM 	ssize_t orig_resid = uiop->uio_resid;
3688348SEric.Yu@Sun.COM 
3698348SEric.Yu@Sun.COM 	/*
3708348SEric.Yu@Sun.COM 	 * Do not bypass the cache when reading data, as the application
3718348SEric.Yu@Sun.COM 	 * is likely to access the data shortly.
3728348SEric.Yu@Sun.COM 	 */
3738348SEric.Yu@Sun.COM 	uiop->uio_extflg |= UIO_COPY_CACHED;
3748348SEric.Yu@Sun.COM 
3758348SEric.Yu@Sun.COM 	error = SOP_RECVMSG(so, msg, uiop, cr);
3768348SEric.Yu@Sun.COM 
3778348SEric.Yu@Sun.COM 	switch (error) {
3788348SEric.Yu@Sun.COM 	case EINTR:
379*8586Sshenjian 	/* EAGAIN is EWOULDBLOCK */
3808348SEric.Yu@Sun.COM 	case EWOULDBLOCK:
3818348SEric.Yu@Sun.COM 		/* We did a partial read */
3828348SEric.Yu@Sun.COM 		if (uiop->uio_resid != orig_resid)
3838348SEric.Yu@Sun.COM 			error = 0;
3848348SEric.Yu@Sun.COM 		break;
3858348SEric.Yu@Sun.COM 	default:
3868348SEric.Yu@Sun.COM 		break;
3878348SEric.Yu@Sun.COM 	}
3888348SEric.Yu@Sun.COM 	return (error);
3898348SEric.Yu@Sun.COM }
3908348SEric.Yu@Sun.COM 
3918348SEric.Yu@Sun.COM int
3928348SEric.Yu@Sun.COM socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
3938348SEric.Yu@Sun.COM     struct cred *cr, int32_t *rvalp)
3948348SEric.Yu@Sun.COM {
3958348SEric.Yu@Sun.COM 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
3968348SEric.Yu@Sun.COM }
3978348SEric.Yu@Sun.COM 
3988348SEric.Yu@Sun.COM int
3998348SEric.Yu@Sun.COM socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
4008348SEric.Yu@Sun.COM     struct pollhead **phpp)
4018348SEric.Yu@Sun.COM {
4028348SEric.Yu@Sun.COM 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
4038348SEric.Yu@Sun.COM }
4048348SEric.Yu@Sun.COM 
4058348SEric.Yu@Sun.COM int
4068348SEric.Yu@Sun.COM socket_close(struct sonode *so, int flag, struct cred *cr)
4078348SEric.Yu@Sun.COM {
4088348SEric.Yu@Sun.COM 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
4098348SEric.Yu@Sun.COM }
4108348SEric.Yu@Sun.COM 
4118348SEric.Yu@Sun.COM int
4128348SEric.Yu@Sun.COM socket_close_internal(struct sonode *so, int flag, cred_t *cr)
4138348SEric.Yu@Sun.COM {
4148348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0);
4158348SEric.Yu@Sun.COM 
4168348SEric.Yu@Sun.COM 	return (SOP_CLOSE(so, flag, cr));
4178348SEric.Yu@Sun.COM }
4188348SEric.Yu@Sun.COM 
4198348SEric.Yu@Sun.COM void
4208348SEric.Yu@Sun.COM socket_destroy(struct sonode *so)
4218348SEric.Yu@Sun.COM {
4228348SEric.Yu@Sun.COM 	vn_invalid(SOTOV(so));
4238348SEric.Yu@Sun.COM 	VN_RELE(SOTOV(so));
4248348SEric.Yu@Sun.COM }
4258348SEric.Yu@Sun.COM 
4268348SEric.Yu@Sun.COM /* ARGSUSED */
4278348SEric.Yu@Sun.COM void
4288348SEric.Yu@Sun.COM socket_destroy_internal(struct sonode *so, cred_t *cr)
4298348SEric.Yu@Sun.COM {
4308348SEric.Yu@Sun.COM 	struct sockparams *sp = so->so_sockparams;
4318348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0 && sp != NULL);
4328348SEric.Yu@Sun.COM 
4338348SEric.Yu@Sun.COM 	sp->sp_smod_info->smod_sock_destroy_func(so);
4348348SEric.Yu@Sun.COM 
4358348SEric.Yu@Sun.COM 	SOCKPARAMS_DEC_REF(sp);
4368348SEric.Yu@Sun.COM }
4378348SEric.Yu@Sun.COM 
4388348SEric.Yu@Sun.COM /*
4398348SEric.Yu@Sun.COM  * TODO Once the common vnode ops is available, then the vnops argument
4408348SEric.Yu@Sun.COM  * should be removed.
4418348SEric.Yu@Sun.COM  */
4428348SEric.Yu@Sun.COM /*ARGSUSED*/
4438348SEric.Yu@Sun.COM int
4448348SEric.Yu@Sun.COM sonode_constructor(void *buf, void *cdrarg, int kmflags)
4458348SEric.Yu@Sun.COM {
4468348SEric.Yu@Sun.COM 	struct sonode *so = buf;
4478348SEric.Yu@Sun.COM 	struct vnode *vp;
4488348SEric.Yu@Sun.COM 
4498348SEric.Yu@Sun.COM 	vp = so->so_vnode = vn_alloc(kmflags);
4508348SEric.Yu@Sun.COM 	if (vp == NULL) {
4518348SEric.Yu@Sun.COM 		return (-1);
4528348SEric.Yu@Sun.COM 	}
4538348SEric.Yu@Sun.COM 	vp->v_data = so;
4548348SEric.Yu@Sun.COM 	vn_setops(vp, socket_vnodeops);
4558348SEric.Yu@Sun.COM 
4568348SEric.Yu@Sun.COM 	so->so_priv 		= NULL;
4578348SEric.Yu@Sun.COM 	so->so_oobmsg		= NULL;
4588348SEric.Yu@Sun.COM 
4598348SEric.Yu@Sun.COM 	so->so_proto_handle	= NULL;
4608348SEric.Yu@Sun.COM 
4618348SEric.Yu@Sun.COM 	so->so_peercred 	= NULL;
4628348SEric.Yu@Sun.COM 
4638348SEric.Yu@Sun.COM 	so->so_rcv_queued	= 0;
4648348SEric.Yu@Sun.COM 	so->so_rcv_q_head 	= NULL;
4658348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head 	= NULL;
4668348SEric.Yu@Sun.COM 	so->so_rcv_head		= NULL;
4678348SEric.Yu@Sun.COM 	so->so_rcv_last_head	= NULL;
4688348SEric.Yu@Sun.COM 	so->so_rcv_wanted	= 0;
4698348SEric.Yu@Sun.COM 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
4708348SEric.Yu@Sun.COM 	so->so_rcv_timer_tid	= 0;
4718348SEric.Yu@Sun.COM 	so->so_rcv_thresh	= 0;
4728348SEric.Yu@Sun.COM 
4738348SEric.Yu@Sun.COM 	so->so_acceptq_head	= NULL;
4748348SEric.Yu@Sun.COM 	so->so_acceptq_tail	= &so->so_acceptq_head;
4758348SEric.Yu@Sun.COM 	so->so_acceptq_next	= NULL;
4768348SEric.Yu@Sun.COM 	so->so_acceptq_len	= 0;
4778348SEric.Yu@Sun.COM 	so->so_backlog		= 0;
4788348SEric.Yu@Sun.COM 
4798348SEric.Yu@Sun.COM 	so->so_snd_qfull	= B_FALSE;
4808348SEric.Yu@Sun.COM 
4818348SEric.Yu@Sun.COM 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
4828348SEric.Yu@Sun.COM 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
4838348SEric.Yu@Sun.COM 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
4848348SEric.Yu@Sun.COM 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
4858348SEric.Yu@Sun.COM 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
4868348SEric.Yu@Sun.COM 
4878348SEric.Yu@Sun.COM 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
4888348SEric.Yu@Sun.COM 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
4898348SEric.Yu@Sun.COM 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
4908348SEric.Yu@Sun.COM 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
4918348SEric.Yu@Sun.COM 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
4928348SEric.Yu@Sun.COM 
4938348SEric.Yu@Sun.COM 	return (0);
4948348SEric.Yu@Sun.COM }
4958348SEric.Yu@Sun.COM 
4968348SEric.Yu@Sun.COM /*ARGSUSED*/
4978348SEric.Yu@Sun.COM void
4988348SEric.Yu@Sun.COM sonode_destructor(void *buf, void *cdrarg)
4998348SEric.Yu@Sun.COM {
5008348SEric.Yu@Sun.COM 	struct sonode *so = buf;
5018348SEric.Yu@Sun.COM 	struct vnode *vp = SOTOV(so);
5028348SEric.Yu@Sun.COM 
5038348SEric.Yu@Sun.COM 	ASSERT(so->so_priv == NULL);
5048348SEric.Yu@Sun.COM 	ASSERT(so->so_peercred == NULL);
5058348SEric.Yu@Sun.COM 
5068348SEric.Yu@Sun.COM 	ASSERT(so->so_oobmsg == NULL);
5078348SEric.Yu@Sun.COM 
5088348SEric.Yu@Sun.COM 	ASSERT(so->so_rcv_q_head == NULL);
5098348SEric.Yu@Sun.COM 
5108348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_head == NULL);
5118348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
5128348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_next == NULL);
5138348SEric.Yu@Sun.COM 
5148348SEric.Yu@Sun.COM 	ASSERT(vp->v_data == so);
5158348SEric.Yu@Sun.COM 	ASSERT(vn_matchops(vp, socket_vnodeops));
5168348SEric.Yu@Sun.COM 
5178348SEric.Yu@Sun.COM 	vn_free(vp);
5188348SEric.Yu@Sun.COM 
5198348SEric.Yu@Sun.COM 	mutex_destroy(&so->so_lock);
5208348SEric.Yu@Sun.COM 	mutex_destroy(&so->so_acceptq_lock);
5218348SEric.Yu@Sun.COM 	rw_destroy(&so->so_fallback_rwlock);
5228348SEric.Yu@Sun.COM 
5238348SEric.Yu@Sun.COM 	cv_destroy(&so->so_state_cv);
5248348SEric.Yu@Sun.COM 	cv_destroy(&so->so_want_cv);
5258348SEric.Yu@Sun.COM 	cv_destroy(&so->so_acceptq_cv);
5268348SEric.Yu@Sun.COM 	cv_destroy(&so->so_snd_cv);
5278348SEric.Yu@Sun.COM 	cv_destroy(&so->so_rcv_cv);
5288348SEric.Yu@Sun.COM 	cv_destroy(&so->so_closing_cv);
5298348SEric.Yu@Sun.COM }
5308348SEric.Yu@Sun.COM 
5318348SEric.Yu@Sun.COM void
5328348SEric.Yu@Sun.COM sonode_init(struct sonode *so, struct sockparams *sp, int family,
5338348SEric.Yu@Sun.COM     int type, int protocol, sonodeops_t *sops)
5348348SEric.Yu@Sun.COM {
5358348SEric.Yu@Sun.COM 	vnode_t *vp;
5368348SEric.Yu@Sun.COM 
5378348SEric.Yu@Sun.COM 	vp = SOTOV(so);
5388348SEric.Yu@Sun.COM 
5398348SEric.Yu@Sun.COM 	so->so_flag	= 0;
5408348SEric.Yu@Sun.COM 
5418348SEric.Yu@Sun.COM 	so->so_state	= 0;
5428348SEric.Yu@Sun.COM 	so->so_mode	= 0;
5438348SEric.Yu@Sun.COM 
5448348SEric.Yu@Sun.COM 	so->so_count	= 0;
5458348SEric.Yu@Sun.COM 
5468348SEric.Yu@Sun.COM 	so->so_family	= family;
5478348SEric.Yu@Sun.COM 	so->so_type	= type;
5488348SEric.Yu@Sun.COM 	so->so_protocol	= protocol;
5498348SEric.Yu@Sun.COM 
5508348SEric.Yu@Sun.COM 	SOCK_CONNID_INIT(so->so_proto_connid);
5518348SEric.Yu@Sun.COM 
5528348SEric.Yu@Sun.COM 	so->so_options	= 0;
5538348SEric.Yu@Sun.COM 	so->so_linger.l_onoff   = 0;
5548348SEric.Yu@Sun.COM 	so->so_linger.l_linger = 0;
5558348SEric.Yu@Sun.COM 	so->so_sndbuf	= 0;
5568348SEric.Yu@Sun.COM 	so->so_error	= 0;
5578348SEric.Yu@Sun.COM 	so->so_rcvtimeo	= 0;
5588348SEric.Yu@Sun.COM 	so->so_sndtimeo = 0;
5598465SEric.Yu@Sun.COM 	so->so_xpg_rcvbuf = 0;
5608348SEric.Yu@Sun.COM 
5618348SEric.Yu@Sun.COM 	ASSERT(so->so_oobmsg == NULL);
5628348SEric.Yu@Sun.COM 	so->so_oobmark	= 0;
5638348SEric.Yu@Sun.COM 	so->so_pgrp	= 0;
5648348SEric.Yu@Sun.COM 
5658348SEric.Yu@Sun.COM 	ASSERT(so->so_peercred == NULL);
5668348SEric.Yu@Sun.COM 
5678348SEric.Yu@Sun.COM 	so->so_zoneid = getzoneid();
5688348SEric.Yu@Sun.COM 
5698348SEric.Yu@Sun.COM 	so->so_sockparams = sp;
5708348SEric.Yu@Sun.COM 
5718348SEric.Yu@Sun.COM 	so->so_ops = sops;
5728348SEric.Yu@Sun.COM 
5738399SRao.Shoaib@Sun.COM 	so->so_not_str = (sops != &sotpi_sonodeops);
5748399SRao.Shoaib@Sun.COM 
5758348SEric.Yu@Sun.COM 	so->so_proto_handle = NULL;
5768348SEric.Yu@Sun.COM 
5778348SEric.Yu@Sun.COM 	so->so_downcalls = NULL;
5788348SEric.Yu@Sun.COM 
5798348SEric.Yu@Sun.COM 	so->so_copyflag = 0;
5808348SEric.Yu@Sun.COM 
5818348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_head == NULL);
5828348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
5838348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_next == NULL);
5848348SEric.Yu@Sun.COM 
5858348SEric.Yu@Sun.COM 	vn_reinit(vp);
5868348SEric.Yu@Sun.COM 	vp->v_vfsp	= rootvfs;
5878348SEric.Yu@Sun.COM 	vp->v_type	= VSOCK;
5888348SEric.Yu@Sun.COM 	vp->v_rdev	= sockdev;
5898348SEric.Yu@Sun.COM 
5908348SEric.Yu@Sun.COM 	so->so_rcv_queued = 0;
5918348SEric.Yu@Sun.COM 	so->so_rcv_q_head = NULL;
5928348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head = NULL;
5938348SEric.Yu@Sun.COM 	so->so_rcv_head	= NULL;
5948348SEric.Yu@Sun.COM 	so->so_rcv_last_head = NULL;
5958348SEric.Yu@Sun.COM 
5968348SEric.Yu@Sun.COM 	so->so_snd_qfull = B_FALSE;
5978348SEric.Yu@Sun.COM 	so->so_minpsz = 0;
5988348SEric.Yu@Sun.COM 
5998348SEric.Yu@Sun.COM 	so->so_rcv_wakeup = B_FALSE;
6008348SEric.Yu@Sun.COM 	so->so_snd_wakeup = B_FALSE;
6018348SEric.Yu@Sun.COM 	so->so_flowctrld = B_FALSE;
6028348SEric.Yu@Sun.COM 
6038348SEric.Yu@Sun.COM 	so->so_pollev = 0;
6048348SEric.Yu@Sun.COM 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
6058348SEric.Yu@Sun.COM 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
6068348SEric.Yu@Sun.COM 
6078348SEric.Yu@Sun.COM 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
6088348SEric.Yu@Sun.COM 	so->so_ksock_cb_arg = NULL;
6098348SEric.Yu@Sun.COM 
6108348SEric.Yu@Sun.COM 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
6118348SEric.Yu@Sun.COM 
6128348SEric.Yu@Sun.COM 	so->so_direct = NULL;
6138348SEric.Yu@Sun.COM 
6148348SEric.Yu@Sun.COM 	vn_exists(vp);
6158348SEric.Yu@Sun.COM }
6168348SEric.Yu@Sun.COM 
6178348SEric.Yu@Sun.COM void
6188348SEric.Yu@Sun.COM sonode_fini(struct sonode *so)
6198348SEric.Yu@Sun.COM {
6208348SEric.Yu@Sun.COM 	mblk_t *mp;
6218348SEric.Yu@Sun.COM 	vnode_t *vp;
6228348SEric.Yu@Sun.COM 
6238348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0);
6248348SEric.Yu@Sun.COM 
6258348SEric.Yu@Sun.COM 	if (so->so_rcv_timer_tid) {
6268348SEric.Yu@Sun.COM 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
6278348SEric.Yu@Sun.COM 		(void) untimeout(so->so_rcv_timer_tid);
6288348SEric.Yu@Sun.COM 		so->so_rcv_timer_tid = 0;
6298348SEric.Yu@Sun.COM 	}
6308348SEric.Yu@Sun.COM 
6318348SEric.Yu@Sun.COM 	so_acceptq_flush(so);
6328348SEric.Yu@Sun.COM 
6338348SEric.Yu@Sun.COM 	if ((mp = so->so_oobmsg) != NULL) {
6348348SEric.Yu@Sun.COM 		freemsg(mp);
6358348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
6368348SEric.Yu@Sun.COM 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
6378348SEric.Yu@Sun.COM 		    SS_RCVATMARK);
6388348SEric.Yu@Sun.COM 	}
6398348SEric.Yu@Sun.COM 
6408348SEric.Yu@Sun.COM 	if (so->so_poll_list.ph_list != NULL) {
6418348SEric.Yu@Sun.COM 		pollwakeup(&so->so_poll_list, POLLERR);
6428348SEric.Yu@Sun.COM 		pollhead_clean(&so->so_poll_list);
6438348SEric.Yu@Sun.COM 	}
6448348SEric.Yu@Sun.COM 
6458348SEric.Yu@Sun.COM 	if (so->so_direct != NULL) {
6468348SEric.Yu@Sun.COM 		sodirect_t *sodp = so->so_direct;
6478348SEric.Yu@Sun.COM 
6488348SEric.Yu@Sun.COM 		ASSERT(sodp->sod_uioafh == NULL);
6498348SEric.Yu@Sun.COM 
6508348SEric.Yu@Sun.COM 		so->so_direct = NULL;
6518348SEric.Yu@Sun.COM 		kmem_cache_free(sock_sod_cache, sodp);
6528348SEric.Yu@Sun.COM 	}
6538348SEric.Yu@Sun.COM 
6548348SEric.Yu@Sun.COM 	vp = SOTOV(so);
6558348SEric.Yu@Sun.COM 	vn_invalid(vp);
6568348SEric.Yu@Sun.COM 
6578348SEric.Yu@Sun.COM 	if (so->so_peercred != NULL) {
6588348SEric.Yu@Sun.COM 		crfree(so->so_peercred);
6598348SEric.Yu@Sun.COM 		so->so_peercred = NULL;
6608348SEric.Yu@Sun.COM 	}
6618348SEric.Yu@Sun.COM }
6628348SEric.Yu@Sun.COM 
6638348SEric.Yu@Sun.COM /*
6648348SEric.Yu@Sun.COM  * This function is called at the beginning of recvmsg().
6658348SEric.Yu@Sun.COM  *
6668348SEric.Yu@Sun.COM  * If I/OAT is enabled on this sonode, initialize the uioa state machine
6678348SEric.Yu@Sun.COM  * with state UIOA_ALLOC.
6688348SEric.Yu@Sun.COM  */
6698348SEric.Yu@Sun.COM uio_t *
6708348SEric.Yu@Sun.COM sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
6718348SEric.Yu@Sun.COM {
6728348SEric.Yu@Sun.COM 	struct uio *suiop;
6738348SEric.Yu@Sun.COM 	struct uio *uiop;
6748348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
6758348SEric.Yu@Sun.COM 
6768348SEric.Yu@Sun.COM 	if (sodp == NULL)
6778348SEric.Yu@Sun.COM 		return (NULL);
6788348SEric.Yu@Sun.COM 
6798348SEric.Yu@Sun.COM 	suiop = NULL;
6808348SEric.Yu@Sun.COM 	uiop = *uiopp;
6818348SEric.Yu@Sun.COM 
6828348SEric.Yu@Sun.COM 	mutex_enter(sodp->sod_lockp);
6838348SEric.Yu@Sun.COM 	if (uiop->uio_resid >= uioasync.mincnt &&
6848348SEric.Yu@Sun.COM 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
6858348SEric.Yu@Sun.COM 	    uioasync.enabled && !(flags & MSG_PEEK) &&
6868348SEric.Yu@Sun.COM 	    !(so->so_state & SS_CANTRCVMORE)) {
6878348SEric.Yu@Sun.COM 		/*
6888348SEric.Yu@Sun.COM 		 * Big enough I/O for uioa min setup and an sodirect socket
6898348SEric.Yu@Sun.COM 		 * and sodirect enabled and uioa enabled and I/O will be done
6908348SEric.Yu@Sun.COM 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
6918348SEric.Yu@Sun.COM 		 */
6928348SEric.Yu@Sun.COM 		if (!uioainit(uiop, &sodp->sod_uioa)) {
6938348SEric.Yu@Sun.COM 			/*
6948348SEric.Yu@Sun.COM 			 * Successful uioainit() so the uio_t part of the
6958348SEric.Yu@Sun.COM 			 * uioa_t will be used for all uio_t work to follow,
6968348SEric.Yu@Sun.COM 			 * we return the original "uiop" in "suiop".
6978348SEric.Yu@Sun.COM 			 */
6988348SEric.Yu@Sun.COM 			suiop = uiop;
6998348SEric.Yu@Sun.COM 			*uiopp = (uio_t *)&sodp->sod_uioa;
7008348SEric.Yu@Sun.COM 			/*
7018348SEric.Yu@Sun.COM 			 * Before returning to the caller the passed in uio_t
7028348SEric.Yu@Sun.COM 			 * "uiop" will be updated via a call to uioafini()
7038348SEric.Yu@Sun.COM 			 * below.
7048348SEric.Yu@Sun.COM 			 *
7058348SEric.Yu@Sun.COM 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
7068348SEric.Yu@Sun.COM 			 * here as first we have to uioamove() any currently
7078348SEric.Yu@Sun.COM 			 * queued M_DATA mblk_t(s) so it will be done later.
7088348SEric.Yu@Sun.COM 			 */
7098348SEric.Yu@Sun.COM 		}
7108348SEric.Yu@Sun.COM 		/*
7118348SEric.Yu@Sun.COM 		 * In either uioainit() success or not case note the number
7128348SEric.Yu@Sun.COM 		 * of uio bytes the caller wants for sod framework and/or
7138348SEric.Yu@Sun.COM 		 * transport (e.g. TCP) strategy.
7148348SEric.Yu@Sun.COM 		 */
7158348SEric.Yu@Sun.COM 		sodp->sod_want = uiop->uio_resid;
7168348SEric.Yu@Sun.COM 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
7178348SEric.Yu@Sun.COM 		/*
7188348SEric.Yu@Sun.COM 		 * No uioa but still using sodirect so note the number of
7198348SEric.Yu@Sun.COM 		 * uio bytes the caller wants for sodirect framework and/or
7208348SEric.Yu@Sun.COM 		 * transport (e.g. TCP) strategy.
7218348SEric.Yu@Sun.COM 		 */
7228348SEric.Yu@Sun.COM 		sodp->sod_want = uiop->uio_resid;
7238348SEric.Yu@Sun.COM 	}
7248348SEric.Yu@Sun.COM 	mutex_exit(sodp->sod_lockp);
7258348SEric.Yu@Sun.COM 
7268348SEric.Yu@Sun.COM 	return (suiop);
7278348SEric.Yu@Sun.COM }
7288348SEric.Yu@Sun.COM 
7298348SEric.Yu@Sun.COM /*
7308348SEric.Yu@Sun.COM  * This function is called at the end of recvmsg(), it finializes all the I/OAT
7318348SEric.Yu@Sun.COM  * operations, and reset the uioa state to UIOA_ALLOC.
7328348SEric.Yu@Sun.COM  */
7338348SEric.Yu@Sun.COM int
7348348SEric.Yu@Sun.COM sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
7358348SEric.Yu@Sun.COM {
7368348SEric.Yu@Sun.COM 	int error = 0;
7378348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
7388348SEric.Yu@Sun.COM 	mblk_t *mp;
7398348SEric.Yu@Sun.COM 
7408348SEric.Yu@Sun.COM 	if (sodp == NULL) {
7418348SEric.Yu@Sun.COM 		return (0);
7428348SEric.Yu@Sun.COM 	}
7438348SEric.Yu@Sun.COM 
7448348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
7458348SEric.Yu@Sun.COM 	/* Finish any sodirect and uioa processing */
7468348SEric.Yu@Sun.COM 	if (suiop != NULL) {
7478348SEric.Yu@Sun.COM 		/* Finish any uioa_t processing */
7488348SEric.Yu@Sun.COM 
7498348SEric.Yu@Sun.COM 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
7508348SEric.Yu@Sun.COM 		error = uioafini(suiop, (uioa_t *)uiop);
7518348SEric.Yu@Sun.COM 		if ((mp = sodp->sod_uioafh) != NULL) {
7528348SEric.Yu@Sun.COM 			sodp->sod_uioafh = NULL;
7538348SEric.Yu@Sun.COM 			sodp->sod_uioaft = NULL;
7548348SEric.Yu@Sun.COM 			freemsg(mp);
7558348SEric.Yu@Sun.COM 		}
7568348SEric.Yu@Sun.COM 	}
7578348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioafh == NULL);
7588348SEric.Yu@Sun.COM 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
7598348SEric.Yu@Sun.COM 		/* Awoke */
7608348SEric.Yu@Sun.COM 		sodp->sod_state &= SOD_WAKE_CLR;
7618348SEric.Yu@Sun.COM 		sodp->sod_state |= SOD_WAKE_NOT;
7628348SEric.Yu@Sun.COM 	}
7638348SEric.Yu@Sun.COM 	/* Last, clear sod_want value */
7648348SEric.Yu@Sun.COM 	sodp->sod_want = 0;
7658348SEric.Yu@Sun.COM 
7668348SEric.Yu@Sun.COM 	return (error);
7678348SEric.Yu@Sun.COM }
7688348SEric.Yu@Sun.COM 
7698348SEric.Yu@Sun.COM /*
7708348SEric.Yu@Sun.COM  * Schedule a uioamove() on a mblk. This is ususally called from
7718348SEric.Yu@Sun.COM  * protocols (e.g. TCP) on a I/OAT enabled sonode.
7728348SEric.Yu@Sun.COM  */
7738348SEric.Yu@Sun.COM mblk_t *
7748348SEric.Yu@Sun.COM sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
7758348SEric.Yu@Sun.COM {
7768348SEric.Yu@Sun.COM 	uioa_t *uioap = &sodp->sod_uioa;
7778348SEric.Yu@Sun.COM 	mblk_t *mp1 = mp;
7788348SEric.Yu@Sun.COM 	mblk_t *lmp = NULL;
7798348SEric.Yu@Sun.COM 
7808348SEric.Yu@Sun.COM 	ASSERT(DB_TYPE(mp) == M_DATA);
7818348SEric.Yu@Sun.COM 	ASSERT(msg_size == msgdsize(mp));
7828348SEric.Yu@Sun.COM 
7838348SEric.Yu@Sun.COM 	/* Caller must have lock held */
7848348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
7858348SEric.Yu@Sun.COM 
7868348SEric.Yu@Sun.COM 	if (uioap->uioa_state & UIOA_ENABLED) {
7878348SEric.Yu@Sun.COM 		/* Uioa is enabled */
7888348SEric.Yu@Sun.COM 
7898348SEric.Yu@Sun.COM 		if (msg_size > uioap->uio_resid) {
7908348SEric.Yu@Sun.COM 			/*
7918348SEric.Yu@Sun.COM 			 * There isn't enough uio space for the mblk_t chain
7928348SEric.Yu@Sun.COM 			 * so disable uioa such that this and any additional
7938348SEric.Yu@Sun.COM 			 * mblk_t data is handled by the socket and schedule
7948348SEric.Yu@Sun.COM 			 * the socket for wakeup to finish this uioa.
7958348SEric.Yu@Sun.COM 			 */
7968348SEric.Yu@Sun.COM 			uioap->uioa_state &= UIOA_CLR;
7978348SEric.Yu@Sun.COM 			uioap->uioa_state |= UIOA_FINI;
7988348SEric.Yu@Sun.COM 			if (sodp->sod_state & SOD_WAKE_NOT) {
7998348SEric.Yu@Sun.COM 				sodp->sod_state &= SOD_WAKE_CLR;
8008348SEric.Yu@Sun.COM 				sodp->sod_state |= SOD_WAKE_NEED;
8018348SEric.Yu@Sun.COM 			}
8028348SEric.Yu@Sun.COM 			return (mp);
8038348SEric.Yu@Sun.COM 		}
8048348SEric.Yu@Sun.COM 		do {
8058348SEric.Yu@Sun.COM 			uint32_t	len = MBLKL(mp1);
8068348SEric.Yu@Sun.COM 
8078348SEric.Yu@Sun.COM 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
8088348SEric.Yu@Sun.COM 				/* Scheduled, mark dblk_t as such */
8098348SEric.Yu@Sun.COM 				DB_FLAGS(mp1) |= DBLK_UIOA;
8108348SEric.Yu@Sun.COM 			} else {
8118348SEric.Yu@Sun.COM 				/* Error, turn off async processing */
8128348SEric.Yu@Sun.COM 				uioap->uioa_state &= UIOA_CLR;
8138348SEric.Yu@Sun.COM 				uioap->uioa_state |= UIOA_FINI;
8148348SEric.Yu@Sun.COM 				break;
8158348SEric.Yu@Sun.COM 			}
8168348SEric.Yu@Sun.COM 			lmp = mp1;
8178348SEric.Yu@Sun.COM 		} while ((mp1 = mp1->b_cont) != NULL);
8188348SEric.Yu@Sun.COM 
8198348SEric.Yu@Sun.COM 		if (mp1 != NULL || uioap->uio_resid == 0) {
8208348SEric.Yu@Sun.COM 			/*
8218348SEric.Yu@Sun.COM 			 * Not all mblk_t(s) uioamoved (error) or all uio
8228348SEric.Yu@Sun.COM 			 * space has been consumed so schedule the socket
8238348SEric.Yu@Sun.COM 			 * for wakeup to finish this uio.
8248348SEric.Yu@Sun.COM 			 */
8258348SEric.Yu@Sun.COM 			sodp->sod_state &= SOD_WAKE_CLR;
8268348SEric.Yu@Sun.COM 			sodp->sod_state |= SOD_WAKE_NEED;
8278348SEric.Yu@Sun.COM 
8288348SEric.Yu@Sun.COM 			/* Break the mblk chain if neccessary. */
8298348SEric.Yu@Sun.COM 			if (mp1 != NULL && lmp != NULL) {
8308348SEric.Yu@Sun.COM 				mp->b_next = mp1;
8318348SEric.Yu@Sun.COM 				lmp->b_cont = NULL;
8328348SEric.Yu@Sun.COM 			}
8338348SEric.Yu@Sun.COM 		}
8348348SEric.Yu@Sun.COM 	}
8358348SEric.Yu@Sun.COM 	return (mp1);
8368348SEric.Yu@Sun.COM }
8378348SEric.Yu@Sun.COM 
8388348SEric.Yu@Sun.COM /*
8398348SEric.Yu@Sun.COM  * This function is called on a mblk that thas been successfully uioamoved().
8408348SEric.Yu@Sun.COM  */
8418348SEric.Yu@Sun.COM void
8428348SEric.Yu@Sun.COM sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
8438348SEric.Yu@Sun.COM {
8448348SEric.Yu@Sun.COM 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
8458348SEric.Yu@Sun.COM 		/*
8468348SEric.Yu@Sun.COM 		 * A uioa flaged mblk_t chain, already uio processed,
8478348SEric.Yu@Sun.COM 		 * add it to the sodirect uioa pending free list.
8488348SEric.Yu@Sun.COM 		 *
8498348SEric.Yu@Sun.COM 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
8508348SEric.Yu@Sun.COM 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
8518348SEric.Yu@Sun.COM 		 */
8528348SEric.Yu@Sun.COM 		mblk_t	*bpt = sodp->sod_uioaft;
8538348SEric.Yu@Sun.COM 
8548348SEric.Yu@Sun.COM 		ASSERT(sodp != NULL);
8558348SEric.Yu@Sun.COM 
8568348SEric.Yu@Sun.COM 		/*
8578348SEric.Yu@Sun.COM 		 * Add first mblk_t of "bp" chain to current sodirect uioa
8588348SEric.Yu@Sun.COM 		 * free list tail mblk_t, if any, else empty list so new head.
8598348SEric.Yu@Sun.COM 		 */
8608348SEric.Yu@Sun.COM 		if (bpt == NULL)
8618348SEric.Yu@Sun.COM 			sodp->sod_uioafh = bp;
8628348SEric.Yu@Sun.COM 		else
8638348SEric.Yu@Sun.COM 			bpt->b_cont = bp;
8648348SEric.Yu@Sun.COM 
8658348SEric.Yu@Sun.COM 		/*
8668348SEric.Yu@Sun.COM 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
8678348SEric.Yu@Sun.COM 		 * each to reflect that uioamove() has consumed all data.
8688348SEric.Yu@Sun.COM 		 */
8698348SEric.Yu@Sun.COM 		bpt = bp;
8708348SEric.Yu@Sun.COM 		for (;;) {
8718348SEric.Yu@Sun.COM 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
8728348SEric.Yu@Sun.COM 
8738348SEric.Yu@Sun.COM 			bpt->b_rptr = bpt->b_wptr;
8748348SEric.Yu@Sun.COM 			if (bpt->b_cont == NULL)
8758348SEric.Yu@Sun.COM 				break;
8768348SEric.Yu@Sun.COM 			bpt = bpt->b_cont;
8778348SEric.Yu@Sun.COM 		}
8788348SEric.Yu@Sun.COM 		/* New sodirect uioa free list tail */
8798348SEric.Yu@Sun.COM 		sodp->sod_uioaft = bpt;
8808348SEric.Yu@Sun.COM 
8818348SEric.Yu@Sun.COM 		/* Only dequeue once with data returned per uioa_t */
8828348SEric.Yu@Sun.COM 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
8838348SEric.Yu@Sun.COM 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
8848348SEric.Yu@Sun.COM 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
8858348SEric.Yu@Sun.COM 		}
8868348SEric.Yu@Sun.COM 	}
8878348SEric.Yu@Sun.COM }
8888348SEric.Yu@Sun.COM 
8898348SEric.Yu@Sun.COM /*
8908348SEric.Yu@Sun.COM  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
8918348SEric.Yu@Sun.COM  * this function on a non-STREAMS socket to schedule uioamove() on the data
8928348SEric.Yu@Sun.COM  * that has already queued in this socket.
8938348SEric.Yu@Sun.COM  */
8948348SEric.Yu@Sun.COM void
8958348SEric.Yu@Sun.COM sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
8968348SEric.Yu@Sun.COM {
8978348SEric.Yu@Sun.COM 	uioa_t	*uioap = (uioa_t *)uiop;
8988348SEric.Yu@Sun.COM 	mblk_t	*lbp;
8998348SEric.Yu@Sun.COM 	mblk_t	*wbp;
9008348SEric.Yu@Sun.COM 	mblk_t	*bp;
9018348SEric.Yu@Sun.COM 	int	len;
9028348SEric.Yu@Sun.COM 	int	error;
9038348SEric.Yu@Sun.COM 	boolean_t in_rcv_q = B_TRUE;
9048348SEric.Yu@Sun.COM 
9058348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
9068348SEric.Yu@Sun.COM 	ASSERT(&sodp->sod_uioa == uioap);
9078348SEric.Yu@Sun.COM 
9088348SEric.Yu@Sun.COM 	/*
9098348SEric.Yu@Sun.COM 	 * Walk first b_cont chain in sod_q
9108348SEric.Yu@Sun.COM 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
9118348SEric.Yu@Sun.COM 	 */
9128348SEric.Yu@Sun.COM 	bp = so->so_rcv_q_head;
9138348SEric.Yu@Sun.COM 
9148348SEric.Yu@Sun.COM again:
9158348SEric.Yu@Sun.COM 	/* Walk the chain */
9168348SEric.Yu@Sun.COM 	lbp = NULL;
9178348SEric.Yu@Sun.COM 	wbp = bp;
9188348SEric.Yu@Sun.COM 
9198348SEric.Yu@Sun.COM 	do {
9208348SEric.Yu@Sun.COM 		if (bp == NULL)
9218348SEric.Yu@Sun.COM 			break;
9228348SEric.Yu@Sun.COM 
9238348SEric.Yu@Sun.COM 		if (wbp->b_datap->db_type != M_DATA) {
9248348SEric.Yu@Sun.COM 			/* Not M_DATA, no more uioa */
9258348SEric.Yu@Sun.COM 			goto nouioa;
9268348SEric.Yu@Sun.COM 		}
9278348SEric.Yu@Sun.COM 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
9288348SEric.Yu@Sun.COM 			/* Have a M_DATA mblk_t with data */
9298348SEric.Yu@Sun.COM 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
9308348SEric.Yu@Sun.COM 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
9318348SEric.Yu@Sun.COM 				/* Not enough uio sapce, or beyond oobmark */
9328348SEric.Yu@Sun.COM 				goto nouioa;
9338348SEric.Yu@Sun.COM 			}
9348348SEric.Yu@Sun.COM 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
9358348SEric.Yu@Sun.COM 			error = uioamove(wbp->b_rptr, len,
9368348SEric.Yu@Sun.COM 			    UIO_READ, uioap);
9378348SEric.Yu@Sun.COM 			if (!error) {
9388348SEric.Yu@Sun.COM 				/* Scheduled, mark dblk_t as such */
9398348SEric.Yu@Sun.COM 				wbp->b_datap->db_flags |= DBLK_UIOA;
9408348SEric.Yu@Sun.COM 			} else {
9418348SEric.Yu@Sun.COM 				/* Break the mblk chain */
9428348SEric.Yu@Sun.COM 				goto nouioa;
9438348SEric.Yu@Sun.COM 			}
9448348SEric.Yu@Sun.COM 		}
9458348SEric.Yu@Sun.COM 		/* Save last wbp processed */
9468348SEric.Yu@Sun.COM 		lbp = wbp;
9478348SEric.Yu@Sun.COM 	} while ((wbp = wbp->b_cont) != NULL);
9488348SEric.Yu@Sun.COM 
9498348SEric.Yu@Sun.COM 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
9508348SEric.Yu@Sun.COM 		/*
9518348SEric.Yu@Sun.COM 		 * We get here only once to process the sonode dump area
9528348SEric.Yu@Sun.COM 		 * if so_rcv_q_head is NULL or all the mblks have been
9538348SEric.Yu@Sun.COM 		 * successfully uioamoved()ed.
9548348SEric.Yu@Sun.COM 		 */
9558348SEric.Yu@Sun.COM 		in_rcv_q = B_FALSE;
9568348SEric.Yu@Sun.COM 
9578348SEric.Yu@Sun.COM 		/* move to dump area */
9588348SEric.Yu@Sun.COM 		bp = so->so_rcv_head;
9598348SEric.Yu@Sun.COM 		goto again;
9608348SEric.Yu@Sun.COM 	}
9618348SEric.Yu@Sun.COM 
9628348SEric.Yu@Sun.COM 	return;
9638348SEric.Yu@Sun.COM 
9648348SEric.Yu@Sun.COM nouioa:
9658348SEric.Yu@Sun.COM 	/* No more uioa */
9668348SEric.Yu@Sun.COM 	uioap->uioa_state &= UIOA_CLR;
9678348SEric.Yu@Sun.COM 	uioap->uioa_state |= UIOA_FINI;
9688348SEric.Yu@Sun.COM 
9698348SEric.Yu@Sun.COM 	/*
9708348SEric.Yu@Sun.COM 	 * If we processed 1 or more mblk_t(s) then we need to split the
9718348SEric.Yu@Sun.COM 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
9728348SEric.Yu@Sun.COM 	 * are in the current chain and the rest are in the following new
9738348SEric.Yu@Sun.COM 	 * chain.
9748348SEric.Yu@Sun.COM 	 */
9758348SEric.Yu@Sun.COM 	if (lbp != NULL) {
9768348SEric.Yu@Sun.COM 		/* New end of current chain */
9778348SEric.Yu@Sun.COM 		lbp->b_cont = NULL;
9788348SEric.Yu@Sun.COM 
9798348SEric.Yu@Sun.COM 		/* Insert new chain wbp after bp */
9808348SEric.Yu@Sun.COM 		if ((wbp->b_next = bp->b_next) == NULL) {
9818348SEric.Yu@Sun.COM 			/*
9828348SEric.Yu@Sun.COM 			 * No need to grab so_lock, since sod_lockp
9838348SEric.Yu@Sun.COM 			 * points to so_lock.
9848348SEric.Yu@Sun.COM 			 */
9858348SEric.Yu@Sun.COM 			if (in_rcv_q)
9868348SEric.Yu@Sun.COM 				so->so_rcv_q_last_head = wbp;
9878348SEric.Yu@Sun.COM 			else
9888348SEric.Yu@Sun.COM 				so->so_rcv_last_head = wbp;
9898348SEric.Yu@Sun.COM 		}
9908348SEric.Yu@Sun.COM 		bp->b_next = wbp;
9918348SEric.Yu@Sun.COM 		bp->b_next->b_prev = bp->b_prev;
9928348SEric.Yu@Sun.COM 		bp->b_prev = lbp;
9938348SEric.Yu@Sun.COM 	}
9948348SEric.Yu@Sun.COM }
9958348SEric.Yu@Sun.COM 
9968348SEric.Yu@Sun.COM /*
9978348SEric.Yu@Sun.COM  * Initialize sodirect data structures on a socket.
9988348SEric.Yu@Sun.COM  */
9998348SEric.Yu@Sun.COM void
10008348SEric.Yu@Sun.COM sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
10018348SEric.Yu@Sun.COM     sod_wakeup_func wake_func, kmutex_t *lockp)
10028348SEric.Yu@Sun.COM {
10038348SEric.Yu@Sun.COM 	sodirect_t	*sodp;
10048348SEric.Yu@Sun.COM 
10058348SEric.Yu@Sun.COM 	ASSERT(so->so_direct == NULL);
10068348SEric.Yu@Sun.COM 
10078348SEric.Yu@Sun.COM 	so->so_state |= SS_SODIRECT;
10088348SEric.Yu@Sun.COM 
10098348SEric.Yu@Sun.COM 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
10108348SEric.Yu@Sun.COM 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
10118348SEric.Yu@Sun.COM 	sodp->sod_want = 0;
10128348SEric.Yu@Sun.COM 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
10138348SEric.Yu@Sun.COM 	sodp->sod_enqueue = enq_func;
10148348SEric.Yu@Sun.COM 	sodp->sod_wakeup = wake_func;
10158348SEric.Yu@Sun.COM 	sodp->sod_uioafh = NULL;
10168348SEric.Yu@Sun.COM 	sodp->sod_uioaft = NULL;
10178348SEric.Yu@Sun.COM 	sodp->sod_lockp = lockp;
10188348SEric.Yu@Sun.COM 	/*
10198348SEric.Yu@Sun.COM 	 * Remainder of the sod_uioa members are left uninitialized
10208348SEric.Yu@Sun.COM 	 * but will be initialized later by uioainit() before uioa
10218348SEric.Yu@Sun.COM 	 * is enabled.
10228348SEric.Yu@Sun.COM 	 */
10238348SEric.Yu@Sun.COM 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
10248348SEric.Yu@Sun.COM 	so->so_direct = sodp;
10258348SEric.Yu@Sun.COM 	if (stp != NULL)
10268348SEric.Yu@Sun.COM 		stp->sd_sodirect = sodp;
10278348SEric.Yu@Sun.COM }
10288348SEric.Yu@Sun.COM 
10298348SEric.Yu@Sun.COM /*
10308348SEric.Yu@Sun.COM  * Init the sodirect kmem cache while sockfs is loading.
10318348SEric.Yu@Sun.COM  */
10328348SEric.Yu@Sun.COM void
10338348SEric.Yu@Sun.COM sod_init()
10348348SEric.Yu@Sun.COM {
10358348SEric.Yu@Sun.COM 	/* Allocate sodirect_t kmem_cache */
10368348SEric.Yu@Sun.COM 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
10378348SEric.Yu@Sun.COM 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
10388348SEric.Yu@Sun.COM }
10398348SEric.Yu@Sun.COM 
10408348SEric.Yu@Sun.COM ssize_t
10418348SEric.Yu@Sun.COM sod_uioa_mblk(struct sonode *so, mblk_t *mp)
10428348SEric.Yu@Sun.COM {
10438348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
10448348SEric.Yu@Sun.COM 
10458348SEric.Yu@Sun.COM 	ASSERT(sodp != NULL);
10468348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
10478348SEric.Yu@Sun.COM 
10488348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_state & SOD_ENABLED);
10498348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
10508348SEric.Yu@Sun.COM 
10518348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
10528348SEric.Yu@Sun.COM 
10538348SEric.Yu@Sun.COM 	if (mp == NULL && so->so_rcv_q_head != NULL) {
10548348SEric.Yu@Sun.COM 		mp = so->so_rcv_q_head;
10558348SEric.Yu@Sun.COM 		ASSERT(mp->b_prev != NULL);
10568348SEric.Yu@Sun.COM 		mp->b_prev = NULL;
10578348SEric.Yu@Sun.COM 		so->so_rcv_q_head = mp->b_next;
10588348SEric.Yu@Sun.COM 		if (so->so_rcv_q_head == NULL) {
10598348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head = NULL;
10608348SEric.Yu@Sun.COM 		}
10618348SEric.Yu@Sun.COM 		mp->b_next = NULL;
10628348SEric.Yu@Sun.COM 	}
10638348SEric.Yu@Sun.COM 
10648348SEric.Yu@Sun.COM 	sod_uioa_mblk_done(sodp, mp);
10658348SEric.Yu@Sun.COM 
10668348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
10678348SEric.Yu@Sun.COM 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
10688348SEric.Yu@Sun.COM 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
10698348SEric.Yu@Sun.COM 		/* more arrived */
10708348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_q_head == NULL);
10718348SEric.Yu@Sun.COM 		mp = so->so_rcv_head;
10728348SEric.Yu@Sun.COM 		so->so_rcv_head = mp->b_next;
10738348SEric.Yu@Sun.COM 		if (so->so_rcv_head == NULL)
10748348SEric.Yu@Sun.COM 			so->so_rcv_last_head = NULL;
10758348SEric.Yu@Sun.COM 		mp->b_prev = mp->b_next = NULL;
10768348SEric.Yu@Sun.COM 		sod_uioa_mblk_done(sodp, mp);
10778348SEric.Yu@Sun.COM 	}
10788348SEric.Yu@Sun.COM 
10798348SEric.Yu@Sun.COM #ifdef DEBUG
10808348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head != NULL) {
10818348SEric.Yu@Sun.COM 		mblk_t *m = so->so_rcv_q_head;
10828348SEric.Yu@Sun.COM 		while (m != NULL) {
10838348SEric.Yu@Sun.COM 			if (DB_FLAGS(m) & DBLK_UIOA) {
10848348SEric.Yu@Sun.COM 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
10858348SEric.Yu@Sun.COM 				    " in so_rcv_q_head.\n", (void *)m);
10868348SEric.Yu@Sun.COM 			}
10878348SEric.Yu@Sun.COM 			m = m->b_next;
10888348SEric.Yu@Sun.COM 		}
10898348SEric.Yu@Sun.COM 	}
10908348SEric.Yu@Sun.COM 	if (so->so_rcv_head != NULL) {
10918348SEric.Yu@Sun.COM 		mblk_t *m = so->so_rcv_head;
10928348SEric.Yu@Sun.COM 		while (m != NULL) {
10938348SEric.Yu@Sun.COM 			if (DB_FLAGS(m) & DBLK_UIOA) {
10948348SEric.Yu@Sun.COM 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
10958348SEric.Yu@Sun.COM 				    " in so_rcv_head.\n", (void *)m);
10968348SEric.Yu@Sun.COM 			}
10978348SEric.Yu@Sun.COM 			m = m->b_next;
10988348SEric.Yu@Sun.COM 		}
10998348SEric.Yu@Sun.COM 	}
11008348SEric.Yu@Sun.COM #endif
11018348SEric.Yu@Sun.COM 	return (sodp->sod_uioa.uioa_mbytes);
11028348SEric.Yu@Sun.COM }
1103