xref: /onnv-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 8348:4137e18bfaf0)
1*8348SEric.Yu@Sun.COM /*
2*8348SEric.Yu@Sun.COM  * CDDL HEADER START
3*8348SEric.Yu@Sun.COM  *
4*8348SEric.Yu@Sun.COM  * The contents of this file are subject to the terms of the
5*8348SEric.Yu@Sun.COM  * Common Development and Distribution License (the "License").
6*8348SEric.Yu@Sun.COM  * You may not use this file except in compliance with the License.
7*8348SEric.Yu@Sun.COM  *
8*8348SEric.Yu@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*8348SEric.Yu@Sun.COM  * or http://www.opensolaris.org/os/licensing.
10*8348SEric.Yu@Sun.COM  * See the License for the specific language governing permissions
11*8348SEric.Yu@Sun.COM  * and limitations under the License.
12*8348SEric.Yu@Sun.COM  *
13*8348SEric.Yu@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
14*8348SEric.Yu@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*8348SEric.Yu@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
16*8348SEric.Yu@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
17*8348SEric.Yu@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
18*8348SEric.Yu@Sun.COM  *
19*8348SEric.Yu@Sun.COM  * CDDL HEADER END
20*8348SEric.Yu@Sun.COM  */
21*8348SEric.Yu@Sun.COM 
22*8348SEric.Yu@Sun.COM /*
23*8348SEric.Yu@Sun.COM  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24*8348SEric.Yu@Sun.COM  * Use is subject to license terms.
25*8348SEric.Yu@Sun.COM  */
26*8348SEric.Yu@Sun.COM 
27*8348SEric.Yu@Sun.COM #include <sys/types.h>
28*8348SEric.Yu@Sun.COM #include <sys/param.h>
29*8348SEric.Yu@Sun.COM #include <sys/systm.h>
30*8348SEric.Yu@Sun.COM #include <sys/sysmacros.h>
31*8348SEric.Yu@Sun.COM #include <sys/debug.h>
32*8348SEric.Yu@Sun.COM #include <sys/cmn_err.h>
33*8348SEric.Yu@Sun.COM #include <sys/vfs.h>
34*8348SEric.Yu@Sun.COM #include <sys/policy.h>
35*8348SEric.Yu@Sun.COM #include <sys/modctl.h>
36*8348SEric.Yu@Sun.COM 
37*8348SEric.Yu@Sun.COM #include <sys/sunddi.h>
38*8348SEric.Yu@Sun.COM 
39*8348SEric.Yu@Sun.COM #include <sys/strsun.h>
40*8348SEric.Yu@Sun.COM #include <sys/stropts.h>
41*8348SEric.Yu@Sun.COM #include <sys/strsubr.h>
42*8348SEric.Yu@Sun.COM #include <sys/socket.h>
43*8348SEric.Yu@Sun.COM #include <sys/socketvar.h>
44*8348SEric.Yu@Sun.COM #include <sys/sodirect.h>
45*8348SEric.Yu@Sun.COM #include <sys/uio.h>
46*8348SEric.Yu@Sun.COM 
47*8348SEric.Yu@Sun.COM #include <inet/ipclassifier.h>
48*8348SEric.Yu@Sun.COM #include <fs/sockfs/sockcommon.h>
49*8348SEric.Yu@Sun.COM #include <fs/sockfs/nl7c.h>
50*8348SEric.Yu@Sun.COM #include <inet/ip.h>
51*8348SEric.Yu@Sun.COM 
52*8348SEric.Yu@Sun.COM extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
53*8348SEric.Yu@Sun.COM 
54*8348SEric.Yu@Sun.COM static struct kmem_cache *sock_sod_cache;
55*8348SEric.Yu@Sun.COM 
56*8348SEric.Yu@Sun.COM /*
57*8348SEric.Yu@Sun.COM  * Common socket access functions.
58*8348SEric.Yu@Sun.COM  *
59*8348SEric.Yu@Sun.COM  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
60*8348SEric.Yu@Sun.COM  * the socket_xxx() function should be used.
61*8348SEric.Yu@Sun.COM  */
62*8348SEric.Yu@Sun.COM 
63*8348SEric.Yu@Sun.COM /*
64*8348SEric.Yu@Sun.COM  * Try to create a new sonode of the requested <family, type, protocol>.
65*8348SEric.Yu@Sun.COM  */
66*8348SEric.Yu@Sun.COM /* ARGSUSED */
67*8348SEric.Yu@Sun.COM struct sonode *
68*8348SEric.Yu@Sun.COM socket_create(int family, int type, int protocol, char *devpath, char *mod,
69*8348SEric.Yu@Sun.COM     int flags, int version, struct cred *cr, int *errorp)
70*8348SEric.Yu@Sun.COM {
71*8348SEric.Yu@Sun.COM 	struct sonode *so;
72*8348SEric.Yu@Sun.COM 	struct sockparams *sp = NULL;
73*8348SEric.Yu@Sun.COM 
74*8348SEric.Yu@Sun.COM 	/*
75*8348SEric.Yu@Sun.COM 	 * Look for a sockparams entry that match the given criteria.
76*8348SEric.Yu@Sun.COM 	 * solookup() returns with the entry held.
77*8348SEric.Yu@Sun.COM 	 */
78*8348SEric.Yu@Sun.COM 	*errorp = solookup(family, type, protocol, &sp);
79*8348SEric.Yu@Sun.COM 	if (sp == NULL) {
80*8348SEric.Yu@Sun.COM 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
81*8348SEric.Yu@Sun.COM 		/*
82*8348SEric.Yu@Sun.COM 		 * There is no matching sockparams entry. An ephemeral entry is
83*8348SEric.Yu@Sun.COM 		 * created if the caller specifies a device or a socket module.
84*8348SEric.Yu@Sun.COM 		 */
85*8348SEric.Yu@Sun.COM 		if (devpath != NULL) {
86*8348SEric.Yu@Sun.COM 			sp = sockparams_hold_ephemeral_bydev(family, type,
87*8348SEric.Yu@Sun.COM 			    protocol, devpath, kmflags, errorp);
88*8348SEric.Yu@Sun.COM 		} else if (mod != NULL) {
89*8348SEric.Yu@Sun.COM 			sp = sockparams_hold_ephemeral_bymod(family, type,
90*8348SEric.Yu@Sun.COM 			    protocol, mod, kmflags, errorp);
91*8348SEric.Yu@Sun.COM 		} else {
92*8348SEric.Yu@Sun.COM 			return (NULL);
93*8348SEric.Yu@Sun.COM 		}
94*8348SEric.Yu@Sun.COM 
95*8348SEric.Yu@Sun.COM 		if (sp == NULL)
96*8348SEric.Yu@Sun.COM 			return (NULL);
97*8348SEric.Yu@Sun.COM 	}
98*8348SEric.Yu@Sun.COM 
99*8348SEric.Yu@Sun.COM 	ASSERT(sp->sp_smod_info != NULL);
100*8348SEric.Yu@Sun.COM 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
101*8348SEric.Yu@Sun.COM 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
102*8348SEric.Yu@Sun.COM 	    protocol, version, flags, errorp, cr);
103*8348SEric.Yu@Sun.COM 	if (so == NULL) {
104*8348SEric.Yu@Sun.COM 		SOCKPARAMS_DEC_REF(sp);
105*8348SEric.Yu@Sun.COM 	} else {
106*8348SEric.Yu@Sun.COM 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
107*8348SEric.Yu@Sun.COM 			/* Cannot fail, only bumps so_count */
108*8348SEric.Yu@Sun.COM 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
109*8348SEric.Yu@Sun.COM 		} else {
110*8348SEric.Yu@Sun.COM 			socket_destroy(so);
111*8348SEric.Yu@Sun.COM 			so = NULL;
112*8348SEric.Yu@Sun.COM 		}
113*8348SEric.Yu@Sun.COM 	}
114*8348SEric.Yu@Sun.COM 	return (so);
115*8348SEric.Yu@Sun.COM }
116*8348SEric.Yu@Sun.COM 
117*8348SEric.Yu@Sun.COM struct sonode *
118*8348SEric.Yu@Sun.COM socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
119*8348SEric.Yu@Sun.COM     sock_downcalls_t *dc, int flags, int *errorp)
120*8348SEric.Yu@Sun.COM {
121*8348SEric.Yu@Sun.COM 	struct sonode *so;
122*8348SEric.Yu@Sun.COM 	struct sockparams *sp;
123*8348SEric.Yu@Sun.COM 	struct cred *cr;
124*8348SEric.Yu@Sun.COM 
125*8348SEric.Yu@Sun.COM 	if ((cr = CRED()) == NULL)
126*8348SEric.Yu@Sun.COM 		cr = kcred;
127*8348SEric.Yu@Sun.COM 
128*8348SEric.Yu@Sun.COM 	sp = parent->so_sockparams;
129*8348SEric.Yu@Sun.COM 	ASSERT(sp != NULL);
130*8348SEric.Yu@Sun.COM 
131*8348SEric.Yu@Sun.COM 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
132*8348SEric.Yu@Sun.COM 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
133*8348SEric.Yu@Sun.COM 	    errorp, cr);
134*8348SEric.Yu@Sun.COM 	if (so != NULL) {
135*8348SEric.Yu@Sun.COM 		SOCKPARAMS_INC_REF(sp);
136*8348SEric.Yu@Sun.COM 
137*8348SEric.Yu@Sun.COM 		so->so_proto_handle = lh;
138*8348SEric.Yu@Sun.COM 		so->so_downcalls = dc;
139*8348SEric.Yu@Sun.COM 		/*
140*8348SEric.Yu@Sun.COM 		 * This function may be called in interrupt context, and CRED()
141*8348SEric.Yu@Sun.COM 		 * will be NULL. In this case, pass in kcred.
142*8348SEric.Yu@Sun.COM 		 */
143*8348SEric.Yu@Sun.COM 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
144*8348SEric.Yu@Sun.COM 			/* Cannot fail, only bumps so_count */
145*8348SEric.Yu@Sun.COM 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
146*8348SEric.Yu@Sun.COM 		} else  {
147*8348SEric.Yu@Sun.COM 			socket_destroy(so);
148*8348SEric.Yu@Sun.COM 			so = NULL;
149*8348SEric.Yu@Sun.COM 		}
150*8348SEric.Yu@Sun.COM 	}
151*8348SEric.Yu@Sun.COM 
152*8348SEric.Yu@Sun.COM 	return (so);
153*8348SEric.Yu@Sun.COM }
154*8348SEric.Yu@Sun.COM 
155*8348SEric.Yu@Sun.COM /*
156*8348SEric.Yu@Sun.COM  * Bind local endpoint.
157*8348SEric.Yu@Sun.COM  */
158*8348SEric.Yu@Sun.COM int
159*8348SEric.Yu@Sun.COM socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
160*8348SEric.Yu@Sun.COM     int flags, cred_t *cr)
161*8348SEric.Yu@Sun.COM {
162*8348SEric.Yu@Sun.COM 	return (SOP_BIND(so, name, namelen, flags, cr));
163*8348SEric.Yu@Sun.COM }
164*8348SEric.Yu@Sun.COM 
165*8348SEric.Yu@Sun.COM /*
166*8348SEric.Yu@Sun.COM  * Turn socket into a listen socket.
167*8348SEric.Yu@Sun.COM  */
168*8348SEric.Yu@Sun.COM int
169*8348SEric.Yu@Sun.COM socket_listen(struct sonode *so, int backlog, cred_t *cr)
170*8348SEric.Yu@Sun.COM {
171*8348SEric.Yu@Sun.COM 	if (backlog < 0) {
172*8348SEric.Yu@Sun.COM 		backlog = 0;
173*8348SEric.Yu@Sun.COM 	}
174*8348SEric.Yu@Sun.COM 
175*8348SEric.Yu@Sun.COM 	/*
176*8348SEric.Yu@Sun.COM 	 * Use the same qlimit as in BSD. BSD checks the qlimit
177*8348SEric.Yu@Sun.COM 	 * before queuing the next connection implying that a
178*8348SEric.Yu@Sun.COM 	 * listen(sock, 0) allows one connection to be queued.
179*8348SEric.Yu@Sun.COM 	 * BSD also uses 1.5 times the requested backlog.
180*8348SEric.Yu@Sun.COM 	 *
181*8348SEric.Yu@Sun.COM 	 * XNS Issue 4 required a strict interpretation of the backlog.
182*8348SEric.Yu@Sun.COM 	 * This has been waived subsequently for Issue 4 and the change
183*8348SEric.Yu@Sun.COM 	 * incorporated in XNS Issue 5. So we aren't required to do
184*8348SEric.Yu@Sun.COM 	 * anything special for XPG apps.
185*8348SEric.Yu@Sun.COM 	 */
186*8348SEric.Yu@Sun.COM 	if (backlog >= (INT_MAX - 1) / 3)
187*8348SEric.Yu@Sun.COM 		backlog = INT_MAX;
188*8348SEric.Yu@Sun.COM 	else
189*8348SEric.Yu@Sun.COM 		backlog = backlog * 3 / 2 + 1;
190*8348SEric.Yu@Sun.COM 
191*8348SEric.Yu@Sun.COM 	return (SOP_LISTEN(so, backlog, cr));
192*8348SEric.Yu@Sun.COM }
193*8348SEric.Yu@Sun.COM 
194*8348SEric.Yu@Sun.COM /*
195*8348SEric.Yu@Sun.COM  * Accept incoming connection.
196*8348SEric.Yu@Sun.COM  */
197*8348SEric.Yu@Sun.COM int
198*8348SEric.Yu@Sun.COM socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
199*8348SEric.Yu@Sun.COM {
200*8348SEric.Yu@Sun.COM 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
201*8348SEric.Yu@Sun.COM }
202*8348SEric.Yu@Sun.COM 
203*8348SEric.Yu@Sun.COM /*
204*8348SEric.Yu@Sun.COM  * Active open.
205*8348SEric.Yu@Sun.COM  */
206*8348SEric.Yu@Sun.COM int
207*8348SEric.Yu@Sun.COM socket_connect(struct sonode *so, const struct sockaddr *name,
208*8348SEric.Yu@Sun.COM     socklen_t namelen, int fflag, int flags, cred_t *cr)
209*8348SEric.Yu@Sun.COM {
210*8348SEric.Yu@Sun.COM 	int error;
211*8348SEric.Yu@Sun.COM 
212*8348SEric.Yu@Sun.COM 	/*
213*8348SEric.Yu@Sun.COM 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
214*8348SEric.Yu@Sun.COM 	 * connect to a null address. This is the portable method to
215*8348SEric.Yu@Sun.COM 	 * unconnect a socket.
216*8348SEric.Yu@Sun.COM 	 */
217*8348SEric.Yu@Sun.COM 	if ((namelen >= sizeof (sa_family_t)) &&
218*8348SEric.Yu@Sun.COM 	    (name->sa_family == AF_UNSPEC)) {
219*8348SEric.Yu@Sun.COM 		name = NULL;
220*8348SEric.Yu@Sun.COM 		namelen = 0;
221*8348SEric.Yu@Sun.COM 	}
222*8348SEric.Yu@Sun.COM 
223*8348SEric.Yu@Sun.COM 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
224*8348SEric.Yu@Sun.COM 
225*8348SEric.Yu@Sun.COM 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
226*8348SEric.Yu@Sun.COM 		/*
227*8348SEric.Yu@Sun.COM 		 * X/Open specification contains a requirement that
228*8348SEric.Yu@Sun.COM 		 * ENETUNREACH be returned but does not require
229*8348SEric.Yu@Sun.COM 		 * EHOSTUNREACH. In order to keep the test suite
230*8348SEric.Yu@Sun.COM 		 * happy we mess with the errno here.
231*8348SEric.Yu@Sun.COM 		 */
232*8348SEric.Yu@Sun.COM 		error = ENETUNREACH;
233*8348SEric.Yu@Sun.COM 	}
234*8348SEric.Yu@Sun.COM 
235*8348SEric.Yu@Sun.COM 	return (error);
236*8348SEric.Yu@Sun.COM }
237*8348SEric.Yu@Sun.COM 
238*8348SEric.Yu@Sun.COM /*
239*8348SEric.Yu@Sun.COM  * Get address of remote node.
240*8348SEric.Yu@Sun.COM  */
241*8348SEric.Yu@Sun.COM int
242*8348SEric.Yu@Sun.COM socket_getpeername(struct sonode *so, struct sockaddr *addr,
243*8348SEric.Yu@Sun.COM     socklen_t *addrlen, boolean_t accept, cred_t *cr)
244*8348SEric.Yu@Sun.COM {
245*8348SEric.Yu@Sun.COM 	ASSERT(*addrlen > 0);
246*8348SEric.Yu@Sun.COM 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
247*8348SEric.Yu@Sun.COM 
248*8348SEric.Yu@Sun.COM }
249*8348SEric.Yu@Sun.COM 
250*8348SEric.Yu@Sun.COM /*
251*8348SEric.Yu@Sun.COM  * Get local address.
252*8348SEric.Yu@Sun.COM  */
253*8348SEric.Yu@Sun.COM int
254*8348SEric.Yu@Sun.COM socket_getsockname(struct sonode *so, struct sockaddr *addr,
255*8348SEric.Yu@Sun.COM     socklen_t *addrlen, cred_t *cr)
256*8348SEric.Yu@Sun.COM {
257*8348SEric.Yu@Sun.COM 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
258*8348SEric.Yu@Sun.COM 
259*8348SEric.Yu@Sun.COM }
260*8348SEric.Yu@Sun.COM 
261*8348SEric.Yu@Sun.COM /*
262*8348SEric.Yu@Sun.COM  * Called from shutdown().
263*8348SEric.Yu@Sun.COM  */
264*8348SEric.Yu@Sun.COM int
265*8348SEric.Yu@Sun.COM socket_shutdown(struct sonode *so, int how, cred_t *cr)
266*8348SEric.Yu@Sun.COM {
267*8348SEric.Yu@Sun.COM 	return (SOP_SHUTDOWN(so, how, cr));
268*8348SEric.Yu@Sun.COM }
269*8348SEric.Yu@Sun.COM 
270*8348SEric.Yu@Sun.COM /*
271*8348SEric.Yu@Sun.COM  * Get socket options.
272*8348SEric.Yu@Sun.COM  */
273*8348SEric.Yu@Sun.COM /*ARGSUSED*/
274*8348SEric.Yu@Sun.COM int
275*8348SEric.Yu@Sun.COM socket_getsockopt(struct sonode *so, int level, int option_name,
276*8348SEric.Yu@Sun.COM     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
277*8348SEric.Yu@Sun.COM {
278*8348SEric.Yu@Sun.COM 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
279*8348SEric.Yu@Sun.COM 	    optlenp, flags, cr));
280*8348SEric.Yu@Sun.COM }
281*8348SEric.Yu@Sun.COM 
282*8348SEric.Yu@Sun.COM /*
283*8348SEric.Yu@Sun.COM  * Set socket options
284*8348SEric.Yu@Sun.COM  */
285*8348SEric.Yu@Sun.COM int
286*8348SEric.Yu@Sun.COM socket_setsockopt(struct sonode *so, int level, int option_name,
287*8348SEric.Yu@Sun.COM     const void *optval, t_uscalar_t optlen, cred_t *cr)
288*8348SEric.Yu@Sun.COM {
289*8348SEric.Yu@Sun.COM 	/* Caller allocates aligned optval, or passes null */
290*8348SEric.Yu@Sun.COM 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
291*8348SEric.Yu@Sun.COM 	/* If optval is null optlen is 0, and vice-versa */
292*8348SEric.Yu@Sun.COM 	ASSERT(optval != NULL || optlen == 0);
293*8348SEric.Yu@Sun.COM 	ASSERT(optlen != 0 || optval == NULL);
294*8348SEric.Yu@Sun.COM 
295*8348SEric.Yu@Sun.COM 	/* No options should be zero-length */
296*8348SEric.Yu@Sun.COM 	if (optlen == 0)
297*8348SEric.Yu@Sun.COM 		return (EINVAL);
298*8348SEric.Yu@Sun.COM 
299*8348SEric.Yu@Sun.COM 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
300*8348SEric.Yu@Sun.COM }
301*8348SEric.Yu@Sun.COM 
302*8348SEric.Yu@Sun.COM int
303*8348SEric.Yu@Sun.COM socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
304*8348SEric.Yu@Sun.COM     cred_t *cr)
305*8348SEric.Yu@Sun.COM {
306*8348SEric.Yu@Sun.COM 	int error = 0;
307*8348SEric.Yu@Sun.COM 	ssize_t orig_resid = uiop->uio_resid;
308*8348SEric.Yu@Sun.COM 
309*8348SEric.Yu@Sun.COM 	/*
310*8348SEric.Yu@Sun.COM 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
311*8348SEric.Yu@Sun.COM 	 */
312*8348SEric.Yu@Sun.COM 	if (so->so_family == AF_UNIX)
313*8348SEric.Yu@Sun.COM 		uiop->uio_extflg |= UIO_COPY_CACHED;
314*8348SEric.Yu@Sun.COM 	else
315*8348SEric.Yu@Sun.COM 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
316*8348SEric.Yu@Sun.COM 
317*8348SEric.Yu@Sun.COM 	error = SOP_SENDMSG(so, msg, uiop, cr);
318*8348SEric.Yu@Sun.COM 	switch (error) {
319*8348SEric.Yu@Sun.COM 	default:
320*8348SEric.Yu@Sun.COM 		break;
321*8348SEric.Yu@Sun.COM 	case EINTR:
322*8348SEric.Yu@Sun.COM 	case ETIME:
323*8348SEric.Yu@Sun.COM 	case EWOULDBLOCK:
324*8348SEric.Yu@Sun.COM 		/* We did a partial send */
325*8348SEric.Yu@Sun.COM 		if (uiop->uio_resid != orig_resid)
326*8348SEric.Yu@Sun.COM 			error = 0;
327*8348SEric.Yu@Sun.COM 		break;
328*8348SEric.Yu@Sun.COM 	case EPIPE:
329*8348SEric.Yu@Sun.COM 		if ((so->so_mode & SM_KERNEL) == 0)
330*8348SEric.Yu@Sun.COM 			tsignal(curthread, SIGPIPE);
331*8348SEric.Yu@Sun.COM 		break;
332*8348SEric.Yu@Sun.COM 	}
333*8348SEric.Yu@Sun.COM 
334*8348SEric.Yu@Sun.COM 	return (error);
335*8348SEric.Yu@Sun.COM }
336*8348SEric.Yu@Sun.COM 
337*8348SEric.Yu@Sun.COM int
338*8348SEric.Yu@Sun.COM socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
339*8348SEric.Yu@Sun.COM     struct cred *cr, mblk_t **mpp)
340*8348SEric.Yu@Sun.COM {
341*8348SEric.Yu@Sun.COM 	int error = 0;
342*8348SEric.Yu@Sun.COM 
343*8348SEric.Yu@Sun.COM 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
344*8348SEric.Yu@Sun.COM 	if (error == EPIPE) {
345*8348SEric.Yu@Sun.COM 		tsignal(curthread, SIGPIPE);
346*8348SEric.Yu@Sun.COM 	}
347*8348SEric.Yu@Sun.COM 	return (error);
348*8348SEric.Yu@Sun.COM }
349*8348SEric.Yu@Sun.COM 
350*8348SEric.Yu@Sun.COM int
351*8348SEric.Yu@Sun.COM socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
352*8348SEric.Yu@Sun.COM     cred_t *cr)
353*8348SEric.Yu@Sun.COM {
354*8348SEric.Yu@Sun.COM 	int error;
355*8348SEric.Yu@Sun.COM 	ssize_t orig_resid = uiop->uio_resid;
356*8348SEric.Yu@Sun.COM 
357*8348SEric.Yu@Sun.COM 	/*
358*8348SEric.Yu@Sun.COM 	 * Do not bypass the cache when reading data, as the application
359*8348SEric.Yu@Sun.COM 	 * is likely to access the data shortly.
360*8348SEric.Yu@Sun.COM 	 */
361*8348SEric.Yu@Sun.COM 	uiop->uio_extflg |= UIO_COPY_CACHED;
362*8348SEric.Yu@Sun.COM 
363*8348SEric.Yu@Sun.COM 	error = SOP_RECVMSG(so, msg, uiop, cr);
364*8348SEric.Yu@Sun.COM 
365*8348SEric.Yu@Sun.COM 	switch (error) {
366*8348SEric.Yu@Sun.COM 	case EINTR:
367*8348SEric.Yu@Sun.COM 	case ETIME:
368*8348SEric.Yu@Sun.COM 	case EWOULDBLOCK:
369*8348SEric.Yu@Sun.COM 		/* We did a partial read */
370*8348SEric.Yu@Sun.COM 		if (uiop->uio_resid != orig_resid)
371*8348SEric.Yu@Sun.COM 			error = 0;
372*8348SEric.Yu@Sun.COM 		break;
373*8348SEric.Yu@Sun.COM 	default:
374*8348SEric.Yu@Sun.COM 		break;
375*8348SEric.Yu@Sun.COM 	}
376*8348SEric.Yu@Sun.COM 	return (error);
377*8348SEric.Yu@Sun.COM }
378*8348SEric.Yu@Sun.COM 
379*8348SEric.Yu@Sun.COM int
380*8348SEric.Yu@Sun.COM socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
381*8348SEric.Yu@Sun.COM     struct cred *cr, int32_t *rvalp)
382*8348SEric.Yu@Sun.COM {
383*8348SEric.Yu@Sun.COM 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
384*8348SEric.Yu@Sun.COM }
385*8348SEric.Yu@Sun.COM 
386*8348SEric.Yu@Sun.COM int
387*8348SEric.Yu@Sun.COM socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
388*8348SEric.Yu@Sun.COM     struct pollhead **phpp)
389*8348SEric.Yu@Sun.COM {
390*8348SEric.Yu@Sun.COM 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
391*8348SEric.Yu@Sun.COM }
392*8348SEric.Yu@Sun.COM 
393*8348SEric.Yu@Sun.COM int
394*8348SEric.Yu@Sun.COM socket_close(struct sonode *so, int flag, struct cred *cr)
395*8348SEric.Yu@Sun.COM {
396*8348SEric.Yu@Sun.COM 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
397*8348SEric.Yu@Sun.COM }
398*8348SEric.Yu@Sun.COM 
399*8348SEric.Yu@Sun.COM int
400*8348SEric.Yu@Sun.COM socket_close_internal(struct sonode *so, int flag, cred_t *cr)
401*8348SEric.Yu@Sun.COM {
402*8348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0);
403*8348SEric.Yu@Sun.COM 
404*8348SEric.Yu@Sun.COM 	return (SOP_CLOSE(so, flag, cr));
405*8348SEric.Yu@Sun.COM }
406*8348SEric.Yu@Sun.COM 
407*8348SEric.Yu@Sun.COM void
408*8348SEric.Yu@Sun.COM socket_destroy(struct sonode *so)
409*8348SEric.Yu@Sun.COM {
410*8348SEric.Yu@Sun.COM 	vn_invalid(SOTOV(so));
411*8348SEric.Yu@Sun.COM 	VN_RELE(SOTOV(so));
412*8348SEric.Yu@Sun.COM }
413*8348SEric.Yu@Sun.COM 
414*8348SEric.Yu@Sun.COM /* ARGSUSED */
415*8348SEric.Yu@Sun.COM void
416*8348SEric.Yu@Sun.COM socket_destroy_internal(struct sonode *so, cred_t *cr)
417*8348SEric.Yu@Sun.COM {
418*8348SEric.Yu@Sun.COM 	struct sockparams *sp = so->so_sockparams;
419*8348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0 && sp != NULL);
420*8348SEric.Yu@Sun.COM 
421*8348SEric.Yu@Sun.COM 	sp->sp_smod_info->smod_sock_destroy_func(so);
422*8348SEric.Yu@Sun.COM 
423*8348SEric.Yu@Sun.COM 	SOCKPARAMS_DEC_REF(sp);
424*8348SEric.Yu@Sun.COM }
425*8348SEric.Yu@Sun.COM 
426*8348SEric.Yu@Sun.COM /*
427*8348SEric.Yu@Sun.COM  * TODO Once the common vnode ops is available, then the vnops argument
428*8348SEric.Yu@Sun.COM  * should be removed.
429*8348SEric.Yu@Sun.COM  */
430*8348SEric.Yu@Sun.COM /*ARGSUSED*/
431*8348SEric.Yu@Sun.COM int
432*8348SEric.Yu@Sun.COM sonode_constructor(void *buf, void *cdrarg, int kmflags)
433*8348SEric.Yu@Sun.COM {
434*8348SEric.Yu@Sun.COM 	struct sonode *so = buf;
435*8348SEric.Yu@Sun.COM 	struct vnode *vp;
436*8348SEric.Yu@Sun.COM 
437*8348SEric.Yu@Sun.COM 	vp = so->so_vnode = vn_alloc(kmflags);
438*8348SEric.Yu@Sun.COM 	if (vp == NULL) {
439*8348SEric.Yu@Sun.COM 		return (-1);
440*8348SEric.Yu@Sun.COM 	}
441*8348SEric.Yu@Sun.COM 	vp->v_data = so;
442*8348SEric.Yu@Sun.COM 	vn_setops(vp, socket_vnodeops);
443*8348SEric.Yu@Sun.COM 
444*8348SEric.Yu@Sun.COM 	so->so_priv 		= NULL;
445*8348SEric.Yu@Sun.COM 	so->so_oobmsg		= NULL;
446*8348SEric.Yu@Sun.COM 
447*8348SEric.Yu@Sun.COM 	so->so_proto_handle	= NULL;
448*8348SEric.Yu@Sun.COM 
449*8348SEric.Yu@Sun.COM 	so->so_peercred 	= NULL;
450*8348SEric.Yu@Sun.COM 
451*8348SEric.Yu@Sun.COM 	so->so_rcv_queued	= 0;
452*8348SEric.Yu@Sun.COM 	so->so_rcv_q_head 	= NULL;
453*8348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head 	= NULL;
454*8348SEric.Yu@Sun.COM 	so->so_rcv_head		= NULL;
455*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head	= NULL;
456*8348SEric.Yu@Sun.COM 	so->so_rcv_wanted	= 0;
457*8348SEric.Yu@Sun.COM 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
458*8348SEric.Yu@Sun.COM 	so->so_rcv_timer_tid	= 0;
459*8348SEric.Yu@Sun.COM 	so->so_rcv_thresh	= 0;
460*8348SEric.Yu@Sun.COM 
461*8348SEric.Yu@Sun.COM 	so->so_acceptq_head	= NULL;
462*8348SEric.Yu@Sun.COM 	so->so_acceptq_tail	= &so->so_acceptq_head;
463*8348SEric.Yu@Sun.COM 	so->so_acceptq_next	= NULL;
464*8348SEric.Yu@Sun.COM 	so->so_acceptq_len	= 0;
465*8348SEric.Yu@Sun.COM 	so->so_backlog		= 0;
466*8348SEric.Yu@Sun.COM 
467*8348SEric.Yu@Sun.COM 	so->so_snd_qfull	= B_FALSE;
468*8348SEric.Yu@Sun.COM 
469*8348SEric.Yu@Sun.COM 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
470*8348SEric.Yu@Sun.COM 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
471*8348SEric.Yu@Sun.COM 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
472*8348SEric.Yu@Sun.COM 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
473*8348SEric.Yu@Sun.COM 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
474*8348SEric.Yu@Sun.COM 
475*8348SEric.Yu@Sun.COM 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
476*8348SEric.Yu@Sun.COM 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
477*8348SEric.Yu@Sun.COM 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
478*8348SEric.Yu@Sun.COM 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
479*8348SEric.Yu@Sun.COM 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
480*8348SEric.Yu@Sun.COM 
481*8348SEric.Yu@Sun.COM 	return (0);
482*8348SEric.Yu@Sun.COM }
483*8348SEric.Yu@Sun.COM 
484*8348SEric.Yu@Sun.COM /*ARGSUSED*/
485*8348SEric.Yu@Sun.COM void
486*8348SEric.Yu@Sun.COM sonode_destructor(void *buf, void *cdrarg)
487*8348SEric.Yu@Sun.COM {
488*8348SEric.Yu@Sun.COM 	struct sonode *so = buf;
489*8348SEric.Yu@Sun.COM 	struct vnode *vp = SOTOV(so);
490*8348SEric.Yu@Sun.COM 
491*8348SEric.Yu@Sun.COM 	ASSERT(so->so_priv == NULL);
492*8348SEric.Yu@Sun.COM 	ASSERT(so->so_peercred == NULL);
493*8348SEric.Yu@Sun.COM 
494*8348SEric.Yu@Sun.COM 	ASSERT(so->so_oobmsg == NULL);
495*8348SEric.Yu@Sun.COM 
496*8348SEric.Yu@Sun.COM 	ASSERT(so->so_rcv_q_head == NULL);
497*8348SEric.Yu@Sun.COM 
498*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_head == NULL);
499*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
500*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_next == NULL);
501*8348SEric.Yu@Sun.COM 
502*8348SEric.Yu@Sun.COM 	ASSERT(vp->v_data == so);
503*8348SEric.Yu@Sun.COM 	ASSERT(vn_matchops(vp, socket_vnodeops));
504*8348SEric.Yu@Sun.COM 
505*8348SEric.Yu@Sun.COM 	vn_free(vp);
506*8348SEric.Yu@Sun.COM 
507*8348SEric.Yu@Sun.COM 	mutex_destroy(&so->so_lock);
508*8348SEric.Yu@Sun.COM 	mutex_destroy(&so->so_acceptq_lock);
509*8348SEric.Yu@Sun.COM 	rw_destroy(&so->so_fallback_rwlock);
510*8348SEric.Yu@Sun.COM 
511*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_state_cv);
512*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_want_cv);
513*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_acceptq_cv);
514*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_snd_cv);
515*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_rcv_cv);
516*8348SEric.Yu@Sun.COM 	cv_destroy(&so->so_closing_cv);
517*8348SEric.Yu@Sun.COM }
518*8348SEric.Yu@Sun.COM 
519*8348SEric.Yu@Sun.COM void
520*8348SEric.Yu@Sun.COM sonode_init(struct sonode *so, struct sockparams *sp, int family,
521*8348SEric.Yu@Sun.COM     int type, int protocol, sonodeops_t *sops)
522*8348SEric.Yu@Sun.COM {
523*8348SEric.Yu@Sun.COM 	vnode_t *vp;
524*8348SEric.Yu@Sun.COM 
525*8348SEric.Yu@Sun.COM 	vp = SOTOV(so);
526*8348SEric.Yu@Sun.COM 
527*8348SEric.Yu@Sun.COM 	so->so_flag	= 0;
528*8348SEric.Yu@Sun.COM 
529*8348SEric.Yu@Sun.COM 	so->so_state	= 0;
530*8348SEric.Yu@Sun.COM 	so->so_mode	= 0;
531*8348SEric.Yu@Sun.COM 
532*8348SEric.Yu@Sun.COM 	so->so_count	= 0;
533*8348SEric.Yu@Sun.COM 
534*8348SEric.Yu@Sun.COM 	so->so_family	= family;
535*8348SEric.Yu@Sun.COM 	so->so_type	= type;
536*8348SEric.Yu@Sun.COM 	so->so_protocol	= protocol;
537*8348SEric.Yu@Sun.COM 
538*8348SEric.Yu@Sun.COM 	SOCK_CONNID_INIT(so->so_proto_connid);
539*8348SEric.Yu@Sun.COM 
540*8348SEric.Yu@Sun.COM 	so->so_options	= 0;
541*8348SEric.Yu@Sun.COM 	so->so_linger.l_onoff   = 0;
542*8348SEric.Yu@Sun.COM 	so->so_linger.l_linger = 0;
543*8348SEric.Yu@Sun.COM 	so->so_sndbuf	= 0;
544*8348SEric.Yu@Sun.COM 	so->so_error	= 0;
545*8348SEric.Yu@Sun.COM 	so->so_rcvtimeo	= 0;
546*8348SEric.Yu@Sun.COM 	so->so_sndtimeo = 0;
547*8348SEric.Yu@Sun.COM 
548*8348SEric.Yu@Sun.COM 	ASSERT(so->so_oobmsg == NULL);
549*8348SEric.Yu@Sun.COM 	so->so_oobmark	= 0;
550*8348SEric.Yu@Sun.COM 	so->so_pgrp	= 0;
551*8348SEric.Yu@Sun.COM 
552*8348SEric.Yu@Sun.COM 	ASSERT(so->so_peercred == NULL);
553*8348SEric.Yu@Sun.COM 
554*8348SEric.Yu@Sun.COM 	so->so_zoneid = getzoneid();
555*8348SEric.Yu@Sun.COM 
556*8348SEric.Yu@Sun.COM 	so->so_sockparams = sp;
557*8348SEric.Yu@Sun.COM 
558*8348SEric.Yu@Sun.COM 	so->so_ops = sops;
559*8348SEric.Yu@Sun.COM 
560*8348SEric.Yu@Sun.COM 	so->so_proto_handle = NULL;
561*8348SEric.Yu@Sun.COM 
562*8348SEric.Yu@Sun.COM 	so->so_downcalls = NULL;
563*8348SEric.Yu@Sun.COM 
564*8348SEric.Yu@Sun.COM 	so->so_copyflag = 0;
565*8348SEric.Yu@Sun.COM 
566*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_head == NULL);
567*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
568*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_next == NULL);
569*8348SEric.Yu@Sun.COM 
570*8348SEric.Yu@Sun.COM 	vn_reinit(vp);
571*8348SEric.Yu@Sun.COM 	vp->v_vfsp	= rootvfs;
572*8348SEric.Yu@Sun.COM 	vp->v_type	= VSOCK;
573*8348SEric.Yu@Sun.COM 	vp->v_rdev	= sockdev;
574*8348SEric.Yu@Sun.COM 
575*8348SEric.Yu@Sun.COM 	so->so_rcv_queued = 0;
576*8348SEric.Yu@Sun.COM 	so->so_rcv_q_head = NULL;
577*8348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head = NULL;
578*8348SEric.Yu@Sun.COM 	so->so_rcv_head	= NULL;
579*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head = NULL;
580*8348SEric.Yu@Sun.COM 
581*8348SEric.Yu@Sun.COM 	so->so_snd_qfull = B_FALSE;
582*8348SEric.Yu@Sun.COM 	so->so_minpsz = 0;
583*8348SEric.Yu@Sun.COM 
584*8348SEric.Yu@Sun.COM 	so->so_rcv_wakeup = B_FALSE;
585*8348SEric.Yu@Sun.COM 	so->so_snd_wakeup = B_FALSE;
586*8348SEric.Yu@Sun.COM 	so->so_flowctrld = B_FALSE;
587*8348SEric.Yu@Sun.COM 
588*8348SEric.Yu@Sun.COM 	so->so_pollev = 0;
589*8348SEric.Yu@Sun.COM 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
590*8348SEric.Yu@Sun.COM 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
591*8348SEric.Yu@Sun.COM 
592*8348SEric.Yu@Sun.COM 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
593*8348SEric.Yu@Sun.COM 	so->so_ksock_cb_arg = NULL;
594*8348SEric.Yu@Sun.COM 
595*8348SEric.Yu@Sun.COM 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
596*8348SEric.Yu@Sun.COM 
597*8348SEric.Yu@Sun.COM 	so->so_direct = NULL;
598*8348SEric.Yu@Sun.COM 
599*8348SEric.Yu@Sun.COM 	vn_exists(vp);
600*8348SEric.Yu@Sun.COM }
601*8348SEric.Yu@Sun.COM 
602*8348SEric.Yu@Sun.COM void
603*8348SEric.Yu@Sun.COM sonode_fini(struct sonode *so)
604*8348SEric.Yu@Sun.COM {
605*8348SEric.Yu@Sun.COM 	mblk_t *mp;
606*8348SEric.Yu@Sun.COM 	vnode_t *vp;
607*8348SEric.Yu@Sun.COM 
608*8348SEric.Yu@Sun.COM 	ASSERT(so->so_count == 0);
609*8348SEric.Yu@Sun.COM 
610*8348SEric.Yu@Sun.COM 	if (so->so_rcv_timer_tid) {
611*8348SEric.Yu@Sun.COM 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
612*8348SEric.Yu@Sun.COM 		(void) untimeout(so->so_rcv_timer_tid);
613*8348SEric.Yu@Sun.COM 		so->so_rcv_timer_tid = 0;
614*8348SEric.Yu@Sun.COM 	}
615*8348SEric.Yu@Sun.COM 
616*8348SEric.Yu@Sun.COM 	so_acceptq_flush(so);
617*8348SEric.Yu@Sun.COM 
618*8348SEric.Yu@Sun.COM #ifdef DEBUG
619*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
620*8348SEric.Yu@Sun.COM 	ASSERT(so_verify_oobstate(so));
621*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
622*8348SEric.Yu@Sun.COM #endif /* DEBUG */
623*8348SEric.Yu@Sun.COM 	if ((mp = so->so_oobmsg) != NULL) {
624*8348SEric.Yu@Sun.COM 		freemsg(mp);
625*8348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
626*8348SEric.Yu@Sun.COM 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
627*8348SEric.Yu@Sun.COM 		    SS_RCVATMARK);
628*8348SEric.Yu@Sun.COM 	}
629*8348SEric.Yu@Sun.COM 
630*8348SEric.Yu@Sun.COM 	if (so->so_poll_list.ph_list != NULL) {
631*8348SEric.Yu@Sun.COM 		pollwakeup(&so->so_poll_list, POLLERR);
632*8348SEric.Yu@Sun.COM 		pollhead_clean(&so->so_poll_list);
633*8348SEric.Yu@Sun.COM 	}
634*8348SEric.Yu@Sun.COM 
635*8348SEric.Yu@Sun.COM 	if (so->so_direct != NULL) {
636*8348SEric.Yu@Sun.COM 		sodirect_t *sodp = so->so_direct;
637*8348SEric.Yu@Sun.COM 
638*8348SEric.Yu@Sun.COM 		ASSERT(sodp->sod_uioafh == NULL);
639*8348SEric.Yu@Sun.COM 
640*8348SEric.Yu@Sun.COM 		so->so_direct = NULL;
641*8348SEric.Yu@Sun.COM 		kmem_cache_free(sock_sod_cache, sodp);
642*8348SEric.Yu@Sun.COM 	}
643*8348SEric.Yu@Sun.COM 
644*8348SEric.Yu@Sun.COM 	vp = SOTOV(so);
645*8348SEric.Yu@Sun.COM 	vn_invalid(vp);
646*8348SEric.Yu@Sun.COM 
647*8348SEric.Yu@Sun.COM 	if (so->so_peercred != NULL) {
648*8348SEric.Yu@Sun.COM 		crfree(so->so_peercred);
649*8348SEric.Yu@Sun.COM 		so->so_peercred = NULL;
650*8348SEric.Yu@Sun.COM 	}
651*8348SEric.Yu@Sun.COM }
652*8348SEric.Yu@Sun.COM 
653*8348SEric.Yu@Sun.COM /*
654*8348SEric.Yu@Sun.COM  * This function is called at the beginning of recvmsg().
655*8348SEric.Yu@Sun.COM  *
656*8348SEric.Yu@Sun.COM  * If I/OAT is enabled on this sonode, initialize the uioa state machine
657*8348SEric.Yu@Sun.COM  * with state UIOA_ALLOC.
658*8348SEric.Yu@Sun.COM  */
659*8348SEric.Yu@Sun.COM uio_t *
660*8348SEric.Yu@Sun.COM sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
661*8348SEric.Yu@Sun.COM {
662*8348SEric.Yu@Sun.COM 	struct uio *suiop;
663*8348SEric.Yu@Sun.COM 	struct uio *uiop;
664*8348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
665*8348SEric.Yu@Sun.COM 
666*8348SEric.Yu@Sun.COM 	if (sodp == NULL)
667*8348SEric.Yu@Sun.COM 		return (NULL);
668*8348SEric.Yu@Sun.COM 
669*8348SEric.Yu@Sun.COM 	suiop = NULL;
670*8348SEric.Yu@Sun.COM 	uiop = *uiopp;
671*8348SEric.Yu@Sun.COM 
672*8348SEric.Yu@Sun.COM 	mutex_enter(sodp->sod_lockp);
673*8348SEric.Yu@Sun.COM 	if (uiop->uio_resid >= uioasync.mincnt &&
674*8348SEric.Yu@Sun.COM 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
675*8348SEric.Yu@Sun.COM 	    uioasync.enabled && !(flags & MSG_PEEK) &&
676*8348SEric.Yu@Sun.COM 	    !(so->so_state & SS_CANTRCVMORE)) {
677*8348SEric.Yu@Sun.COM 		/*
678*8348SEric.Yu@Sun.COM 		 * Big enough I/O for uioa min setup and an sodirect socket
679*8348SEric.Yu@Sun.COM 		 * and sodirect enabled and uioa enabled and I/O will be done
680*8348SEric.Yu@Sun.COM 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
681*8348SEric.Yu@Sun.COM 		 */
682*8348SEric.Yu@Sun.COM 		if (!uioainit(uiop, &sodp->sod_uioa)) {
683*8348SEric.Yu@Sun.COM 			/*
684*8348SEric.Yu@Sun.COM 			 * Successful uioainit() so the uio_t part of the
685*8348SEric.Yu@Sun.COM 			 * uioa_t will be used for all uio_t work to follow,
686*8348SEric.Yu@Sun.COM 			 * we return the original "uiop" in "suiop".
687*8348SEric.Yu@Sun.COM 			 */
688*8348SEric.Yu@Sun.COM 			suiop = uiop;
689*8348SEric.Yu@Sun.COM 			*uiopp = (uio_t *)&sodp->sod_uioa;
690*8348SEric.Yu@Sun.COM 			/*
691*8348SEric.Yu@Sun.COM 			 * Before returning to the caller the passed in uio_t
692*8348SEric.Yu@Sun.COM 			 * "uiop" will be updated via a call to uioafini()
693*8348SEric.Yu@Sun.COM 			 * below.
694*8348SEric.Yu@Sun.COM 			 *
695*8348SEric.Yu@Sun.COM 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
696*8348SEric.Yu@Sun.COM 			 * here as first we have to uioamove() any currently
697*8348SEric.Yu@Sun.COM 			 * queued M_DATA mblk_t(s) so it will be done later.
698*8348SEric.Yu@Sun.COM 			 */
699*8348SEric.Yu@Sun.COM 		}
700*8348SEric.Yu@Sun.COM 		/*
701*8348SEric.Yu@Sun.COM 		 * In either uioainit() success or not case note the number
702*8348SEric.Yu@Sun.COM 		 * of uio bytes the caller wants for sod framework and/or
703*8348SEric.Yu@Sun.COM 		 * transport (e.g. TCP) strategy.
704*8348SEric.Yu@Sun.COM 		 */
705*8348SEric.Yu@Sun.COM 		sodp->sod_want = uiop->uio_resid;
706*8348SEric.Yu@Sun.COM 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
707*8348SEric.Yu@Sun.COM 		/*
708*8348SEric.Yu@Sun.COM 		 * No uioa but still using sodirect so note the number of
709*8348SEric.Yu@Sun.COM 		 * uio bytes the caller wants for sodirect framework and/or
710*8348SEric.Yu@Sun.COM 		 * transport (e.g. TCP) strategy.
711*8348SEric.Yu@Sun.COM 		 */
712*8348SEric.Yu@Sun.COM 		sodp->sod_want = uiop->uio_resid;
713*8348SEric.Yu@Sun.COM 	}
714*8348SEric.Yu@Sun.COM 	mutex_exit(sodp->sod_lockp);
715*8348SEric.Yu@Sun.COM 
716*8348SEric.Yu@Sun.COM 	return (suiop);
717*8348SEric.Yu@Sun.COM }
718*8348SEric.Yu@Sun.COM 
719*8348SEric.Yu@Sun.COM /*
720*8348SEric.Yu@Sun.COM  * This function is called at the end of recvmsg(), it finializes all the I/OAT
721*8348SEric.Yu@Sun.COM  * operations, and reset the uioa state to UIOA_ALLOC.
722*8348SEric.Yu@Sun.COM  */
723*8348SEric.Yu@Sun.COM int
724*8348SEric.Yu@Sun.COM sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
725*8348SEric.Yu@Sun.COM {
726*8348SEric.Yu@Sun.COM 	int error = 0;
727*8348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
728*8348SEric.Yu@Sun.COM 	mblk_t *mp;
729*8348SEric.Yu@Sun.COM 
730*8348SEric.Yu@Sun.COM 	if (sodp == NULL) {
731*8348SEric.Yu@Sun.COM 		return (0);
732*8348SEric.Yu@Sun.COM 	}
733*8348SEric.Yu@Sun.COM 
734*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
735*8348SEric.Yu@Sun.COM 	/* Finish any sodirect and uioa processing */
736*8348SEric.Yu@Sun.COM 	if (suiop != NULL) {
737*8348SEric.Yu@Sun.COM 		/* Finish any uioa_t processing */
738*8348SEric.Yu@Sun.COM 
739*8348SEric.Yu@Sun.COM 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
740*8348SEric.Yu@Sun.COM 		error = uioafini(suiop, (uioa_t *)uiop);
741*8348SEric.Yu@Sun.COM 		if ((mp = sodp->sod_uioafh) != NULL) {
742*8348SEric.Yu@Sun.COM 			sodp->sod_uioafh = NULL;
743*8348SEric.Yu@Sun.COM 			sodp->sod_uioaft = NULL;
744*8348SEric.Yu@Sun.COM 			freemsg(mp);
745*8348SEric.Yu@Sun.COM 		}
746*8348SEric.Yu@Sun.COM 	}
747*8348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioafh == NULL);
748*8348SEric.Yu@Sun.COM 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
749*8348SEric.Yu@Sun.COM 		/* Awoke */
750*8348SEric.Yu@Sun.COM 		sodp->sod_state &= SOD_WAKE_CLR;
751*8348SEric.Yu@Sun.COM 		sodp->sod_state |= SOD_WAKE_NOT;
752*8348SEric.Yu@Sun.COM 	}
753*8348SEric.Yu@Sun.COM 	/* Last, clear sod_want value */
754*8348SEric.Yu@Sun.COM 	sodp->sod_want = 0;
755*8348SEric.Yu@Sun.COM 
756*8348SEric.Yu@Sun.COM 	return (error);
757*8348SEric.Yu@Sun.COM }
758*8348SEric.Yu@Sun.COM 
759*8348SEric.Yu@Sun.COM /*
760*8348SEric.Yu@Sun.COM  * Schedule a uioamove() on a mblk. This is ususally called from
761*8348SEric.Yu@Sun.COM  * protocols (e.g. TCP) on a I/OAT enabled sonode.
762*8348SEric.Yu@Sun.COM  */
763*8348SEric.Yu@Sun.COM mblk_t *
764*8348SEric.Yu@Sun.COM sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
765*8348SEric.Yu@Sun.COM {
766*8348SEric.Yu@Sun.COM 	uioa_t *uioap = &sodp->sod_uioa;
767*8348SEric.Yu@Sun.COM 	mblk_t *mp1 = mp;
768*8348SEric.Yu@Sun.COM 	mblk_t *lmp = NULL;
769*8348SEric.Yu@Sun.COM 
770*8348SEric.Yu@Sun.COM 	ASSERT(DB_TYPE(mp) == M_DATA);
771*8348SEric.Yu@Sun.COM 	ASSERT(msg_size == msgdsize(mp));
772*8348SEric.Yu@Sun.COM 
773*8348SEric.Yu@Sun.COM 	/* Caller must have lock held */
774*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
775*8348SEric.Yu@Sun.COM 
776*8348SEric.Yu@Sun.COM 	if (uioap->uioa_state & UIOA_ENABLED) {
777*8348SEric.Yu@Sun.COM 		/* Uioa is enabled */
778*8348SEric.Yu@Sun.COM 
779*8348SEric.Yu@Sun.COM 		if (msg_size > uioap->uio_resid) {
780*8348SEric.Yu@Sun.COM 			/*
781*8348SEric.Yu@Sun.COM 			 * There isn't enough uio space for the mblk_t chain
782*8348SEric.Yu@Sun.COM 			 * so disable uioa such that this and any additional
783*8348SEric.Yu@Sun.COM 			 * mblk_t data is handled by the socket and schedule
784*8348SEric.Yu@Sun.COM 			 * the socket for wakeup to finish this uioa.
785*8348SEric.Yu@Sun.COM 			 */
786*8348SEric.Yu@Sun.COM 			uioap->uioa_state &= UIOA_CLR;
787*8348SEric.Yu@Sun.COM 			uioap->uioa_state |= UIOA_FINI;
788*8348SEric.Yu@Sun.COM 			if (sodp->sod_state & SOD_WAKE_NOT) {
789*8348SEric.Yu@Sun.COM 				sodp->sod_state &= SOD_WAKE_CLR;
790*8348SEric.Yu@Sun.COM 				sodp->sod_state |= SOD_WAKE_NEED;
791*8348SEric.Yu@Sun.COM 			}
792*8348SEric.Yu@Sun.COM 			return (mp);
793*8348SEric.Yu@Sun.COM 		}
794*8348SEric.Yu@Sun.COM 		do {
795*8348SEric.Yu@Sun.COM 			uint32_t	len = MBLKL(mp1);
796*8348SEric.Yu@Sun.COM 
797*8348SEric.Yu@Sun.COM 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
798*8348SEric.Yu@Sun.COM 				/* Scheduled, mark dblk_t as such */
799*8348SEric.Yu@Sun.COM 				DB_FLAGS(mp1) |= DBLK_UIOA;
800*8348SEric.Yu@Sun.COM 			} else {
801*8348SEric.Yu@Sun.COM 				/* Error, turn off async processing */
802*8348SEric.Yu@Sun.COM 				uioap->uioa_state &= UIOA_CLR;
803*8348SEric.Yu@Sun.COM 				uioap->uioa_state |= UIOA_FINI;
804*8348SEric.Yu@Sun.COM 				break;
805*8348SEric.Yu@Sun.COM 			}
806*8348SEric.Yu@Sun.COM 			lmp = mp1;
807*8348SEric.Yu@Sun.COM 		} while ((mp1 = mp1->b_cont) != NULL);
808*8348SEric.Yu@Sun.COM 
809*8348SEric.Yu@Sun.COM 		if (mp1 != NULL || uioap->uio_resid == 0) {
810*8348SEric.Yu@Sun.COM 			/*
811*8348SEric.Yu@Sun.COM 			 * Not all mblk_t(s) uioamoved (error) or all uio
812*8348SEric.Yu@Sun.COM 			 * space has been consumed so schedule the socket
813*8348SEric.Yu@Sun.COM 			 * for wakeup to finish this uio.
814*8348SEric.Yu@Sun.COM 			 */
815*8348SEric.Yu@Sun.COM 			sodp->sod_state &= SOD_WAKE_CLR;
816*8348SEric.Yu@Sun.COM 			sodp->sod_state |= SOD_WAKE_NEED;
817*8348SEric.Yu@Sun.COM 
818*8348SEric.Yu@Sun.COM 			/* Break the mblk chain if neccessary. */
819*8348SEric.Yu@Sun.COM 			if (mp1 != NULL && lmp != NULL) {
820*8348SEric.Yu@Sun.COM 				mp->b_next = mp1;
821*8348SEric.Yu@Sun.COM 				lmp->b_cont = NULL;
822*8348SEric.Yu@Sun.COM 			}
823*8348SEric.Yu@Sun.COM 		}
824*8348SEric.Yu@Sun.COM 	}
825*8348SEric.Yu@Sun.COM 	return (mp1);
826*8348SEric.Yu@Sun.COM }
827*8348SEric.Yu@Sun.COM 
828*8348SEric.Yu@Sun.COM /*
829*8348SEric.Yu@Sun.COM  * This function is called on a mblk that thas been successfully uioamoved().
830*8348SEric.Yu@Sun.COM  */
831*8348SEric.Yu@Sun.COM void
832*8348SEric.Yu@Sun.COM sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
833*8348SEric.Yu@Sun.COM {
834*8348SEric.Yu@Sun.COM 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
835*8348SEric.Yu@Sun.COM 		/*
836*8348SEric.Yu@Sun.COM 		 * A uioa flaged mblk_t chain, already uio processed,
837*8348SEric.Yu@Sun.COM 		 * add it to the sodirect uioa pending free list.
838*8348SEric.Yu@Sun.COM 		 *
839*8348SEric.Yu@Sun.COM 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
840*8348SEric.Yu@Sun.COM 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
841*8348SEric.Yu@Sun.COM 		 */
842*8348SEric.Yu@Sun.COM 		mblk_t	*bpt = sodp->sod_uioaft;
843*8348SEric.Yu@Sun.COM 
844*8348SEric.Yu@Sun.COM 		ASSERT(sodp != NULL);
845*8348SEric.Yu@Sun.COM 
846*8348SEric.Yu@Sun.COM 		/*
847*8348SEric.Yu@Sun.COM 		 * Add first mblk_t of "bp" chain to current sodirect uioa
848*8348SEric.Yu@Sun.COM 		 * free list tail mblk_t, if any, else empty list so new head.
849*8348SEric.Yu@Sun.COM 		 */
850*8348SEric.Yu@Sun.COM 		if (bpt == NULL)
851*8348SEric.Yu@Sun.COM 			sodp->sod_uioafh = bp;
852*8348SEric.Yu@Sun.COM 		else
853*8348SEric.Yu@Sun.COM 			bpt->b_cont = bp;
854*8348SEric.Yu@Sun.COM 
855*8348SEric.Yu@Sun.COM 		/*
856*8348SEric.Yu@Sun.COM 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
857*8348SEric.Yu@Sun.COM 		 * each to reflect that uioamove() has consumed all data.
858*8348SEric.Yu@Sun.COM 		 */
859*8348SEric.Yu@Sun.COM 		bpt = bp;
860*8348SEric.Yu@Sun.COM 		for (;;) {
861*8348SEric.Yu@Sun.COM 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
862*8348SEric.Yu@Sun.COM 
863*8348SEric.Yu@Sun.COM 			bpt->b_rptr = bpt->b_wptr;
864*8348SEric.Yu@Sun.COM 			if (bpt->b_cont == NULL)
865*8348SEric.Yu@Sun.COM 				break;
866*8348SEric.Yu@Sun.COM 			bpt = bpt->b_cont;
867*8348SEric.Yu@Sun.COM 		}
868*8348SEric.Yu@Sun.COM 		/* New sodirect uioa free list tail */
869*8348SEric.Yu@Sun.COM 		sodp->sod_uioaft = bpt;
870*8348SEric.Yu@Sun.COM 
871*8348SEric.Yu@Sun.COM 		/* Only dequeue once with data returned per uioa_t */
872*8348SEric.Yu@Sun.COM 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
873*8348SEric.Yu@Sun.COM 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
874*8348SEric.Yu@Sun.COM 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
875*8348SEric.Yu@Sun.COM 		}
876*8348SEric.Yu@Sun.COM 	}
877*8348SEric.Yu@Sun.COM }
878*8348SEric.Yu@Sun.COM 
879*8348SEric.Yu@Sun.COM /*
880*8348SEric.Yu@Sun.COM  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
881*8348SEric.Yu@Sun.COM  * this function on a non-STREAMS socket to schedule uioamove() on the data
882*8348SEric.Yu@Sun.COM  * that has already queued in this socket.
883*8348SEric.Yu@Sun.COM  */
884*8348SEric.Yu@Sun.COM void
885*8348SEric.Yu@Sun.COM sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
886*8348SEric.Yu@Sun.COM {
887*8348SEric.Yu@Sun.COM 	uioa_t	*uioap = (uioa_t *)uiop;
888*8348SEric.Yu@Sun.COM 	mblk_t	*lbp;
889*8348SEric.Yu@Sun.COM 	mblk_t	*wbp;
890*8348SEric.Yu@Sun.COM 	mblk_t	*bp;
891*8348SEric.Yu@Sun.COM 	int	len;
892*8348SEric.Yu@Sun.COM 	int	error;
893*8348SEric.Yu@Sun.COM 	boolean_t in_rcv_q = B_TRUE;
894*8348SEric.Yu@Sun.COM 
895*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
896*8348SEric.Yu@Sun.COM 	ASSERT(&sodp->sod_uioa == uioap);
897*8348SEric.Yu@Sun.COM 
898*8348SEric.Yu@Sun.COM 	/*
899*8348SEric.Yu@Sun.COM 	 * Walk first b_cont chain in sod_q
900*8348SEric.Yu@Sun.COM 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
901*8348SEric.Yu@Sun.COM 	 */
902*8348SEric.Yu@Sun.COM 	bp = so->so_rcv_q_head;
903*8348SEric.Yu@Sun.COM 
904*8348SEric.Yu@Sun.COM again:
905*8348SEric.Yu@Sun.COM 	/* Walk the chain */
906*8348SEric.Yu@Sun.COM 	lbp = NULL;
907*8348SEric.Yu@Sun.COM 	wbp = bp;
908*8348SEric.Yu@Sun.COM 
909*8348SEric.Yu@Sun.COM 	do {
910*8348SEric.Yu@Sun.COM 		if (bp == NULL)
911*8348SEric.Yu@Sun.COM 			break;
912*8348SEric.Yu@Sun.COM 
913*8348SEric.Yu@Sun.COM 		if (wbp->b_datap->db_type != M_DATA) {
914*8348SEric.Yu@Sun.COM 			/* Not M_DATA, no more uioa */
915*8348SEric.Yu@Sun.COM 			goto nouioa;
916*8348SEric.Yu@Sun.COM 		}
917*8348SEric.Yu@Sun.COM 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
918*8348SEric.Yu@Sun.COM 			/* Have a M_DATA mblk_t with data */
919*8348SEric.Yu@Sun.COM 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
920*8348SEric.Yu@Sun.COM 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
921*8348SEric.Yu@Sun.COM 				/* Not enough uio sapce, or beyond oobmark */
922*8348SEric.Yu@Sun.COM 				goto nouioa;
923*8348SEric.Yu@Sun.COM 			}
924*8348SEric.Yu@Sun.COM 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
925*8348SEric.Yu@Sun.COM 			error = uioamove(wbp->b_rptr, len,
926*8348SEric.Yu@Sun.COM 			    UIO_READ, uioap);
927*8348SEric.Yu@Sun.COM 			if (!error) {
928*8348SEric.Yu@Sun.COM 				/* Scheduled, mark dblk_t as such */
929*8348SEric.Yu@Sun.COM 				wbp->b_datap->db_flags |= DBLK_UIOA;
930*8348SEric.Yu@Sun.COM 			} else {
931*8348SEric.Yu@Sun.COM 				/* Break the mblk chain */
932*8348SEric.Yu@Sun.COM 				goto nouioa;
933*8348SEric.Yu@Sun.COM 			}
934*8348SEric.Yu@Sun.COM 		}
935*8348SEric.Yu@Sun.COM 		/* Save last wbp processed */
936*8348SEric.Yu@Sun.COM 		lbp = wbp;
937*8348SEric.Yu@Sun.COM 	} while ((wbp = wbp->b_cont) != NULL);
938*8348SEric.Yu@Sun.COM 
939*8348SEric.Yu@Sun.COM 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
940*8348SEric.Yu@Sun.COM 		/*
941*8348SEric.Yu@Sun.COM 		 * We get here only once to process the sonode dump area
942*8348SEric.Yu@Sun.COM 		 * if so_rcv_q_head is NULL or all the mblks have been
943*8348SEric.Yu@Sun.COM 		 * successfully uioamoved()ed.
944*8348SEric.Yu@Sun.COM 		 */
945*8348SEric.Yu@Sun.COM 		in_rcv_q = B_FALSE;
946*8348SEric.Yu@Sun.COM 
947*8348SEric.Yu@Sun.COM 		/* move to dump area */
948*8348SEric.Yu@Sun.COM 		bp = so->so_rcv_head;
949*8348SEric.Yu@Sun.COM 		goto again;
950*8348SEric.Yu@Sun.COM 	}
951*8348SEric.Yu@Sun.COM 
952*8348SEric.Yu@Sun.COM 	return;
953*8348SEric.Yu@Sun.COM 
954*8348SEric.Yu@Sun.COM nouioa:
955*8348SEric.Yu@Sun.COM 	/* No more uioa */
956*8348SEric.Yu@Sun.COM 	uioap->uioa_state &= UIOA_CLR;
957*8348SEric.Yu@Sun.COM 	uioap->uioa_state |= UIOA_FINI;
958*8348SEric.Yu@Sun.COM 
959*8348SEric.Yu@Sun.COM 	/*
960*8348SEric.Yu@Sun.COM 	 * If we processed 1 or more mblk_t(s) then we need to split the
961*8348SEric.Yu@Sun.COM 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
962*8348SEric.Yu@Sun.COM 	 * are in the current chain and the rest are in the following new
963*8348SEric.Yu@Sun.COM 	 * chain.
964*8348SEric.Yu@Sun.COM 	 */
965*8348SEric.Yu@Sun.COM 	if (lbp != NULL) {
966*8348SEric.Yu@Sun.COM 		/* New end of current chain */
967*8348SEric.Yu@Sun.COM 		lbp->b_cont = NULL;
968*8348SEric.Yu@Sun.COM 
969*8348SEric.Yu@Sun.COM 		/* Insert new chain wbp after bp */
970*8348SEric.Yu@Sun.COM 		if ((wbp->b_next = bp->b_next) == NULL) {
971*8348SEric.Yu@Sun.COM 			/*
972*8348SEric.Yu@Sun.COM 			 * No need to grab so_lock, since sod_lockp
973*8348SEric.Yu@Sun.COM 			 * points to so_lock.
974*8348SEric.Yu@Sun.COM 			 */
975*8348SEric.Yu@Sun.COM 			if (in_rcv_q)
976*8348SEric.Yu@Sun.COM 				so->so_rcv_q_last_head = wbp;
977*8348SEric.Yu@Sun.COM 			else
978*8348SEric.Yu@Sun.COM 				so->so_rcv_last_head = wbp;
979*8348SEric.Yu@Sun.COM 		}
980*8348SEric.Yu@Sun.COM 		bp->b_next = wbp;
981*8348SEric.Yu@Sun.COM 		bp->b_next->b_prev = bp->b_prev;
982*8348SEric.Yu@Sun.COM 		bp->b_prev = lbp;
983*8348SEric.Yu@Sun.COM 	}
984*8348SEric.Yu@Sun.COM }
985*8348SEric.Yu@Sun.COM 
986*8348SEric.Yu@Sun.COM /*
987*8348SEric.Yu@Sun.COM  * Initialize sodirect data structures on a socket.
988*8348SEric.Yu@Sun.COM  */
989*8348SEric.Yu@Sun.COM void
990*8348SEric.Yu@Sun.COM sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
991*8348SEric.Yu@Sun.COM     sod_wakeup_func wake_func, kmutex_t *lockp)
992*8348SEric.Yu@Sun.COM {
993*8348SEric.Yu@Sun.COM 	sodirect_t	*sodp;
994*8348SEric.Yu@Sun.COM 
995*8348SEric.Yu@Sun.COM 	ASSERT(so->so_direct == NULL);
996*8348SEric.Yu@Sun.COM 
997*8348SEric.Yu@Sun.COM 	so->so_state |= SS_SODIRECT;
998*8348SEric.Yu@Sun.COM 
999*8348SEric.Yu@Sun.COM 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
1000*8348SEric.Yu@Sun.COM 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
1001*8348SEric.Yu@Sun.COM 	sodp->sod_want = 0;
1002*8348SEric.Yu@Sun.COM 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
1003*8348SEric.Yu@Sun.COM 	sodp->sod_enqueue = enq_func;
1004*8348SEric.Yu@Sun.COM 	sodp->sod_wakeup = wake_func;
1005*8348SEric.Yu@Sun.COM 	sodp->sod_uioafh = NULL;
1006*8348SEric.Yu@Sun.COM 	sodp->sod_uioaft = NULL;
1007*8348SEric.Yu@Sun.COM 	sodp->sod_lockp = lockp;
1008*8348SEric.Yu@Sun.COM 	/*
1009*8348SEric.Yu@Sun.COM 	 * Remainder of the sod_uioa members are left uninitialized
1010*8348SEric.Yu@Sun.COM 	 * but will be initialized later by uioainit() before uioa
1011*8348SEric.Yu@Sun.COM 	 * is enabled.
1012*8348SEric.Yu@Sun.COM 	 */
1013*8348SEric.Yu@Sun.COM 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
1014*8348SEric.Yu@Sun.COM 	so->so_direct = sodp;
1015*8348SEric.Yu@Sun.COM 	if (stp != NULL)
1016*8348SEric.Yu@Sun.COM 		stp->sd_sodirect = sodp;
1017*8348SEric.Yu@Sun.COM }
1018*8348SEric.Yu@Sun.COM 
1019*8348SEric.Yu@Sun.COM /*
1020*8348SEric.Yu@Sun.COM  * Init the sodirect kmem cache while sockfs is loading.
1021*8348SEric.Yu@Sun.COM  */
1022*8348SEric.Yu@Sun.COM void
1023*8348SEric.Yu@Sun.COM sod_init()
1024*8348SEric.Yu@Sun.COM {
1025*8348SEric.Yu@Sun.COM 	/* Allocate sodirect_t kmem_cache */
1026*8348SEric.Yu@Sun.COM 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
1027*8348SEric.Yu@Sun.COM 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1028*8348SEric.Yu@Sun.COM }
1029*8348SEric.Yu@Sun.COM 
1030*8348SEric.Yu@Sun.COM ssize_t
1031*8348SEric.Yu@Sun.COM sod_uioa_mblk(struct sonode *so, mblk_t *mp)
1032*8348SEric.Yu@Sun.COM {
1033*8348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
1034*8348SEric.Yu@Sun.COM 
1035*8348SEric.Yu@Sun.COM 	ASSERT(sodp != NULL);
1036*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
1037*8348SEric.Yu@Sun.COM 
1038*8348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_state & SOD_ENABLED);
1039*8348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
1040*8348SEric.Yu@Sun.COM 
1041*8348SEric.Yu@Sun.COM 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
1042*8348SEric.Yu@Sun.COM 
1043*8348SEric.Yu@Sun.COM 	if (mp == NULL && so->so_rcv_q_head != NULL) {
1044*8348SEric.Yu@Sun.COM 		mp = so->so_rcv_q_head;
1045*8348SEric.Yu@Sun.COM 		ASSERT(mp->b_prev != NULL);
1046*8348SEric.Yu@Sun.COM 		mp->b_prev = NULL;
1047*8348SEric.Yu@Sun.COM 		so->so_rcv_q_head = mp->b_next;
1048*8348SEric.Yu@Sun.COM 		if (so->so_rcv_q_head == NULL) {
1049*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head = NULL;
1050*8348SEric.Yu@Sun.COM 		}
1051*8348SEric.Yu@Sun.COM 		mp->b_next = NULL;
1052*8348SEric.Yu@Sun.COM 	}
1053*8348SEric.Yu@Sun.COM 
1054*8348SEric.Yu@Sun.COM 	sod_uioa_mblk_done(sodp, mp);
1055*8348SEric.Yu@Sun.COM 
1056*8348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
1057*8348SEric.Yu@Sun.COM 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
1058*8348SEric.Yu@Sun.COM 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
1059*8348SEric.Yu@Sun.COM 		/* more arrived */
1060*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_q_head == NULL);
1061*8348SEric.Yu@Sun.COM 		mp = so->so_rcv_head;
1062*8348SEric.Yu@Sun.COM 		so->so_rcv_head = mp->b_next;
1063*8348SEric.Yu@Sun.COM 		if (so->so_rcv_head == NULL)
1064*8348SEric.Yu@Sun.COM 			so->so_rcv_last_head = NULL;
1065*8348SEric.Yu@Sun.COM 		mp->b_prev = mp->b_next = NULL;
1066*8348SEric.Yu@Sun.COM 		sod_uioa_mblk_done(sodp, mp);
1067*8348SEric.Yu@Sun.COM 	}
1068*8348SEric.Yu@Sun.COM 
1069*8348SEric.Yu@Sun.COM #ifdef DEBUG
1070*8348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head != NULL) {
1071*8348SEric.Yu@Sun.COM 		mblk_t *m = so->so_rcv_q_head;
1072*8348SEric.Yu@Sun.COM 		while (m != NULL) {
1073*8348SEric.Yu@Sun.COM 			if (DB_FLAGS(m) & DBLK_UIOA) {
1074*8348SEric.Yu@Sun.COM 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1075*8348SEric.Yu@Sun.COM 				    " in so_rcv_q_head.\n", (void *)m);
1076*8348SEric.Yu@Sun.COM 			}
1077*8348SEric.Yu@Sun.COM 			m = m->b_next;
1078*8348SEric.Yu@Sun.COM 		}
1079*8348SEric.Yu@Sun.COM 	}
1080*8348SEric.Yu@Sun.COM 	if (so->so_rcv_head != NULL) {
1081*8348SEric.Yu@Sun.COM 		mblk_t *m = so->so_rcv_head;
1082*8348SEric.Yu@Sun.COM 		while (m != NULL) {
1083*8348SEric.Yu@Sun.COM 			if (DB_FLAGS(m) & DBLK_UIOA) {
1084*8348SEric.Yu@Sun.COM 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1085*8348SEric.Yu@Sun.COM 				    " in so_rcv_head.\n", (void *)m);
1086*8348SEric.Yu@Sun.COM 			}
1087*8348SEric.Yu@Sun.COM 			m = m->b_next;
1088*8348SEric.Yu@Sun.COM 		}
1089*8348SEric.Yu@Sun.COM 	}
1090*8348SEric.Yu@Sun.COM #endif
1091*8348SEric.Yu@Sun.COM 	return (sodp->sod_uioa.uioa_mbytes);
1092*8348SEric.Yu@Sun.COM }
1093