xref: /onnv-gate/usr/src/uts/common/fs/sockfs/sockcommon_subr.c (revision 8348:4137e18bfaf0)
1*8348SEric.Yu@Sun.COM /*
2*8348SEric.Yu@Sun.COM  * CDDL HEADER START
3*8348SEric.Yu@Sun.COM  *
4*8348SEric.Yu@Sun.COM  * The contents of this file are subject to the terms of the
5*8348SEric.Yu@Sun.COM  * Common Development and Distribution License (the "License").
6*8348SEric.Yu@Sun.COM  * You may not use this file except in compliance with the License.
7*8348SEric.Yu@Sun.COM  *
8*8348SEric.Yu@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*8348SEric.Yu@Sun.COM  * or http://www.opensolaris.org/os/licensing.
10*8348SEric.Yu@Sun.COM  * See the License for the specific language governing permissions
11*8348SEric.Yu@Sun.COM  * and limitations under the License.
12*8348SEric.Yu@Sun.COM  *
13*8348SEric.Yu@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
14*8348SEric.Yu@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*8348SEric.Yu@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
16*8348SEric.Yu@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
17*8348SEric.Yu@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
18*8348SEric.Yu@Sun.COM  *
19*8348SEric.Yu@Sun.COM  * CDDL HEADER END
20*8348SEric.Yu@Sun.COM  */
21*8348SEric.Yu@Sun.COM 
22*8348SEric.Yu@Sun.COM /*
23*8348SEric.Yu@Sun.COM  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24*8348SEric.Yu@Sun.COM  * Use is subject to license terms.
25*8348SEric.Yu@Sun.COM  */
26*8348SEric.Yu@Sun.COM 
27*8348SEric.Yu@Sun.COM #include <sys/types.h>
28*8348SEric.Yu@Sun.COM #include <sys/param.h>
29*8348SEric.Yu@Sun.COM #include <sys/signal.h>
30*8348SEric.Yu@Sun.COM #include <sys/cmn_err.h>
31*8348SEric.Yu@Sun.COM 
32*8348SEric.Yu@Sun.COM #include <sys/stropts.h>
33*8348SEric.Yu@Sun.COM #include <sys/socket.h>
34*8348SEric.Yu@Sun.COM #include <sys/socketvar.h>
35*8348SEric.Yu@Sun.COM #include <sys/sockio.h>
36*8348SEric.Yu@Sun.COM #include <sys/sodirect.h>
37*8348SEric.Yu@Sun.COM #include <sys/strsubr.h>
38*8348SEric.Yu@Sun.COM #include <sys/strsun.h>
39*8348SEric.Yu@Sun.COM #include <sys/atomic.h>
40*8348SEric.Yu@Sun.COM 
41*8348SEric.Yu@Sun.COM #include <fs/sockfs/sockcommon.h>
42*8348SEric.Yu@Sun.COM #include <fs/sockfs/socktpi.h>
43*8348SEric.Yu@Sun.COM #include <sys/ddi.h>
44*8348SEric.Yu@Sun.COM #include <inet/ip.h>
45*8348SEric.Yu@Sun.COM #include <sys/time.h>
46*8348SEric.Yu@Sun.COM #include <sys/cmn_err.h>
47*8348SEric.Yu@Sun.COM 
48*8348SEric.Yu@Sun.COM #ifdef SOCK_TEST
49*8348SEric.Yu@Sun.COM extern int do_useracc;
50*8348SEric.Yu@Sun.COM extern clock_t sock_test_timelimit;
51*8348SEric.Yu@Sun.COM #endif /* SOCK_TEST */
52*8348SEric.Yu@Sun.COM 
53*8348SEric.Yu@Sun.COM #define	MBLK_PULL_LEN 64
54*8348SEric.Yu@Sun.COM uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
55*8348SEric.Yu@Sun.COM 
56*8348SEric.Yu@Sun.COM #ifdef DEBUG
57*8348SEric.Yu@Sun.COM boolean_t so_debug_length = B_FALSE;
58*8348SEric.Yu@Sun.COM static boolean_t so_check_length(sonode_t *so);
59*8348SEric.Yu@Sun.COM #endif
60*8348SEric.Yu@Sun.COM 
61*8348SEric.Yu@Sun.COM int
62*8348SEric.Yu@Sun.COM so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
63*8348SEric.Yu@Sun.COM {
64*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
65*8348SEric.Yu@Sun.COM 	ASSERT(nso->so_acceptq_next == NULL);
66*8348SEric.Yu@Sun.COM 
67*8348SEric.Yu@Sun.COM 	*so->so_acceptq_tail = nso;
68*8348SEric.Yu@Sun.COM 	so->so_acceptq_tail = &nso->so_acceptq_next;
69*8348SEric.Yu@Sun.COM 	so->so_acceptq_len++;
70*8348SEric.Yu@Sun.COM 
71*8348SEric.Yu@Sun.COM 	/* Wakeup a single consumer */
72*8348SEric.Yu@Sun.COM 	cv_signal(&so->so_acceptq_cv);
73*8348SEric.Yu@Sun.COM 
74*8348SEric.Yu@Sun.COM 	return (so->so_acceptq_len);
75*8348SEric.Yu@Sun.COM }
76*8348SEric.Yu@Sun.COM 
77*8348SEric.Yu@Sun.COM /*
78*8348SEric.Yu@Sun.COM  * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
79*8348SEric.Yu@Sun.COM  *
80*8348SEric.Yu@Sun.COM  * Enqueue an incoming connection on a listening socket.
81*8348SEric.Yu@Sun.COM  *
82*8348SEric.Yu@Sun.COM  * Arguments:
83*8348SEric.Yu@Sun.COM  *   so	  - listening socket
84*8348SEric.Yu@Sun.COM  *   nso  - new connection
85*8348SEric.Yu@Sun.COM  *
86*8348SEric.Yu@Sun.COM  * Returns:
87*8348SEric.Yu@Sun.COM  *   Number of queued connections, including the new connection
88*8348SEric.Yu@Sun.COM  */
89*8348SEric.Yu@Sun.COM int
90*8348SEric.Yu@Sun.COM so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
91*8348SEric.Yu@Sun.COM {
92*8348SEric.Yu@Sun.COM 	int conns;
93*8348SEric.Yu@Sun.COM 
94*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_acceptq_lock);
95*8348SEric.Yu@Sun.COM 	conns = so_acceptq_enqueue_locked(so, nso);
96*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_acceptq_lock);
97*8348SEric.Yu@Sun.COM 
98*8348SEric.Yu@Sun.COM 	return (conns);
99*8348SEric.Yu@Sun.COM }
100*8348SEric.Yu@Sun.COM 
101*8348SEric.Yu@Sun.COM static int
102*8348SEric.Yu@Sun.COM so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
103*8348SEric.Yu@Sun.COM     struct sonode **nsop)
104*8348SEric.Yu@Sun.COM {
105*8348SEric.Yu@Sun.COM 	struct sonode *nso = NULL;
106*8348SEric.Yu@Sun.COM 
107*8348SEric.Yu@Sun.COM 	*nsop = NULL;
108*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
109*8348SEric.Yu@Sun.COM 	while ((nso = so->so_acceptq_head) == NULL) {
110*8348SEric.Yu@Sun.COM 		/*
111*8348SEric.Yu@Sun.COM 		 * No need to check so_error here, because it is not
112*8348SEric.Yu@Sun.COM 		 * possible for a listening socket to be reset or otherwise
113*8348SEric.Yu@Sun.COM 		 * disconnected.
114*8348SEric.Yu@Sun.COM 		 *
115*8348SEric.Yu@Sun.COM 		 * So now we just need check if it's ok to wait.
116*8348SEric.Yu@Sun.COM 		 */
117*8348SEric.Yu@Sun.COM 		if (dontblock)
118*8348SEric.Yu@Sun.COM 			return (EWOULDBLOCK);
119*8348SEric.Yu@Sun.COM 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
120*8348SEric.Yu@Sun.COM 			return (EINTR);
121*8348SEric.Yu@Sun.COM 
122*8348SEric.Yu@Sun.COM 		if (cv_wait_sig_swap(&so->so_acceptq_cv,
123*8348SEric.Yu@Sun.COM 		    &so->so_acceptq_lock) == 0)
124*8348SEric.Yu@Sun.COM 			return (EINTR);
125*8348SEric.Yu@Sun.COM 	}
126*8348SEric.Yu@Sun.COM 
127*8348SEric.Yu@Sun.COM 	ASSERT(nso != NULL);
128*8348SEric.Yu@Sun.COM 	so->so_acceptq_head = nso->so_acceptq_next;
129*8348SEric.Yu@Sun.COM 	nso->so_acceptq_next = NULL;
130*8348SEric.Yu@Sun.COM 
131*8348SEric.Yu@Sun.COM 	if (so->so_acceptq_head == NULL) {
132*8348SEric.Yu@Sun.COM 		ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
133*8348SEric.Yu@Sun.COM 		so->so_acceptq_tail = &so->so_acceptq_head;
134*8348SEric.Yu@Sun.COM 	}
135*8348SEric.Yu@Sun.COM 	ASSERT(so->so_acceptq_len > 0);
136*8348SEric.Yu@Sun.COM 	--so->so_acceptq_len;
137*8348SEric.Yu@Sun.COM 
138*8348SEric.Yu@Sun.COM 	*nsop = nso;
139*8348SEric.Yu@Sun.COM 
140*8348SEric.Yu@Sun.COM 	return (0);
141*8348SEric.Yu@Sun.COM }
142*8348SEric.Yu@Sun.COM 
143*8348SEric.Yu@Sun.COM /*
144*8348SEric.Yu@Sun.COM  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
145*8348SEric.Yu@Sun.COM  *
146*8348SEric.Yu@Sun.COM  * Pulls a connection off of the accept queue.
147*8348SEric.Yu@Sun.COM  *
148*8348SEric.Yu@Sun.COM  * Arguments:
149*8348SEric.Yu@Sun.COM  *   so	       - listening socket
150*8348SEric.Yu@Sun.COM  *   dontblock - indicate whether it's ok to sleep if there are no
151*8348SEric.Yu@Sun.COM  *		 connections on the queue
152*8348SEric.Yu@Sun.COM  *   nsop      - Value-return argument
153*8348SEric.Yu@Sun.COM  *
154*8348SEric.Yu@Sun.COM  * Return values:
155*8348SEric.Yu@Sun.COM  *   0 when a connection is successfully dequeued, in which case nsop
156*8348SEric.Yu@Sun.COM  *   is set to point to the new connection. Upon failure a non-zero
157*8348SEric.Yu@Sun.COM  *   value is returned, and the value of nsop is set to NULL.
158*8348SEric.Yu@Sun.COM  *
159*8348SEric.Yu@Sun.COM  * Note:
160*8348SEric.Yu@Sun.COM  *   so_acceptq_dequeue() may return prematurly if the socket is falling
161*8348SEric.Yu@Sun.COM  *   back to TPI.
162*8348SEric.Yu@Sun.COM  */
163*8348SEric.Yu@Sun.COM int
164*8348SEric.Yu@Sun.COM so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
165*8348SEric.Yu@Sun.COM     struct sonode **nsop)
166*8348SEric.Yu@Sun.COM {
167*8348SEric.Yu@Sun.COM 	int error;
168*8348SEric.Yu@Sun.COM 
169*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_acceptq_lock);
170*8348SEric.Yu@Sun.COM 	error = so_acceptq_dequeue_locked(so, dontblock, nsop);
171*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_acceptq_lock);
172*8348SEric.Yu@Sun.COM 
173*8348SEric.Yu@Sun.COM 	return (error);
174*8348SEric.Yu@Sun.COM }
175*8348SEric.Yu@Sun.COM 
176*8348SEric.Yu@Sun.COM /*
177*8348SEric.Yu@Sun.COM  * void so_acceptq_flush(struct sonode *so)
178*8348SEric.Yu@Sun.COM  *
179*8348SEric.Yu@Sun.COM  * Removes all pending connections from a listening socket, and
180*8348SEric.Yu@Sun.COM  * frees the associated resources.
181*8348SEric.Yu@Sun.COM  *
182*8348SEric.Yu@Sun.COM  * Arguments
183*8348SEric.Yu@Sun.COM  *   so	    - listening socket
184*8348SEric.Yu@Sun.COM  *
185*8348SEric.Yu@Sun.COM  * Return values:
186*8348SEric.Yu@Sun.COM  *   None.
187*8348SEric.Yu@Sun.COM  *
188*8348SEric.Yu@Sun.COM  * Note:
189*8348SEric.Yu@Sun.COM  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
190*8348SEric.Yu@Sun.COM  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
191*8348SEric.Yu@Sun.COM  *   So either the socket needs to be in a state where no operations
192*8348SEric.Yu@Sun.COM  *   would come in, or so_lock needs to be obtained.
193*8348SEric.Yu@Sun.COM  */
194*8348SEric.Yu@Sun.COM void
195*8348SEric.Yu@Sun.COM so_acceptq_flush(struct sonode *so)
196*8348SEric.Yu@Sun.COM {
197*8348SEric.Yu@Sun.COM 	struct sonode *nso;
198*8348SEric.Yu@Sun.COM 
199*8348SEric.Yu@Sun.COM 	nso = so->so_acceptq_head;
200*8348SEric.Yu@Sun.COM 
201*8348SEric.Yu@Sun.COM 	while (nso != NULL) {
202*8348SEric.Yu@Sun.COM 		struct sonode *nnso = NULL;
203*8348SEric.Yu@Sun.COM 
204*8348SEric.Yu@Sun.COM 		nnso = nso->so_acceptq_next;
205*8348SEric.Yu@Sun.COM 		nso->so_acceptq_next = NULL;
206*8348SEric.Yu@Sun.COM 		/*
207*8348SEric.Yu@Sun.COM 		 * Since the socket is on the accept queue, there can
208*8348SEric.Yu@Sun.COM 		 * only be one reference. We drop the reference and
209*8348SEric.Yu@Sun.COM 		 * just blow off the socket.
210*8348SEric.Yu@Sun.COM 		 */
211*8348SEric.Yu@Sun.COM 		ASSERT(nso->so_count == 1);
212*8348SEric.Yu@Sun.COM 		nso->so_count--;
213*8348SEric.Yu@Sun.COM 		socket_destroy(nso);
214*8348SEric.Yu@Sun.COM 		nso = nnso;
215*8348SEric.Yu@Sun.COM 	}
216*8348SEric.Yu@Sun.COM 
217*8348SEric.Yu@Sun.COM 	so->so_acceptq_head = NULL;
218*8348SEric.Yu@Sun.COM 	so->so_acceptq_tail = &so->so_acceptq_head;
219*8348SEric.Yu@Sun.COM 	so->so_acceptq_len = 0;
220*8348SEric.Yu@Sun.COM }
221*8348SEric.Yu@Sun.COM 
222*8348SEric.Yu@Sun.COM int
223*8348SEric.Yu@Sun.COM so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
224*8348SEric.Yu@Sun.COM     sock_connid_t id)
225*8348SEric.Yu@Sun.COM {
226*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
227*8348SEric.Yu@Sun.COM 
228*8348SEric.Yu@Sun.COM 	/*
229*8348SEric.Yu@Sun.COM 	 * The protocol has notified us that a connection attempt is being
230*8348SEric.Yu@Sun.COM 	 * made, so before we wait for a notification to arrive we must
231*8348SEric.Yu@Sun.COM 	 * clear out any errors associated with earlier connection attempts.
232*8348SEric.Yu@Sun.COM 	 */
233*8348SEric.Yu@Sun.COM 	if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
234*8348SEric.Yu@Sun.COM 		so->so_error = 0;
235*8348SEric.Yu@Sun.COM 
236*8348SEric.Yu@Sun.COM 	while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
237*8348SEric.Yu@Sun.COM 		if (nonblock)
238*8348SEric.Yu@Sun.COM 			return (EINPROGRESS);
239*8348SEric.Yu@Sun.COM 
240*8348SEric.Yu@Sun.COM 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
241*8348SEric.Yu@Sun.COM 			return (EINTR);
242*8348SEric.Yu@Sun.COM 
243*8348SEric.Yu@Sun.COM 		if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
244*8348SEric.Yu@Sun.COM 			return (EINTR);
245*8348SEric.Yu@Sun.COM 	}
246*8348SEric.Yu@Sun.COM 
247*8348SEric.Yu@Sun.COM 	if (so->so_error != 0)
248*8348SEric.Yu@Sun.COM 		return (sogeterr(so, B_TRUE));
249*8348SEric.Yu@Sun.COM 	/*
250*8348SEric.Yu@Sun.COM 	 * Under normal circumstances, so_error should contain an error
251*8348SEric.Yu@Sun.COM 	 * in case the connect failed. However, it is possible for another
252*8348SEric.Yu@Sun.COM 	 * thread to come in a consume the error, so generate a sensible
253*8348SEric.Yu@Sun.COM 	 * error in that case.
254*8348SEric.Yu@Sun.COM 	 */
255*8348SEric.Yu@Sun.COM 	if ((so->so_state & SS_ISCONNECTED) == 0)
256*8348SEric.Yu@Sun.COM 		return (ECONNREFUSED);
257*8348SEric.Yu@Sun.COM 
258*8348SEric.Yu@Sun.COM 	return (0);
259*8348SEric.Yu@Sun.COM }
260*8348SEric.Yu@Sun.COM 
261*8348SEric.Yu@Sun.COM /*
262*8348SEric.Yu@Sun.COM  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
263*8348SEric.Yu@Sun.COM  *    sock_connid_t id)
264*8348SEric.Yu@Sun.COM  *
265*8348SEric.Yu@Sun.COM  * Wait until the socket is connected or an error has occured.
266*8348SEric.Yu@Sun.COM  *
267*8348SEric.Yu@Sun.COM  * Arguments:
268*8348SEric.Yu@Sun.COM  *   so	      - socket
269*8348SEric.Yu@Sun.COM  *   nonblock - indicate whether it's ok to sleep if the connection has
270*8348SEric.Yu@Sun.COM  *		not yet been established
271*8348SEric.Yu@Sun.COM  *   gen      - generation number that was returned by the protocol
272*8348SEric.Yu@Sun.COM  *		when the operation was started
273*8348SEric.Yu@Sun.COM  *
274*8348SEric.Yu@Sun.COM  * Returns:
275*8348SEric.Yu@Sun.COM  *   0 if the connection attempt was successful, or an error indicating why
276*8348SEric.Yu@Sun.COM  *   the connection attempt failed.
277*8348SEric.Yu@Sun.COM  */
278*8348SEric.Yu@Sun.COM int
279*8348SEric.Yu@Sun.COM so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
280*8348SEric.Yu@Sun.COM {
281*8348SEric.Yu@Sun.COM 	int error;
282*8348SEric.Yu@Sun.COM 
283*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
284*8348SEric.Yu@Sun.COM 	error = so_wait_connected_locked(so, nonblock, id);
285*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
286*8348SEric.Yu@Sun.COM 
287*8348SEric.Yu@Sun.COM 	return (error);
288*8348SEric.Yu@Sun.COM }
289*8348SEric.Yu@Sun.COM 
290*8348SEric.Yu@Sun.COM int
291*8348SEric.Yu@Sun.COM so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
292*8348SEric.Yu@Sun.COM {
293*8348SEric.Yu@Sun.COM 	int error;
294*8348SEric.Yu@Sun.COM 
295*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
296*8348SEric.Yu@Sun.COM 	while (so->so_snd_qfull) {
297*8348SEric.Yu@Sun.COM 		if (so->so_state & SS_CANTSENDMORE)
298*8348SEric.Yu@Sun.COM 			return (EPIPE);
299*8348SEric.Yu@Sun.COM 		if (dontblock)
300*8348SEric.Yu@Sun.COM 			return (EWOULDBLOCK);
301*8348SEric.Yu@Sun.COM 
302*8348SEric.Yu@Sun.COM 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
303*8348SEric.Yu@Sun.COM 			return (EINTR);
304*8348SEric.Yu@Sun.COM 
305*8348SEric.Yu@Sun.COM 		if (so->so_sndtimeo == 0) {
306*8348SEric.Yu@Sun.COM 			/*
307*8348SEric.Yu@Sun.COM 			 * Zero means disable timeout.
308*8348SEric.Yu@Sun.COM 			 */
309*8348SEric.Yu@Sun.COM 			error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
310*8348SEric.Yu@Sun.COM 		} else {
311*8348SEric.Yu@Sun.COM 			clock_t now;
312*8348SEric.Yu@Sun.COM 
313*8348SEric.Yu@Sun.COM 			time_to_wait(&now, so->so_sndtimeo);
314*8348SEric.Yu@Sun.COM 			error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
315*8348SEric.Yu@Sun.COM 			    now);
316*8348SEric.Yu@Sun.COM 		}
317*8348SEric.Yu@Sun.COM 		if (error == 0)
318*8348SEric.Yu@Sun.COM 			return (EINTR);
319*8348SEric.Yu@Sun.COM 		else if (error == -1)
320*8348SEric.Yu@Sun.COM 			return (ETIME);
321*8348SEric.Yu@Sun.COM 	}
322*8348SEric.Yu@Sun.COM 	return (0);
323*8348SEric.Yu@Sun.COM }
324*8348SEric.Yu@Sun.COM 
325*8348SEric.Yu@Sun.COM /*
326*8348SEric.Yu@Sun.COM  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
327*8348SEric.Yu@Sun.COM  *
328*8348SEric.Yu@Sun.COM  * Wait for the transport to notify us about send buffers becoming
329*8348SEric.Yu@Sun.COM  * available.
330*8348SEric.Yu@Sun.COM  */
331*8348SEric.Yu@Sun.COM int
332*8348SEric.Yu@Sun.COM so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
333*8348SEric.Yu@Sun.COM {
334*8348SEric.Yu@Sun.COM 	int error = 0;
335*8348SEric.Yu@Sun.COM 
336*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
337*8348SEric.Yu@Sun.COM 	if (so->so_snd_qfull) {
338*8348SEric.Yu@Sun.COM 		so->so_snd_wakeup = B_TRUE;
339*8348SEric.Yu@Sun.COM 		error = so_snd_wait_qnotfull_locked(so, dontblock);
340*8348SEric.Yu@Sun.COM 		so->so_snd_wakeup = B_FALSE;
341*8348SEric.Yu@Sun.COM 	}
342*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
343*8348SEric.Yu@Sun.COM 
344*8348SEric.Yu@Sun.COM 	return (error);
345*8348SEric.Yu@Sun.COM }
346*8348SEric.Yu@Sun.COM 
347*8348SEric.Yu@Sun.COM void
348*8348SEric.Yu@Sun.COM so_snd_qfull(struct sonode *so)
349*8348SEric.Yu@Sun.COM {
350*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
351*8348SEric.Yu@Sun.COM 	so->so_snd_qfull = B_TRUE;
352*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
353*8348SEric.Yu@Sun.COM }
354*8348SEric.Yu@Sun.COM 
355*8348SEric.Yu@Sun.COM void
356*8348SEric.Yu@Sun.COM so_snd_qnotfull(struct sonode *so)
357*8348SEric.Yu@Sun.COM {
358*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
359*8348SEric.Yu@Sun.COM 	so->so_snd_qfull = B_FALSE;
360*8348SEric.Yu@Sun.COM 	/* wake up everyone waiting for buffers */
361*8348SEric.Yu@Sun.COM 	cv_broadcast(&so->so_snd_cv);
362*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
363*8348SEric.Yu@Sun.COM }
364*8348SEric.Yu@Sun.COM 
365*8348SEric.Yu@Sun.COM /*
366*8348SEric.Yu@Sun.COM  * Change the process/process group to which SIGIO is sent.
367*8348SEric.Yu@Sun.COM  */
368*8348SEric.Yu@Sun.COM int
369*8348SEric.Yu@Sun.COM socket_chgpgrp(struct sonode *so, pid_t pid)
370*8348SEric.Yu@Sun.COM {
371*8348SEric.Yu@Sun.COM 	int error;
372*8348SEric.Yu@Sun.COM 
373*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
374*8348SEric.Yu@Sun.COM 	if (pid != 0) {
375*8348SEric.Yu@Sun.COM 		/*
376*8348SEric.Yu@Sun.COM 		 * Permissions check by sending signal 0.
377*8348SEric.Yu@Sun.COM 		 * Note that when kill fails it does a
378*8348SEric.Yu@Sun.COM 		 * set_errno causing the system call to fail.
379*8348SEric.Yu@Sun.COM 		 */
380*8348SEric.Yu@Sun.COM 		error = kill(pid, 0);
381*8348SEric.Yu@Sun.COM 		if (error != 0) {
382*8348SEric.Yu@Sun.COM 			return (error);
383*8348SEric.Yu@Sun.COM 		}
384*8348SEric.Yu@Sun.COM 	}
385*8348SEric.Yu@Sun.COM 	so->so_pgrp = pid;
386*8348SEric.Yu@Sun.COM 	return (0);
387*8348SEric.Yu@Sun.COM }
388*8348SEric.Yu@Sun.COM 
389*8348SEric.Yu@Sun.COM 
390*8348SEric.Yu@Sun.COM /*
391*8348SEric.Yu@Sun.COM  * Generate a SIGIO, for 'writable' events include siginfo structure,
392*8348SEric.Yu@Sun.COM  * for read events just send the signal.
393*8348SEric.Yu@Sun.COM  */
394*8348SEric.Yu@Sun.COM /*ARGSUSED*/
395*8348SEric.Yu@Sun.COM static void
396*8348SEric.Yu@Sun.COM socket_sigproc(proc_t *proc, int event)
397*8348SEric.Yu@Sun.COM {
398*8348SEric.Yu@Sun.COM 	k_siginfo_t info;
399*8348SEric.Yu@Sun.COM 
400*8348SEric.Yu@Sun.COM 	ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
401*8348SEric.Yu@Sun.COM 
402*8348SEric.Yu@Sun.COM 	if (event & SOCKETSIG_WRITE) {
403*8348SEric.Yu@Sun.COM 		info.si_signo = SIGPOLL;
404*8348SEric.Yu@Sun.COM 		info.si_code = POLL_OUT;
405*8348SEric.Yu@Sun.COM 		info.si_errno = 0;
406*8348SEric.Yu@Sun.COM 		info.si_fd = 0;
407*8348SEric.Yu@Sun.COM 		info.si_band = 0;
408*8348SEric.Yu@Sun.COM 		sigaddq(proc, NULL, &info, KM_NOSLEEP);
409*8348SEric.Yu@Sun.COM 	}
410*8348SEric.Yu@Sun.COM 	if (event & SOCKETSIG_READ) {
411*8348SEric.Yu@Sun.COM 		sigtoproc(proc, NULL, SIGPOLL);
412*8348SEric.Yu@Sun.COM 	}
413*8348SEric.Yu@Sun.COM 	if (event & SOCKETSIG_URG) {
414*8348SEric.Yu@Sun.COM 		sigtoproc(proc, NULL, SIGURG);
415*8348SEric.Yu@Sun.COM 	}
416*8348SEric.Yu@Sun.COM }
417*8348SEric.Yu@Sun.COM 
418*8348SEric.Yu@Sun.COM void
419*8348SEric.Yu@Sun.COM socket_sendsig(struct sonode *so, int event)
420*8348SEric.Yu@Sun.COM {
421*8348SEric.Yu@Sun.COM 	proc_t *proc;
422*8348SEric.Yu@Sun.COM 
423*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
424*8348SEric.Yu@Sun.COM 
425*8348SEric.Yu@Sun.COM 	if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
426*8348SEric.Yu@Sun.COM 	    event != SOCKETSIG_URG)) {
427*8348SEric.Yu@Sun.COM 		return;
428*8348SEric.Yu@Sun.COM 	}
429*8348SEric.Yu@Sun.COM 
430*8348SEric.Yu@Sun.COM 	dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
431*8348SEric.Yu@Sun.COM 
432*8348SEric.Yu@Sun.COM 	if (so->so_pgrp > 0) {
433*8348SEric.Yu@Sun.COM 		/*
434*8348SEric.Yu@Sun.COM 		 * XXX This unfortunately still generates
435*8348SEric.Yu@Sun.COM 		 * a signal when a fd is closed but
436*8348SEric.Yu@Sun.COM 		 * the proc is active.
437*8348SEric.Yu@Sun.COM 		 */
438*8348SEric.Yu@Sun.COM 		mutex_enter(&pidlock);
439*8348SEric.Yu@Sun.COM 		proc = prfind(so->so_pgrp);
440*8348SEric.Yu@Sun.COM 		if (proc == NULL) {
441*8348SEric.Yu@Sun.COM 			mutex_exit(&pidlock);
442*8348SEric.Yu@Sun.COM 			return;
443*8348SEric.Yu@Sun.COM 		}
444*8348SEric.Yu@Sun.COM 		mutex_enter(&proc->p_lock);
445*8348SEric.Yu@Sun.COM 		mutex_exit(&pidlock);
446*8348SEric.Yu@Sun.COM 		socket_sigproc(proc, event);
447*8348SEric.Yu@Sun.COM 		mutex_exit(&proc->p_lock);
448*8348SEric.Yu@Sun.COM 	} else {
449*8348SEric.Yu@Sun.COM 		/*
450*8348SEric.Yu@Sun.COM 		 * Send to process group. Hold pidlock across
451*8348SEric.Yu@Sun.COM 		 * calls to socket_sigproc().
452*8348SEric.Yu@Sun.COM 		 */
453*8348SEric.Yu@Sun.COM 		pid_t pgrp = -so->so_pgrp;
454*8348SEric.Yu@Sun.COM 
455*8348SEric.Yu@Sun.COM 		mutex_enter(&pidlock);
456*8348SEric.Yu@Sun.COM 		proc = pgfind(pgrp);
457*8348SEric.Yu@Sun.COM 		while (proc != NULL) {
458*8348SEric.Yu@Sun.COM 			mutex_enter(&proc->p_lock);
459*8348SEric.Yu@Sun.COM 			socket_sigproc(proc, event);
460*8348SEric.Yu@Sun.COM 			mutex_exit(&proc->p_lock);
461*8348SEric.Yu@Sun.COM 			proc = proc->p_pglink;
462*8348SEric.Yu@Sun.COM 		}
463*8348SEric.Yu@Sun.COM 		mutex_exit(&pidlock);
464*8348SEric.Yu@Sun.COM 	}
465*8348SEric.Yu@Sun.COM }
466*8348SEric.Yu@Sun.COM 
467*8348SEric.Yu@Sun.COM #define	MIN(a, b) ((a) < (b) ? (a) : (b))
468*8348SEric.Yu@Sun.COM /* Copy userdata into a new mblk_t */
469*8348SEric.Yu@Sun.COM mblk_t *
470*8348SEric.Yu@Sun.COM socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
471*8348SEric.Yu@Sun.COM     size_t tail_len, int *errorp)
472*8348SEric.Yu@Sun.COM {
473*8348SEric.Yu@Sun.COM 	mblk_t	*head = NULL, **tail = &head;
474*8348SEric.Yu@Sun.COM 
475*8348SEric.Yu@Sun.COM 	ASSERT(iosize == INFPSZ || iosize > 0);
476*8348SEric.Yu@Sun.COM 
477*8348SEric.Yu@Sun.COM 	if (iosize == INFPSZ || iosize > uiop->uio_resid)
478*8348SEric.Yu@Sun.COM 		iosize = uiop->uio_resid;
479*8348SEric.Yu@Sun.COM 
480*8348SEric.Yu@Sun.COM 	if (maxblk == INFPSZ)
481*8348SEric.Yu@Sun.COM 		maxblk = iosize;
482*8348SEric.Yu@Sun.COM 
483*8348SEric.Yu@Sun.COM 	/* Nothing to do in these cases, so we're done */
484*8348SEric.Yu@Sun.COM 	if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
485*8348SEric.Yu@Sun.COM 		goto done;
486*8348SEric.Yu@Sun.COM 
487*8348SEric.Yu@Sun.COM 	/*
488*8348SEric.Yu@Sun.COM 	 * We will enter the loop below if iosize is 0; it will allocate an
489*8348SEric.Yu@Sun.COM 	 * empty message block and call uiomove(9F) which will just return.
490*8348SEric.Yu@Sun.COM 	 * We could avoid that with an extra check but would only slow
491*8348SEric.Yu@Sun.COM 	 * down the much more likely case where iosize is larger than 0.
492*8348SEric.Yu@Sun.COM 	 */
493*8348SEric.Yu@Sun.COM 	do {
494*8348SEric.Yu@Sun.COM 		ssize_t blocksize;
495*8348SEric.Yu@Sun.COM 		mblk_t	*mp;
496*8348SEric.Yu@Sun.COM 
497*8348SEric.Yu@Sun.COM 		blocksize = MIN(iosize, maxblk);
498*8348SEric.Yu@Sun.COM 		ASSERT(blocksize >= 0);
499*8348SEric.Yu@Sun.COM 		if ((mp = allocb(wroff + blocksize + tail_len,
500*8348SEric.Yu@Sun.COM 		    BPRI_MED)) == NULL) {
501*8348SEric.Yu@Sun.COM 			*errorp = ENOMEM;
502*8348SEric.Yu@Sun.COM 			return (head);
503*8348SEric.Yu@Sun.COM 		}
504*8348SEric.Yu@Sun.COM 		mp->b_rptr += wroff;
505*8348SEric.Yu@Sun.COM 		mp->b_wptr = mp->b_rptr + blocksize;
506*8348SEric.Yu@Sun.COM 
507*8348SEric.Yu@Sun.COM 		*tail = mp;
508*8348SEric.Yu@Sun.COM 		tail = &mp->b_cont;
509*8348SEric.Yu@Sun.COM 
510*8348SEric.Yu@Sun.COM 		/* uiomove(9F) either returns 0 or EFAULT */
511*8348SEric.Yu@Sun.COM 		if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
512*8348SEric.Yu@Sun.COM 		    UIO_WRITE, uiop)) != 0) {
513*8348SEric.Yu@Sun.COM 			ASSERT(*errorp != ENOMEM);
514*8348SEric.Yu@Sun.COM 			freemsg(head);
515*8348SEric.Yu@Sun.COM 			return (NULL);
516*8348SEric.Yu@Sun.COM 		}
517*8348SEric.Yu@Sun.COM 
518*8348SEric.Yu@Sun.COM 		iosize -= blocksize;
519*8348SEric.Yu@Sun.COM 	} while (iosize > 0);
520*8348SEric.Yu@Sun.COM 
521*8348SEric.Yu@Sun.COM done:
522*8348SEric.Yu@Sun.COM 	*errorp = 0;
523*8348SEric.Yu@Sun.COM 	return (head);
524*8348SEric.Yu@Sun.COM }
525*8348SEric.Yu@Sun.COM 
526*8348SEric.Yu@Sun.COM mblk_t *
527*8348SEric.Yu@Sun.COM socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
528*8348SEric.Yu@Sun.COM {
529*8348SEric.Yu@Sun.COM 	int error;
530*8348SEric.Yu@Sun.COM 	ptrdiff_t n;
531*8348SEric.Yu@Sun.COM 	mblk_t *nmp;
532*8348SEric.Yu@Sun.COM 
533*8348SEric.Yu@Sun.COM 	ASSERT(mp->b_wptr >= mp->b_rptr);
534*8348SEric.Yu@Sun.COM 
535*8348SEric.Yu@Sun.COM 	/*
536*8348SEric.Yu@Sun.COM 	 * max_read is the offset of the oobmark and read can not go pass
537*8348SEric.Yu@Sun.COM 	 * the oobmark.
538*8348SEric.Yu@Sun.COM 	 */
539*8348SEric.Yu@Sun.COM 	if (max_read == INFPSZ || max_read > uiop->uio_resid)
540*8348SEric.Yu@Sun.COM 		max_read = uiop->uio_resid;
541*8348SEric.Yu@Sun.COM 
542*8348SEric.Yu@Sun.COM 	do {
543*8348SEric.Yu@Sun.COM 		if ((n = MIN(max_read, MBLKL(mp))) != 0) {
544*8348SEric.Yu@Sun.COM 			ASSERT(n > 0);
545*8348SEric.Yu@Sun.COM 
546*8348SEric.Yu@Sun.COM 			error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
547*8348SEric.Yu@Sun.COM 			if (error != 0) {
548*8348SEric.Yu@Sun.COM 				freemsg(mp);
549*8348SEric.Yu@Sun.COM 				*errorp = error;
550*8348SEric.Yu@Sun.COM 				return (NULL);
551*8348SEric.Yu@Sun.COM 			}
552*8348SEric.Yu@Sun.COM 		}
553*8348SEric.Yu@Sun.COM 
554*8348SEric.Yu@Sun.COM 		mp->b_rptr += n;
555*8348SEric.Yu@Sun.COM 		max_read -= n;
556*8348SEric.Yu@Sun.COM 		while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
557*8348SEric.Yu@Sun.COM 			/*
558*8348SEric.Yu@Sun.COM 			 * get rid of zero length mblks
559*8348SEric.Yu@Sun.COM 			 */
560*8348SEric.Yu@Sun.COM 			nmp = mp;
561*8348SEric.Yu@Sun.COM 			mp = mp->b_cont;
562*8348SEric.Yu@Sun.COM 			freeb(nmp);
563*8348SEric.Yu@Sun.COM 		}
564*8348SEric.Yu@Sun.COM 	} while (mp != NULL && max_read > 0);
565*8348SEric.Yu@Sun.COM 
566*8348SEric.Yu@Sun.COM 	*errorp = 0;
567*8348SEric.Yu@Sun.COM 	return (mp);
568*8348SEric.Yu@Sun.COM }
569*8348SEric.Yu@Sun.COM 
570*8348SEric.Yu@Sun.COM static void
571*8348SEric.Yu@Sun.COM so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
572*8348SEric.Yu@Sun.COM {
573*8348SEric.Yu@Sun.COM 	ASSERT(last_tail != NULL);
574*8348SEric.Yu@Sun.COM 	mp->b_next = so->so_rcv_q_head;
575*8348SEric.Yu@Sun.COM 	mp->b_prev = last_tail;
576*8348SEric.Yu@Sun.COM 	ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
577*8348SEric.Yu@Sun.COM 
578*8348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head == NULL) {
579*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_q_last_head == NULL);
580*8348SEric.Yu@Sun.COM 		so->so_rcv_q_last_head = mp;
581*8348SEric.Yu@Sun.COM #ifdef DEBUG
582*8348SEric.Yu@Sun.COM 	} else {
583*8348SEric.Yu@Sun.COM 		ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
584*8348SEric.Yu@Sun.COM #endif
585*8348SEric.Yu@Sun.COM 	}
586*8348SEric.Yu@Sun.COM 	so->so_rcv_q_head = mp;
587*8348SEric.Yu@Sun.COM 
588*8348SEric.Yu@Sun.COM #ifdef DEBUG
589*8348SEric.Yu@Sun.COM 	if (so_debug_length) {
590*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
591*8348SEric.Yu@Sun.COM 		ASSERT(so_check_length(so));
592*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
593*8348SEric.Yu@Sun.COM 	}
594*8348SEric.Yu@Sun.COM #endif
595*8348SEric.Yu@Sun.COM }
596*8348SEric.Yu@Sun.COM 
597*8348SEric.Yu@Sun.COM static void
598*8348SEric.Yu@Sun.COM process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
599*8348SEric.Yu@Sun.COM {
600*8348SEric.Yu@Sun.COM 	ASSERT(mp_head->b_prev != NULL);
601*8348SEric.Yu@Sun.COM 	if (so->so_rcv_q_head  == NULL) {
602*8348SEric.Yu@Sun.COM 		so->so_rcv_q_head = mp_head;
603*8348SEric.Yu@Sun.COM 		so->so_rcv_q_last_head = mp_last_head;
604*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
605*8348SEric.Yu@Sun.COM 	} else {
606*8348SEric.Yu@Sun.COM 		boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
607*8348SEric.Yu@Sun.COM 		    (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
608*8348SEric.Yu@Sun.COM 
609*8348SEric.Yu@Sun.COM 		if (mp_head->b_next == NULL &&
610*8348SEric.Yu@Sun.COM 		    DB_TYPE(mp_head) == M_DATA &&
611*8348SEric.Yu@Sun.COM 		    DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
612*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
613*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
614*8348SEric.Yu@Sun.COM 			mp_head->b_prev = NULL;
615*8348SEric.Yu@Sun.COM 		} else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
616*8348SEric.Yu@Sun.COM 			/*
617*8348SEric.Yu@Sun.COM 			 * Append to last_head if more than one mblks, and both
618*8348SEric.Yu@Sun.COM 			 * mp_head and last_head are I/OAT mblks.
619*8348SEric.Yu@Sun.COM 			 */
620*8348SEric.Yu@Sun.COM 			ASSERT(mp_head->b_next != NULL);
621*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
622*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
623*8348SEric.Yu@Sun.COM 			mp_head->b_prev = NULL;
624*8348SEric.Yu@Sun.COM 
625*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_next = mp_head->b_next;
626*8348SEric.Yu@Sun.COM 			mp_head->b_next = NULL;
627*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head = mp_last_head;
628*8348SEric.Yu@Sun.COM 		} else {
629*8348SEric.Yu@Sun.COM #ifdef DEBUG
630*8348SEric.Yu@Sun.COM 			{
631*8348SEric.Yu@Sun.COM 				mblk_t *tmp_mblk;
632*8348SEric.Yu@Sun.COM 				tmp_mblk = mp_head;
633*8348SEric.Yu@Sun.COM 				while (tmp_mblk != NULL) {
634*8348SEric.Yu@Sun.COM 					ASSERT(tmp_mblk->b_prev != NULL);
635*8348SEric.Yu@Sun.COM 					tmp_mblk = tmp_mblk->b_next;
636*8348SEric.Yu@Sun.COM 				}
637*8348SEric.Yu@Sun.COM 			}
638*8348SEric.Yu@Sun.COM #endif
639*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_next = mp_head;
640*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head = mp_last_head;
641*8348SEric.Yu@Sun.COM 		}
642*8348SEric.Yu@Sun.COM 	}
643*8348SEric.Yu@Sun.COM }
644*8348SEric.Yu@Sun.COM 
645*8348SEric.Yu@Sun.COM int
646*8348SEric.Yu@Sun.COM so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
647*8348SEric.Yu@Sun.COM     rval_t *rvalp, int flags)
648*8348SEric.Yu@Sun.COM {
649*8348SEric.Yu@Sun.COM 	mblk_t	*mp, *nmp;
650*8348SEric.Yu@Sun.COM 	mblk_t	*savemp, *savemptail;
651*8348SEric.Yu@Sun.COM 	mblk_t	*new_msg_head;
652*8348SEric.Yu@Sun.COM 	mblk_t	*new_msg_last_head;
653*8348SEric.Yu@Sun.COM 	mblk_t	*last_tail;
654*8348SEric.Yu@Sun.COM 	boolean_t partial_read;
655*8348SEric.Yu@Sun.COM 	boolean_t reset_atmark = B_FALSE;
656*8348SEric.Yu@Sun.COM 	int more = 0;
657*8348SEric.Yu@Sun.COM 	int error;
658*8348SEric.Yu@Sun.COM 	ssize_t oobmark;
659*8348SEric.Yu@Sun.COM 	sodirect_t *sodp = so->so_direct;
660*8348SEric.Yu@Sun.COM 
661*8348SEric.Yu@Sun.COM 	partial_read = B_FALSE;
662*8348SEric.Yu@Sun.COM 	*mctlp = NULL;
663*8348SEric.Yu@Sun.COM again:
664*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
665*8348SEric.Yu@Sun.COM again1:
666*8348SEric.Yu@Sun.COM #ifdef DEBUG
667*8348SEric.Yu@Sun.COM 	if (so_debug_length) {
668*8348SEric.Yu@Sun.COM 		ASSERT(so_check_length(so));
669*8348SEric.Yu@Sun.COM 	}
670*8348SEric.Yu@Sun.COM #endif
671*8348SEric.Yu@Sun.COM 	/*
672*8348SEric.Yu@Sun.COM 	 * First move messages from the dump area to processing area
673*8348SEric.Yu@Sun.COM 	 */
674*8348SEric.Yu@Sun.COM 	if (sodp != NULL) {
675*8348SEric.Yu@Sun.COM 		/* No need to grab sod_lockp since it pointers to so_lock */
676*8348SEric.Yu@Sun.COM 		if (sodp->sod_state & SOD_ENABLED) {
677*8348SEric.Yu@Sun.COM 			ASSERT(sodp->sod_lockp == &so->so_lock);
678*8348SEric.Yu@Sun.COM 
679*8348SEric.Yu@Sun.COM 			if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
680*8348SEric.Yu@Sun.COM 				/* nothing to uioamove */
681*8348SEric.Yu@Sun.COM 				sodp = NULL;
682*8348SEric.Yu@Sun.COM 			} else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
683*8348SEric.Yu@Sun.COM 				sodp->sod_uioa.uioa_state &= UIOA_CLR;
684*8348SEric.Yu@Sun.COM 				sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
685*8348SEric.Yu@Sun.COM 				/*
686*8348SEric.Yu@Sun.COM 				 * try to uioamove() the data that
687*8348SEric.Yu@Sun.COM 				 * has already queued.
688*8348SEric.Yu@Sun.COM 				 */
689*8348SEric.Yu@Sun.COM 				sod_uioa_so_init(so, sodp, uiop);
690*8348SEric.Yu@Sun.COM 			}
691*8348SEric.Yu@Sun.COM 		} else {
692*8348SEric.Yu@Sun.COM 			sodp = NULL;
693*8348SEric.Yu@Sun.COM 		}
694*8348SEric.Yu@Sun.COM 	}
695*8348SEric.Yu@Sun.COM 	new_msg_head = so->so_rcv_head;
696*8348SEric.Yu@Sun.COM 	new_msg_last_head = so->so_rcv_last_head;
697*8348SEric.Yu@Sun.COM 	so->so_rcv_head = NULL;
698*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head = NULL;
699*8348SEric.Yu@Sun.COM 	oobmark = so->so_oobmark;
700*8348SEric.Yu@Sun.COM 	/*
701*8348SEric.Yu@Sun.COM 	 * We can release the lock as there can only be one reader
702*8348SEric.Yu@Sun.COM 	 */
703*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
704*8348SEric.Yu@Sun.COM 
705*8348SEric.Yu@Sun.COM 	if (so->so_state & SS_RCVATMARK) {
706*8348SEric.Yu@Sun.COM 		reset_atmark = B_TRUE;
707*8348SEric.Yu@Sun.COM 	}
708*8348SEric.Yu@Sun.COM 	if (new_msg_head != NULL) {
709*8348SEric.Yu@Sun.COM 		process_new_message(so, new_msg_head, new_msg_last_head);
710*8348SEric.Yu@Sun.COM 	}
711*8348SEric.Yu@Sun.COM 	savemp = savemptail = NULL;
712*8348SEric.Yu@Sun.COM 	rvalp->r_val1 = 0;
713*8348SEric.Yu@Sun.COM 	error = 0;
714*8348SEric.Yu@Sun.COM 	mp = so->so_rcv_q_head;
715*8348SEric.Yu@Sun.COM 
716*8348SEric.Yu@Sun.COM 	if (mp != NULL &&
717*8348SEric.Yu@Sun.COM 	    (so->so_rcv_timer_tid == 0 ||
718*8348SEric.Yu@Sun.COM 	    so->so_rcv_queued >= so->so_rcv_thresh)) {
719*8348SEric.Yu@Sun.COM 		partial_read = B_FALSE;
720*8348SEric.Yu@Sun.COM 
721*8348SEric.Yu@Sun.COM 		if (flags & MSG_PEEK) {
722*8348SEric.Yu@Sun.COM 			if ((nmp = dupmsg(mp)) == NULL &&
723*8348SEric.Yu@Sun.COM 			    (nmp = copymsg(mp)) == NULL) {
724*8348SEric.Yu@Sun.COM 				size_t size = msgsize(mp);
725*8348SEric.Yu@Sun.COM 
726*8348SEric.Yu@Sun.COM 				error = strwaitbuf(size, BPRI_HI);
727*8348SEric.Yu@Sun.COM 				if (error) {
728*8348SEric.Yu@Sun.COM 					return (error);
729*8348SEric.Yu@Sun.COM 				}
730*8348SEric.Yu@Sun.COM 				goto again;
731*8348SEric.Yu@Sun.COM 			}
732*8348SEric.Yu@Sun.COM 			mp = nmp;
733*8348SEric.Yu@Sun.COM 		} else {
734*8348SEric.Yu@Sun.COM 			ASSERT(mp->b_prev != NULL);
735*8348SEric.Yu@Sun.COM 			last_tail = mp->b_prev;
736*8348SEric.Yu@Sun.COM 			mp->b_prev = NULL;
737*8348SEric.Yu@Sun.COM 			so->so_rcv_q_head = mp->b_next;
738*8348SEric.Yu@Sun.COM 			if (so->so_rcv_q_head == NULL) {
739*8348SEric.Yu@Sun.COM 				so->so_rcv_q_last_head = NULL;
740*8348SEric.Yu@Sun.COM 			}
741*8348SEric.Yu@Sun.COM 			mp->b_next = NULL;
742*8348SEric.Yu@Sun.COM 		}
743*8348SEric.Yu@Sun.COM 
744*8348SEric.Yu@Sun.COM 		ASSERT(mctlp != NULL);
745*8348SEric.Yu@Sun.COM 		/*
746*8348SEric.Yu@Sun.COM 		 * First process PROTO or PCPROTO blocks, if any.
747*8348SEric.Yu@Sun.COM 		 */
748*8348SEric.Yu@Sun.COM 		if (DB_TYPE(mp) != M_DATA) {
749*8348SEric.Yu@Sun.COM 			*mctlp = mp;
750*8348SEric.Yu@Sun.COM 			savemp = mp;
751*8348SEric.Yu@Sun.COM 			savemptail = mp;
752*8348SEric.Yu@Sun.COM 			ASSERT(DB_TYPE(mp) == M_PROTO ||
753*8348SEric.Yu@Sun.COM 			    DB_TYPE(mp) == M_PCPROTO);
754*8348SEric.Yu@Sun.COM 			while (mp->b_cont != NULL &&
755*8348SEric.Yu@Sun.COM 			    DB_TYPE(mp->b_cont) != M_DATA) {
756*8348SEric.Yu@Sun.COM 				ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
757*8348SEric.Yu@Sun.COM 				    DB_TYPE(mp->b_cont) == M_PCPROTO);
758*8348SEric.Yu@Sun.COM 				mp = mp->b_cont;
759*8348SEric.Yu@Sun.COM 				savemptail = mp;
760*8348SEric.Yu@Sun.COM 			}
761*8348SEric.Yu@Sun.COM 			mp = savemptail->b_cont;
762*8348SEric.Yu@Sun.COM 			savemptail->b_cont = NULL;
763*8348SEric.Yu@Sun.COM 		}
764*8348SEric.Yu@Sun.COM 
765*8348SEric.Yu@Sun.COM 		ASSERT(DB_TYPE(mp) == M_DATA);
766*8348SEric.Yu@Sun.COM 		/*
767*8348SEric.Yu@Sun.COM 		 * Now process DATA blocks, if any. Note that for sodirect
768*8348SEric.Yu@Sun.COM 		 * enabled socket, uio_resid can be 0.
769*8348SEric.Yu@Sun.COM 		 */
770*8348SEric.Yu@Sun.COM 		if (uiop->uio_resid >= 0) {
771*8348SEric.Yu@Sun.COM 			ssize_t copied = 0;
772*8348SEric.Yu@Sun.COM 
773*8348SEric.Yu@Sun.COM 			if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
774*8348SEric.Yu@Sun.COM 				mutex_enter(sodp->sod_lockp);
775*8348SEric.Yu@Sun.COM 				ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
776*8348SEric.Yu@Sun.COM 				copied = sod_uioa_mblk(so, mp);
777*8348SEric.Yu@Sun.COM 				if (copied > 0)
778*8348SEric.Yu@Sun.COM 					partial_read = B_TRUE;
779*8348SEric.Yu@Sun.COM 				mutex_exit(sodp->sod_lockp);
780*8348SEric.Yu@Sun.COM 				/* mark this mblk as processed */
781*8348SEric.Yu@Sun.COM 				mp = NULL;
782*8348SEric.Yu@Sun.COM 			} else {
783*8348SEric.Yu@Sun.COM 				ssize_t oldresid = uiop->uio_resid;
784*8348SEric.Yu@Sun.COM 
785*8348SEric.Yu@Sun.COM 				if (MBLKL(mp) < so_mblk_pull_len) {
786*8348SEric.Yu@Sun.COM 					if (pullupmsg(mp, -1) == 1) {
787*8348SEric.Yu@Sun.COM 						last_tail = mp;
788*8348SEric.Yu@Sun.COM 					}
789*8348SEric.Yu@Sun.COM 				}
790*8348SEric.Yu@Sun.COM 				/*
791*8348SEric.Yu@Sun.COM 				 * Can not read beyond the oobmark
792*8348SEric.Yu@Sun.COM 				 */
793*8348SEric.Yu@Sun.COM 				mp = socopyoutuio(mp, uiop,
794*8348SEric.Yu@Sun.COM 				    oobmark == 0 ? INFPSZ : oobmark, &error);
795*8348SEric.Yu@Sun.COM 				if (error != 0) {
796*8348SEric.Yu@Sun.COM 					freemsg(*mctlp);
797*8348SEric.Yu@Sun.COM 					*mctlp = NULL;
798*8348SEric.Yu@Sun.COM 					more = 0;
799*8348SEric.Yu@Sun.COM 					goto done;
800*8348SEric.Yu@Sun.COM 				}
801*8348SEric.Yu@Sun.COM 				ASSERT(oldresid >= uiop->uio_resid);
802*8348SEric.Yu@Sun.COM 				copied = oldresid - uiop->uio_resid;
803*8348SEric.Yu@Sun.COM 				if (oldresid > uiop->uio_resid)
804*8348SEric.Yu@Sun.COM 					partial_read = B_TRUE;
805*8348SEric.Yu@Sun.COM 			}
806*8348SEric.Yu@Sun.COM 			ASSERT(copied >= 0);
807*8348SEric.Yu@Sun.COM 			if (copied > 0 && !(flags & MSG_PEEK)) {
808*8348SEric.Yu@Sun.COM 				mutex_enter(&so->so_lock);
809*8348SEric.Yu@Sun.COM 				so->so_rcv_queued -= copied;
810*8348SEric.Yu@Sun.COM 				ASSERT(so->so_oobmark >= 0);
811*8348SEric.Yu@Sun.COM 				if (so->so_oobmark > 0) {
812*8348SEric.Yu@Sun.COM 					so->so_oobmark -= copied;
813*8348SEric.Yu@Sun.COM 					ASSERT(so->so_oobmark >= 0);
814*8348SEric.Yu@Sun.COM 					if (so->so_oobmark == 0) {
815*8348SEric.Yu@Sun.COM 						ASSERT(so->so_state &
816*8348SEric.Yu@Sun.COM 						    SS_OOBPEND);
817*8348SEric.Yu@Sun.COM 						so->so_oobmark = 0;
818*8348SEric.Yu@Sun.COM 						so->so_state |= SS_RCVATMARK;
819*8348SEric.Yu@Sun.COM 					}
820*8348SEric.Yu@Sun.COM 				}
821*8348SEric.Yu@Sun.COM 				if (so->so_flowctrld && so->so_rcv_queued <
822*8348SEric.Yu@Sun.COM 				    so->so_rcvlowat) {
823*8348SEric.Yu@Sun.COM 					so->so_flowctrld = B_FALSE;
824*8348SEric.Yu@Sun.COM 					mutex_exit(&so->so_lock);
825*8348SEric.Yu@Sun.COM 					/*
826*8348SEric.Yu@Sun.COM 					 * open up flow control
827*8348SEric.Yu@Sun.COM 					 */
828*8348SEric.Yu@Sun.COM 					(*so->so_downcalls->sd_clr_flowctrl)
829*8348SEric.Yu@Sun.COM 					    (so->so_proto_handle);
830*8348SEric.Yu@Sun.COM 				} else {
831*8348SEric.Yu@Sun.COM 					mutex_exit(&so->so_lock);
832*8348SEric.Yu@Sun.COM 				}
833*8348SEric.Yu@Sun.COM 			}
834*8348SEric.Yu@Sun.COM 		}
835*8348SEric.Yu@Sun.COM 		if (mp != NULL) { /* more data blocks in msg */
836*8348SEric.Yu@Sun.COM 			more |= MOREDATA;
837*8348SEric.Yu@Sun.COM 			if ((flags & (MSG_PEEK|MSG_TRUNC))) {
838*8348SEric.Yu@Sun.COM 				if (flags & MSG_TRUNC) {
839*8348SEric.Yu@Sun.COM 					mutex_enter(&so->so_lock);
840*8348SEric.Yu@Sun.COM 					so->so_rcv_queued -= msgdsize(mp);
841*8348SEric.Yu@Sun.COM 					mutex_exit(&so->so_lock);
842*8348SEric.Yu@Sun.COM 				}
843*8348SEric.Yu@Sun.COM 				freemsg(mp);
844*8348SEric.Yu@Sun.COM 			} else if (partial_read && !somsghasdata(mp)) {
845*8348SEric.Yu@Sun.COM 				/*
846*8348SEric.Yu@Sun.COM 				 * Avoid queuing a zero-length tail part of
847*8348SEric.Yu@Sun.COM 				 * a message. partial_read == 1 indicates that
848*8348SEric.Yu@Sun.COM 				 * we read some of the message.
849*8348SEric.Yu@Sun.COM 				 */
850*8348SEric.Yu@Sun.COM 				freemsg(mp);
851*8348SEric.Yu@Sun.COM 				more &= ~MOREDATA;
852*8348SEric.Yu@Sun.COM 			} else {
853*8348SEric.Yu@Sun.COM 				if (savemp != NULL &&
854*8348SEric.Yu@Sun.COM 				    (flags & MSG_DUPCTRL)) {
855*8348SEric.Yu@Sun.COM 					mblk_t *nmp;
856*8348SEric.Yu@Sun.COM 					/*
857*8348SEric.Yu@Sun.COM 					 * There should only be non data mblks
858*8348SEric.Yu@Sun.COM 					 */
859*8348SEric.Yu@Sun.COM 					ASSERT(DB_TYPE(savemp) != M_DATA &&
860*8348SEric.Yu@Sun.COM 					    DB_TYPE(savemptail) != M_DATA);
861*8348SEric.Yu@Sun.COM try_again:
862*8348SEric.Yu@Sun.COM 					if ((nmp = dupmsg(savemp)) == NULL &&
863*8348SEric.Yu@Sun.COM 					    (nmp = copymsg(savemp)) == NULL) {
864*8348SEric.Yu@Sun.COM 
865*8348SEric.Yu@Sun.COM 						size_t size = msgsize(savemp);
866*8348SEric.Yu@Sun.COM 
867*8348SEric.Yu@Sun.COM 						error = strwaitbuf(size,
868*8348SEric.Yu@Sun.COM 						    BPRI_HI);
869*8348SEric.Yu@Sun.COM 						if (error != 0) {
870*8348SEric.Yu@Sun.COM 							/*
871*8348SEric.Yu@Sun.COM 							 * In case we
872*8348SEric.Yu@Sun.COM 							 * cannot copy
873*8348SEric.Yu@Sun.COM 							 * control data
874*8348SEric.Yu@Sun.COM 							 * free the remaining
875*8348SEric.Yu@Sun.COM 							 * data.
876*8348SEric.Yu@Sun.COM 							 */
877*8348SEric.Yu@Sun.COM 							freemsg(mp);
878*8348SEric.Yu@Sun.COM 							goto done;
879*8348SEric.Yu@Sun.COM 						}
880*8348SEric.Yu@Sun.COM 						goto try_again;
881*8348SEric.Yu@Sun.COM 					}
882*8348SEric.Yu@Sun.COM 
883*8348SEric.Yu@Sun.COM 					ASSERT(nmp != NULL);
884*8348SEric.Yu@Sun.COM 					ASSERT(DB_TYPE(nmp) != M_DATA);
885*8348SEric.Yu@Sun.COM 					savemptail->b_cont = mp;
886*8348SEric.Yu@Sun.COM 					*mctlp = nmp;
887*8348SEric.Yu@Sun.COM 					mp = savemp;
888*8348SEric.Yu@Sun.COM 				}
889*8348SEric.Yu@Sun.COM 				/*
890*8348SEric.Yu@Sun.COM 				 * putback mp
891*8348SEric.Yu@Sun.COM 				 */
892*8348SEric.Yu@Sun.COM 				so_prepend_msg(so, mp, last_tail);
893*8348SEric.Yu@Sun.COM 			}
894*8348SEric.Yu@Sun.COM 		}
895*8348SEric.Yu@Sun.COM 
896*8348SEric.Yu@Sun.COM 		/* fast check so_rcv_head if there is more data */
897*8348SEric.Yu@Sun.COM 		if (partial_read && !(so->so_state & SS_RCVATMARK) &&
898*8348SEric.Yu@Sun.COM 		    *mctlp == NULL && uiop->uio_resid > 0 &&
899*8348SEric.Yu@Sun.COM 		    !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
900*8348SEric.Yu@Sun.COM 			goto again;
901*8348SEric.Yu@Sun.COM 		}
902*8348SEric.Yu@Sun.COM 	} else if (!partial_read) {
903*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
904*8348SEric.Yu@Sun.COM 		if (so->so_error != 0) {
905*8348SEric.Yu@Sun.COM 			error = sogeterr(so, !(flags & MSG_PEEK));
906*8348SEric.Yu@Sun.COM 			mutex_exit(&so->so_lock);
907*8348SEric.Yu@Sun.COM 			return (error);
908*8348SEric.Yu@Sun.COM 		}
909*8348SEric.Yu@Sun.COM 		/*
910*8348SEric.Yu@Sun.COM 		 * No pending data. Return right away for nonblocking
911*8348SEric.Yu@Sun.COM 		 * socket, otherwise sleep waiting for data.
912*8348SEric.Yu@Sun.COM 		 */
913*8348SEric.Yu@Sun.COM 		if (!(so->so_state & SS_CANTRCVMORE)) {
914*8348SEric.Yu@Sun.COM 			if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
915*8348SEric.Yu@Sun.COM 			    (flags & MSG_DONTWAIT)) {
916*8348SEric.Yu@Sun.COM 				error = EWOULDBLOCK;
917*8348SEric.Yu@Sun.COM 			} else {
918*8348SEric.Yu@Sun.COM 				if (so->so_state & (SS_CLOSING |
919*8348SEric.Yu@Sun.COM 				    SS_FALLBACK_PENDING)) {
920*8348SEric.Yu@Sun.COM 					mutex_exit(&so->so_lock);
921*8348SEric.Yu@Sun.COM 					error = EINTR;
922*8348SEric.Yu@Sun.COM 					goto done;
923*8348SEric.Yu@Sun.COM 				}
924*8348SEric.Yu@Sun.COM 
925*8348SEric.Yu@Sun.COM 				if (so->so_rcv_head != NULL) {
926*8348SEric.Yu@Sun.COM 					goto again1;
927*8348SEric.Yu@Sun.COM 				}
928*8348SEric.Yu@Sun.COM 				so->so_rcv_wakeup = B_TRUE;
929*8348SEric.Yu@Sun.COM 				so->so_rcv_wanted = uiop->uio_resid;
930*8348SEric.Yu@Sun.COM 				if (so->so_rcvtimeo == 0) {
931*8348SEric.Yu@Sun.COM 					/*
932*8348SEric.Yu@Sun.COM 					 * Zero means disable timeout.
933*8348SEric.Yu@Sun.COM 					 */
934*8348SEric.Yu@Sun.COM 					error = cv_wait_sig(&so->so_rcv_cv,
935*8348SEric.Yu@Sun.COM 					    &so->so_lock);
936*8348SEric.Yu@Sun.COM 				} else {
937*8348SEric.Yu@Sun.COM 					clock_t now;
938*8348SEric.Yu@Sun.COM 					time_to_wait(&now, so->so_rcvtimeo);
939*8348SEric.Yu@Sun.COM 					error = cv_timedwait_sig(&so->so_rcv_cv,
940*8348SEric.Yu@Sun.COM 					    &so->so_lock, now);
941*8348SEric.Yu@Sun.COM 				}
942*8348SEric.Yu@Sun.COM 				so->so_rcv_wakeup = B_FALSE;
943*8348SEric.Yu@Sun.COM 				so->so_rcv_wanted = 0;
944*8348SEric.Yu@Sun.COM 
945*8348SEric.Yu@Sun.COM 				if (error == 0) {
946*8348SEric.Yu@Sun.COM 					error = EINTR;
947*8348SEric.Yu@Sun.COM 				} else if (error == -1) {
948*8348SEric.Yu@Sun.COM 					error = ETIME;
949*8348SEric.Yu@Sun.COM 				} else {
950*8348SEric.Yu@Sun.COM 					goto again1;
951*8348SEric.Yu@Sun.COM 				}
952*8348SEric.Yu@Sun.COM 			}
953*8348SEric.Yu@Sun.COM 		}
954*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
955*8348SEric.Yu@Sun.COM 	}
956*8348SEric.Yu@Sun.COM 	if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
957*8348SEric.Yu@Sun.COM 		/*
958*8348SEric.Yu@Sun.COM 		 * We are passed the mark, update state
959*8348SEric.Yu@Sun.COM 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
960*8348SEric.Yu@Sun.COM 		 * The draft Posix socket spec states that the mark should
961*8348SEric.Yu@Sun.COM 		 * not be cleared when peeking. We follow the latter.
962*8348SEric.Yu@Sun.COM 		 */
963*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
964*8348SEric.Yu@Sun.COM 		ASSERT(so_verify_oobstate(so));
965*8348SEric.Yu@Sun.COM 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
966*8348SEric.Yu@Sun.COM 		freemsg(so->so_oobmsg);
967*8348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
968*8348SEric.Yu@Sun.COM 		ASSERT(so_verify_oobstate(so));
969*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
970*8348SEric.Yu@Sun.COM 	}
971*8348SEric.Yu@Sun.COM 	ASSERT(so->so_rcv_wakeup == B_FALSE);
972*8348SEric.Yu@Sun.COM done:
973*8348SEric.Yu@Sun.COM 	if (sodp != NULL) {
974*8348SEric.Yu@Sun.COM 		mutex_enter(sodp->sod_lockp);
975*8348SEric.Yu@Sun.COM 		if ((sodp->sod_state & SOD_ENABLED) &&
976*8348SEric.Yu@Sun.COM 		    (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
977*8348SEric.Yu@Sun.COM 			SOD_UIOAFINI(sodp);
978*8348SEric.Yu@Sun.COM 			if (sodp->sod_uioa.uioa_mbytes > 0) {
979*8348SEric.Yu@Sun.COM 				ASSERT(so->so_rcv_q_head != NULL ||
980*8348SEric.Yu@Sun.COM 				    so->so_rcv_head != NULL);
981*8348SEric.Yu@Sun.COM 				so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
982*8348SEric.Yu@Sun.COM 				if (error == EWOULDBLOCK)
983*8348SEric.Yu@Sun.COM 					error = 0;
984*8348SEric.Yu@Sun.COM 			}
985*8348SEric.Yu@Sun.COM 		}
986*8348SEric.Yu@Sun.COM 		mutex_exit(sodp->sod_lockp);
987*8348SEric.Yu@Sun.COM 	}
988*8348SEric.Yu@Sun.COM #ifdef DEBUG
989*8348SEric.Yu@Sun.COM 	if (so_debug_length) {
990*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
991*8348SEric.Yu@Sun.COM 		ASSERT(so_check_length(so));
992*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
993*8348SEric.Yu@Sun.COM 	}
994*8348SEric.Yu@Sun.COM #endif
995*8348SEric.Yu@Sun.COM 	rvalp->r_val1 = more;
996*8348SEric.Yu@Sun.COM 	return (error);
997*8348SEric.Yu@Sun.COM }
998*8348SEric.Yu@Sun.COM 
999*8348SEric.Yu@Sun.COM void
1000*8348SEric.Yu@Sun.COM so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1001*8348SEric.Yu@Sun.COM {
1002*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
1003*8348SEric.Yu@Sun.COM 
1004*8348SEric.Yu@Sun.COM #ifdef DEBUG
1005*8348SEric.Yu@Sun.COM 	if (so_debug_length) {
1006*8348SEric.Yu@Sun.COM 		ASSERT(so_check_length(so));
1007*8348SEric.Yu@Sun.COM 	}
1008*8348SEric.Yu@Sun.COM #endif
1009*8348SEric.Yu@Sun.COM 	so->so_rcv_queued += msg_size;
1010*8348SEric.Yu@Sun.COM 
1011*8348SEric.Yu@Sun.COM 	if (so->so_rcv_head == NULL) {
1012*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_last_head == NULL);
1013*8348SEric.Yu@Sun.COM 		so->so_rcv_head = mp;
1014*8348SEric.Yu@Sun.COM 		so->so_rcv_last_head = mp;
1015*8348SEric.Yu@Sun.COM 	} else if ((DB_TYPE(mp) == M_DATA &&
1016*8348SEric.Yu@Sun.COM 	    DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1017*8348SEric.Yu@Sun.COM 	    ((DB_FLAGS(mp) & DBLK_UIOA) ==
1018*8348SEric.Yu@Sun.COM 	    (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1019*8348SEric.Yu@Sun.COM 		/* Added to the end */
1020*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_last_head != NULL);
1021*8348SEric.Yu@Sun.COM 		ASSERT(so->so_rcv_last_head->b_prev != NULL);
1022*8348SEric.Yu@Sun.COM 		so->so_rcv_last_head->b_prev->b_cont = mp;
1023*8348SEric.Yu@Sun.COM 	} else {
1024*8348SEric.Yu@Sun.COM 		/* Start a new end */
1025*8348SEric.Yu@Sun.COM 		so->so_rcv_last_head->b_next = mp;
1026*8348SEric.Yu@Sun.COM 		so->so_rcv_last_head = mp;
1027*8348SEric.Yu@Sun.COM 	}
1028*8348SEric.Yu@Sun.COM 	while (mp->b_cont != NULL)
1029*8348SEric.Yu@Sun.COM 		mp = mp->b_cont;
1030*8348SEric.Yu@Sun.COM 
1031*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head->b_prev = mp;
1032*8348SEric.Yu@Sun.COM #ifdef DEBUG
1033*8348SEric.Yu@Sun.COM 	if (so_debug_length) {
1034*8348SEric.Yu@Sun.COM 		ASSERT(so_check_length(so));
1035*8348SEric.Yu@Sun.COM 	}
1036*8348SEric.Yu@Sun.COM #endif
1037*8348SEric.Yu@Sun.COM }
1038*8348SEric.Yu@Sun.COM 
1039*8348SEric.Yu@Sun.COM /*
1040*8348SEric.Yu@Sun.COM  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1041*8348SEric.Yu@Sun.COM  */
1042*8348SEric.Yu@Sun.COM boolean_t
1043*8348SEric.Yu@Sun.COM somsghasdata(mblk_t *mp)
1044*8348SEric.Yu@Sun.COM {
1045*8348SEric.Yu@Sun.COM 	for (; mp; mp = mp->b_cont)
1046*8348SEric.Yu@Sun.COM 		if (mp->b_datap->db_type == M_DATA) {
1047*8348SEric.Yu@Sun.COM 			ASSERT(mp->b_wptr >= mp->b_rptr);
1048*8348SEric.Yu@Sun.COM 			if (mp->b_wptr > mp->b_rptr)
1049*8348SEric.Yu@Sun.COM 				return (B_TRUE);
1050*8348SEric.Yu@Sun.COM 		}
1051*8348SEric.Yu@Sun.COM 	return (B_FALSE);
1052*8348SEric.Yu@Sun.COM }
1053*8348SEric.Yu@Sun.COM 
1054*8348SEric.Yu@Sun.COM /*
1055*8348SEric.Yu@Sun.COM  * Flush the read side of sockfs.
1056*8348SEric.Yu@Sun.COM  *
1057*8348SEric.Yu@Sun.COM  * The caller must be sure that a reader is not already active when the
1058*8348SEric.Yu@Sun.COM  * buffer is being flushed.
1059*8348SEric.Yu@Sun.COM  */
1060*8348SEric.Yu@Sun.COM void
1061*8348SEric.Yu@Sun.COM so_rcv_flush(struct sonode *so)
1062*8348SEric.Yu@Sun.COM {
1063*8348SEric.Yu@Sun.COM 	mblk_t  *mp;
1064*8348SEric.Yu@Sun.COM 
1065*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
1066*8348SEric.Yu@Sun.COM 
1067*8348SEric.Yu@Sun.COM 	if (so->so_oobmsg != NULL) {
1068*8348SEric.Yu@Sun.COM 		freemsg(so->so_oobmsg);
1069*8348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
1070*8348SEric.Yu@Sun.COM 		so->so_oobmark = 0;
1071*8348SEric.Yu@Sun.COM 		so->so_state &=
1072*8348SEric.Yu@Sun.COM 		    ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1073*8348SEric.Yu@Sun.COM 	}
1074*8348SEric.Yu@Sun.COM 
1075*8348SEric.Yu@Sun.COM 	/*
1076*8348SEric.Yu@Sun.COM 	 * Free messages sitting in the send and recv queue
1077*8348SEric.Yu@Sun.COM 	 */
1078*8348SEric.Yu@Sun.COM 	while (so->so_rcv_q_head != NULL) {
1079*8348SEric.Yu@Sun.COM 		mp = so->so_rcv_q_head;
1080*8348SEric.Yu@Sun.COM 		so->so_rcv_q_head = mp->b_next;
1081*8348SEric.Yu@Sun.COM 		mp->b_next = mp->b_prev = NULL;
1082*8348SEric.Yu@Sun.COM 		freemsg(mp);
1083*8348SEric.Yu@Sun.COM 	}
1084*8348SEric.Yu@Sun.COM 	while (so->so_rcv_head != NULL) {
1085*8348SEric.Yu@Sun.COM 		mp = so->so_rcv_head;
1086*8348SEric.Yu@Sun.COM 		so->so_rcv_head = mp->b_next;
1087*8348SEric.Yu@Sun.COM 		mp->b_next = mp->b_prev = NULL;
1088*8348SEric.Yu@Sun.COM 		freemsg(mp);
1089*8348SEric.Yu@Sun.COM 	}
1090*8348SEric.Yu@Sun.COM 	so->so_rcv_queued = 0;
1091*8348SEric.Yu@Sun.COM 	so->so_rcv_q_head = NULL;
1092*8348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head = NULL;
1093*8348SEric.Yu@Sun.COM 	so->so_rcv_head = NULL;
1094*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head = NULL;
1095*8348SEric.Yu@Sun.COM }
1096*8348SEric.Yu@Sun.COM 
1097*8348SEric.Yu@Sun.COM /*
1098*8348SEric.Yu@Sun.COM  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1099*8348SEric.Yu@Sun.COM  */
1100*8348SEric.Yu@Sun.COM int
1101*8348SEric.Yu@Sun.COM sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1102*8348SEric.Yu@Sun.COM     boolean_t oob_inline)
1103*8348SEric.Yu@Sun.COM {
1104*8348SEric.Yu@Sun.COM 	mblk_t		*mp, *nmp;
1105*8348SEric.Yu@Sun.COM 	int		error;
1106*8348SEric.Yu@Sun.COM 
1107*8348SEric.Yu@Sun.COM 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1108*8348SEric.Yu@Sun.COM 	    flags));
1109*8348SEric.Yu@Sun.COM 
1110*8348SEric.Yu@Sun.COM 	if (msg != NULL) {
1111*8348SEric.Yu@Sun.COM 		/*
1112*8348SEric.Yu@Sun.COM 		 * There is never any oob data with addresses or control since
1113*8348SEric.Yu@Sun.COM 		 * the T_EXDATA_IND does not carry any options.
1114*8348SEric.Yu@Sun.COM 		 */
1115*8348SEric.Yu@Sun.COM 		msg->msg_controllen = 0;
1116*8348SEric.Yu@Sun.COM 		msg->msg_namelen = 0;
1117*8348SEric.Yu@Sun.COM 		msg->msg_flags = 0;
1118*8348SEric.Yu@Sun.COM 	}
1119*8348SEric.Yu@Sun.COM 
1120*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1121*8348SEric.Yu@Sun.COM 	ASSERT(so_verify_oobstate(so));
1122*8348SEric.Yu@Sun.COM 	if (oob_inline ||
1123*8348SEric.Yu@Sun.COM 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1124*8348SEric.Yu@Sun.COM 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1125*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1126*8348SEric.Yu@Sun.COM 		return (EINVAL);
1127*8348SEric.Yu@Sun.COM 	}
1128*8348SEric.Yu@Sun.COM 	if (!(so->so_state & SS_HAVEOOBDATA)) {
1129*8348SEric.Yu@Sun.COM 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
1130*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1131*8348SEric.Yu@Sun.COM 		return (EWOULDBLOCK);
1132*8348SEric.Yu@Sun.COM 	}
1133*8348SEric.Yu@Sun.COM 	ASSERT(so->so_oobmsg != NULL);
1134*8348SEric.Yu@Sun.COM 	mp = so->so_oobmsg;
1135*8348SEric.Yu@Sun.COM 	if (flags & MSG_PEEK) {
1136*8348SEric.Yu@Sun.COM 		/*
1137*8348SEric.Yu@Sun.COM 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
1138*8348SEric.Yu@Sun.COM 		 * Instead we revert to the consolidation private
1139*8348SEric.Yu@Sun.COM 		 * allocb_wait plus bcopy.
1140*8348SEric.Yu@Sun.COM 		 */
1141*8348SEric.Yu@Sun.COM 		mblk_t *mp1;
1142*8348SEric.Yu@Sun.COM 
1143*8348SEric.Yu@Sun.COM 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1144*8348SEric.Yu@Sun.COM 		ASSERT(mp1);
1145*8348SEric.Yu@Sun.COM 
1146*8348SEric.Yu@Sun.COM 		while (mp != NULL) {
1147*8348SEric.Yu@Sun.COM 			ssize_t size;
1148*8348SEric.Yu@Sun.COM 
1149*8348SEric.Yu@Sun.COM 			size = MBLKL(mp);
1150*8348SEric.Yu@Sun.COM 			bcopy(mp->b_rptr, mp1->b_wptr, size);
1151*8348SEric.Yu@Sun.COM 			mp1->b_wptr += size;
1152*8348SEric.Yu@Sun.COM 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1153*8348SEric.Yu@Sun.COM 			mp = mp->b_cont;
1154*8348SEric.Yu@Sun.COM 		}
1155*8348SEric.Yu@Sun.COM 		mp = mp1;
1156*8348SEric.Yu@Sun.COM 	} else {
1157*8348SEric.Yu@Sun.COM 		/*
1158*8348SEric.Yu@Sun.COM 		 * Update the state indicating that the data has been consumed.
1159*8348SEric.Yu@Sun.COM 		 * Keep SS_OOBPEND set until data is consumed past the mark.
1160*8348SEric.Yu@Sun.COM 		 */
1161*8348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
1162*8348SEric.Yu@Sun.COM 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1163*8348SEric.Yu@Sun.COM 	}
1164*8348SEric.Yu@Sun.COM 	ASSERT(so_verify_oobstate(so));
1165*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1166*8348SEric.Yu@Sun.COM 
1167*8348SEric.Yu@Sun.COM 	error = 0;
1168*8348SEric.Yu@Sun.COM 	nmp = mp;
1169*8348SEric.Yu@Sun.COM 	while (nmp != NULL && uiop->uio_resid > 0) {
1170*8348SEric.Yu@Sun.COM 		ssize_t n = MBLKL(nmp);
1171*8348SEric.Yu@Sun.COM 
1172*8348SEric.Yu@Sun.COM 		n = MIN(n, uiop->uio_resid);
1173*8348SEric.Yu@Sun.COM 		if (n > 0)
1174*8348SEric.Yu@Sun.COM 			error = uiomove(nmp->b_rptr, n,
1175*8348SEric.Yu@Sun.COM 			    UIO_READ, uiop);
1176*8348SEric.Yu@Sun.COM 		if (error)
1177*8348SEric.Yu@Sun.COM 			break;
1178*8348SEric.Yu@Sun.COM 		nmp = nmp->b_cont;
1179*8348SEric.Yu@Sun.COM 	}
1180*8348SEric.Yu@Sun.COM 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1181*8348SEric.Yu@Sun.COM 	freemsg(mp);
1182*8348SEric.Yu@Sun.COM 	return (error);
1183*8348SEric.Yu@Sun.COM }
1184*8348SEric.Yu@Sun.COM 
1185*8348SEric.Yu@Sun.COM /*
1186*8348SEric.Yu@Sun.COM  * Allocate and initializ sonode
1187*8348SEric.Yu@Sun.COM  */
1188*8348SEric.Yu@Sun.COM /* ARGSUSED */
1189*8348SEric.Yu@Sun.COM struct sonode *
1190*8348SEric.Yu@Sun.COM socket_sonode_create(struct sockparams *sp, int family, int type,
1191*8348SEric.Yu@Sun.COM     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1192*8348SEric.Yu@Sun.COM {
1193*8348SEric.Yu@Sun.COM 	sonode_t *so;
1194*8348SEric.Yu@Sun.COM 	int	kmflags;
1195*8348SEric.Yu@Sun.COM 
1196*8348SEric.Yu@Sun.COM 	/*
1197*8348SEric.Yu@Sun.COM 	 * Choose the right set of sonodeops based on the upcall and
1198*8348SEric.Yu@Sun.COM 	 * down call version that the protocol has provided
1199*8348SEric.Yu@Sun.COM 	 */
1200*8348SEric.Yu@Sun.COM 	if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1201*8348SEric.Yu@Sun.COM 	    SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1202*8348SEric.Yu@Sun.COM 		/*
1203*8348SEric.Yu@Sun.COM 		 * mismatch
1204*8348SEric.Yu@Sun.COM 		 */
1205*8348SEric.Yu@Sun.COM #ifdef DEBUG
1206*8348SEric.Yu@Sun.COM 		cmn_err(CE_CONT, "protocol and socket module version mismatch");
1207*8348SEric.Yu@Sun.COM #endif
1208*8348SEric.Yu@Sun.COM 		*errorp = EINVAL;
1209*8348SEric.Yu@Sun.COM 		return (NULL);
1210*8348SEric.Yu@Sun.COM 	}
1211*8348SEric.Yu@Sun.COM 
1212*8348SEric.Yu@Sun.COM 	kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1213*8348SEric.Yu@Sun.COM 
1214*8348SEric.Yu@Sun.COM 	so = kmem_cache_alloc(socket_cache, kmflags);
1215*8348SEric.Yu@Sun.COM 	if (so == NULL) {
1216*8348SEric.Yu@Sun.COM 		*errorp = ENOMEM;
1217*8348SEric.Yu@Sun.COM 		return (NULL);
1218*8348SEric.Yu@Sun.COM 	}
1219*8348SEric.Yu@Sun.COM 
1220*8348SEric.Yu@Sun.COM 	sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1221*8348SEric.Yu@Sun.COM 
1222*8348SEric.Yu@Sun.COM 	if (version == SOV_DEFAULT)
1223*8348SEric.Yu@Sun.COM 		version = so_default_version;
1224*8348SEric.Yu@Sun.COM 
1225*8348SEric.Yu@Sun.COM 	so->so_version = (short)version;
1226*8348SEric.Yu@Sun.COM 
1227*8348SEric.Yu@Sun.COM 	/*
1228*8348SEric.Yu@Sun.COM 	 * set the default values to be INFPSZ
1229*8348SEric.Yu@Sun.COM 	 * if a protocol desires it can change the value later
1230*8348SEric.Yu@Sun.COM 	 */
1231*8348SEric.Yu@Sun.COM 	so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1232*8348SEric.Yu@Sun.COM 	so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1233*8348SEric.Yu@Sun.COM 	so->so_proto_props.sopp_maxpsz = INFPSZ;
1234*8348SEric.Yu@Sun.COM 	so->so_proto_props.sopp_maxblk = INFPSZ;
1235*8348SEric.Yu@Sun.COM 
1236*8348SEric.Yu@Sun.COM 	return (so);
1237*8348SEric.Yu@Sun.COM }
1238*8348SEric.Yu@Sun.COM 
1239*8348SEric.Yu@Sun.COM int
1240*8348SEric.Yu@Sun.COM socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1241*8348SEric.Yu@Sun.COM {
1242*8348SEric.Yu@Sun.COM 	int error = 0;
1243*8348SEric.Yu@Sun.COM 
1244*8348SEric.Yu@Sun.COM 	if (pso != NULL) {
1245*8348SEric.Yu@Sun.COM 		/*
1246*8348SEric.Yu@Sun.COM 		 * We have a passive open, so inherit basic state from
1247*8348SEric.Yu@Sun.COM 		 * the parent (listener).
1248*8348SEric.Yu@Sun.COM 		 *
1249*8348SEric.Yu@Sun.COM 		 * No need to grab the new sonode's lock, since there is no
1250*8348SEric.Yu@Sun.COM 		 * one that can have a reference to it.
1251*8348SEric.Yu@Sun.COM 		 */
1252*8348SEric.Yu@Sun.COM 		mutex_enter(&pso->so_lock);
1253*8348SEric.Yu@Sun.COM 
1254*8348SEric.Yu@Sun.COM 		so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1255*8348SEric.Yu@Sun.COM 		so->so_pgrp = pso->so_pgrp;
1256*8348SEric.Yu@Sun.COM 		so->so_rcvtimeo = pso->so_rcvtimeo;
1257*8348SEric.Yu@Sun.COM 		so->so_sndtimeo = pso->so_sndtimeo;
1258*8348SEric.Yu@Sun.COM 		/*
1259*8348SEric.Yu@Sun.COM 		 * Make note of the socket level options. TCP and IP level
1260*8348SEric.Yu@Sun.COM 		 * options are already inherited. We could do all this after
1261*8348SEric.Yu@Sun.COM 		 * accept is successful but doing it here simplifies code and
1262*8348SEric.Yu@Sun.COM 		 * no harm done for error case.
1263*8348SEric.Yu@Sun.COM 		 */
1264*8348SEric.Yu@Sun.COM 		so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1265*8348SEric.Yu@Sun.COM 		    SO_KEEPALIVE| SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1266*8348SEric.Yu@Sun.COM 		    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1267*8348SEric.Yu@Sun.COM 		so->so_proto_props = pso->so_proto_props;
1268*8348SEric.Yu@Sun.COM 		so->so_mode = pso->so_mode;
1269*8348SEric.Yu@Sun.COM 
1270*8348SEric.Yu@Sun.COM 		mutex_exit(&pso->so_lock);
1271*8348SEric.Yu@Sun.COM 
1272*8348SEric.Yu@Sun.COM 		if (uioasync.enabled) {
1273*8348SEric.Yu@Sun.COM 			sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
1274*8348SEric.Yu@Sun.COM 		}
1275*8348SEric.Yu@Sun.COM 		return (0);
1276*8348SEric.Yu@Sun.COM 	} else {
1277*8348SEric.Yu@Sun.COM 		struct sockparams *sp = so->so_sockparams;
1278*8348SEric.Yu@Sun.COM 		sock_upcalls_t *upcalls_to_use;
1279*8348SEric.Yu@Sun.COM 
1280*8348SEric.Yu@Sun.COM 		/*
1281*8348SEric.Yu@Sun.COM 		 * Based on the version number select the right upcalls to
1282*8348SEric.Yu@Sun.COM 		 * pass down. Currently we only have one version so choose
1283*8348SEric.Yu@Sun.COM 		 * default
1284*8348SEric.Yu@Sun.COM 		 */
1285*8348SEric.Yu@Sun.COM 		upcalls_to_use = &so_upcalls;
1286*8348SEric.Yu@Sun.COM 
1287*8348SEric.Yu@Sun.COM 		/* active open, so create a lower handle */
1288*8348SEric.Yu@Sun.COM 		so->so_proto_handle =
1289*8348SEric.Yu@Sun.COM 		    sp->sp_smod_info->smod_proto_create_func(so->so_family,
1290*8348SEric.Yu@Sun.COM 		    so->so_type, so->so_protocol, &so->so_downcalls,
1291*8348SEric.Yu@Sun.COM 		    &so->so_mode, &error, flags, cr);
1292*8348SEric.Yu@Sun.COM 
1293*8348SEric.Yu@Sun.COM 		if (so->so_proto_handle == NULL) {
1294*8348SEric.Yu@Sun.COM 			ASSERT(error != 0);
1295*8348SEric.Yu@Sun.COM 			/*
1296*8348SEric.Yu@Sun.COM 			 * To be safe; if a lower handle cannot be created, and
1297*8348SEric.Yu@Sun.COM 			 * the proto does not give a reason why, assume there
1298*8348SEric.Yu@Sun.COM 			 * was a lack of memory.
1299*8348SEric.Yu@Sun.COM 			 */
1300*8348SEric.Yu@Sun.COM 			return ((error == 0) ? ENOMEM : error);
1301*8348SEric.Yu@Sun.COM 		}
1302*8348SEric.Yu@Sun.COM 		ASSERT(so->so_downcalls != NULL);
1303*8348SEric.Yu@Sun.COM 		ASSERT(so->so_downcalls->sd_send != NULL ||
1304*8348SEric.Yu@Sun.COM 		    so->so_downcalls->sd_send_uio != NULL);
1305*8348SEric.Yu@Sun.COM 		if (so->so_downcalls->sd_recv_uio != NULL) {
1306*8348SEric.Yu@Sun.COM 			ASSERT(so->so_downcalls->sd_poll != NULL);
1307*8348SEric.Yu@Sun.COM 			so->so_pollev |= SO_POLLEV_ALWAYS;
1308*8348SEric.Yu@Sun.COM 		}
1309*8348SEric.Yu@Sun.COM 
1310*8348SEric.Yu@Sun.COM 		(*so->so_downcalls->sd_activate)(so->so_proto_handle,
1311*8348SEric.Yu@Sun.COM 		    (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1312*8348SEric.Yu@Sun.COM 
1313*8348SEric.Yu@Sun.COM 		/* Wildcard */
1314*8348SEric.Yu@Sun.COM 
1315*8348SEric.Yu@Sun.COM 		/*
1316*8348SEric.Yu@Sun.COM 		 * FIXME No need for this, the protocol can deal with it in
1317*8348SEric.Yu@Sun.COM 		 * sd_create(). Should update ICMP.
1318*8348SEric.Yu@Sun.COM 		 */
1319*8348SEric.Yu@Sun.COM 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
1320*8348SEric.Yu@Sun.COM 			int protocol = so->so_protocol;
1321*8348SEric.Yu@Sun.COM 			int error;
1322*8348SEric.Yu@Sun.COM 			/*
1323*8348SEric.Yu@Sun.COM 			 * Issue SO_PROTOTYPE setsockopt.
1324*8348SEric.Yu@Sun.COM 			 */
1325*8348SEric.Yu@Sun.COM 			error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1326*8348SEric.Yu@Sun.COM 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
1327*8348SEric.Yu@Sun.COM 			if (error) {
1328*8348SEric.Yu@Sun.COM 				(void) (*so->so_downcalls->sd_close)
1329*8348SEric.Yu@Sun.COM 				    (so->so_proto_handle, 0, cr);
1330*8348SEric.Yu@Sun.COM 
1331*8348SEric.Yu@Sun.COM 				mutex_enter(&so->so_lock);
1332*8348SEric.Yu@Sun.COM 				so_rcv_flush(so);
1333*8348SEric.Yu@Sun.COM 				mutex_exit(&so->so_lock);
1334*8348SEric.Yu@Sun.COM 				/*
1335*8348SEric.Yu@Sun.COM 				 * Setsockopt often fails with ENOPROTOOPT but
1336*8348SEric.Yu@Sun.COM 				 * socket() should fail with
1337*8348SEric.Yu@Sun.COM 				 * EPROTONOSUPPORT/EPROTOTYPE.
1338*8348SEric.Yu@Sun.COM 				 */
1339*8348SEric.Yu@Sun.COM 				return (EPROTONOSUPPORT);
1340*8348SEric.Yu@Sun.COM 			}
1341*8348SEric.Yu@Sun.COM 		}
1342*8348SEric.Yu@Sun.COM 		return (0);
1343*8348SEric.Yu@Sun.COM 	}
1344*8348SEric.Yu@Sun.COM }
1345*8348SEric.Yu@Sun.COM 
1346*8348SEric.Yu@Sun.COM /*
1347*8348SEric.Yu@Sun.COM  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1348*8348SEric.Yu@Sun.COM  *         struct cred *cr, int32_t *rvalp)
1349*8348SEric.Yu@Sun.COM  *
1350*8348SEric.Yu@Sun.COM  * Handle ioctls that manipulate basic socket state; non-blocking,
1351*8348SEric.Yu@Sun.COM  * async, etc.
1352*8348SEric.Yu@Sun.COM  *
1353*8348SEric.Yu@Sun.COM  * Returns:
1354*8348SEric.Yu@Sun.COM  *   < 0  - ioctl was not handle
1355*8348SEric.Yu@Sun.COM  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1356*8348SEric.Yu@Sun.COM  *
1357*8348SEric.Yu@Sun.COM  * Notes:
1358*8348SEric.Yu@Sun.COM  *   Assumes the standard receive buffer is used to obtain info for
1359*8348SEric.Yu@Sun.COM  *   NREAD.
1360*8348SEric.Yu@Sun.COM  */
1361*8348SEric.Yu@Sun.COM /* ARGSUSED */
1362*8348SEric.Yu@Sun.COM int
1363*8348SEric.Yu@Sun.COM socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1364*8348SEric.Yu@Sun.COM     struct cred *cr, int32_t *rvalp)
1365*8348SEric.Yu@Sun.COM {
1366*8348SEric.Yu@Sun.COM 	switch (cmd) {
1367*8348SEric.Yu@Sun.COM 	case FIONBIO: {
1368*8348SEric.Yu@Sun.COM 		int32_t value;
1369*8348SEric.Yu@Sun.COM 
1370*8348SEric.Yu@Sun.COM 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
1371*8348SEric.Yu@Sun.COM 		    (mode & (int)FKIOCTL)))
1372*8348SEric.Yu@Sun.COM 			return (EFAULT);
1373*8348SEric.Yu@Sun.COM 
1374*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
1375*8348SEric.Yu@Sun.COM 		if (value) {
1376*8348SEric.Yu@Sun.COM 			so->so_state |= SS_NDELAY;
1377*8348SEric.Yu@Sun.COM 		} else {
1378*8348SEric.Yu@Sun.COM 			so->so_state &= ~SS_NDELAY;
1379*8348SEric.Yu@Sun.COM 		}
1380*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1381*8348SEric.Yu@Sun.COM 		return (0);
1382*8348SEric.Yu@Sun.COM 	}
1383*8348SEric.Yu@Sun.COM 	case FIOASYNC: {
1384*8348SEric.Yu@Sun.COM 		int32_t value;
1385*8348SEric.Yu@Sun.COM 
1386*8348SEric.Yu@Sun.COM 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
1387*8348SEric.Yu@Sun.COM 		    (mode & (int)FKIOCTL)))
1388*8348SEric.Yu@Sun.COM 			return (EFAULT);
1389*8348SEric.Yu@Sun.COM 
1390*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
1391*8348SEric.Yu@Sun.COM 
1392*8348SEric.Yu@Sun.COM 		if (value) {
1393*8348SEric.Yu@Sun.COM 			/* Turn on SIGIO */
1394*8348SEric.Yu@Sun.COM 			so->so_state |= SS_ASYNC;
1395*8348SEric.Yu@Sun.COM 		} else {
1396*8348SEric.Yu@Sun.COM 			/* Turn off SIGIO */
1397*8348SEric.Yu@Sun.COM 			so->so_state &= ~SS_ASYNC;
1398*8348SEric.Yu@Sun.COM 		}
1399*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1400*8348SEric.Yu@Sun.COM 
1401*8348SEric.Yu@Sun.COM 		return (0);
1402*8348SEric.Yu@Sun.COM 	}
1403*8348SEric.Yu@Sun.COM 
1404*8348SEric.Yu@Sun.COM 	case SIOCSPGRP:
1405*8348SEric.Yu@Sun.COM 	case FIOSETOWN: {
1406*8348SEric.Yu@Sun.COM 		int error;
1407*8348SEric.Yu@Sun.COM 		pid_t pid;
1408*8348SEric.Yu@Sun.COM 
1409*8348SEric.Yu@Sun.COM 		if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1410*8348SEric.Yu@Sun.COM 		    (mode & (int)FKIOCTL)))
1411*8348SEric.Yu@Sun.COM 			return (EFAULT);
1412*8348SEric.Yu@Sun.COM 
1413*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
1414*8348SEric.Yu@Sun.COM 		error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1415*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1416*8348SEric.Yu@Sun.COM 		return (error);
1417*8348SEric.Yu@Sun.COM 	}
1418*8348SEric.Yu@Sun.COM 	case SIOCGPGRP:
1419*8348SEric.Yu@Sun.COM 	case FIOGETOWN:
1420*8348SEric.Yu@Sun.COM 		if (so_copyout(&so->so_pgrp, (void *)arg,
1421*8348SEric.Yu@Sun.COM 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
1422*8348SEric.Yu@Sun.COM 			return (EFAULT);
1423*8348SEric.Yu@Sun.COM 
1424*8348SEric.Yu@Sun.COM 		return (0);
1425*8348SEric.Yu@Sun.COM 	case SIOCATMARK: {
1426*8348SEric.Yu@Sun.COM 		int retval;
1427*8348SEric.Yu@Sun.COM 
1428*8348SEric.Yu@Sun.COM 		/*
1429*8348SEric.Yu@Sun.COM 		 * Only protocols that support urgent data can handle ATMARK.
1430*8348SEric.Yu@Sun.COM 		 */
1431*8348SEric.Yu@Sun.COM 		if ((so->so_mode & SM_EXDATA) == 0)
1432*8348SEric.Yu@Sun.COM 			return (EINVAL);
1433*8348SEric.Yu@Sun.COM 
1434*8348SEric.Yu@Sun.COM 		/*
1435*8348SEric.Yu@Sun.COM 		 * If the protocol is maintaining its own buffer, then the
1436*8348SEric.Yu@Sun.COM 		 * request must be passed down.
1437*8348SEric.Yu@Sun.COM 		 */
1438*8348SEric.Yu@Sun.COM 		if (so->so_downcalls->sd_recv_uio != NULL)
1439*8348SEric.Yu@Sun.COM 			return (-1);
1440*8348SEric.Yu@Sun.COM 
1441*8348SEric.Yu@Sun.COM 		retval = (so->so_state & SS_RCVATMARK) != 0;
1442*8348SEric.Yu@Sun.COM 
1443*8348SEric.Yu@Sun.COM 		if (so_copyout(&retval, (void *)arg, sizeof (int),
1444*8348SEric.Yu@Sun.COM 		    (mode & (int)FKIOCTL))) {
1445*8348SEric.Yu@Sun.COM 			return (EFAULT);
1446*8348SEric.Yu@Sun.COM 		}
1447*8348SEric.Yu@Sun.COM 		return (0);
1448*8348SEric.Yu@Sun.COM 	}
1449*8348SEric.Yu@Sun.COM 
1450*8348SEric.Yu@Sun.COM 	case FIONREAD: {
1451*8348SEric.Yu@Sun.COM 		int retval;
1452*8348SEric.Yu@Sun.COM 
1453*8348SEric.Yu@Sun.COM 		/*
1454*8348SEric.Yu@Sun.COM 		 * If the protocol is maintaining its own buffer, then the
1455*8348SEric.Yu@Sun.COM 		 * request must be passed down.
1456*8348SEric.Yu@Sun.COM 		 */
1457*8348SEric.Yu@Sun.COM 		if (so->so_downcalls->sd_recv_uio != NULL)
1458*8348SEric.Yu@Sun.COM 			return (-1);
1459*8348SEric.Yu@Sun.COM 
1460*8348SEric.Yu@Sun.COM 		retval = MIN(so->so_rcv_queued, INT_MAX);
1461*8348SEric.Yu@Sun.COM 
1462*8348SEric.Yu@Sun.COM 		if (so_copyout(&retval, (void *)arg,
1463*8348SEric.Yu@Sun.COM 		    sizeof (retval), (mode & (int)FKIOCTL))) {
1464*8348SEric.Yu@Sun.COM 			return (EFAULT);
1465*8348SEric.Yu@Sun.COM 		}
1466*8348SEric.Yu@Sun.COM 		return (0);
1467*8348SEric.Yu@Sun.COM 	}
1468*8348SEric.Yu@Sun.COM 
1469*8348SEric.Yu@Sun.COM 	case _I_GETPEERCRED: {
1470*8348SEric.Yu@Sun.COM 		int error = 0;
1471*8348SEric.Yu@Sun.COM 
1472*8348SEric.Yu@Sun.COM 		if ((mode & FKIOCTL) == 0)
1473*8348SEric.Yu@Sun.COM 			return (EINVAL);
1474*8348SEric.Yu@Sun.COM 
1475*8348SEric.Yu@Sun.COM 		mutex_enter(&so->so_lock);
1476*8348SEric.Yu@Sun.COM 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1477*8348SEric.Yu@Sun.COM 			error = ENOTSUP;
1478*8348SEric.Yu@Sun.COM 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
1479*8348SEric.Yu@Sun.COM 			error = ENOTCONN;
1480*8348SEric.Yu@Sun.COM 		} else if (so->so_peercred != NULL) {
1481*8348SEric.Yu@Sun.COM 			k_peercred_t *kp = (k_peercred_t *)arg;
1482*8348SEric.Yu@Sun.COM 			kp->pc_cr = so->so_peercred;
1483*8348SEric.Yu@Sun.COM 			kp->pc_cpid = so->so_cpid;
1484*8348SEric.Yu@Sun.COM 			crhold(so->so_peercred);
1485*8348SEric.Yu@Sun.COM 		} else {
1486*8348SEric.Yu@Sun.COM 			error = EINVAL;
1487*8348SEric.Yu@Sun.COM 		}
1488*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1489*8348SEric.Yu@Sun.COM 		return (error);
1490*8348SEric.Yu@Sun.COM 	}
1491*8348SEric.Yu@Sun.COM 	default:
1492*8348SEric.Yu@Sun.COM 		return (-1);
1493*8348SEric.Yu@Sun.COM 	}
1494*8348SEric.Yu@Sun.COM }
1495*8348SEric.Yu@Sun.COM 
1496*8348SEric.Yu@Sun.COM /*
1497*8348SEric.Yu@Sun.COM  * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
1498*8348SEric.Yu@Sun.COM  * then the socket will fall back to TPI.
1499*8348SEric.Yu@Sun.COM  *
1500*8348SEric.Yu@Sun.COM  * Returns:
1501*8348SEric.Yu@Sun.COM  *   < 0  - ioctl was not handle
1502*8348SEric.Yu@Sun.COM  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1503*8348SEric.Yu@Sun.COM  */
1504*8348SEric.Yu@Sun.COM int
1505*8348SEric.Yu@Sun.COM socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1506*8348SEric.Yu@Sun.COM     struct cred *cr, int32_t *rvalp)
1507*8348SEric.Yu@Sun.COM {
1508*8348SEric.Yu@Sun.COM 	switch (cmd) {
1509*8348SEric.Yu@Sun.COM 	case _I_INSERT:
1510*8348SEric.Yu@Sun.COM 	case _I_REMOVE:
1511*8348SEric.Yu@Sun.COM 	case I_FIND:
1512*8348SEric.Yu@Sun.COM 	case I_LIST:
1513*8348SEric.Yu@Sun.COM 		return (EOPNOTSUPP);
1514*8348SEric.Yu@Sun.COM 
1515*8348SEric.Yu@Sun.COM 	case I_PUSH:
1516*8348SEric.Yu@Sun.COM 	case I_POP: {
1517*8348SEric.Yu@Sun.COM 		int retval;
1518*8348SEric.Yu@Sun.COM 
1519*8348SEric.Yu@Sun.COM 		if ((retval = so_tpi_fallback(so, cr)) == 0) {
1520*8348SEric.Yu@Sun.COM 			/* Reissue the ioctl */
1521*8348SEric.Yu@Sun.COM 			ASSERT(so->so_rcv_q_head == NULL);
1522*8348SEric.Yu@Sun.COM 			return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1523*8348SEric.Yu@Sun.COM 		}
1524*8348SEric.Yu@Sun.COM 		return (retval);
1525*8348SEric.Yu@Sun.COM 	}
1526*8348SEric.Yu@Sun.COM 	case I_LOOK:
1527*8348SEric.Yu@Sun.COM 		if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1528*8348SEric.Yu@Sun.COM 		    (mode & (int)FKIOCTL))) {
1529*8348SEric.Yu@Sun.COM 			return (EFAULT);
1530*8348SEric.Yu@Sun.COM 		}
1531*8348SEric.Yu@Sun.COM 		return (0);
1532*8348SEric.Yu@Sun.COM 	default:
1533*8348SEric.Yu@Sun.COM 		return (-1);
1534*8348SEric.Yu@Sun.COM 	}
1535*8348SEric.Yu@Sun.COM }
1536*8348SEric.Yu@Sun.COM 
1537*8348SEric.Yu@Sun.COM int
1538*8348SEric.Yu@Sun.COM socket_getopt_common(struct sonode *so, int level, int option_name,
1539*8348SEric.Yu@Sun.COM     void *optval, socklen_t *optlenp)
1540*8348SEric.Yu@Sun.COM {
1541*8348SEric.Yu@Sun.COM 	if (level != SOL_SOCKET)
1542*8348SEric.Yu@Sun.COM 		return (-1);
1543*8348SEric.Yu@Sun.COM 
1544*8348SEric.Yu@Sun.COM 	switch (option_name) {
1545*8348SEric.Yu@Sun.COM 	case SO_ERROR:
1546*8348SEric.Yu@Sun.COM 	case SO_DOMAIN:
1547*8348SEric.Yu@Sun.COM 	case SO_TYPE:
1548*8348SEric.Yu@Sun.COM 	case SO_ACCEPTCONN: {
1549*8348SEric.Yu@Sun.COM 		int32_t value;
1550*8348SEric.Yu@Sun.COM 		socklen_t optlen = *optlenp;
1551*8348SEric.Yu@Sun.COM 
1552*8348SEric.Yu@Sun.COM 		if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1553*8348SEric.Yu@Sun.COM 			return (EINVAL);
1554*8348SEric.Yu@Sun.COM 		}
1555*8348SEric.Yu@Sun.COM 
1556*8348SEric.Yu@Sun.COM 		switch (option_name) {
1557*8348SEric.Yu@Sun.COM 		case SO_ERROR:
1558*8348SEric.Yu@Sun.COM 			mutex_enter(&so->so_lock);
1559*8348SEric.Yu@Sun.COM 			value = sogeterr(so, B_TRUE);
1560*8348SEric.Yu@Sun.COM 			mutex_exit(&so->so_lock);
1561*8348SEric.Yu@Sun.COM 			break;
1562*8348SEric.Yu@Sun.COM 		case SO_DOMAIN:
1563*8348SEric.Yu@Sun.COM 			value = so->so_family;
1564*8348SEric.Yu@Sun.COM 			break;
1565*8348SEric.Yu@Sun.COM 		case SO_TYPE:
1566*8348SEric.Yu@Sun.COM 			value = so->so_type;
1567*8348SEric.Yu@Sun.COM 			break;
1568*8348SEric.Yu@Sun.COM 		case SO_ACCEPTCONN:
1569*8348SEric.Yu@Sun.COM 			if (so->so_state & SS_ACCEPTCONN)
1570*8348SEric.Yu@Sun.COM 				value = SO_ACCEPTCONN;
1571*8348SEric.Yu@Sun.COM 			else
1572*8348SEric.Yu@Sun.COM 				value = 0;
1573*8348SEric.Yu@Sun.COM 			break;
1574*8348SEric.Yu@Sun.COM 		}
1575*8348SEric.Yu@Sun.COM 
1576*8348SEric.Yu@Sun.COM 		bcopy(&value, optval, sizeof (value));
1577*8348SEric.Yu@Sun.COM 		*optlenp = sizeof (value);
1578*8348SEric.Yu@Sun.COM 
1579*8348SEric.Yu@Sun.COM 		return (0);
1580*8348SEric.Yu@Sun.COM 	}
1581*8348SEric.Yu@Sun.COM 	case SO_SNDTIMEO:
1582*8348SEric.Yu@Sun.COM 	case SO_RCVTIMEO: {
1583*8348SEric.Yu@Sun.COM 		clock_t value;
1584*8348SEric.Yu@Sun.COM 		socklen_t optlen = *optlenp;
1585*8348SEric.Yu@Sun.COM 
1586*8348SEric.Yu@Sun.COM 		if (optlen < (t_uscalar_t)sizeof (struct timeval)) {
1587*8348SEric.Yu@Sun.COM 			return (EINVAL);
1588*8348SEric.Yu@Sun.COM 		}
1589*8348SEric.Yu@Sun.COM 		if (option_name == SO_RCVTIMEO)
1590*8348SEric.Yu@Sun.COM 			value = drv_hztousec(so->so_rcvtimeo);
1591*8348SEric.Yu@Sun.COM 		else
1592*8348SEric.Yu@Sun.COM 			value = drv_hztousec(so->so_sndtimeo);
1593*8348SEric.Yu@Sun.COM 		((struct timeval *)(optval))->tv_sec = value / (1000 * 1000);
1594*8348SEric.Yu@Sun.COM 		((struct timeval *)(optval))->tv_usec = value % (1000 * 1000);
1595*8348SEric.Yu@Sun.COM 		*optlenp = sizeof (struct timeval);
1596*8348SEric.Yu@Sun.COM 		return (0);
1597*8348SEric.Yu@Sun.COM 	}
1598*8348SEric.Yu@Sun.COM 	case SO_DEBUG:
1599*8348SEric.Yu@Sun.COM 	case SO_REUSEADDR:
1600*8348SEric.Yu@Sun.COM 	case SO_KEEPALIVE:
1601*8348SEric.Yu@Sun.COM 	case SO_DONTROUTE:
1602*8348SEric.Yu@Sun.COM 	case SO_BROADCAST:
1603*8348SEric.Yu@Sun.COM 	case SO_USELOOPBACK:
1604*8348SEric.Yu@Sun.COM 	case SO_OOBINLINE:
1605*8348SEric.Yu@Sun.COM 	case SO_SNDBUF:
1606*8348SEric.Yu@Sun.COM 	case SO_RCVBUF:
1607*8348SEric.Yu@Sun.COM #ifdef notyet
1608*8348SEric.Yu@Sun.COM 	case SO_SNDLOWAT:
1609*8348SEric.Yu@Sun.COM 	case SO_RCVLOWAT:
1610*8348SEric.Yu@Sun.COM #endif /* notyet */
1611*8348SEric.Yu@Sun.COM 	case SO_DGRAM_ERRIND: {
1612*8348SEric.Yu@Sun.COM 		socklen_t optlen = *optlenp;
1613*8348SEric.Yu@Sun.COM 
1614*8348SEric.Yu@Sun.COM 		if (optlen < (t_uscalar_t)sizeof (int32_t))
1615*8348SEric.Yu@Sun.COM 			return (EINVAL);
1616*8348SEric.Yu@Sun.COM 		break;
1617*8348SEric.Yu@Sun.COM 	}
1618*8348SEric.Yu@Sun.COM 	case SO_LINGER: {
1619*8348SEric.Yu@Sun.COM 		socklen_t optlen = *optlenp;
1620*8348SEric.Yu@Sun.COM 
1621*8348SEric.Yu@Sun.COM 		if (optlen < (t_uscalar_t)sizeof (struct linger))
1622*8348SEric.Yu@Sun.COM 			return (EINVAL);
1623*8348SEric.Yu@Sun.COM 		break;
1624*8348SEric.Yu@Sun.COM 	}
1625*8348SEric.Yu@Sun.COM 	case SO_SND_BUFINFO: {
1626*8348SEric.Yu@Sun.COM 		socklen_t optlen = *optlenp;
1627*8348SEric.Yu@Sun.COM 
1628*8348SEric.Yu@Sun.COM 		if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1629*8348SEric.Yu@Sun.COM 			return (EINVAL);
1630*8348SEric.Yu@Sun.COM 		((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1631*8348SEric.Yu@Sun.COM 		    (so->so_proto_props).sopp_wroff;
1632*8348SEric.Yu@Sun.COM 		((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1633*8348SEric.Yu@Sun.COM 		    (so->so_proto_props).sopp_maxblk;
1634*8348SEric.Yu@Sun.COM 		((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1635*8348SEric.Yu@Sun.COM 		    (so->so_proto_props).sopp_maxpsz;
1636*8348SEric.Yu@Sun.COM 		((struct so_snd_bufinfo *)(optval))->sbi_tail =
1637*8348SEric.Yu@Sun.COM 		    (so->so_proto_props).sopp_tail;
1638*8348SEric.Yu@Sun.COM 		*optlenp = sizeof (struct so_snd_bufinfo);
1639*8348SEric.Yu@Sun.COM 		return (0);
1640*8348SEric.Yu@Sun.COM 	}
1641*8348SEric.Yu@Sun.COM 	default:
1642*8348SEric.Yu@Sun.COM 		break;
1643*8348SEric.Yu@Sun.COM 	}
1644*8348SEric.Yu@Sun.COM 
1645*8348SEric.Yu@Sun.COM 	/* Unknown Option */
1646*8348SEric.Yu@Sun.COM 	return (-1);
1647*8348SEric.Yu@Sun.COM }
1648*8348SEric.Yu@Sun.COM 
1649*8348SEric.Yu@Sun.COM void
1650*8348SEric.Yu@Sun.COM socket_sonode_destroy(struct sonode *so)
1651*8348SEric.Yu@Sun.COM {
1652*8348SEric.Yu@Sun.COM 	sonode_fini(so);
1653*8348SEric.Yu@Sun.COM 	kmem_cache_free(socket_cache, so);
1654*8348SEric.Yu@Sun.COM }
1655*8348SEric.Yu@Sun.COM 
1656*8348SEric.Yu@Sun.COM int
1657*8348SEric.Yu@Sun.COM so_zcopy_wait(struct sonode *so)
1658*8348SEric.Yu@Sun.COM {
1659*8348SEric.Yu@Sun.COM 	int error = 0;
1660*8348SEric.Yu@Sun.COM 
1661*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1662*8348SEric.Yu@Sun.COM 	while (!(so->so_copyflag & STZCNOTIFY)) {
1663*8348SEric.Yu@Sun.COM 		if (so->so_state & SS_CLOSING) {
1664*8348SEric.Yu@Sun.COM 			mutex_exit(&so->so_lock);
1665*8348SEric.Yu@Sun.COM 			return (EINTR);
1666*8348SEric.Yu@Sun.COM 		}
1667*8348SEric.Yu@Sun.COM 		if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1668*8348SEric.Yu@Sun.COM 			error = EINTR;
1669*8348SEric.Yu@Sun.COM 			break;
1670*8348SEric.Yu@Sun.COM 		}
1671*8348SEric.Yu@Sun.COM 	}
1672*8348SEric.Yu@Sun.COM 	so->so_copyflag &= ~STZCNOTIFY;
1673*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1674*8348SEric.Yu@Sun.COM 	return (error);
1675*8348SEric.Yu@Sun.COM }
1676*8348SEric.Yu@Sun.COM 
1677*8348SEric.Yu@Sun.COM void
1678*8348SEric.Yu@Sun.COM so_timer_callback(void *arg)
1679*8348SEric.Yu@Sun.COM {
1680*8348SEric.Yu@Sun.COM 	struct sonode *so = (struct sonode *)arg;
1681*8348SEric.Yu@Sun.COM 
1682*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1683*8348SEric.Yu@Sun.COM 
1684*8348SEric.Yu@Sun.COM 	so->so_rcv_timer_tid = 0;
1685*8348SEric.Yu@Sun.COM 	if (so->so_rcv_queued > 0) {
1686*8348SEric.Yu@Sun.COM 		so_notify_data(so, so->so_rcv_queued);
1687*8348SEric.Yu@Sun.COM 	} else {
1688*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1689*8348SEric.Yu@Sun.COM 	}
1690*8348SEric.Yu@Sun.COM }
1691*8348SEric.Yu@Sun.COM 
1692*8348SEric.Yu@Sun.COM #ifdef DEBUG
1693*8348SEric.Yu@Sun.COM /*
1694*8348SEric.Yu@Sun.COM  * Verify that the length stored in so_rcv_queued and the length of data blocks
1695*8348SEric.Yu@Sun.COM  * queued is same.
1696*8348SEric.Yu@Sun.COM  */
1697*8348SEric.Yu@Sun.COM static boolean_t
1698*8348SEric.Yu@Sun.COM so_check_length(sonode_t *so)
1699*8348SEric.Yu@Sun.COM {
1700*8348SEric.Yu@Sun.COM 	mblk_t *mp = so->so_rcv_q_head;
1701*8348SEric.Yu@Sun.COM 	int len = 0;
1702*8348SEric.Yu@Sun.COM 
1703*8348SEric.Yu@Sun.COM 	ASSERT(MUTEX_HELD(&so->so_lock));
1704*8348SEric.Yu@Sun.COM 
1705*8348SEric.Yu@Sun.COM 	if (mp != NULL) {
1706*8348SEric.Yu@Sun.COM 		len = msgdsize(mp);
1707*8348SEric.Yu@Sun.COM 		while ((mp = mp->b_next) != NULL)
1708*8348SEric.Yu@Sun.COM 			len += msgdsize(mp);
1709*8348SEric.Yu@Sun.COM 	}
1710*8348SEric.Yu@Sun.COM 	mp = so->so_rcv_head;
1711*8348SEric.Yu@Sun.COM 	if (mp != NULL) {
1712*8348SEric.Yu@Sun.COM 		len += msgdsize(mp);
1713*8348SEric.Yu@Sun.COM 		while ((mp = mp->b_next) != NULL)
1714*8348SEric.Yu@Sun.COM 			len += msgdsize(mp);
1715*8348SEric.Yu@Sun.COM 	}
1716*8348SEric.Yu@Sun.COM 	return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1717*8348SEric.Yu@Sun.COM }
1718*8348SEric.Yu@Sun.COM #endif
1719*8348SEric.Yu@Sun.COM 
1720*8348SEric.Yu@Sun.COM int
1721*8348SEric.Yu@Sun.COM so_get_mod_version(struct sockparams *sp)
1722*8348SEric.Yu@Sun.COM {
1723*8348SEric.Yu@Sun.COM 	ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1724*8348SEric.Yu@Sun.COM 	return (sp->sp_smod_info->smod_version);
1725*8348SEric.Yu@Sun.COM }
1726*8348SEric.Yu@Sun.COM 
1727*8348SEric.Yu@Sun.COM /*
1728*8348SEric.Yu@Sun.COM  * so_start_fallback()
1729*8348SEric.Yu@Sun.COM  *
1730*8348SEric.Yu@Sun.COM  * Block new socket operations from coming in, and wait for active operations
1731*8348SEric.Yu@Sun.COM  * to complete. Threads that are sleeping will be woken up so they can get
1732*8348SEric.Yu@Sun.COM  * out of the way.
1733*8348SEric.Yu@Sun.COM  *
1734*8348SEric.Yu@Sun.COM  * The caller must be a reader on so_fallback_rwlock.
1735*8348SEric.Yu@Sun.COM  */
1736*8348SEric.Yu@Sun.COM static boolean_t
1737*8348SEric.Yu@Sun.COM so_start_fallback(struct sonode *so)
1738*8348SEric.Yu@Sun.COM {
1739*8348SEric.Yu@Sun.COM 	ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1740*8348SEric.Yu@Sun.COM 
1741*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1742*8348SEric.Yu@Sun.COM 	if (so->so_state & SS_FALLBACK_PENDING) {
1743*8348SEric.Yu@Sun.COM 		mutex_exit(&so->so_lock);
1744*8348SEric.Yu@Sun.COM 		return (B_FALSE);
1745*8348SEric.Yu@Sun.COM 	}
1746*8348SEric.Yu@Sun.COM 	so->so_state |= SS_FALLBACK_PENDING;
1747*8348SEric.Yu@Sun.COM 	/*
1748*8348SEric.Yu@Sun.COM 	 * Poke all threads that might be sleeping. Any operation that comes
1749*8348SEric.Yu@Sun.COM 	 * in after the cv_broadcast will observe the fallback pending flag
1750*8348SEric.Yu@Sun.COM 	 * which cause the call to return where it would normally sleep.
1751*8348SEric.Yu@Sun.COM 	 */
1752*8348SEric.Yu@Sun.COM 	cv_broadcast(&so->so_state_cv);		/* threads in connect() */
1753*8348SEric.Yu@Sun.COM 	cv_broadcast(&so->so_rcv_cv);		/* threads in recvmsg() */
1754*8348SEric.Yu@Sun.COM 	cv_broadcast(&so->so_snd_cv);		/* threads in sendmsg() */
1755*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_acceptq_lock);
1756*8348SEric.Yu@Sun.COM 	cv_broadcast(&so->so_acceptq_cv);	/* threads in accept() */
1757*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_acceptq_lock);
1758*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1759*8348SEric.Yu@Sun.COM 
1760*8348SEric.Yu@Sun.COM 	/*
1761*8348SEric.Yu@Sun.COM 	 * The main reason for the rw_tryupgrade call is to provide
1762*8348SEric.Yu@Sun.COM 	 * observability during the fallback process. We want to
1763*8348SEric.Yu@Sun.COM 	 * be able to see if there are pending operations.
1764*8348SEric.Yu@Sun.COM 	 */
1765*8348SEric.Yu@Sun.COM 	if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1766*8348SEric.Yu@Sun.COM 		/*
1767*8348SEric.Yu@Sun.COM 		 * It is safe to drop and reaquire the fallback lock, because
1768*8348SEric.Yu@Sun.COM 		 * we are guaranteed that another fallback cannot take place.
1769*8348SEric.Yu@Sun.COM 		 */
1770*8348SEric.Yu@Sun.COM 		rw_exit(&so->so_fallback_rwlock);
1771*8348SEric.Yu@Sun.COM 		DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1772*8348SEric.Yu@Sun.COM 		rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1773*8348SEric.Yu@Sun.COM 		DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1774*8348SEric.Yu@Sun.COM 	}
1775*8348SEric.Yu@Sun.COM 
1776*8348SEric.Yu@Sun.COM 	return (B_TRUE);
1777*8348SEric.Yu@Sun.COM }
1778*8348SEric.Yu@Sun.COM 
1779*8348SEric.Yu@Sun.COM /*
1780*8348SEric.Yu@Sun.COM  * so_end_fallback()
1781*8348SEric.Yu@Sun.COM  *
1782*8348SEric.Yu@Sun.COM  * Allow socket opertions back in.
1783*8348SEric.Yu@Sun.COM  *
1784*8348SEric.Yu@Sun.COM  * The caller must be a writer on so_fallback_rwlock.
1785*8348SEric.Yu@Sun.COM  */
1786*8348SEric.Yu@Sun.COM static void
1787*8348SEric.Yu@Sun.COM so_end_fallback(struct sonode *so)
1788*8348SEric.Yu@Sun.COM {
1789*8348SEric.Yu@Sun.COM 	ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1790*8348SEric.Yu@Sun.COM 
1791*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1792*8348SEric.Yu@Sun.COM 	so->so_state &= ~SS_FALLBACK_PENDING;
1793*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1794*8348SEric.Yu@Sun.COM 
1795*8348SEric.Yu@Sun.COM 	rw_downgrade(&so->so_fallback_rwlock);
1796*8348SEric.Yu@Sun.COM }
1797*8348SEric.Yu@Sun.COM 
1798*8348SEric.Yu@Sun.COM /*
1799*8348SEric.Yu@Sun.COM  * so_quiesced_cb()
1800*8348SEric.Yu@Sun.COM  *
1801*8348SEric.Yu@Sun.COM  * Callback passed to the protocol during fallback. It is called once
1802*8348SEric.Yu@Sun.COM  * the endpoint is quiescent.
1803*8348SEric.Yu@Sun.COM  *
1804*8348SEric.Yu@Sun.COM  * No requests from the user, no notifications from the protocol, so it
1805*8348SEric.Yu@Sun.COM  * is safe to synchronize the state. Data can also be moved without
1806*8348SEric.Yu@Sun.COM  * risk for reordering.
1807*8348SEric.Yu@Sun.COM  *
1808*8348SEric.Yu@Sun.COM  * NOTE: urgent data is dropped on the floor.
1809*8348SEric.Yu@Sun.COM  *
1810*8348SEric.Yu@Sun.COM  * We do not need to hold so_lock, since there can be only one thread
1811*8348SEric.Yu@Sun.COM  * operating on the sonode.
1812*8348SEric.Yu@Sun.COM  */
1813*8348SEric.Yu@Sun.COM static void
1814*8348SEric.Yu@Sun.COM so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1815*8348SEric.Yu@Sun.COM     struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1816*8348SEric.Yu@Sun.COM     struct sockaddr *faddr, socklen_t faddrlen, short opts)
1817*8348SEric.Yu@Sun.COM {
1818*8348SEric.Yu@Sun.COM 	struct sonode *so = (struct sonode *)sock_handle;
1819*8348SEric.Yu@Sun.COM 
1820*8348SEric.Yu@Sun.COM 	sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1821*8348SEric.Yu@Sun.COM 
1822*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1823*8348SEric.Yu@Sun.COM 	SOCKET_TIMER_CANCEL(so);
1824*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1825*8348SEric.Yu@Sun.COM 	/*
1826*8348SEric.Yu@Sun.COM 	 * Move data to the STREAM head.
1827*8348SEric.Yu@Sun.COM 	 */
1828*8348SEric.Yu@Sun.COM 	if (so->so_rcv_head != NULL) {
1829*8348SEric.Yu@Sun.COM 		if (so->so_rcv_q_last_head == NULL)
1830*8348SEric.Yu@Sun.COM 			so->so_rcv_q_head = so->so_rcv_head;
1831*8348SEric.Yu@Sun.COM 		else
1832*8348SEric.Yu@Sun.COM 			so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1833*8348SEric.Yu@Sun.COM 		so->so_rcv_q_last_head = so->so_rcv_last_head;
1834*8348SEric.Yu@Sun.COM 	}
1835*8348SEric.Yu@Sun.COM 
1836*8348SEric.Yu@Sun.COM 	while (so->so_rcv_q_head != NULL) {
1837*8348SEric.Yu@Sun.COM 		mblk_t *mp = so->so_rcv_q_head;
1838*8348SEric.Yu@Sun.COM 		size_t mlen = msgdsize(mp);
1839*8348SEric.Yu@Sun.COM 
1840*8348SEric.Yu@Sun.COM 		so->so_rcv_q_head = mp->b_next;
1841*8348SEric.Yu@Sun.COM 		mp->b_next = NULL;
1842*8348SEric.Yu@Sun.COM 		mp->b_prev = NULL;
1843*8348SEric.Yu@Sun.COM 		so->so_rcv_queued -= mlen;
1844*8348SEric.Yu@Sun.COM 		putnext(q, mp);
1845*8348SEric.Yu@Sun.COM 	}
1846*8348SEric.Yu@Sun.COM 	ASSERT(so->so_rcv_queued == 0);
1847*8348SEric.Yu@Sun.COM 	so->so_rcv_head = NULL;
1848*8348SEric.Yu@Sun.COM 	so->so_rcv_last_head = NULL;
1849*8348SEric.Yu@Sun.COM 	so->so_rcv_q_head = NULL;
1850*8348SEric.Yu@Sun.COM 	so->so_rcv_q_last_head = NULL;
1851*8348SEric.Yu@Sun.COM 
1852*8348SEric.Yu@Sun.COM #ifdef DEBUG
1853*8348SEric.Yu@Sun.COM 	if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
1854*8348SEric.Yu@Sun.COM 		cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
1855*8348SEric.Yu@Sun.COM 	}
1856*8348SEric.Yu@Sun.COM #endif
1857*8348SEric.Yu@Sun.COM 	if (so->so_oobmsg != NULL) {
1858*8348SEric.Yu@Sun.COM 		freemsg(so->so_oobmsg);
1859*8348SEric.Yu@Sun.COM 		so->so_oobmsg = NULL;
1860*8348SEric.Yu@Sun.COM 	}
1861*8348SEric.Yu@Sun.COM 	so->so_oobmark = 0;
1862*8348SEric.Yu@Sun.COM 
1863*8348SEric.Yu@Sun.COM 	ASSERT(so->so_rcv_queued == 0);
1864*8348SEric.Yu@Sun.COM }
1865*8348SEric.Yu@Sun.COM 
1866*8348SEric.Yu@Sun.COM /*
1867*8348SEric.Yu@Sun.COM  * so_tpi_fallback()
1868*8348SEric.Yu@Sun.COM  *
1869*8348SEric.Yu@Sun.COM  * This is fallback initation routine; things start here.
1870*8348SEric.Yu@Sun.COM  *
1871*8348SEric.Yu@Sun.COM  * Basic strategy:
1872*8348SEric.Yu@Sun.COM  *   o Block new socket operations from coming in
1873*8348SEric.Yu@Sun.COM  *   o Allocate/initate info needed by TPI
1874*8348SEric.Yu@Sun.COM  *   o Quiesce the connection, at which point we sync
1875*8348SEric.Yu@Sun.COM  *     state and move data
1876*8348SEric.Yu@Sun.COM  *   o Change operations (sonodeops) associated with the socket
1877*8348SEric.Yu@Sun.COM  *   o Unblock threads waiting for the fallback to finish
1878*8348SEric.Yu@Sun.COM  */
1879*8348SEric.Yu@Sun.COM int
1880*8348SEric.Yu@Sun.COM so_tpi_fallback(struct sonode *so, struct cred *cr)
1881*8348SEric.Yu@Sun.COM {
1882*8348SEric.Yu@Sun.COM 	int error;
1883*8348SEric.Yu@Sun.COM 	queue_t *q;
1884*8348SEric.Yu@Sun.COM 	struct sockparams *sp;
1885*8348SEric.Yu@Sun.COM 	struct sockparams *newsp;
1886*8348SEric.Yu@Sun.COM 	so_proto_fallback_func_t fbfunc;
1887*8348SEric.Yu@Sun.COM 	boolean_t direct;
1888*8348SEric.Yu@Sun.COM 
1889*8348SEric.Yu@Sun.COM 	error = 0;
1890*8348SEric.Yu@Sun.COM 	sp = so->so_sockparams;
1891*8348SEric.Yu@Sun.COM 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
1892*8348SEric.Yu@Sun.COM 
1893*8348SEric.Yu@Sun.COM 	/*
1894*8348SEric.Yu@Sun.COM 	 * Fallback can only happen if there is a device associated
1895*8348SEric.Yu@Sun.COM 	 * with the sonode, and the socket module has a fallback function.
1896*8348SEric.Yu@Sun.COM 	 */
1897*8348SEric.Yu@Sun.COM 	if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
1898*8348SEric.Yu@Sun.COM 		return (EINVAL);
1899*8348SEric.Yu@Sun.COM 
1900*8348SEric.Yu@Sun.COM 	/*
1901*8348SEric.Yu@Sun.COM 	 * Initiate fallback; upon success we know that no new requests
1902*8348SEric.Yu@Sun.COM 	 * will come in from the user.
1903*8348SEric.Yu@Sun.COM 	 */
1904*8348SEric.Yu@Sun.COM 	if (!so_start_fallback(so))
1905*8348SEric.Yu@Sun.COM 		return (EAGAIN);
1906*8348SEric.Yu@Sun.COM 
1907*8348SEric.Yu@Sun.COM 	newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
1908*8348SEric.Yu@Sun.COM 	    so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
1909*8348SEric.Yu@Sun.COM 	    KM_SLEEP, &error);
1910*8348SEric.Yu@Sun.COM 	if (error != 0)
1911*8348SEric.Yu@Sun.COM 		goto out;
1912*8348SEric.Yu@Sun.COM 
1913*8348SEric.Yu@Sun.COM 	if (so->so_direct != NULL) {
1914*8348SEric.Yu@Sun.COM 		sodirect_t *sodp = so->so_direct;
1915*8348SEric.Yu@Sun.COM 		mutex_enter(sodp->sod_lockp);
1916*8348SEric.Yu@Sun.COM 
1917*8348SEric.Yu@Sun.COM 		so->so_direct->sod_state &= ~SOD_ENABLED;
1918*8348SEric.Yu@Sun.COM 		so->so_state &= ~SS_SODIRECT;
1919*8348SEric.Yu@Sun.COM 		ASSERT(sodp->sod_uioafh == NULL);
1920*8348SEric.Yu@Sun.COM 		mutex_exit(sodp->sod_lockp);
1921*8348SEric.Yu@Sun.COM 	}
1922*8348SEric.Yu@Sun.COM 
1923*8348SEric.Yu@Sun.COM 	/* Turn sonode into a TPI socket */
1924*8348SEric.Yu@Sun.COM 	q = sotpi_convert_sonode(so, newsp, &direct, cr);
1925*8348SEric.Yu@Sun.COM 	if (q == NULL) {
1926*8348SEric.Yu@Sun.COM 		zcmn_err(getzoneid(), CE_WARN,
1927*8348SEric.Yu@Sun.COM 		    "Failed to convert socket to TPI. Pid = %d\n",
1928*8348SEric.Yu@Sun.COM 		    curproc->p_pid);
1929*8348SEric.Yu@Sun.COM 		SOCKPARAMS_DEC_REF(newsp);
1930*8348SEric.Yu@Sun.COM 		error = EINVAL;
1931*8348SEric.Yu@Sun.COM 		goto out;
1932*8348SEric.Yu@Sun.COM 	}
1933*8348SEric.Yu@Sun.COM 
1934*8348SEric.Yu@Sun.COM 	/*
1935*8348SEric.Yu@Sun.COM 	 * Now tell the protocol to start using TPI. so_quiesced_cb be
1936*8348SEric.Yu@Sun.COM 	 * called once it's safe to synchronize state.
1937*8348SEric.Yu@Sun.COM 	 */
1938*8348SEric.Yu@Sun.COM 	DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
1939*8348SEric.Yu@Sun.COM 	/* FIXME assumes this cannot fail. TCP can fail to enter squeue */
1940*8348SEric.Yu@Sun.COM 	(*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
1941*8348SEric.Yu@Sun.COM 	DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
1942*8348SEric.Yu@Sun.COM 
1943*8348SEric.Yu@Sun.COM 	/*
1944*8348SEric.Yu@Sun.COM 	 * Free all pending connection indications, i.e., socket_accept() has
1945*8348SEric.Yu@Sun.COM 	 * not yet pulled the connection of the queue. The transport sent
1946*8348SEric.Yu@Sun.COM 	 * a T_CONN_IND message for each pending connection to the STREAM head.
1947*8348SEric.Yu@Sun.COM 	 */
1948*8348SEric.Yu@Sun.COM 	so_acceptq_flush(so);
1949*8348SEric.Yu@Sun.COM 
1950*8348SEric.Yu@Sun.COM 	mutex_enter(&so->so_lock);
1951*8348SEric.Yu@Sun.COM 	so->so_state |= SS_FALLBACK_COMP;
1952*8348SEric.Yu@Sun.COM 	mutex_exit(&so->so_lock);
1953*8348SEric.Yu@Sun.COM 
1954*8348SEric.Yu@Sun.COM 	/*
1955*8348SEric.Yu@Sun.COM 	 * Swap the sonode ops. Socket opertations that come in once this
1956*8348SEric.Yu@Sun.COM 	 * is done will proceed without blocking.
1957*8348SEric.Yu@Sun.COM 	 */
1958*8348SEric.Yu@Sun.COM 	so->so_ops = &sotpi_sonodeops;
1959*8348SEric.Yu@Sun.COM 
1960*8348SEric.Yu@Sun.COM 	/*
1961*8348SEric.Yu@Sun.COM 	 * Wake up any threads stuck in poll. This is needed since the poll
1962*8348SEric.Yu@Sun.COM 	 * head changes when the fallback happens (moves from the sonode to
1963*8348SEric.Yu@Sun.COM 	 * the STREAMS head).
1964*8348SEric.Yu@Sun.COM 	 */
1965*8348SEric.Yu@Sun.COM 	pollwakeup(&so->so_poll_list, POLLERR);
1966*8348SEric.Yu@Sun.COM out:
1967*8348SEric.Yu@Sun.COM 	so_end_fallback(so);
1968*8348SEric.Yu@Sun.COM 
1969*8348SEric.Yu@Sun.COM 	return (error);
1970*8348SEric.Yu@Sun.COM }
1971