1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/kmem_impl.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/suntpi.h>
49 #include <sys/ddi.h>
50 #include <sys/esunddi.h>
51 #include <sys/flock.h>
52 #include <sys/modctl.h>
53 #include <sys/vtrace.h>
54 #include <sys/cmn_err.h>
55 #include <sys/pathname.h>
56
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/sockio.h>
60 #include <netinet/in.h>
61 #include <sys/un.h>
62 #include <sys/strsun.h>
63
64 #include <sys/tiuser.h>
65 #define _SUN_TPI_VERSION 2
66 #include <sys/tihdr.h>
67 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
68
69 #include <c2/audit.h>
70
71 #include <inet/common.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/tcp.h>
75 #include <inet/udp_impl.h>
76
77 #include <sys/zone.h>
78
79 #include <fs/sockfs/nl7c.h>
80 #include <fs/sockfs/nl7curi.h>
81
82 #include <fs/sockfs/sockcommon.h>
83 #include <fs/sockfs/socktpi.h>
84 #include <fs/sockfs/socktpi_impl.h>
85
86 /*
87 * Possible failures when memory can't be allocated. The documented behavior:
88 *
89 * 5.5: 4.X: XNET:
90 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
91 * EINTR
92 * (4.X does not document EINTR but returns it)
93 * bind: ENOSR - ENOBUFS/ENOSR
94 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
95 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
96 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
97 * (4.X getpeername and getsockname do not fail in practice)
98 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
99 * listen: - - ENOBUFS
100 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
101 * EINTR
102 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
103 * EINTR
104 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
105 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
106 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
107 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
108 *
109 * Resolution. When allocation fails:
110 * recv: return EINTR
111 * send: return EINTR
112 * connect, accept: EINTR
113 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114 * socket, socketpair: ENOBUFS
115 * getpeername, getsockname: sleep
116 * getsockopt, setsockopt: sleep
117 */
118
119 #ifdef SOCK_TEST
120 /*
121 * Variables that make sockfs do something other than the standard TPI
122 * for the AF_INET transports.
123 *
124 * solisten_tpi_tcp:
125 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
126 * the transport is already bound. This is needed to avoid loosing the
127 * port number should listen() do a T_UNBIND_REQ followed by a
128 * O_T_BIND_REQ.
129 *
130 * soconnect_tpi_udp:
131 * UDP and ICMP can handle a T_CONN_REQ.
132 * This is needed to make the sequence of connect(), getsockname()
133 * return the local IP address used to send packets to the connected to
134 * destination.
135 *
136 * soconnect_tpi_tcp:
137 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138 * Set this to non-zero to send TPI conformant messages to TCP in this
139 * respect. This is a performance optimization.
140 *
141 * soaccept_tpi_tcp:
142 * TCP can handle a T_CONN_REQ without the acceptor being bound.
143 * This is a performance optimization that has been picked up in XTI.
144 *
145 * soaccept_tpi_multioptions:
146 * When inheriting SOL_SOCKET options from the listener to the accepting
147 * socket send them as a single message for AF_INET{,6}.
148 */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define soconnect_tpi_tcp 0
156 #define soconnect_tpi_udp 0
157 #define solisten_tpi_tcp 0
158 #define soaccept_tpi_tcp 0
159 #define soaccept_tpi_multioptions 1
160 #endif /* SOCK_TEST */
161
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166
167 /*
168 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
169 * applications working. Turn on this flag to disable these checks.
170 */
171 int xnet_skip_checks = 0;
172 int xnet_check_print = 0;
173 int xnet_truncate_print = 0;
174
175 static void sotpi_destroy(struct sonode *);
176 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
177 int, int *, cred_t *cr);
178
179 static boolean_t sotpi_info_create(struct sonode *, int);
180 static void sotpi_info_init(struct sonode *);
181 static void sotpi_info_fini(struct sonode *);
182 static void sotpi_info_destroy(struct sonode *);
183
184 /*
185 * Do direct function call to the transport layer below; this would
186 * also allow the transport to utilize read-side synchronous stream
187 * interface if necessary. This is a /etc/system tunable that must
188 * not be modified on a running system. By default this is enabled
189 * for performance reasons and may be disabled for debugging purposes.
190 */
191 boolean_t socktpi_direct = B_TRUE;
192
193 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
194
195 extern void sigintr(k_sigset_t *, int);
196 extern void sigunintr(k_sigset_t *);
197
198 static int sotpi_unbind(struct sonode *, int);
199
200 /* TPI sockfs sonode operations */
201 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
202 int);
203 static int sotpi_accept(struct sonode *, int, struct cred *,
204 struct sonode **);
205 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
206 int, struct cred *);
207 static int sotpi_listen(struct sonode *, int, struct cred *);
208 static int sotpi_connect(struct sonode *, struct sockaddr *,
209 socklen_t, int, int, struct cred *);
210 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
211 struct uio *, struct cred *);
212 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
213 struct uio *, struct cred *);
214 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
215 struct cred *, mblk_t **);
216 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
217 struct uio *, void *, t_uscalar_t, int);
218 static int sodgram_direct(struct sonode *, struct sockaddr *,
219 socklen_t, struct uio *, int);
220 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
221 socklen_t *, boolean_t, struct cred *);
222 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
223 socklen_t *, struct cred *);
224 static int sotpi_shutdown(struct sonode *, int, struct cred *);
225 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
226 socklen_t *, int, struct cred *);
227 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
228 socklen_t, struct cred *);
229 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
230 int32_t *);
231 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
232 struct cred *, int32_t *);
233 static int sotpi_poll(struct sonode *, short, int, short *,
234 struct pollhead **);
235 static int sotpi_close(struct sonode *, int, struct cred *);
236
237 static int i_sotpi_info_constructor(sotpi_info_t *);
238 static void i_sotpi_info_destructor(sotpi_info_t *);
239
240 sonodeops_t sotpi_sonodeops = {
241 sotpi_init, /* sop_init */
242 sotpi_accept, /* sop_accept */
243 sotpi_bind, /* sop_bind */
244 sotpi_listen, /* sop_listen */
245 sotpi_connect, /* sop_connect */
246 sotpi_recvmsg, /* sop_recvmsg */
247 sotpi_sendmsg, /* sop_sendmsg */
248 sotpi_sendmblk, /* sop_sendmblk */
249 sotpi_getpeername, /* sop_getpeername */
250 sotpi_getsockname, /* sop_getsockname */
251 sotpi_shutdown, /* sop_shutdown */
252 sotpi_getsockopt, /* sop_getsockopt */
253 sotpi_setsockopt, /* sop_setsockopt */
254 sotpi_ioctl, /* sop_ioctl */
255 sotpi_poll, /* sop_poll */
256 sotpi_close, /* sop_close */
257 };
258
259 /*
260 * Return a TPI socket vnode.
261 *
262 * Note that sockets assume that the driver will clone (either itself
263 * or by using the clone driver) i.e. a socket() call will always
264 * result in a new vnode being created.
265 */
266
267 /*
268 * Common create code for socket and accept. If tso is set the values
269 * from that node is used instead of issuing a T_INFO_REQ.
270 */
271
272 /* ARGSUSED */
273 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)274 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
275 int version, int sflags, int *errorp, cred_t *cr)
276 {
277 struct sonode *so;
278 kmem_cache_t *cp;
279 int sfamily = family;
280
281 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
282
283 if (family == AF_NCA) {
284 /*
285 * The request is for an NCA socket so for NL7C use the
286 * INET domain instead and mark NL7C_AF_NCA below.
287 */
288 family = AF_INET;
289 /*
290 * NL7C is not supported in the non-global zone,
291 * we enforce this restriction here.
292 */
293 if (getzoneid() != GLOBAL_ZONEID) {
294 *errorp = ENOTSUP;
295 return (NULL);
296 }
297 }
298
299 /*
300 * to be compatible with old tpi socket implementation ignore
301 * sleep flag (sflags) passed in
302 */
303 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
304 so = kmem_cache_alloc(cp, KM_SLEEP);
305 if (so == NULL) {
306 *errorp = ENOMEM;
307 return (NULL);
308 }
309
310 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
311 sotpi_info_init(so);
312
313 if (sfamily == AF_NCA) {
314 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
315 }
316
317 if (version == SOV_DEFAULT)
318 version = so_default_version;
319
320 so->so_version = (short)version;
321 *errorp = 0;
322
323 return (so);
324 }
325
326 static void
sotpi_destroy(struct sonode * so)327 sotpi_destroy(struct sonode *so)
328 {
329 kmem_cache_t *cp;
330 struct sockparams *origsp;
331
332 /*
333 * If there is a new dealloc function (ie. smod_destroy_func),
334 * then it should check the correctness of the ops.
335 */
336
337 ASSERT(so->so_ops == &sotpi_sonodeops);
338
339 origsp = SOTOTPI(so)->sti_orig_sp;
340
341 sotpi_info_fini(so);
342
343 if (so->so_state & SS_FALLBACK_COMP) {
344 /*
345 * A fallback happend, which means that a sotpi_info_t struct
346 * was allocated (as opposed to being allocated from the TPI
347 * sonode cache. Therefore we explicitly free the struct
348 * here.
349 */
350 sotpi_info_destroy(so);
351 ASSERT(origsp != NULL);
352
353 origsp->sp_smod_info->smod_sock_destroy_func(so);
354 SOCKPARAMS_DEC_REF(origsp);
355 } else {
356 sonode_fini(so);
357 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
358 socktpi_cache;
359 kmem_cache_free(cp, so);
360 }
361 }
362
363 /* ARGSUSED1 */
364 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)365 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
366 {
367 major_t maj;
368 dev_t newdev;
369 struct vnode *vp;
370 int error = 0;
371 struct stdata *stp;
372
373 sotpi_info_t *sti = SOTOTPI(so);
374
375 dprint(1, ("sotpi_init()\n"));
376
377 /*
378 * over write the sleep flag passed in but that is ok
379 * as tpi socket does not honor sleep flag.
380 */
381 flags |= FREAD|FWRITE;
382
383 /*
384 * Record in so_flag that it is a clone.
385 */
386 if (getmajor(sti->sti_dev) == clone_major)
387 so->so_flag |= SOCLONE;
388
389 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
390 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
391 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
392 so->so_protocol == IPPROTO_IP)) {
393 /* Tell tcp or udp that it's talking to sockets */
394 flags |= SO_SOCKSTR;
395
396 /*
397 * Here we indicate to socktpi_open() our attempt to
398 * make direct calls between sockfs and transport.
399 * The final decision is left to socktpi_open().
400 */
401 sti->sti_direct = 1;
402
403 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
404 if (so->so_type == SOCK_STREAM && tso != NULL) {
405 if (SOTOTPI(tso)->sti_direct) {
406 /*
407 * Inherit sti_direct from listener and pass
408 * SO_ACCEPTOR open flag to tcp, indicating
409 * that this is an accept fast-path instance.
410 */
411 flags |= SO_ACCEPTOR;
412 } else {
413 /*
414 * sti_direct is not set on listener, meaning
415 * that the listener has been converted from
416 * a socket to a stream. Ensure that the
417 * acceptor inherits these settings.
418 */
419 sti->sti_direct = 0;
420 flags &= ~SO_SOCKSTR;
421 }
422 }
423 }
424
425 /*
426 * Tell local transport that it is talking to sockets.
427 */
428 if (so->so_family == AF_UNIX) {
429 flags |= SO_SOCKSTR;
430 }
431
432 vp = SOTOV(so);
433 newdev = vp->v_rdev;
434 maj = getmajor(newdev);
435 ASSERT(STREAMSTAB(maj));
436
437 error = stropen(vp, &newdev, flags, cr);
438
439 stp = vp->v_stream;
440 if (error == 0) {
441 if (so->so_flag & SOCLONE)
442 ASSERT(newdev != vp->v_rdev);
443 mutex_enter(&so->so_lock);
444 sti->sti_dev = newdev;
445 vp->v_rdev = newdev;
446 mutex_exit(&so->so_lock);
447
448 if (stp->sd_flag & STRISTTY) {
449 /*
450 * this is a post SVR4 tty driver - a socket can not
451 * be a controlling terminal. Fail the open.
452 */
453 (void) sotpi_close(so, flags, cr);
454 return (ENOTTY); /* XXX */
455 }
456
457 ASSERT(stp->sd_wrq != NULL);
458 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
459
460 /*
461 * If caller is interested in doing direct function call
462 * interface to/from transport module, probe the module
463 * directly beneath the streamhead to see if it qualifies.
464 *
465 * We turn off the direct interface when qualifications fail.
466 * In the acceptor case, we simply turn off the sti_direct
467 * flag on the socket. We do the fallback after the accept
468 * has completed, before the new socket is returned to the
469 * application.
470 */
471 if (sti->sti_direct) {
472 queue_t *tq = stp->sd_wrq->q_next;
473
474 /*
475 * sti_direct is currently supported and tested
476 * only for tcp/udp; this is the main reason to
477 * have the following assertions.
478 */
479 ASSERT(so->so_family == AF_INET ||
480 so->so_family == AF_INET6);
481 ASSERT(so->so_protocol == IPPROTO_UDP ||
482 so->so_protocol == IPPROTO_TCP ||
483 so->so_protocol == IPPROTO_IP);
484 ASSERT(so->so_type == SOCK_DGRAM ||
485 so->so_type == SOCK_STREAM);
486
487 /*
488 * Abort direct call interface if the module directly
489 * underneath the stream head is not defined with the
490 * _D_DIRECT flag. This could happen in the tcp or
491 * udp case, when some other module is autopushed
492 * above it, or for some reasons the expected module
493 * isn't purely D_MP (which is the main requirement).
494 */
495 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
496 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
497 int rval;
498
499 /* Continue on without direct calls */
500 sti->sti_direct = 0;
501
502 /*
503 * Cannot issue ioctl on fallback socket since
504 * there is no conn associated with the queue.
505 * The fallback downcall will notify the proto
506 * of the change.
507 */
508 if (!(flags & SO_ACCEPTOR) &&
509 !(flags & SO_FALLBACK)) {
510 if ((error = strioctl(vp,
511 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
512 cr, &rval)) != 0) {
513 (void) sotpi_close(so, flags,
514 cr);
515 return (error);
516 }
517 }
518 }
519 }
520
521 if (flags & SO_FALLBACK) {
522 /*
523 * The stream created does not have a conn.
524 * do stream set up after conn has been assigned
525 */
526 return (error);
527 }
528 if (error = so_strinit(so, tso)) {
529 (void) sotpi_close(so, flags, cr);
530 return (error);
531 }
532
533 /* Wildcard */
534 if (so->so_protocol != so->so_sockparams->sp_protocol) {
535 int protocol = so->so_protocol;
536 /*
537 * Issue SO_PROTOTYPE setsockopt.
538 */
539 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
540 &protocol, (t_uscalar_t)sizeof (protocol), cr);
541 if (error != 0) {
542 (void) sotpi_close(so, flags, cr);
543 /*
544 * Setsockopt often fails with ENOPROTOOPT but
545 * socket() should fail with
546 * EPROTONOSUPPORT/EPROTOTYPE.
547 */
548 return (EPROTONOSUPPORT);
549 }
550 }
551
552 } else {
553 /*
554 * While the same socket can not be reopened (unlike specfs)
555 * the stream head sets STREOPENFAIL when the autopush fails.
556 */
557 if ((stp != NULL) &&
558 (stp->sd_flag & STREOPENFAIL)) {
559 /*
560 * Open failed part way through.
561 */
562 mutex_enter(&stp->sd_lock);
563 stp->sd_flag &= ~STREOPENFAIL;
564 mutex_exit(&stp->sd_lock);
565 (void) sotpi_close(so, flags, cr);
566 return (error);
567 /*NOTREACHED*/
568 }
569 ASSERT(stp == NULL);
570 }
571 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
572 "sockfs open:maj %d vp %p so %p error %d",
573 maj, vp, so, error);
574 return (error);
575 }
576
577 /*
578 * Bind the socket to an unspecified address in sockfs only.
579 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
580 * required in all cases.
581 */
582 static void
so_automatic_bind(struct sonode * so)583 so_automatic_bind(struct sonode *so)
584 {
585 sotpi_info_t *sti = SOTOTPI(so);
586 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
587
588 ASSERT(MUTEX_HELD(&so->so_lock));
589 ASSERT(!(so->so_state & SS_ISBOUND));
590 ASSERT(sti->sti_unbind_mp);
591
592 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
593 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
594 sti->sti_laddr_sa->sa_family = so->so_family;
595 so->so_state |= SS_ISBOUND;
596 }
597
598
599 /*
600 * bind the socket.
601 *
602 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
603 * are passed in we allow rebinding. Note that for backwards compatibility
604 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
605 * Thus the rebinding code is currently not executed.
606 *
607 * The constraints for rebinding are:
608 * - it is a SOCK_DGRAM, or
609 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
610 * and no listen() has been done.
611 * This rebinding code was added based on some language in the XNET book
612 * about not returning EINVAL it the protocol allows rebinding. However,
613 * this language is not present in the Posix socket draft. Thus maybe the
614 * rebinding logic should be deleted from the source.
615 *
616 * A null "name" can be used to unbind the socket if:
617 * - it is a SOCK_DGRAM, or
618 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
619 * and no listen() has been done.
620 */
621 /* ARGSUSED */
622 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)623 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
624 socklen_t namelen, int backlog, int flags, struct cred *cr)
625 {
626 struct T_bind_req bind_req;
627 struct T_bind_ack *bind_ack;
628 int error = 0;
629 mblk_t *mp;
630 void *addr;
631 t_uscalar_t addrlen;
632 int unbind_on_err = 1;
633 boolean_t clear_acceptconn_on_err = B_FALSE;
634 boolean_t restore_backlog_on_err = B_FALSE;
635 int save_so_backlog;
636 t_scalar_t PRIM_type = O_T_BIND_REQ;
637 boolean_t tcp_udp_xport;
638 void *nl7c = NULL;
639 sotpi_info_t *sti = SOTOTPI(so);
640
641 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
642 (void *)so, (void *)name, namelen, backlog, flags,
643 pr_state(so->so_state, so->so_mode)));
644
645 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
646
647 if (!(flags & _SOBIND_LOCK_HELD)) {
648 mutex_enter(&so->so_lock);
649 so_lock_single(so); /* Set SOLOCKED */
650 } else {
651 ASSERT(MUTEX_HELD(&so->so_lock));
652 ASSERT(so->so_flag & SOLOCKED);
653 }
654
655 /*
656 * Make sure that there is a preallocated unbind_req message
657 * before binding. This message allocated when the socket is
658 * created but it might be have been consumed.
659 */
660 if (sti->sti_unbind_mp == NULL) {
661 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
662 /* NOTE: holding so_lock while sleeping */
663 sti->sti_unbind_mp =
664 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
665 cr);
666 }
667
668 if (flags & _SOBIND_REBIND) {
669 /*
670 * Called from solisten after doing an sotpi_unbind() or
671 * potentially without the unbind (latter for AF_INET{,6}).
672 */
673 ASSERT(name == NULL && namelen == 0);
674
675 if (so->so_family == AF_UNIX) {
676 ASSERT(sti->sti_ux_bound_vp);
677 addr = &sti->sti_ux_laddr;
678 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
679 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
680 "addr 0x%p, vp %p\n",
681 addrlen,
682 (void *)((struct so_ux_addr *)addr)->soua_vp,
683 (void *)sti->sti_ux_bound_vp));
684 } else {
685 addr = sti->sti_laddr_sa;
686 addrlen = (t_uscalar_t)sti->sti_laddr_len;
687 }
688 } else if (flags & _SOBIND_UNSPEC) {
689 ASSERT(name == NULL && namelen == 0);
690
691 /*
692 * The caller checked SS_ISBOUND but not necessarily
693 * under so_lock
694 */
695 if (so->so_state & SS_ISBOUND) {
696 /* No error */
697 goto done;
698 }
699
700 /* Set an initial local address */
701 switch (so->so_family) {
702 case AF_UNIX:
703 /*
704 * Use an address with same size as struct sockaddr
705 * just like BSD.
706 */
707 sti->sti_laddr_len =
708 (socklen_t)sizeof (struct sockaddr);
709 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
710 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
711 sti->sti_laddr_sa->sa_family = so->so_family;
712
713 /*
714 * Pass down an address with the implicit bind
715 * magic number and the rest all zeros.
716 * The transport will return a unique address.
717 */
718 sti->sti_ux_laddr.soua_vp = NULL;
719 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
720 addr = &sti->sti_ux_laddr;
721 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
722 break;
723
724 case AF_INET:
725 case AF_INET6:
726 /*
727 * An unspecified bind in TPI has a NULL address.
728 * Set the address in sockfs to have the sa_family.
729 */
730 sti->sti_laddr_len = (so->so_family == AF_INET) ?
731 (socklen_t)sizeof (sin_t) :
732 (socklen_t)sizeof (sin6_t);
733 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
734 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
735 sti->sti_laddr_sa->sa_family = so->so_family;
736 addr = NULL;
737 addrlen = 0;
738 break;
739
740 default:
741 /*
742 * An unspecified bind in TPI has a NULL address.
743 * Set the address in sockfs to be zero length.
744 *
745 * Can not assume there is a sa_family for all
746 * protocol families. For example, AF_X25 does not
747 * have a family field.
748 */
749 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
750 sti->sti_laddr_len = 0; /* XXX correct? */
751 addr = NULL;
752 addrlen = 0;
753 break;
754 }
755
756 } else {
757 if (so->so_state & SS_ISBOUND) {
758 /*
759 * If it is ok to rebind the socket, first unbind
760 * with the transport. A rebind to the NULL address
761 * is interpreted as an unbind.
762 * Note that a bind to NULL in BSD does unbind the
763 * socket but it fails with EINVAL.
764 * Note that regular sockets set SOV_SOCKBSD i.e.
765 * _SOBIND_SOCKBSD gets set here hence no type of
766 * socket does currently allow rebinding.
767 *
768 * If the name is NULL just do an unbind.
769 */
770 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
771 name != NULL) {
772 error = EINVAL;
773 unbind_on_err = 0;
774 eprintsoline(so, error);
775 goto done;
776 }
777 if ((so->so_mode & SM_CONNREQUIRED) &&
778 (so->so_state & SS_CANTREBIND)) {
779 error = EINVAL;
780 unbind_on_err = 0;
781 eprintsoline(so, error);
782 goto done;
783 }
784 error = sotpi_unbind(so, 0);
785 if (error) {
786 eprintsoline(so, error);
787 goto done;
788 }
789 ASSERT(!(so->so_state & SS_ISBOUND));
790 if (name == NULL) {
791 so->so_state &=
792 ~(SS_ISCONNECTED|SS_ISCONNECTING);
793 goto done;
794 }
795 }
796
797 /* X/Open requires this check */
798 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
799 if (xnet_check_print) {
800 printf("sockfs: X/Open bind state check "
801 "caused EINVAL\n");
802 }
803 error = EINVAL;
804 goto done;
805 }
806
807 switch (so->so_family) {
808 case AF_UNIX:
809 /*
810 * All AF_UNIX addresses are nul terminated
811 * when copied (copyin_name) in so the minimum
812 * length is 3 bytes.
813 */
814 if (name == NULL ||
815 (ssize_t)namelen <= sizeof (short) + 1) {
816 error = EISDIR;
817 eprintsoline(so, error);
818 goto done;
819 }
820 /*
821 * Verify so_family matches the bound family.
822 * BSD does not check this for AF_UNIX resulting
823 * in funny mknods.
824 */
825 if (name->sa_family != so->so_family) {
826 error = EAFNOSUPPORT;
827 goto done;
828 }
829 break;
830 case AF_INET:
831 if (name == NULL) {
832 error = EINVAL;
833 eprintsoline(so, error);
834 goto done;
835 }
836 if ((size_t)namelen != sizeof (sin_t)) {
837 error = name->sa_family != so->so_family ?
838 EAFNOSUPPORT : EINVAL;
839 eprintsoline(so, error);
840 goto done;
841 }
842 if ((flags & _SOBIND_XPG4_2) &&
843 (name->sa_family != so->so_family)) {
844 /*
845 * This check has to be made for X/Open
846 * sockets however application failures have
847 * been observed when it is applied to
848 * all sockets.
849 */
850 error = EAFNOSUPPORT;
851 eprintsoline(so, error);
852 goto done;
853 }
854 /*
855 * Force a zero sa_family to match so_family.
856 *
857 * Some programs like inetd(1M) don't set the
858 * family field. Other programs leave
859 * sin_family set to garbage - SunOS 4.X does
860 * not check the family field on a bind.
861 * We use the family field that
862 * was passed in to the socket() call.
863 */
864 name->sa_family = so->so_family;
865 break;
866
867 case AF_INET6: {
868 #ifdef DEBUG
869 sin6_t *sin6 = (sin6_t *)name;
870 #endif /* DEBUG */
871
872 if (name == NULL) {
873 error = EINVAL;
874 eprintsoline(so, error);
875 goto done;
876 }
877 if ((size_t)namelen != sizeof (sin6_t)) {
878 error = name->sa_family != so->so_family ?
879 EAFNOSUPPORT : EINVAL;
880 eprintsoline(so, error);
881 goto done;
882 }
883 if (name->sa_family != so->so_family) {
884 /*
885 * With IPv6 we require the family to match
886 * unlike in IPv4.
887 */
888 error = EAFNOSUPPORT;
889 eprintsoline(so, error);
890 goto done;
891 }
892 #ifdef DEBUG
893 /*
894 * Verify that apps don't forget to clear
895 * sin6_scope_id etc
896 */
897 if (sin6->sin6_scope_id != 0 &&
898 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
899 zcmn_err(getzoneid(), CE_WARN,
900 "bind with uninitialized sin6_scope_id "
901 "(%d) on socket. Pid = %d\n",
902 (int)sin6->sin6_scope_id,
903 (int)curproc->p_pid);
904 }
905 if (sin6->__sin6_src_id != 0) {
906 zcmn_err(getzoneid(), CE_WARN,
907 "bind with uninitialized __sin6_src_id "
908 "(%d) on socket. Pid = %d\n",
909 (int)sin6->__sin6_src_id,
910 (int)curproc->p_pid);
911 }
912 #endif /* DEBUG */
913 break;
914 }
915 default:
916 /*
917 * Don't do any length or sa_family check to allow
918 * non-sockaddr style addresses.
919 */
920 if (name == NULL) {
921 error = EINVAL;
922 eprintsoline(so, error);
923 goto done;
924 }
925 break;
926 }
927
928 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
929 error = ENAMETOOLONG;
930 eprintsoline(so, error);
931 goto done;
932 }
933 /*
934 * Save local address.
935 */
936 sti->sti_laddr_len = (socklen_t)namelen;
937 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
938 bcopy(name, sti->sti_laddr_sa, namelen);
939
940 addr = sti->sti_laddr_sa;
941 addrlen = (t_uscalar_t)sti->sti_laddr_len;
942 switch (so->so_family) {
943 case AF_INET6:
944 case AF_INET:
945 break;
946 case AF_UNIX: {
947 struct sockaddr_un *soun =
948 (struct sockaddr_un *)sti->sti_laddr_sa;
949 struct vnode *vp, *rvp;
950 struct vattr vattr;
951
952 ASSERT(sti->sti_ux_bound_vp == NULL);
953 /*
954 * Create vnode for the specified path name.
955 * Keep vnode held with a reference in sti_ux_bound_vp.
956 * Use the vnode pointer as the address used in the
957 * bind with the transport.
958 *
959 * Use the same mode as in BSD. In particular this does
960 * not observe the umask.
961 */
962 /* MAXPATHLEN + soun_family + nul termination */
963 if (sti->sti_laddr_len >
964 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
965 error = ENAMETOOLONG;
966 eprintsoline(so, error);
967 goto done;
968 }
969 vattr.va_type = VSOCK;
970 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
971 vattr.va_mask = AT_TYPE|AT_MODE;
972 /* NOTE: holding so_lock */
973 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
974 EXCL, 0, &vp, CRMKNOD, 0, 0);
975 if (error) {
976 if (error == EEXIST)
977 error = EADDRINUSE;
978 eprintsoline(so, error);
979 goto done;
980 }
981 /*
982 * Establish pointer from the underlying filesystem
983 * vnode to the socket node.
984 * sti_ux_bound_vp and v_stream->sd_vnode form the
985 * cross-linkage between the underlying filesystem
986 * node and the socket node.
987 */
988
989 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
990 VN_HOLD(rvp);
991 VN_RELE(vp);
992 vp = rvp;
993 }
994
995 ASSERT(SOTOV(so)->v_stream);
996 mutex_enter(&vp->v_lock);
997 vp->v_stream = SOTOV(so)->v_stream;
998 sti->sti_ux_bound_vp = vp;
999 mutex_exit(&vp->v_lock);
1000
1001 /*
1002 * Use the vnode pointer value as a unique address
1003 * (together with the magic number to avoid conflicts
1004 * with implicit binds) in the transport provider.
1005 */
1006 sti->sti_ux_laddr.soua_vp =
1007 (void *)sti->sti_ux_bound_vp;
1008 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1009 addr = &sti->sti_ux_laddr;
1010 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1011 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1012 addrlen,
1013 (void *)((struct so_ux_addr *)addr)->soua_vp));
1014 break;
1015 }
1016 } /* end switch (so->so_family) */
1017 }
1018
1019 /*
1020 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1021 * the transport can start passing up T_CONN_IND messages
1022 * as soon as it receives the bind req and strsock_proto()
1023 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1024 */
1025 if (flags & _SOBIND_LISTEN) {
1026 if ((so->so_state & SS_ACCEPTCONN) == 0)
1027 clear_acceptconn_on_err = B_TRUE;
1028 save_so_backlog = so->so_backlog;
1029 restore_backlog_on_err = B_TRUE;
1030 so->so_state |= SS_ACCEPTCONN;
1031 so->so_backlog = backlog;
1032 }
1033
1034 /*
1035 * If NL7C addr(s) have been configured check for addr/port match,
1036 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1037 *
1038 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1039 * family sockets only. If match mark as such.
1040 */
1041 if (nl7c_enabled && ((addr != NULL &&
1042 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1043 (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1044 sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1045 /*
1046 * NL7C is not supported in non-global zones,
1047 * we enforce this restriction here.
1048 */
1049 if (so->so_zoneid == GLOBAL_ZONEID) {
1050 /* An NL7C socket, mark it */
1051 sti->sti_nl7c_flags |= NL7C_ENABLED;
1052 if (nl7c == NULL) {
1053 /*
1054 * Was an AF_NCA bind() so add it to the
1055 * addr list for reporting purposes.
1056 */
1057 nl7c = nl7c_add_addr(addr, addrlen);
1058 }
1059 } else
1060 nl7c = NULL;
1061 }
1062
1063 /*
1064 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1065 * for other transports we will send in a O_T_BIND_REQ.
1066 */
1067 if (tcp_udp_xport &&
1068 (so->so_family == AF_INET || so->so_family == AF_INET6))
1069 PRIM_type = T_BIND_REQ;
1070
1071 bind_req.PRIM_type = PRIM_type;
1072 bind_req.ADDR_length = addrlen;
1073 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1074 bind_req.CONIND_number = backlog;
1075 /* NOTE: holding so_lock while sleeping */
1076 mp = soallocproto2(&bind_req, sizeof (bind_req),
1077 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1078 sti->sti_laddr_valid = 0;
1079
1080 /* Done using sti_laddr_sa - can drop the lock */
1081 mutex_exit(&so->so_lock);
1082
1083 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1084 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1085 if (error) {
1086 eprintsoline(so, error);
1087 mutex_enter(&so->so_lock);
1088 goto done;
1089 }
1090
1091 mutex_enter(&so->so_lock);
1092 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1093 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1094 if (error) {
1095 eprintsoline(so, error);
1096 goto done;
1097 }
1098 ASSERT(mp);
1099 /*
1100 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1101 * strsock_proto while the lock was dropped above, the bind
1102 * is allowed to complete.
1103 */
1104
1105 /* Mark as bound. This will be undone if we detect errors below. */
1106 if (flags & _SOBIND_NOXLATE) {
1107 ASSERT(so->so_family == AF_UNIX);
1108 sti->sti_faddr_noxlate = 1;
1109 }
1110 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1111 so->so_state |= SS_ISBOUND;
1112 ASSERT(sti->sti_unbind_mp);
1113
1114 /* note that we've already set SS_ACCEPTCONN above */
1115
1116 /*
1117 * Recompute addrlen - an unspecied bind sent down an
1118 * address of length zero but we expect the appropriate length
1119 * in return.
1120 */
1121 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1122 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1123
1124 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1125 /*
1126 * The alignment restriction is really too strict but
1127 * we want enough alignment to inspect the fields of
1128 * a sockaddr_in.
1129 */
1130 addr = sogetoff(mp, bind_ack->ADDR_offset,
1131 bind_ack->ADDR_length,
1132 __TPI_ALIGN_SIZE);
1133 if (addr == NULL) {
1134 freemsg(mp);
1135 error = EPROTO;
1136 eprintsoline(so, error);
1137 goto done;
1138 }
1139 if (!(flags & _SOBIND_UNSPEC)) {
1140 /*
1141 * Verify that the transport didn't return something we
1142 * did not want e.g. an address other than what we asked for.
1143 *
1144 * NOTE: These checks would go away if/when we switch to
1145 * using the new TPI (in which the transport would fail
1146 * the request instead of assigning a different address).
1147 *
1148 * NOTE2: For protocols that we don't know (i.e. any
1149 * other than AF_INET6, AF_INET and AF_UNIX), we
1150 * cannot know if the transport should be expected to
1151 * return the same address as that requested.
1152 *
1153 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1154 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1155 *
1156 * For example, in the case of netatalk it may be
1157 * inappropriate for the transport to return the
1158 * requested address (as it may have allocated a local
1159 * port number in behaviour similar to that of an
1160 * AF_INET bind request with a port number of zero).
1161 *
1162 * Given the definition of O_T_BIND_REQ, where the
1163 * transport may bind to an address other than the
1164 * requested address, it's not possible to determine
1165 * whether a returned address that differs from the
1166 * requested address is a reason to fail (because the
1167 * requested address was not available) or succeed
1168 * (because the transport allocated an appropriate
1169 * address and/or port).
1170 *
1171 * sockfs currently requires that the transport return
1172 * the requested address in the T_BIND_ACK, unless
1173 * there is code here to allow for any discrepancy.
1174 * Such code exists for AF_INET and AF_INET6.
1175 *
1176 * Netatalk chooses to return the requested address
1177 * rather than the (correct) allocated address. This
1178 * means that netatalk violates the TPI specification
1179 * (and would not function correctly if used from a
1180 * TLI application), but it does mean that it works
1181 * with sockfs.
1182 *
1183 * As noted above, using the newer XTI bind primitive
1184 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1185 * allow sockfs to be more sure about whether or not
1186 * the bind request had succeeded (as transports are
1187 * not permitted to bind to a different address than
1188 * that requested - they must return failure).
1189 * Unfortunately, support for T_BIND_REQ may not be
1190 * present in all transport implementations (netatalk,
1191 * for example, doesn't have it), making the
1192 * transition difficult.
1193 */
1194 if (bind_ack->ADDR_length != addrlen) {
1195 /* Assumes that the requested address was in use */
1196 freemsg(mp);
1197 error = EADDRINUSE;
1198 eprintsoline(so, error);
1199 goto done;
1200 }
1201
1202 switch (so->so_family) {
1203 case AF_INET6:
1204 case AF_INET: {
1205 sin_t *rname, *aname;
1206
1207 rname = (sin_t *)addr;
1208 aname = (sin_t *)sti->sti_laddr_sa;
1209
1210 /*
1211 * Take advantage of the alignment
1212 * of sin_port and sin6_port which fall
1213 * in the same place in their data structures.
1214 * Just use sin_port for either address family.
1215 *
1216 * This may become a problem if (heaven forbid)
1217 * there's a separate ipv6port_reserved... :-P
1218 *
1219 * Binding to port 0 has the semantics of letting
1220 * the transport bind to any port.
1221 *
1222 * If the transport is TCP or UDP since we had sent
1223 * a T_BIND_REQ we would not get a port other than
1224 * what we asked for.
1225 */
1226 if (tcp_udp_xport) {
1227 /*
1228 * Pick up the new port number if we bound to
1229 * port 0.
1230 */
1231 if (aname->sin_port == 0)
1232 aname->sin_port = rname->sin_port;
1233 sti->sti_laddr_valid = 1;
1234 break;
1235 }
1236 if (aname->sin_port != 0 &&
1237 aname->sin_port != rname->sin_port) {
1238 freemsg(mp);
1239 error = EADDRINUSE;
1240 eprintsoline(so, error);
1241 goto done;
1242 }
1243 /*
1244 * Pick up the new port number if we bound to port 0.
1245 */
1246 aname->sin_port = rname->sin_port;
1247
1248 /*
1249 * Unfortunately, addresses aren't _quite_ the same.
1250 */
1251 if (so->so_family == AF_INET) {
1252 if (aname->sin_addr.s_addr !=
1253 rname->sin_addr.s_addr) {
1254 freemsg(mp);
1255 error = EADDRNOTAVAIL;
1256 eprintsoline(so, error);
1257 goto done;
1258 }
1259 } else {
1260 sin6_t *rname6 = (sin6_t *)rname;
1261 sin6_t *aname6 = (sin6_t *)aname;
1262
1263 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1264 &rname6->sin6_addr)) {
1265 freemsg(mp);
1266 error = EADDRNOTAVAIL;
1267 eprintsoline(so, error);
1268 goto done;
1269 }
1270 }
1271 break;
1272 }
1273 case AF_UNIX:
1274 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1275 freemsg(mp);
1276 error = EADDRINUSE;
1277 eprintsoline(so, error);
1278 eprintso(so,
1279 ("addrlen %d, addr 0x%x, vp %p\n",
1280 addrlen, *((int *)addr),
1281 (void *)sti->sti_ux_bound_vp));
1282 goto done;
1283 }
1284 sti->sti_laddr_valid = 1;
1285 break;
1286 default:
1287 /*
1288 * NOTE: This assumes that addresses can be
1289 * byte-compared for equivalence.
1290 */
1291 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1292 freemsg(mp);
1293 error = EADDRINUSE;
1294 eprintsoline(so, error);
1295 goto done;
1296 }
1297 /*
1298 * Don't mark sti_laddr_valid, as we cannot be
1299 * sure that the returned address is the real
1300 * bound address when talking to an unknown
1301 * transport.
1302 */
1303 break;
1304 }
1305 } else {
1306 /*
1307 * Save for returned address for getsockname.
1308 * Needed for unspecific bind unless transport supports
1309 * the TI_GETMYNAME ioctl.
1310 * Do this for AF_INET{,6} even though they do, as
1311 * caching info here is much better performance than
1312 * a TPI/STREAMS trip to the transport for getsockname.
1313 * Any which can't for some reason _must_ _not_ set
1314 * sti_laddr_valid here for the caching version of
1315 * getsockname to not break;
1316 */
1317 switch (so->so_family) {
1318 case AF_UNIX:
1319 /*
1320 * Record the address bound with the transport
1321 * for use by socketpair.
1322 */
1323 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1324 sti->sti_laddr_valid = 1;
1325 break;
1326 case AF_INET:
1327 case AF_INET6:
1328 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1329 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1330 sti->sti_laddr_valid = 1;
1331 break;
1332 default:
1333 /*
1334 * Don't mark sti_laddr_valid, as we cannot be
1335 * sure that the returned address is the real
1336 * bound address when talking to an unknown
1337 * transport.
1338 */
1339 break;
1340 }
1341 }
1342
1343 if (nl7c != NULL) {
1344 /* Register listen()er sonode pointer with NL7C */
1345 nl7c_listener_addr(nl7c, so);
1346 }
1347
1348 freemsg(mp);
1349
1350 done:
1351 if (error) {
1352 /* reset state & backlog to values held on entry */
1353 if (clear_acceptconn_on_err == B_TRUE)
1354 so->so_state &= ~SS_ACCEPTCONN;
1355 if (restore_backlog_on_err == B_TRUE)
1356 so->so_backlog = save_so_backlog;
1357
1358 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1359 int err;
1360
1361 err = sotpi_unbind(so, 0);
1362 /* LINTED - statement has no consequent: if */
1363 if (err) {
1364 eprintsoline(so, error);
1365 } else {
1366 ASSERT(!(so->so_state & SS_ISBOUND));
1367 }
1368 }
1369 }
1370 if (!(flags & _SOBIND_LOCK_HELD)) {
1371 so_unlock_single(so, SOLOCKED);
1372 mutex_exit(&so->so_lock);
1373 } else {
1374 ASSERT(MUTEX_HELD(&so->so_lock));
1375 ASSERT(so->so_flag & SOLOCKED);
1376 }
1377 return (error);
1378 }
1379
1380 /* bind the socket */
1381 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1382 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1383 int flags, struct cred *cr)
1384 {
1385 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1386 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1387
1388 flags &= ~_SOBIND_SOCKETPAIR;
1389 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1390 }
1391
1392 /*
1393 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1394 * address, or when listen needs to unbind and bind.
1395 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1396 * so that a sobind can pick them up.
1397 */
1398 static int
sotpi_unbind(struct sonode * so,int flags)1399 sotpi_unbind(struct sonode *so, int flags)
1400 {
1401 struct T_unbind_req unbind_req;
1402 int error = 0;
1403 mblk_t *mp;
1404 sotpi_info_t *sti = SOTOTPI(so);
1405
1406 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1407 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1408
1409 ASSERT(MUTEX_HELD(&so->so_lock));
1410 ASSERT(so->so_flag & SOLOCKED);
1411
1412 if (!(so->so_state & SS_ISBOUND)) {
1413 error = EINVAL;
1414 eprintsoline(so, error);
1415 goto done;
1416 }
1417
1418 mutex_exit(&so->so_lock);
1419
1420 /*
1421 * Flush the read and write side (except stream head read queue)
1422 * and send down T_UNBIND_REQ.
1423 */
1424 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1425
1426 unbind_req.PRIM_type = T_UNBIND_REQ;
1427 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1428 0, _ALLOC_SLEEP, CRED());
1429 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1430 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1431 mutex_enter(&so->so_lock);
1432 if (error) {
1433 eprintsoline(so, error);
1434 goto done;
1435 }
1436
1437 error = sowaitokack(so, T_UNBIND_REQ);
1438 if (error) {
1439 eprintsoline(so, error);
1440 goto done;
1441 }
1442
1443 /*
1444 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1445 * strsock_proto while the lock was dropped above, the unbind
1446 * is allowed to complete.
1447 */
1448 if (!(flags & _SOUNBIND_REBIND)) {
1449 /*
1450 * Clear out bound address.
1451 */
1452 vnode_t *vp;
1453
1454 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1455 sti->sti_ux_bound_vp = NULL;
1456 vn_rele_stream(vp);
1457 }
1458 /* Clear out address */
1459 sti->sti_laddr_len = 0;
1460 }
1461 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1462 sti->sti_laddr_valid = 0;
1463
1464 done:
1465
1466 /* If the caller held the lock don't release it here */
1467 ASSERT(MUTEX_HELD(&so->so_lock));
1468 ASSERT(so->so_flag & SOLOCKED);
1469
1470 return (error);
1471 }
1472
1473 /*
1474 * listen on the socket.
1475 * For TPI conforming transports this has to first unbind with the transport
1476 * and then bind again using the new backlog.
1477 */
1478 /* ARGSUSED */
1479 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1480 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1481 {
1482 int error = 0;
1483 sotpi_info_t *sti = SOTOTPI(so);
1484
1485 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1486 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1487
1488 if (sti->sti_serv_type == T_CLTS)
1489 return (EOPNOTSUPP);
1490
1491 /*
1492 * If the socket is ready to accept connections already, then
1493 * return without doing anything. This avoids a problem where
1494 * a second listen() call fails if a connection is pending and
1495 * leaves the socket unbound. Only when we are not unbinding
1496 * with the transport can we safely increase the backlog.
1497 */
1498 if (so->so_state & SS_ACCEPTCONN &&
1499 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1500 /*CONSTCOND*/
1501 !solisten_tpi_tcp))
1502 return (0);
1503
1504 if (so->so_state & SS_ISCONNECTED)
1505 return (EINVAL);
1506
1507 mutex_enter(&so->so_lock);
1508 so_lock_single(so); /* Set SOLOCKED */
1509
1510 /*
1511 * If the listen doesn't change the backlog we do nothing.
1512 * This avoids an EPROTO error from the transport.
1513 */
1514 if ((so->so_state & SS_ACCEPTCONN) &&
1515 so->so_backlog == backlog)
1516 goto done;
1517
1518 if (!(so->so_state & SS_ISBOUND)) {
1519 /*
1520 * Must have been explicitly bound in the UNIX domain.
1521 */
1522 if (so->so_family == AF_UNIX) {
1523 error = EINVAL;
1524 goto done;
1525 }
1526 error = sotpi_bindlisten(so, NULL, 0, backlog,
1527 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1528 } else if (backlog > 0) {
1529 /*
1530 * AF_INET{,6} hack to avoid losing the port.
1531 * Assumes that all AF_INET{,6} transports can handle a
1532 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1533 * has already bound thus it is possible to avoid the unbind.
1534 */
1535 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1536 /*CONSTCOND*/
1537 !solisten_tpi_tcp)) {
1538 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1539 if (error)
1540 goto done;
1541 }
1542 error = sotpi_bindlisten(so, NULL, 0, backlog,
1543 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1544 } else {
1545 so->so_state |= SS_ACCEPTCONN;
1546 so->so_backlog = backlog;
1547 }
1548 if (error)
1549 goto done;
1550 ASSERT(so->so_state & SS_ACCEPTCONN);
1551 done:
1552 so_unlock_single(so, SOLOCKED);
1553 mutex_exit(&so->so_lock);
1554 return (error);
1555 }
1556
1557 /*
1558 * Disconnect either a specified seqno or all (-1).
1559 * The former is used on listening sockets only.
1560 *
1561 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1562 * the current use of sodisconnect(seqno == -1) is only for shutdown
1563 * so there is no point (and potentially incorrect) to unbind.
1564 */
1565 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1566 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1567 {
1568 struct T_discon_req discon_req;
1569 int error = 0;
1570 mblk_t *mp;
1571
1572 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1573 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1574
1575 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1576 mutex_enter(&so->so_lock);
1577 so_lock_single(so); /* Set SOLOCKED */
1578 } else {
1579 ASSERT(MUTEX_HELD(&so->so_lock));
1580 ASSERT(so->so_flag & SOLOCKED);
1581 }
1582
1583 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1584 error = EINVAL;
1585 eprintsoline(so, error);
1586 goto done;
1587 }
1588
1589 mutex_exit(&so->so_lock);
1590 /*
1591 * Flush the write side (unless this is a listener)
1592 * and then send down a T_DISCON_REQ.
1593 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1594 * and other messages.)
1595 */
1596 if (!(so->so_state & SS_ACCEPTCONN))
1597 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1598
1599 discon_req.PRIM_type = T_DISCON_REQ;
1600 discon_req.SEQ_number = seqno;
1601 mp = soallocproto1(&discon_req, sizeof (discon_req),
1602 0, _ALLOC_SLEEP, CRED());
1603 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1604 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1605 mutex_enter(&so->so_lock);
1606 if (error) {
1607 eprintsoline(so, error);
1608 goto done;
1609 }
1610
1611 error = sowaitokack(so, T_DISCON_REQ);
1612 if (error) {
1613 eprintsoline(so, error);
1614 goto done;
1615 }
1616 /*
1617 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1618 * strsock_proto while the lock was dropped above, the disconnect
1619 * is allowed to complete. However, it is not possible to
1620 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1621 */
1622 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1623 SOTOTPI(so)->sti_laddr_valid = 0;
1624 SOTOTPI(so)->sti_faddr_valid = 0;
1625 done:
1626 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1627 so_unlock_single(so, SOLOCKED);
1628 mutex_exit(&so->so_lock);
1629 } else {
1630 /* If the caller held the lock don't release it here */
1631 ASSERT(MUTEX_HELD(&so->so_lock));
1632 ASSERT(so->so_flag & SOLOCKED);
1633 }
1634 return (error);
1635 }
1636
1637 /* ARGSUSED */
1638 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1639 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1640 struct sonode **nsop)
1641 {
1642 struct T_conn_ind *conn_ind;
1643 struct T_conn_res *conn_res;
1644 int error = 0;
1645 mblk_t *mp, *ack_mp;
1646 struct sonode *nso;
1647 vnode_t *nvp;
1648 void *src;
1649 t_uscalar_t srclen;
1650 void *opt;
1651 t_uscalar_t optlen;
1652 t_scalar_t PRIM_type;
1653 t_scalar_t SEQ_number;
1654 size_t sinlen;
1655 sotpi_info_t *sti = SOTOTPI(so);
1656 sotpi_info_t *nsti;
1657
1658 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1659 (void *)so, fflag, (void *)nsop,
1660 pr_state(so->so_state, so->so_mode)));
1661
1662 /*
1663 * Defer single-threading the accepting socket until
1664 * the T_CONN_IND has been received and parsed and the
1665 * new sonode has been opened.
1666 */
1667
1668 /* Check that we are not already connected */
1669 if ((so->so_state & SS_ACCEPTCONN) == 0)
1670 goto conn_bad;
1671 again:
1672 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1673 goto e_bad;
1674
1675 ASSERT(mp != NULL);
1676 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1677
1678 /*
1679 * Save SEQ_number for error paths.
1680 */
1681 SEQ_number = conn_ind->SEQ_number;
1682
1683 srclen = conn_ind->SRC_length;
1684 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1685 if (src == NULL) {
1686 error = EPROTO;
1687 freemsg(mp);
1688 eprintsoline(so, error);
1689 goto disconnect_unlocked;
1690 }
1691 optlen = conn_ind->OPT_length;
1692 switch (so->so_family) {
1693 case AF_INET:
1694 case AF_INET6:
1695 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1696 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1697 &opt, conn_ind->OPT_length);
1698 } else {
1699 /*
1700 * The transport (in this case TCP) hasn't sent up
1701 * a pointer to an instance for the accept fast-path.
1702 * Disable fast-path completely because the call to
1703 * sotpi_create() below would otherwise create an
1704 * incomplete TCP instance, which would lead to
1705 * problems when sockfs sends a normal T_CONN_RES
1706 * message down the new stream.
1707 */
1708 if (sti->sti_direct) {
1709 int rval;
1710 /*
1711 * For consistency we inform tcp to disable
1712 * direct interface on the listener, though
1713 * we can certainly live without doing this
1714 * because no data will ever travel upstream
1715 * on the listening socket.
1716 */
1717 sti->sti_direct = 0;
1718 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1719 0, 0, K_TO_K, cr, &rval);
1720 }
1721 opt = NULL;
1722 optlen = 0;
1723 }
1724 break;
1725 case AF_UNIX:
1726 default:
1727 if (optlen != 0) {
1728 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1729 __TPI_ALIGN_SIZE);
1730 if (opt == NULL) {
1731 error = EPROTO;
1732 freemsg(mp);
1733 eprintsoline(so, error);
1734 goto disconnect_unlocked;
1735 }
1736 }
1737 if (so->so_family == AF_UNIX) {
1738 if (!sti->sti_faddr_noxlate) {
1739 src = NULL;
1740 srclen = 0;
1741 }
1742 /* Extract src address from options */
1743 if (optlen != 0)
1744 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1745 }
1746 break;
1747 }
1748
1749 /*
1750 * Create the new socket.
1751 */
1752 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1753 if (nso == NULL) {
1754 ASSERT(error != 0);
1755 /*
1756 * Accept can not fail with ENOBUFS. sotpi_create
1757 * sleeps waiting for memory until a signal is caught
1758 * so return EINTR.
1759 */
1760 freemsg(mp);
1761 if (error == ENOBUFS)
1762 error = EINTR;
1763 goto e_disc_unl;
1764 }
1765 nvp = SOTOV(nso);
1766 nsti = SOTOTPI(nso);
1767
1768 #ifdef DEBUG
1769 /*
1770 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1771 * it's inherited early to allow debugging of the accept code itself.
1772 */
1773 nso->so_options |= so->so_options & SO_DEBUG;
1774 #endif /* DEBUG */
1775
1776 /*
1777 * Save the SRC address from the T_CONN_IND
1778 * for getpeername to work on AF_UNIX and on transports that do not
1779 * support TI_GETPEERNAME.
1780 *
1781 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1782 * copyin_name().
1783 */
1784 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1785 error = EINVAL;
1786 freemsg(mp);
1787 eprintsoline(so, error);
1788 goto disconnect_vp_unlocked;
1789 }
1790 nsti->sti_faddr_len = (socklen_t)srclen;
1791 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1792 bcopy(src, nsti->sti_faddr_sa, srclen);
1793 nsti->sti_faddr_valid = 1;
1794
1795 /*
1796 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1797 */
1798 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1799 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1800 cred_t *cr;
1801 pid_t cpid;
1802
1803 cr = msg_getcred(mp, &cpid);
1804 if (cr != NULL) {
1805 crhold(cr);
1806 nso->so_peercred = cr;
1807 nso->so_cpid = cpid;
1808 }
1809 freemsg(mp);
1810
1811 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1812 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1813 if (mp == NULL) {
1814 /*
1815 * Accept can not fail with ENOBUFS.
1816 * A signal was caught so return EINTR.
1817 */
1818 error = EINTR;
1819 eprintsoline(so, error);
1820 goto disconnect_vp_unlocked;
1821 }
1822 conn_res = (struct T_conn_res *)mp->b_rptr;
1823 } else {
1824 /*
1825 * For efficency reasons we use msg_extractcred; no crhold
1826 * needed since db_credp is cleared (i.e., we move the cred
1827 * from the message to so_peercred.
1828 */
1829 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1830
1831 mp->b_rptr = DB_BASE(mp);
1832 conn_res = (struct T_conn_res *)mp->b_rptr;
1833 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1834
1835 mblk_setcred(mp, cr, curproc->p_pid);
1836 }
1837
1838 /*
1839 * New socket must be bound at least in sockfs and, except for AF_INET,
1840 * (or AF_INET6) it also has to be bound in the transport provider.
1841 * We set the local address in the sonode from the T_OK_ACK of the
1842 * T_CONN_RES. For this reason the address we bind to here isn't
1843 * important.
1844 */
1845 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1846 /*CONSTCOND*/
1847 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1848 /*
1849 * Optimization for AF_INET{,6} transports
1850 * that can handle a T_CONN_RES without being bound.
1851 */
1852 mutex_enter(&nso->so_lock);
1853 so_automatic_bind(nso);
1854 mutex_exit(&nso->so_lock);
1855 } else {
1856 /* Perform NULL bind with the transport provider. */
1857 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1858 cr)) != 0) {
1859 ASSERT(error != ENOBUFS);
1860 freemsg(mp);
1861 eprintsoline(nso, error);
1862 goto disconnect_vp_unlocked;
1863 }
1864 }
1865
1866 /*
1867 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1868 * so that any data arriving on the new socket will cause the
1869 * appropriate signals to be delivered for the new socket.
1870 *
1871 * No other thread (except strsock_proto and strsock_misc)
1872 * can access the new socket thus we relax the locking.
1873 */
1874 nso->so_pgrp = so->so_pgrp;
1875 nso->so_state |= so->so_state & SS_ASYNC;
1876 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1877
1878 if (nso->so_pgrp != 0) {
1879 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1880 eprintsoline(nso, error);
1881 error = 0;
1882 nso->so_pgrp = 0;
1883 }
1884 }
1885
1886 /*
1887 * Make note of the socket level options. TCP and IP level options
1888 * are already inherited. We could do all this after accept is
1889 * successful but doing it here simplifies code and no harm done
1890 * for error case.
1891 */
1892 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1893 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1894 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1895 nso->so_sndbuf = so->so_sndbuf;
1896 nso->so_rcvbuf = so->so_rcvbuf;
1897 if (nso->so_options & SO_LINGER)
1898 nso->so_linger = so->so_linger;
1899
1900 /*
1901 * Note that the following sti_direct code path should be
1902 * removed once we are confident that the direct sockets
1903 * do not result in any degradation.
1904 */
1905 if (sti->sti_direct) {
1906
1907 ASSERT(opt != NULL);
1908
1909 conn_res->OPT_length = optlen;
1910 conn_res->OPT_offset = MBLKL(mp);
1911 bcopy(&opt, mp->b_wptr, optlen);
1912 mp->b_wptr += optlen;
1913 conn_res->PRIM_type = T_CONN_RES;
1914 conn_res->ACCEPTOR_id = 0;
1915 PRIM_type = T_CONN_RES;
1916
1917 /* Send down the T_CONN_RES on acceptor STREAM */
1918 error = kstrputmsg(SOTOV(nso), mp, NULL,
1919 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1920 if (error) {
1921 mutex_enter(&so->so_lock);
1922 so_lock_single(so);
1923 eprintsoline(so, error);
1924 goto disconnect_vp;
1925 }
1926 mutex_enter(&nso->so_lock);
1927 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1928 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1929 if (error) {
1930 mutex_exit(&nso->so_lock);
1931 mutex_enter(&so->so_lock);
1932 so_lock_single(so);
1933 eprintsoline(so, error);
1934 goto disconnect_vp;
1935 }
1936 if (nso->so_family == AF_INET) {
1937 sin_t *sin;
1938
1939 sin = (sin_t *)(ack_mp->b_rptr +
1940 sizeof (struct T_ok_ack));
1941 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1942 nsti->sti_laddr_len = sizeof (sin_t);
1943 } else {
1944 sin6_t *sin6;
1945
1946 sin6 = (sin6_t *)(ack_mp->b_rptr +
1947 sizeof (struct T_ok_ack));
1948 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1949 nsti->sti_laddr_len = sizeof (sin6_t);
1950 }
1951 freemsg(ack_mp);
1952
1953 nso->so_state |= SS_ISCONNECTED;
1954 nso->so_proto_handle = (sock_lower_handle_t)opt;
1955 nsti->sti_laddr_valid = 1;
1956
1957 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1958 /*
1959 * A NL7C marked listen()er so the new socket
1960 * inherits the listen()er's NL7C state, except
1961 * for NL7C_POLLIN.
1962 *
1963 * Only call NL7C to process the new socket if
1964 * the listen socket allows blocking i/o.
1965 */
1966 nsti->sti_nl7c_flags =
1967 sti->sti_nl7c_flags & (~NL7C_POLLIN);
1968 if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1969 /*
1970 * Nonblocking accept() just make it
1971 * persist to defer processing to the
1972 * read-side syscall (e.g. read).
1973 */
1974 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1975 } else if (nl7c_process(nso, B_FALSE)) {
1976 /*
1977 * NL7C has completed processing on the
1978 * socket, close the socket and back to
1979 * the top to await the next T_CONN_IND.
1980 */
1981 mutex_exit(&nso->so_lock);
1982 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1983 cr, NULL);
1984 VN_RELE(nvp);
1985 goto again;
1986 }
1987 /* Pass the new socket out */
1988 }
1989
1990 mutex_exit(&nso->so_lock);
1991
1992 /*
1993 * It's possible, through the use of autopush for example,
1994 * that the acceptor stream may not support sti_direct
1995 * semantics. If the new socket does not support sti_direct
1996 * we issue a _SIOCSOCKFALLBACK to inform the transport
1997 * as we would in the I_PUSH case.
1998 */
1999 if (nsti->sti_direct == 0) {
2000 int rval;
2001
2002 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2003 0, 0, K_TO_K, cr, &rval)) != 0) {
2004 mutex_enter(&so->so_lock);
2005 so_lock_single(so);
2006 eprintsoline(so, error);
2007 goto disconnect_vp;
2008 }
2009 }
2010
2011 /*
2012 * Pass out new socket.
2013 */
2014 if (nsop != NULL)
2015 *nsop = nso;
2016
2017 return (0);
2018 }
2019
2020 /*
2021 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2022 * which don't support the FireEngine accept fast-path. It is also
2023 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2024 * again. Neither sockfs nor TCP attempt to find out if some other
2025 * random module has been inserted in between (in which case we
2026 * should follow TLI accept behaviour). We blindly assume the worst
2027 * case and revert back to old behaviour i.e. TCP will not send us
2028 * any option (eager) and the accept should happen on the listener
2029 * queue. Any queued T_conn_ind have already got their options removed
2030 * by so_sock2_stream() when "sockmod" was I_POP'd.
2031 */
2032 /*
2033 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2034 */
2035 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2036 #ifdef _ILP32
2037 queue_t *q;
2038
2039 /*
2040 * Find read queue in driver
2041 * Can safely do this since we "own" nso/nvp.
2042 */
2043 q = strvp2wq(nvp)->q_next;
2044 while (SAMESTR(q))
2045 q = q->q_next;
2046 q = RD(q);
2047 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2048 #else
2049 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2050 #endif /* _ILP32 */
2051 conn_res->PRIM_type = O_T_CONN_RES;
2052 PRIM_type = O_T_CONN_RES;
2053 } else {
2054 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2055 conn_res->PRIM_type = T_CONN_RES;
2056 PRIM_type = T_CONN_RES;
2057 }
2058 conn_res->SEQ_number = SEQ_number;
2059 conn_res->OPT_length = 0;
2060 conn_res->OPT_offset = 0;
2061
2062 mutex_enter(&so->so_lock);
2063 so_lock_single(so); /* Set SOLOCKED */
2064 mutex_exit(&so->so_lock);
2065
2066 error = kstrputmsg(SOTOV(so), mp, NULL,
2067 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2068 mutex_enter(&so->so_lock);
2069 if (error) {
2070 eprintsoline(so, error);
2071 goto disconnect_vp;
2072 }
2073 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2074 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2075 if (error) {
2076 eprintsoline(so, error);
2077 goto disconnect_vp;
2078 }
2079 mutex_exit(&so->so_lock);
2080 /*
2081 * If there is a sin/sin6 appended onto the T_OK_ACK use
2082 * that to set the local address. If this is not present
2083 * then we zero out the address and don't set the
2084 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2085 * the pathname from the listening socket.
2086 * In the case where this is TCP or an AF_UNIX socket the
2087 * client side may have queued data or a T_ORDREL in the
2088 * transport. Having now sent the T_CONN_RES we may receive
2089 * those queued messages at any time. Hold the acceptor
2090 * so_lock until its state and laddr are finalized.
2091 */
2092 mutex_enter(&nso->so_lock);
2093 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2094 if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2095 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2096 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2097 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2098 nsti->sti_laddr_len = sinlen;
2099 nsti->sti_laddr_valid = 1;
2100 } else if (nso->so_family == AF_UNIX) {
2101 ASSERT(so->so_family == AF_UNIX);
2102 nsti->sti_laddr_len = sti->sti_laddr_len;
2103 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2104 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2105 nsti->sti_laddr_len);
2106 nsti->sti_laddr_valid = 1;
2107 } else {
2108 nsti->sti_laddr_len = sti->sti_laddr_len;
2109 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2110 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2111 nsti->sti_laddr_sa->sa_family = nso->so_family;
2112 }
2113 nso->so_state |= SS_ISCONNECTED;
2114 mutex_exit(&nso->so_lock);
2115
2116 freemsg(ack_mp);
2117
2118 mutex_enter(&so->so_lock);
2119 so_unlock_single(so, SOLOCKED);
2120 mutex_exit(&so->so_lock);
2121
2122 /*
2123 * Pass out new socket.
2124 */
2125 if (nsop != NULL)
2126 *nsop = nso;
2127
2128 return (0);
2129
2130
2131 eproto_disc_unl:
2132 error = EPROTO;
2133 e_disc_unl:
2134 eprintsoline(so, error);
2135 goto disconnect_unlocked;
2136
2137 pr_disc_vp_unl:
2138 eprintsoline(so, error);
2139 disconnect_vp_unlocked:
2140 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2141 VN_RELE(nvp);
2142 disconnect_unlocked:
2143 (void) sodisconnect(so, SEQ_number, 0);
2144 return (error);
2145
2146 pr_disc_vp:
2147 eprintsoline(so, error);
2148 disconnect_vp:
2149 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2150 so_unlock_single(so, SOLOCKED);
2151 mutex_exit(&so->so_lock);
2152 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2153 VN_RELE(nvp);
2154 return (error);
2155
2156 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2157 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2158 ? EOPNOTSUPP : EINVAL;
2159 e_bad:
2160 eprintsoline(so, error);
2161 return (error);
2162 }
2163
2164 /*
2165 * connect a socket.
2166 *
2167 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2168 * unconnect (by specifying a null address).
2169 */
2170 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2171 sotpi_connect(struct sonode *so,
2172 struct sockaddr *name,
2173 socklen_t namelen,
2174 int fflag,
2175 int flags,
2176 struct cred *cr)
2177 {
2178 struct T_conn_req conn_req;
2179 int error = 0;
2180 mblk_t *mp;
2181 void *src;
2182 socklen_t srclen;
2183 void *addr;
2184 socklen_t addrlen;
2185 boolean_t need_unlock;
2186 sotpi_info_t *sti = SOTOTPI(so);
2187
2188 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2189 (void *)so, (void *)name, namelen, fflag, flags,
2190 pr_state(so->so_state, so->so_mode)));
2191
2192 /*
2193 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2194 * avoid sleeping for memory with SOLOCKED held.
2195 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2196 * + sizeof (struct T_opthdr).
2197 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2198 * exceed sti_faddr_maxlen).
2199 */
2200 mp = soallocproto(sizeof (struct T_conn_req) +
2201 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2202 cr);
2203 if (mp == NULL) {
2204 /*
2205 * Connect can not fail with ENOBUFS. A signal was
2206 * caught so return EINTR.
2207 */
2208 error = EINTR;
2209 eprintsoline(so, error);
2210 return (error);
2211 }
2212
2213 mutex_enter(&so->so_lock);
2214 /*
2215 * Make sure there is a preallocated T_unbind_req message
2216 * before any binding. This message is allocated when the
2217 * socket is created. Since another thread can consume
2218 * so_unbind_mp by the time we return from so_lock_single(),
2219 * we should check the availability of so_unbind_mp after
2220 * we return from so_lock_single().
2221 */
2222
2223 so_lock_single(so); /* Set SOLOCKED */
2224 need_unlock = B_TRUE;
2225
2226 if (sti->sti_unbind_mp == NULL) {
2227 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2228 /* NOTE: holding so_lock while sleeping */
2229 sti->sti_unbind_mp =
2230 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2231 if (sti->sti_unbind_mp == NULL) {
2232 error = EINTR;
2233 goto done;
2234 }
2235 }
2236
2237 /*
2238 * Can't have done a listen before connecting.
2239 */
2240 if (so->so_state & SS_ACCEPTCONN) {
2241 error = EOPNOTSUPP;
2242 goto done;
2243 }
2244
2245 /*
2246 * Must be bound with the transport
2247 */
2248 if (!(so->so_state & SS_ISBOUND)) {
2249 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2250 /*CONSTCOND*/
2251 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2252 /*
2253 * Optimization for AF_INET{,6} transports
2254 * that can handle a T_CONN_REQ without being bound.
2255 */
2256 so_automatic_bind(so);
2257 } else {
2258 error = sotpi_bind(so, NULL, 0,
2259 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2260 if (error)
2261 goto done;
2262 }
2263 ASSERT(so->so_state & SS_ISBOUND);
2264 flags |= _SOCONNECT_DID_BIND;
2265 }
2266
2267 /*
2268 * Handle a connect to a name parameter of type AF_UNSPEC like a
2269 * connect to a null address. This is the portable method to
2270 * unconnect a socket.
2271 */
2272 if ((namelen >= sizeof (sa_family_t)) &&
2273 (name->sa_family == AF_UNSPEC)) {
2274 name = NULL;
2275 namelen = 0;
2276 }
2277
2278 /*
2279 * Check that we are not already connected.
2280 * A connection-oriented socket cannot be reconnected.
2281 * A connected connection-less socket can be
2282 * - connected to a different address by a subsequent connect
2283 * - "unconnected" by a connect to the NULL address
2284 */
2285 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2286 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2287 if (so->so_mode & SM_CONNREQUIRED) {
2288 /* Connection-oriented socket */
2289 error = so->so_state & SS_ISCONNECTED ?
2290 EISCONN : EALREADY;
2291 goto done;
2292 }
2293 /* Connection-less socket */
2294 if (name == NULL) {
2295 /*
2296 * Remove the connected state and clear SO_DGRAM_ERRIND
2297 * since it was set when the socket was connected.
2298 * If this is UDP also send down a T_DISCON_REQ.
2299 */
2300 int val;
2301
2302 if ((so->so_family == AF_INET ||
2303 so->so_family == AF_INET6) &&
2304 (so->so_type == SOCK_DGRAM ||
2305 so->so_type == SOCK_RAW) &&
2306 /*CONSTCOND*/
2307 !soconnect_tpi_udp) {
2308 /* XXX What about implicitly unbinding here? */
2309 error = sodisconnect(so, -1,
2310 _SODISCONNECT_LOCK_HELD);
2311 } else {
2312 so->so_state &=
2313 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2314 sti->sti_faddr_valid = 0;
2315 sti->sti_faddr_len = 0;
2316 }
2317
2318 /* Remove SOLOCKED since setsockopt will grab it */
2319 so_unlock_single(so, SOLOCKED);
2320 mutex_exit(&so->so_lock);
2321
2322 val = 0;
2323 (void) sotpi_setsockopt(so, SOL_SOCKET,
2324 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2325 cr);
2326
2327 mutex_enter(&so->so_lock);
2328 so_lock_single(so); /* Set SOLOCKED */
2329 goto done;
2330 }
2331 }
2332 ASSERT(so->so_state & SS_ISBOUND);
2333
2334 if (name == NULL || namelen == 0) {
2335 error = EINVAL;
2336 goto done;
2337 }
2338 /*
2339 * Mark the socket if sti_faddr_sa represents the transport level
2340 * address.
2341 */
2342 if (flags & _SOCONNECT_NOXLATE) {
2343 struct sockaddr_ux *soaddr_ux;
2344
2345 ASSERT(so->so_family == AF_UNIX);
2346 if (namelen != sizeof (struct sockaddr_ux)) {
2347 error = EINVAL;
2348 goto done;
2349 }
2350 soaddr_ux = (struct sockaddr_ux *)name;
2351 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2352 namelen = sizeof (soaddr_ux->sou_addr);
2353 sti->sti_faddr_noxlate = 1;
2354 }
2355
2356 /*
2357 * Length and family checks.
2358 */
2359 error = so_addr_verify(so, name, namelen);
2360 if (error)
2361 goto bad;
2362
2363 /*
2364 * Save foreign address. Needed for AF_UNIX as well as
2365 * transport providers that do not support TI_GETPEERNAME.
2366 * Also used for cached foreign address for TCP and UDP.
2367 */
2368 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2369 error = EINVAL;
2370 goto done;
2371 }
2372 sti->sti_faddr_len = (socklen_t)namelen;
2373 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2374 bcopy(name, sti->sti_faddr_sa, namelen);
2375 sti->sti_faddr_valid = 1;
2376
2377 if (so->so_family == AF_UNIX) {
2378 if (sti->sti_faddr_noxlate) {
2379 /*
2380 * Already have a transport internal address. Do not
2381 * pass any (transport internal) source address.
2382 */
2383 addr = sti->sti_faddr_sa;
2384 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2385 src = NULL;
2386 srclen = 0;
2387 } else {
2388 /*
2389 * Pass the sockaddr_un source address as an option
2390 * and translate the remote address.
2391 * Holding so_lock thus sti_laddr_sa can not change.
2392 */
2393 src = sti->sti_laddr_sa;
2394 srclen = (t_uscalar_t)sti->sti_laddr_len;
2395 dprintso(so, 1,
2396 ("sotpi_connect UNIX: srclen %d, src %p\n",
2397 srclen, src));
2398 error = so_ux_addr_xlate(so,
2399 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2400 (flags & _SOCONNECT_XPG4_2),
2401 &addr, &addrlen);
2402 if (error)
2403 goto bad;
2404 }
2405 } else {
2406 addr = sti->sti_faddr_sa;
2407 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2408 src = NULL;
2409 srclen = 0;
2410 }
2411 /*
2412 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2413 * option which asks the transport provider to send T_UDERR_IND
2414 * messages. These T_UDERR_IND messages are used to return connected
2415 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2416 *
2417 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2418 * we send down a T_CONN_REQ. This is needed to let the
2419 * transport assign a local address that is consistent with
2420 * the remote address. Applications depend on a getsockname()
2421 * after a connect() to retrieve the "source" IP address for
2422 * the connected socket. Invalidate the cached local address
2423 * to force getsockname() to enquire of the transport.
2424 */
2425 if (!(so->so_mode & SM_CONNREQUIRED)) {
2426 /*
2427 * Datagram socket.
2428 */
2429 int32_t val;
2430
2431 so_unlock_single(so, SOLOCKED);
2432 mutex_exit(&so->so_lock);
2433
2434 val = 1;
2435 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2436 &val, (t_uscalar_t)sizeof (val), cr);
2437
2438 mutex_enter(&so->so_lock);
2439 so_lock_single(so); /* Set SOLOCKED */
2440 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2441 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2442 soconnect_tpi_udp) {
2443 soisconnected(so);
2444 goto done;
2445 }
2446 /*
2447 * Send down T_CONN_REQ etc.
2448 * Clear fflag to avoid returning EWOULDBLOCK.
2449 */
2450 fflag = 0;
2451 ASSERT(so->so_family != AF_UNIX);
2452 sti->sti_laddr_valid = 0;
2453 } else if (sti->sti_laddr_len != 0) {
2454 /*
2455 * If the local address or port was "any" then it may be
2456 * changed by the transport as a result of the
2457 * connect. Invalidate the cached version if we have one.
2458 */
2459 switch (so->so_family) {
2460 case AF_INET:
2461 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2462 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2463 INADDR_ANY ||
2464 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2465 sti->sti_laddr_valid = 0;
2466 break;
2467
2468 case AF_INET6:
2469 ASSERT(sti->sti_laddr_len ==
2470 (socklen_t)sizeof (sin6_t));
2471 if (IN6_IS_ADDR_UNSPECIFIED(
2472 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2473 IN6_IS_ADDR_V4MAPPED_ANY(
2474 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2475 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2476 sti->sti_laddr_valid = 0;
2477 break;
2478
2479 default:
2480 break;
2481 }
2482 }
2483
2484 /*
2485 * Check for failure of an earlier call
2486 */
2487 if (so->so_error != 0)
2488 goto so_bad;
2489
2490 /*
2491 * Send down T_CONN_REQ. Message was allocated above.
2492 */
2493 conn_req.PRIM_type = T_CONN_REQ;
2494 conn_req.DEST_length = addrlen;
2495 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2496 if (srclen == 0) {
2497 conn_req.OPT_length = 0;
2498 conn_req.OPT_offset = 0;
2499 soappendmsg(mp, &conn_req, sizeof (conn_req));
2500 soappendmsg(mp, addr, addrlen);
2501 } else {
2502 /*
2503 * There is a AF_UNIX sockaddr_un to include as a source
2504 * address option.
2505 */
2506 struct T_opthdr toh;
2507
2508 toh.level = SOL_SOCKET;
2509 toh.name = SO_SRCADDR;
2510 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2511 toh.status = 0;
2512 conn_req.OPT_length =
2513 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2514 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2515 _TPI_ALIGN_TOPT(addrlen));
2516
2517 soappendmsg(mp, &conn_req, sizeof (conn_req));
2518 soappendmsg(mp, addr, addrlen);
2519 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2520 soappendmsg(mp, &toh, sizeof (toh));
2521 soappendmsg(mp, src, srclen);
2522 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2523 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2524 }
2525 /*
2526 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2527 * in order to have the right state when the T_CONN_CON shows up.
2528 */
2529 soisconnecting(so);
2530 mutex_exit(&so->so_lock);
2531
2532 if (AU_AUDITING())
2533 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2534
2535 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2536 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2537 mp = NULL;
2538 mutex_enter(&so->so_lock);
2539 if (error != 0)
2540 goto bad;
2541
2542 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2543 goto bad;
2544
2545 /* Allow other threads to access the socket */
2546 so_unlock_single(so, SOLOCKED);
2547 need_unlock = B_FALSE;
2548
2549 /*
2550 * Wait until we get a T_CONN_CON or an error
2551 */
2552 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2553 so_lock_single(so); /* Set SOLOCKED */
2554 need_unlock = B_TRUE;
2555 }
2556
2557 done:
2558 freemsg(mp);
2559 switch (error) {
2560 case EINPROGRESS:
2561 case EALREADY:
2562 case EISCONN:
2563 case EINTR:
2564 /* Non-fatal errors */
2565 sti->sti_laddr_valid = 0;
2566 /* FALLTHRU */
2567 case 0:
2568 break;
2569 default:
2570 ASSERT(need_unlock);
2571 /*
2572 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2573 * and invalidate local-address cache
2574 */
2575 so->so_state &= ~SS_ISCONNECTING;
2576 sti->sti_laddr_valid = 0;
2577 /* A discon_ind might have already unbound us */
2578 if ((flags & _SOCONNECT_DID_BIND) &&
2579 (so->so_state & SS_ISBOUND)) {
2580 int err;
2581
2582 err = sotpi_unbind(so, 0);
2583 /* LINTED - statement has no conseq */
2584 if (err) {
2585 eprintsoline(so, err);
2586 }
2587 }
2588 break;
2589 }
2590 if (need_unlock)
2591 so_unlock_single(so, SOLOCKED);
2592 mutex_exit(&so->so_lock);
2593 return (error);
2594
2595 so_bad: error = sogeterr(so, B_TRUE);
2596 bad: eprintsoline(so, error);
2597 goto done;
2598 }
2599
2600 /* ARGSUSED */
2601 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2602 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2603 {
2604 struct T_ordrel_req ordrel_req;
2605 mblk_t *mp;
2606 uint_t old_state, state_change;
2607 int error = 0;
2608 sotpi_info_t *sti = SOTOTPI(so);
2609
2610 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2611 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2612
2613 mutex_enter(&so->so_lock);
2614 so_lock_single(so); /* Set SOLOCKED */
2615
2616 /*
2617 * SunOS 4.X has no check for datagram sockets.
2618 * 5.X checks that it is connected (ENOTCONN)
2619 * X/Open requires that we check the connected state.
2620 */
2621 if (!(so->so_state & SS_ISCONNECTED)) {
2622 if (!xnet_skip_checks) {
2623 error = ENOTCONN;
2624 if (xnet_check_print) {
2625 printf("sockfs: X/Open shutdown check "
2626 "caused ENOTCONN\n");
2627 }
2628 }
2629 goto done;
2630 }
2631 /*
2632 * Record the current state and then perform any state changes.
2633 * Then use the difference between the old and new states to
2634 * determine which messages need to be sent.
2635 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2636 * duplicate calls to shutdown().
2637 */
2638 old_state = so->so_state;
2639
2640 switch (how) {
2641 case 0:
2642 socantrcvmore(so);
2643 break;
2644 case 1:
2645 socantsendmore(so);
2646 break;
2647 case 2:
2648 socantsendmore(so);
2649 socantrcvmore(so);
2650 break;
2651 default:
2652 error = EINVAL;
2653 goto done;
2654 }
2655
2656 /*
2657 * Assumes that the SS_CANT* flags are never cleared in the above code.
2658 */
2659 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2660 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2661 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2662
2663 switch (state_change) {
2664 case 0:
2665 dprintso(so, 1,
2666 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2667 so->so_state));
2668 goto done;
2669
2670 case SS_CANTRCVMORE:
2671 mutex_exit(&so->so_lock);
2672 strseteof(SOTOV(so), 1);
2673 /*
2674 * strseteof takes care of read side wakeups,
2675 * pollwakeups, and signals.
2676 */
2677 /*
2678 * Get the read lock before flushing data to avoid problems
2679 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2680 */
2681 mutex_enter(&so->so_lock);
2682 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2683 mutex_exit(&so->so_lock);
2684
2685 /* Flush read side queue */
2686 strflushrq(SOTOV(so), FLUSHALL);
2687
2688 mutex_enter(&so->so_lock);
2689 so_unlock_read(so); /* Clear SOREADLOCKED */
2690 break;
2691
2692 case SS_CANTSENDMORE:
2693 mutex_exit(&so->so_lock);
2694 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2695 mutex_enter(&so->so_lock);
2696 break;
2697
2698 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2699 mutex_exit(&so->so_lock);
2700 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2701 strseteof(SOTOV(so), 1);
2702 /*
2703 * strseteof takes care of read side wakeups,
2704 * pollwakeups, and signals.
2705 */
2706 /*
2707 * Get the read lock before flushing data to avoid problems
2708 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2709 */
2710 mutex_enter(&so->so_lock);
2711 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2712 mutex_exit(&so->so_lock);
2713
2714 /* Flush read side queue */
2715 strflushrq(SOTOV(so), FLUSHALL);
2716
2717 mutex_enter(&so->so_lock);
2718 so_unlock_read(so); /* Clear SOREADLOCKED */
2719 break;
2720 }
2721
2722 ASSERT(MUTEX_HELD(&so->so_lock));
2723
2724 /*
2725 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2726 * was set due to this call and the new state has both of them set:
2727 * Send the AF_UNIX close indication
2728 * For T_COTS send a discon_ind
2729 *
2730 * If cantsend was set due to this call:
2731 * For T_COTSORD send an ordrel_ind
2732 *
2733 * Note that for T_CLTS there is no message sent here.
2734 */
2735 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2736 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2737 /*
2738 * For SunOS 4.X compatibility we tell the other end
2739 * that we are unable to receive at this point.
2740 */
2741 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2742 so_unix_close(so);
2743
2744 if (sti->sti_serv_type == T_COTS)
2745 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2746 }
2747 if ((state_change & SS_CANTSENDMORE) &&
2748 (sti->sti_serv_type == T_COTS_ORD)) {
2749 /* Send an orderly release */
2750 ordrel_req.PRIM_type = T_ORDREL_REQ;
2751
2752 mutex_exit(&so->so_lock);
2753 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2754 0, _ALLOC_SLEEP, cr);
2755 /*
2756 * Send down the T_ORDREL_REQ even if there is flow control.
2757 * This prevents shutdown from blocking.
2758 * Note that there is no T_OK_ACK for ordrel_req.
2759 */
2760 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2761 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2762 mutex_enter(&so->so_lock);
2763 if (error) {
2764 eprintsoline(so, error);
2765 goto done;
2766 }
2767 }
2768
2769 done:
2770 so_unlock_single(so, SOLOCKED);
2771 mutex_exit(&so->so_lock);
2772 return (error);
2773 }
2774
2775 /*
2776 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2777 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2778 * that we have closed.
2779 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2780 * T_UNITDATA_REQ containing the same option.
2781 *
2782 * For SOCK_DGRAM half-connections (somebody connected to this end
2783 * but this end is not connect) we don't know where to send any
2784 * SO_UNIX_CLOSE.
2785 *
2786 * We have to ignore stream head errors just in case there has been
2787 * a shutdown(output).
2788 * Ignore any flow control to try to get the message more quickly to the peer.
2789 * While locally ignoring flow control solves the problem when there
2790 * is only the loopback transport on the stream it would not provide
2791 * the correct AF_UNIX socket semantics when one or more modules have
2792 * been pushed.
2793 */
2794 void
so_unix_close(struct sonode * so)2795 so_unix_close(struct sonode *so)
2796 {
2797 int error;
2798 struct T_opthdr toh;
2799 mblk_t *mp;
2800 sotpi_info_t *sti = SOTOTPI(so);
2801
2802 ASSERT(MUTEX_HELD(&so->so_lock));
2803
2804 ASSERT(so->so_family == AF_UNIX);
2805
2806 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2807 (SS_ISCONNECTED|SS_ISBOUND))
2808 return;
2809
2810 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2811 (void *)so, pr_state(so->so_state, so->so_mode)));
2812
2813 toh.level = SOL_SOCKET;
2814 toh.name = SO_UNIX_CLOSE;
2815
2816 /* zero length + header */
2817 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2818 toh.status = 0;
2819
2820 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2821 struct T_optdata_req tdr;
2822
2823 tdr.PRIM_type = T_OPTDATA_REQ;
2824 tdr.DATA_flag = 0;
2825
2826 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2827 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2828
2829 /* NOTE: holding so_lock while sleeping */
2830 mp = soallocproto2(&tdr, sizeof (tdr),
2831 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2832 } else {
2833 struct T_unitdata_req tudr;
2834 void *addr;
2835 socklen_t addrlen;
2836 void *src;
2837 socklen_t srclen;
2838 struct T_opthdr toh2;
2839 t_scalar_t size;
2840
2841 /* Connecteded DGRAM socket */
2842
2843 /*
2844 * For AF_UNIX the destination address is translated to
2845 * an internal name and the source address is passed as
2846 * an option.
2847 */
2848 /*
2849 * Length and family checks.
2850 */
2851 error = so_addr_verify(so, sti->sti_faddr_sa,
2852 (t_uscalar_t)sti->sti_faddr_len);
2853 if (error) {
2854 eprintsoline(so, error);
2855 return;
2856 }
2857 if (sti->sti_faddr_noxlate) {
2858 /*
2859 * Already have a transport internal address. Do not
2860 * pass any (transport internal) source address.
2861 */
2862 addr = sti->sti_faddr_sa;
2863 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2864 src = NULL;
2865 srclen = 0;
2866 } else {
2867 /*
2868 * Pass the sockaddr_un source address as an option
2869 * and translate the remote address.
2870 * Holding so_lock thus sti_laddr_sa can not change.
2871 */
2872 src = sti->sti_laddr_sa;
2873 srclen = (socklen_t)sti->sti_laddr_len;
2874 dprintso(so, 1,
2875 ("so_ux_close: srclen %d, src %p\n",
2876 srclen, src));
2877 error = so_ux_addr_xlate(so,
2878 sti->sti_faddr_sa,
2879 (socklen_t)sti->sti_faddr_len, 0,
2880 &addr, &addrlen);
2881 if (error) {
2882 eprintsoline(so, error);
2883 return;
2884 }
2885 }
2886 tudr.PRIM_type = T_UNITDATA_REQ;
2887 tudr.DEST_length = addrlen;
2888 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2889 if (srclen == 0) {
2890 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2891 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2892 _TPI_ALIGN_TOPT(addrlen));
2893
2894 size = tudr.OPT_offset + tudr.OPT_length;
2895 /* NOTE: holding so_lock while sleeping */
2896 mp = soallocproto2(&tudr, sizeof (tudr),
2897 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2898 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2899 soappendmsg(mp, &toh, sizeof (toh));
2900 } else {
2901 /*
2902 * There is a AF_UNIX sockaddr_un to include as a
2903 * source address option.
2904 */
2905 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2906 _TPI_ALIGN_TOPT(srclen));
2907 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2908 _TPI_ALIGN_TOPT(addrlen));
2909
2910 toh2.level = SOL_SOCKET;
2911 toh2.name = SO_SRCADDR;
2912 toh2.len = (t_uscalar_t)(srclen +
2913 sizeof (struct T_opthdr));
2914 toh2.status = 0;
2915
2916 size = tudr.OPT_offset + tudr.OPT_length;
2917
2918 /* NOTE: holding so_lock while sleeping */
2919 mp = soallocproto2(&tudr, sizeof (tudr),
2920 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2921 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2922 soappendmsg(mp, &toh, sizeof (toh));
2923 soappendmsg(mp, &toh2, sizeof (toh2));
2924 soappendmsg(mp, src, srclen);
2925 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2926 }
2927 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2928 }
2929 mutex_exit(&so->so_lock);
2930 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2931 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2932 mutex_enter(&so->so_lock);
2933 }
2934
2935 /*
2936 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2937 * In addition, the caller typically verifies that there is some
2938 * potential state to clear by checking
2939 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2940 * before calling this routine.
2941 * Note that such a check can be made without holding so_lock since
2942 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2943 * decrements sti_oobsigcnt.
2944 *
2945 * When data is read *after* the point that all pending
2946 * oob data has been consumed the oob indication is cleared.
2947 *
2948 * This logic keeps select/poll returning POLLRDBAND and
2949 * SIOCATMARK returning true until we have read past
2950 * the mark.
2951 */
2952 static void
sorecv_update_oobstate(struct sonode * so)2953 sorecv_update_oobstate(struct sonode *so)
2954 {
2955 sotpi_info_t *sti = SOTOTPI(so);
2956
2957 mutex_enter(&so->so_lock);
2958 ASSERT(so_verify_oobstate(so));
2959 dprintso(so, 1,
2960 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2961 sti->sti_oobsigcnt,
2962 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2963 if (sti->sti_oobsigcnt == 0) {
2964 /* No more pending oob indications */
2965 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2966 freemsg(so->so_oobmsg);
2967 so->so_oobmsg = NULL;
2968 }
2969 ASSERT(so_verify_oobstate(so));
2970 mutex_exit(&so->so_lock);
2971 }
2972
2973 /*
2974 * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2975 */
2976 static int
nl7c_sorecv(struct sonode * so,mblk_t ** rmp,uio_t * uiop,rval_t * rp)2977 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2978 {
2979 sotpi_info_t *sti = SOTOTPI(so);
2980 int error = 0;
2981 mblk_t *tmp = NULL;
2982 mblk_t *pmp = NULL;
2983 mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2984
2985 ASSERT(nmp != NULL);
2986
2987 while (nmp != NULL && uiop->uio_resid > 0) {
2988 ssize_t n;
2989
2990 if (DB_TYPE(nmp) == M_DATA) {
2991 /*
2992 * We have some data, uiomove up to resid bytes.
2993 */
2994 n = MIN(MBLKL(nmp), uiop->uio_resid);
2995 if (n > 0)
2996 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2997 nmp->b_rptr += n;
2998 if (nmp->b_rptr == nmp->b_wptr) {
2999 pmp = nmp;
3000 nmp = nmp->b_cont;
3001 }
3002 if (error)
3003 break;
3004 } else {
3005 /*
3006 * We only handle data, save for caller to handle.
3007 */
3008 if (pmp != NULL) {
3009 pmp->b_cont = nmp->b_cont;
3010 }
3011 nmp->b_cont = NULL;
3012 if (*rmp == NULL) {
3013 *rmp = nmp;
3014 } else {
3015 tmp->b_cont = nmp;
3016 }
3017 nmp = nmp->b_cont;
3018 tmp = nmp;
3019 }
3020 }
3021 if (pmp != NULL) {
3022 /* Free any mblk_t(s) which we have consumed */
3023 pmp->b_cont = NULL;
3024 freemsg(sti->sti_nl7c_rcv_mp);
3025 }
3026 if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3027 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3028 if (error == 0) {
3029 rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3030
3031 error = p->r_v.r_v2;
3032 p->r_v.r_v2 = 0;
3033 }
3034 rp->r_vals = sti->sti_nl7c_rcv_rval;
3035 sti->sti_nl7c_rcv_rval = 0;
3036 } else {
3037 /* More mblk_t(s) to process so no rval to return */
3038 rp->r_vals = 0;
3039 }
3040 return (error);
3041 }
3042 /*
3043 * Receive the next message on the queue.
3044 * If msg_controllen is non-zero when called the caller is interested in
3045 * any received control info (options).
3046 * If msg_namelen is non-zero when called the caller is interested in
3047 * any received source address.
3048 * The routine returns with msg_control and msg_name pointing to
3049 * kmem_alloc'ed memory which the caller has to free.
3050 */
3051 /* ARGSUSED */
3052 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)3053 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3054 struct cred *cr)
3055 {
3056 union T_primitives *tpr;
3057 mblk_t *mp;
3058 uchar_t pri;
3059 int pflag, opflag;
3060 void *control;
3061 t_uscalar_t controllen;
3062 t_uscalar_t namelen;
3063 int so_state = so->so_state; /* Snapshot */
3064 ssize_t saved_resid;
3065 rval_t rval;
3066 int flags;
3067 clock_t timout;
3068 int error = 0;
3069 sotpi_info_t *sti = SOTOTPI(so);
3070
3071 flags = msg->msg_flags;
3072 msg->msg_flags = 0;
3073
3074 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3075 (void *)so, (void *)msg, flags,
3076 pr_state(so->so_state, so->so_mode), so->so_error));
3077
3078 if (so->so_version == SOV_STREAM) {
3079 so_update_attrs(so, SOACC);
3080 /* The imaginary "sockmod" has been popped - act as a stream */
3081 return (strread(SOTOV(so), uiop, cr));
3082 }
3083
3084 /*
3085 * If we are not connected because we have never been connected
3086 * we return ENOTCONN. If we have been connected (but are no longer
3087 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3088 * the EOF.
3089 *
3090 * An alternative would be to post an ENOTCONN error in stream head
3091 * (read+write) and clear it when we're connected. However, that error
3092 * would cause incorrect poll/select behavior!
3093 */
3094 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3095 (so->so_mode & SM_CONNREQUIRED)) {
3096 return (ENOTCONN);
3097 }
3098
3099 /*
3100 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3101 * after checking that the read queue is empty) and returns zero.
3102 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3103 * is zero.
3104 */
3105
3106 if (flags & MSG_OOB) {
3107 /* Check that the transport supports OOB */
3108 if (!(so->so_mode & SM_EXDATA))
3109 return (EOPNOTSUPP);
3110 so_update_attrs(so, SOACC);
3111 return (sorecvoob(so, msg, uiop, flags,
3112 (so->so_options & SO_OOBINLINE)));
3113 }
3114
3115 so_update_attrs(so, SOACC);
3116
3117 /*
3118 * Set msg_controllen and msg_namelen to zero here to make it
3119 * simpler in the cases that no control or name is returned.
3120 */
3121 controllen = msg->msg_controllen;
3122 namelen = msg->msg_namelen;
3123 msg->msg_controllen = 0;
3124 msg->msg_namelen = 0;
3125
3126 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3127 namelen, controllen));
3128
3129 mutex_enter(&so->so_lock);
3130 /*
3131 * If an NL7C enabled socket and not waiting for write data.
3132 */
3133 if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3134 NL7C_ENABLED) {
3135 if (sti->sti_nl7c_uri) {
3136 /* Close uri processing for a previous request */
3137 nl7c_close(so);
3138 }
3139 if ((so_state & SS_CANTRCVMORE) &&
3140 sti->sti_nl7c_rcv_mp == NULL) {
3141 /* Nothing to process, EOF */
3142 mutex_exit(&so->so_lock);
3143 return (0);
3144 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3145 /* Persistent NL7C socket, try to process request */
3146 boolean_t ret;
3147
3148 ret = nl7c_process(so,
3149 (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3150 rval.r_vals = sti->sti_nl7c_rcv_rval;
3151 error = rval.r_v.r_v2;
3152 if (error) {
3153 /* Error of some sort, return it */
3154 mutex_exit(&so->so_lock);
3155 return (error);
3156 }
3157 if (sti->sti_nl7c_flags &&
3158 ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3159 /*
3160 * Still an NL7C socket and no data
3161 * to pass up to the caller.
3162 */
3163 mutex_exit(&so->so_lock);
3164 if (ret) {
3165 /* EOF */
3166 return (0);
3167 } else {
3168 /* Need more data */
3169 return (EAGAIN);
3170 }
3171 }
3172 } else {
3173 /*
3174 * Not persistent so no further NL7C processing.
3175 */
3176 sti->sti_nl7c_flags = 0;
3177 }
3178 }
3179 /*
3180 * Only one reader is allowed at any given time. This is needed
3181 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3182 *
3183 * This is slightly different that BSD behavior in that it fails with
3184 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3185 * is single-threaded using sblock(), which is dropped while waiting
3186 * for data to appear. The difference shows up e.g. if one
3187 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3188 * does use nonblocking io and different threads are reading each
3189 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3190 * in this case as long as the read queue doesn't get empty.
3191 * In this implementation the thread using nonblocking io can
3192 * get an EWOULDBLOCK error due to the blocking thread executing
3193 * e.g. in the uiomove in kstrgetmsg.
3194 * This difference is not believed to be significant.
3195 */
3196 /* Set SOREADLOCKED */
3197 error = so_lock_read_intr(so,
3198 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3199 mutex_exit(&so->so_lock);
3200 if (error)
3201 return (error);
3202
3203 /*
3204 * Tell kstrgetmsg to not inspect the stream head errors until all
3205 * queued data has been consumed.
3206 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3207 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3208 *
3209 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3210 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3211 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3212 */
3213 pflag = MSG_ANY | MSG_DELAYERROR;
3214 if (flags & MSG_PEEK) {
3215 pflag |= MSG_IPEEK;
3216 flags &= ~MSG_WAITALL;
3217 }
3218 if (so->so_mode & SM_ATOMIC)
3219 pflag |= MSG_DISCARDTAIL;
3220
3221 if (flags & MSG_DONTWAIT)
3222 timout = 0;
3223 else
3224 timout = -1;
3225 opflag = pflag;
3226 retry:
3227 saved_resid = uiop->uio_resid;
3228 pri = 0;
3229 mp = NULL;
3230 if (sti->sti_nl7c_rcv_mp != NULL) {
3231 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3232 error = nl7c_sorecv(so, &mp, uiop, &rval);
3233 } else {
3234 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3235 timout, &rval);
3236 }
3237 if (error != 0) {
3238 /* kstrgetmsg returns ETIME when timeout expires */
3239 if (error == ETIME)
3240 error = EWOULDBLOCK;
3241 goto out;
3242 }
3243 /*
3244 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3245 * For non-datagrams MOREDATA is used to set MSG_EOR.
3246 */
3247 ASSERT(!(rval.r_val1 & MORECTL));
3248 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3249 msg->msg_flags |= MSG_TRUNC;
3250
3251 if (mp == NULL) {
3252 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3253 /*
3254 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3255 * The draft Posix socket spec states that the mark should
3256 * not be cleared when peeking. We follow the latter.
3257 */
3258 if ((so->so_state &
3259 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3260 (uiop->uio_resid != saved_resid) &&
3261 !(flags & MSG_PEEK)) {
3262 sorecv_update_oobstate(so);
3263 }
3264
3265 mutex_enter(&so->so_lock);
3266 /* Set MSG_EOR based on MOREDATA */
3267 if (!(rval.r_val1 & MOREDATA)) {
3268 if (so->so_state & SS_SAVEDEOR) {
3269 msg->msg_flags |= MSG_EOR;
3270 so->so_state &= ~SS_SAVEDEOR;
3271 }
3272 }
3273 /*
3274 * If some data was received (i.e. not EOF) and the
3275 * read/recv* has not been satisfied wait for some more.
3276 */
3277 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3278 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3279 mutex_exit(&so->so_lock);
3280 pflag = opflag | MSG_NOMARK;
3281 goto retry;
3282 }
3283 goto out_locked;
3284 }
3285
3286 /* strsock_proto has already verified length and alignment */
3287 tpr = (union T_primitives *)mp->b_rptr;
3288 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3289
3290 switch (tpr->type) {
3291 case T_DATA_IND: {
3292 if ((so->so_state &
3293 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3294 (uiop->uio_resid != saved_resid) &&
3295 !(flags & MSG_PEEK)) {
3296 sorecv_update_oobstate(so);
3297 }
3298
3299 /*
3300 * Set msg_flags to MSG_EOR based on
3301 * MORE_flag and MOREDATA.
3302 */
3303 mutex_enter(&so->so_lock);
3304 so->so_state &= ~SS_SAVEDEOR;
3305 if (!(tpr->data_ind.MORE_flag & 1)) {
3306 if (!(rval.r_val1 & MOREDATA))
3307 msg->msg_flags |= MSG_EOR;
3308 else
3309 so->so_state |= SS_SAVEDEOR;
3310 }
3311 freemsg(mp);
3312 /*
3313 * If some data was received (i.e. not EOF) and the
3314 * read/recv* has not been satisfied wait for some more.
3315 */
3316 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3317 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3318 mutex_exit(&so->so_lock);
3319 pflag = opflag | MSG_NOMARK;
3320 goto retry;
3321 }
3322 goto out_locked;
3323 }
3324 case T_UNITDATA_IND: {
3325 void *addr;
3326 t_uscalar_t addrlen;
3327 void *abuf;
3328 t_uscalar_t optlen;
3329 void *opt;
3330
3331 if ((so->so_state &
3332 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3333 (uiop->uio_resid != saved_resid) &&
3334 !(flags & MSG_PEEK)) {
3335 sorecv_update_oobstate(so);
3336 }
3337
3338 if (namelen != 0) {
3339 /* Caller wants source address */
3340 addrlen = tpr->unitdata_ind.SRC_length;
3341 addr = sogetoff(mp,
3342 tpr->unitdata_ind.SRC_offset,
3343 addrlen, 1);
3344 if (addr == NULL) {
3345 freemsg(mp);
3346 error = EPROTO;
3347 eprintsoline(so, error);
3348 goto out;
3349 }
3350 if (so->so_family == AF_UNIX) {
3351 /*
3352 * Can not use the transport level address.
3353 * If there is a SO_SRCADDR option carrying
3354 * the socket level address it will be
3355 * extracted below.
3356 */
3357 addr = NULL;
3358 addrlen = 0;
3359 }
3360 }
3361 optlen = tpr->unitdata_ind.OPT_length;
3362 if (optlen != 0) {
3363 t_uscalar_t ncontrollen;
3364
3365 /*
3366 * Extract any source address option.
3367 * Determine how large cmsg buffer is needed.
3368 */
3369 opt = sogetoff(mp,
3370 tpr->unitdata_ind.OPT_offset,
3371 optlen, __TPI_ALIGN_SIZE);
3372
3373 if (opt == NULL) {
3374 freemsg(mp);
3375 error = EPROTO;
3376 eprintsoline(so, error);
3377 goto out;
3378 }
3379 if (so->so_family == AF_UNIX)
3380 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3381 ncontrollen = so_cmsglen(mp, opt, optlen,
3382 !(flags & MSG_XPG4_2));
3383 if (controllen != 0)
3384 controllen = ncontrollen;
3385 else if (ncontrollen != 0)
3386 msg->msg_flags |= MSG_CTRUNC;
3387 } else {
3388 controllen = 0;
3389 }
3390
3391 if (namelen != 0) {
3392 /*
3393 * Return address to caller.
3394 * Caller handles truncation if length
3395 * exceeds msg_namelen.
3396 * NOTE: AF_UNIX NUL termination is ensured by
3397 * the sender's copyin_name().
3398 */
3399 abuf = kmem_alloc(addrlen, KM_SLEEP);
3400
3401 bcopy(addr, abuf, addrlen);
3402 msg->msg_name = abuf;
3403 msg->msg_namelen = addrlen;
3404 }
3405
3406 if (controllen != 0) {
3407 /*
3408 * Return control msg to caller.
3409 * Caller handles truncation if length
3410 * exceeds msg_controllen.
3411 */
3412 control = kmem_zalloc(controllen, KM_SLEEP);
3413
3414 error = so_opt2cmsg(mp, opt, optlen,
3415 !(flags & MSG_XPG4_2),
3416 control, controllen);
3417 if (error) {
3418 freemsg(mp);
3419 if (msg->msg_namelen != 0)
3420 kmem_free(msg->msg_name,
3421 msg->msg_namelen);
3422 kmem_free(control, controllen);
3423 eprintsoline(so, error);
3424 goto out;
3425 }
3426 msg->msg_control = control;
3427 msg->msg_controllen = controllen;
3428 }
3429
3430 freemsg(mp);
3431 goto out;
3432 }
3433 case T_OPTDATA_IND: {
3434 struct T_optdata_req *tdr;
3435 void *opt;
3436 t_uscalar_t optlen;
3437
3438 if ((so->so_state &
3439 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3440 (uiop->uio_resid != saved_resid) &&
3441 !(flags & MSG_PEEK)) {
3442 sorecv_update_oobstate(so);
3443 }
3444
3445 tdr = (struct T_optdata_req *)mp->b_rptr;
3446 optlen = tdr->OPT_length;
3447 if (optlen != 0) {
3448 t_uscalar_t ncontrollen;
3449 /*
3450 * Determine how large cmsg buffer is needed.
3451 */
3452 opt = sogetoff(mp,
3453 tpr->optdata_ind.OPT_offset,
3454 optlen, __TPI_ALIGN_SIZE);
3455
3456 if (opt == NULL) {
3457 freemsg(mp);
3458 error = EPROTO;
3459 eprintsoline(so, error);
3460 goto out;
3461 }
3462
3463 ncontrollen = so_cmsglen(mp, opt, optlen,
3464 !(flags & MSG_XPG4_2));
3465 if (controllen != 0)
3466 controllen = ncontrollen;
3467 else if (ncontrollen != 0)
3468 msg->msg_flags |= MSG_CTRUNC;
3469 } else {
3470 controllen = 0;
3471 }
3472
3473 if (controllen != 0) {
3474 /*
3475 * Return control msg to caller.
3476 * Caller handles truncation if length
3477 * exceeds msg_controllen.
3478 */
3479 control = kmem_zalloc(controllen, KM_SLEEP);
3480
3481 error = so_opt2cmsg(mp, opt, optlen,
3482 !(flags & MSG_XPG4_2),
3483 control, controllen);
3484 if (error) {
3485 freemsg(mp);
3486 kmem_free(control, controllen);
3487 eprintsoline(so, error);
3488 goto out;
3489 }
3490 msg->msg_control = control;
3491 msg->msg_controllen = controllen;
3492 }
3493
3494 /*
3495 * Set msg_flags to MSG_EOR based on
3496 * DATA_flag and MOREDATA.
3497 */
3498 mutex_enter(&so->so_lock);
3499 so->so_state &= ~SS_SAVEDEOR;
3500 if (!(tpr->data_ind.MORE_flag & 1)) {
3501 if (!(rval.r_val1 & MOREDATA))
3502 msg->msg_flags |= MSG_EOR;
3503 else
3504 so->so_state |= SS_SAVEDEOR;
3505 }
3506 freemsg(mp);
3507 /*
3508 * If some data was received (i.e. not EOF) and the
3509 * read/recv* has not been satisfied wait for some more.
3510 * Not possible to wait if control info was received.
3511 */
3512 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3513 controllen == 0 &&
3514 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3515 mutex_exit(&so->so_lock);
3516 pflag = opflag | MSG_NOMARK;
3517 goto retry;
3518 }
3519 goto out_locked;
3520 }
3521 case T_EXDATA_IND: {
3522 dprintso(so, 1,
3523 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3524 "state %s\n",
3525 sti->sti_oobsigcnt, sti->sti_oobcnt,
3526 saved_resid - uiop->uio_resid,
3527 pr_state(so->so_state, so->so_mode)));
3528 /*
3529 * kstrgetmsg handles MSGMARK so there is nothing to
3530 * inspect in the T_EXDATA_IND.
3531 * strsock_proto makes the stream head queue the T_EXDATA_IND
3532 * as a separate message with no M_DATA component. Furthermore,
3533 * the stream head does not consolidate M_DATA messages onto
3534 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3535 * remains a message by itself. This is needed since MSGMARK
3536 * marks both the whole message as well as the last byte
3537 * of the message.
3538 */
3539 freemsg(mp);
3540 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3541 if (flags & MSG_PEEK) {
3542 /*
3543 * Even though we are peeking we consume the
3544 * T_EXDATA_IND thereby moving the mark information
3545 * to SS_RCVATMARK. Then the oob code below will
3546 * retry the peeking kstrgetmsg.
3547 * Note that the stream head read queue is
3548 * never flushed without holding SOREADLOCKED
3549 * thus the T_EXDATA_IND can not disappear
3550 * underneath us.
3551 */
3552 dprintso(so, 1,
3553 ("sotpi_recvmsg: consume EXDATA_IND "
3554 "counts %d/%d state %s\n",
3555 sti->sti_oobsigcnt,
3556 sti->sti_oobcnt,
3557 pr_state(so->so_state, so->so_mode)));
3558
3559 pflag = MSG_ANY | MSG_DELAYERROR;
3560 if (so->so_mode & SM_ATOMIC)
3561 pflag |= MSG_DISCARDTAIL;
3562
3563 pri = 0;
3564 mp = NULL;
3565
3566 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3567 &pri, &pflag, (clock_t)-1, &rval);
3568 ASSERT(uiop->uio_resid == saved_resid);
3569
3570 if (error) {
3571 #ifdef SOCK_DEBUG
3572 if (error != EWOULDBLOCK && error != EINTR) {
3573 eprintsoline(so, error);
3574 }
3575 #endif /* SOCK_DEBUG */
3576 goto out;
3577 }
3578 ASSERT(mp);
3579 tpr = (union T_primitives *)mp->b_rptr;
3580 ASSERT(tpr->type == T_EXDATA_IND);
3581 freemsg(mp);
3582 } /* end "if (flags & MSG_PEEK)" */
3583
3584 /*
3585 * Decrement the number of queued and pending oob.
3586 *
3587 * SS_RCVATMARK is cleared when we read past a mark.
3588 * SS_HAVEOOBDATA is cleared when we've read past the
3589 * last mark.
3590 * SS_OOBPEND is cleared if we've read past the last
3591 * mark and no (new) SIGURG has been posted.
3592 */
3593 mutex_enter(&so->so_lock);
3594 ASSERT(so_verify_oobstate(so));
3595 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3596 ASSERT(sti->sti_oobsigcnt > 0);
3597 sti->sti_oobsigcnt--;
3598 ASSERT(sti->sti_oobcnt > 0);
3599 sti->sti_oobcnt--;
3600 /*
3601 * Since the T_EXDATA_IND has been removed from the stream
3602 * head, but we have not read data past the mark,
3603 * sockfs needs to track that the socket is still at the mark.
3604 *
3605 * Since no data was received call kstrgetmsg again to wait
3606 * for data.
3607 */
3608 so->so_state |= SS_RCVATMARK;
3609 mutex_exit(&so->so_lock);
3610 dprintso(so, 1,
3611 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3612 sti->sti_oobsigcnt, sti->sti_oobcnt,
3613 pr_state(so->so_state, so->so_mode)));
3614 pflag = opflag;
3615 goto retry;
3616 }
3617 default:
3618 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3619 (void *)so, tpr->type, (void *)mp);
3620 ASSERT(0);
3621 freemsg(mp);
3622 error = EPROTO;
3623 eprintsoline(so, error);
3624 goto out;
3625 }
3626 /* NOTREACHED */
3627 out:
3628 mutex_enter(&so->so_lock);
3629 out_locked:
3630 so_unlock_read(so); /* Clear SOREADLOCKED */
3631 mutex_exit(&so->so_lock);
3632 return (error);
3633 }
3634
3635 /*
3636 * Sending data with options on a datagram socket.
3637 * Assumes caller has verified that SS_ISBOUND etc. are set.
3638 */
3639 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3640 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3641 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3642 {
3643 struct T_unitdata_req tudr;
3644 mblk_t *mp;
3645 int error;
3646 void *addr;
3647 socklen_t addrlen;
3648 void *src;
3649 socklen_t srclen;
3650 ssize_t len;
3651 int size;
3652 struct T_opthdr toh;
3653 struct fdbuf *fdbuf;
3654 t_uscalar_t optlen;
3655 void *fds;
3656 int fdlen;
3657 sotpi_info_t *sti = SOTOTPI(so);
3658
3659 ASSERT(name && namelen);
3660 ASSERT(control && controllen);
3661
3662 len = uiop->uio_resid;
3663 if (len > (ssize_t)sti->sti_tidu_size) {
3664 return (EMSGSIZE);
3665 }
3666
3667 /*
3668 * For AF_UNIX the destination address is translated to an internal
3669 * name and the source address is passed as an option.
3670 * Also, file descriptors are passed as file pointers in an
3671 * option.
3672 */
3673
3674 /*
3675 * Length and family checks.
3676 */
3677 error = so_addr_verify(so, name, namelen);
3678 if (error) {
3679 eprintsoline(so, error);
3680 return (error);
3681 }
3682 if (so->so_family == AF_UNIX) {
3683 if (sti->sti_faddr_noxlate) {
3684 /*
3685 * Already have a transport internal address. Do not
3686 * pass any (transport internal) source address.
3687 */
3688 addr = name;
3689 addrlen = namelen;
3690 src = NULL;
3691 srclen = 0;
3692 } else {
3693 /*
3694 * Pass the sockaddr_un source address as an option
3695 * and translate the remote address.
3696 *
3697 * Note that this code does not prevent sti_laddr_sa
3698 * from changing while it is being used. Thus
3699 * if an unbind+bind occurs concurrently with this
3700 * send the peer might see a partially new and a
3701 * partially old "from" address.
3702 */
3703 src = sti->sti_laddr_sa;
3704 srclen = (t_uscalar_t)sti->sti_laddr_len;
3705 dprintso(so, 1,
3706 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3707 srclen, src));
3708 error = so_ux_addr_xlate(so, name, namelen,
3709 (flags & MSG_XPG4_2),
3710 &addr, &addrlen);
3711 if (error) {
3712 eprintsoline(so, error);
3713 return (error);
3714 }
3715 }
3716 } else {
3717 addr = name;
3718 addrlen = namelen;
3719 src = NULL;
3720 srclen = 0;
3721 }
3722 optlen = so_optlen(control, controllen,
3723 !(flags & MSG_XPG4_2));
3724 tudr.PRIM_type = T_UNITDATA_REQ;
3725 tudr.DEST_length = addrlen;
3726 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3727 if (srclen != 0)
3728 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3729 _TPI_ALIGN_TOPT(srclen));
3730 else
3731 tudr.OPT_length = optlen;
3732 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3733 _TPI_ALIGN_TOPT(addrlen));
3734
3735 size = tudr.OPT_offset + tudr.OPT_length;
3736
3737 /*
3738 * File descriptors only when SM_FDPASSING set.
3739 */
3740 error = so_getfdopt(control, controllen,
3741 !(flags & MSG_XPG4_2), &fds, &fdlen);
3742 if (error)
3743 return (error);
3744 if (fdlen != -1) {
3745 if (!(so->so_mode & SM_FDPASSING))
3746 return (EOPNOTSUPP);
3747
3748 error = fdbuf_create(fds, fdlen, &fdbuf);
3749 if (error)
3750 return (error);
3751 mp = fdbuf_allocmsg(size, fdbuf);
3752 } else {
3753 mp = soallocproto(size, _ALLOC_INTR, CRED());
3754 if (mp == NULL) {
3755 /*
3756 * Caught a signal waiting for memory.
3757 * Let send* return EINTR.
3758 */
3759 return (EINTR);
3760 }
3761 }
3762 soappendmsg(mp, &tudr, sizeof (tudr));
3763 soappendmsg(mp, addr, addrlen);
3764 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3765
3766 if (fdlen != -1) {
3767 ASSERT(fdbuf != NULL);
3768 toh.level = SOL_SOCKET;
3769 toh.name = SO_FILEP;
3770 toh.len = fdbuf->fd_size +
3771 (t_uscalar_t)sizeof (struct T_opthdr);
3772 toh.status = 0;
3773 soappendmsg(mp, &toh, sizeof (toh));
3774 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3775 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3776 }
3777 if (srclen != 0) {
3778 /*
3779 * There is a AF_UNIX sockaddr_un to include as a source
3780 * address option.
3781 */
3782 toh.level = SOL_SOCKET;
3783 toh.name = SO_SRCADDR;
3784 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3785 toh.status = 0;
3786 soappendmsg(mp, &toh, sizeof (toh));
3787 soappendmsg(mp, src, srclen);
3788 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3789 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3790 }
3791 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3792 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3793 /* At most 3 bytes left in the message */
3794 ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3795 ASSERT(MBLKL(mp) <= (ssize_t)size);
3796
3797 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3798 if (AU_AUDITING())
3799 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3800
3801 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3802 #ifdef SOCK_DEBUG
3803 if (error) {
3804 eprintsoline(so, error);
3805 }
3806 #endif /* SOCK_DEBUG */
3807 return (error);
3808 }
3809
3810 /*
3811 * Sending data with options on a connected stream socket.
3812 * Assumes caller has verified that SS_ISCONNECTED is set.
3813 */
3814 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3815 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3816 t_uscalar_t controllen, int flags)
3817 {
3818 struct T_optdata_req tdr;
3819 mblk_t *mp;
3820 int error;
3821 ssize_t iosize;
3822 int size;
3823 struct fdbuf *fdbuf;
3824 t_uscalar_t optlen;
3825 void *fds;
3826 int fdlen;
3827 struct T_opthdr toh;
3828 sotpi_info_t *sti = SOTOTPI(so);
3829
3830 dprintso(so, 1,
3831 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3832
3833 /*
3834 * Has to be bound and connected. However, since no locks are
3835 * held the state could have changed after sotpi_sendmsg checked it
3836 * thus it is not possible to ASSERT on the state.
3837 */
3838
3839 /* Options on connection-oriented only when SM_OPTDATA set. */
3840 if (!(so->so_mode & SM_OPTDATA))
3841 return (EOPNOTSUPP);
3842
3843 do {
3844 /*
3845 * Set the MORE flag if uio_resid does not fit in this
3846 * message or if the caller passed in "more".
3847 * Error for transports with zero tidu_size.
3848 */
3849 tdr.PRIM_type = T_OPTDATA_REQ;
3850 iosize = sti->sti_tidu_size;
3851 if (iosize <= 0)
3852 return (EMSGSIZE);
3853 if (uiop->uio_resid > iosize) {
3854 tdr.DATA_flag = 1;
3855 } else {
3856 if (more)
3857 tdr.DATA_flag = 1;
3858 else
3859 tdr.DATA_flag = 0;
3860 iosize = uiop->uio_resid;
3861 }
3862 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3863 tdr.DATA_flag, iosize));
3864
3865 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3866 tdr.OPT_length = optlen;
3867 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3868
3869 size = (int)sizeof (tdr) + optlen;
3870 /*
3871 * File descriptors only when SM_FDPASSING set.
3872 */
3873 error = so_getfdopt(control, controllen,
3874 !(flags & MSG_XPG4_2), &fds, &fdlen);
3875 if (error)
3876 return (error);
3877 if (fdlen != -1) {
3878 if (!(so->so_mode & SM_FDPASSING))
3879 return (EOPNOTSUPP);
3880
3881 error = fdbuf_create(fds, fdlen, &fdbuf);
3882 if (error)
3883 return (error);
3884 mp = fdbuf_allocmsg(size, fdbuf);
3885 } else {
3886 mp = soallocproto(size, _ALLOC_INTR, CRED());
3887 if (mp == NULL) {
3888 /*
3889 * Caught a signal waiting for memory.
3890 * Let send* return EINTR.
3891 */
3892 return (EINTR);
3893 }
3894 }
3895 soappendmsg(mp, &tdr, sizeof (tdr));
3896
3897 if (fdlen != -1) {
3898 ASSERT(fdbuf != NULL);
3899 toh.level = SOL_SOCKET;
3900 toh.name = SO_FILEP;
3901 toh.len = fdbuf->fd_size +
3902 (t_uscalar_t)sizeof (struct T_opthdr);
3903 toh.status = 0;
3904 soappendmsg(mp, &toh, sizeof (toh));
3905 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3906 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3907 }
3908 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3909 /* At most 3 bytes left in the message */
3910 ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3911 ASSERT(MBLKL(mp) <= (ssize_t)size);
3912
3913 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3914
3915 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3916 0, MSG_BAND, 0);
3917 if (error) {
3918 eprintsoline(so, error);
3919 return (error);
3920 }
3921 control = NULL;
3922 if (uiop->uio_resid > 0) {
3923 /*
3924 * Recheck for fatal errors. Fail write even though
3925 * some data have been written. This is consistent
3926 * with strwrite semantics and BSD sockets semantics.
3927 */
3928 if (so->so_state & SS_CANTSENDMORE) {
3929 eprintsoline(so, error);
3930 return (EPIPE);
3931 }
3932 if (so->so_error != 0) {
3933 mutex_enter(&so->so_lock);
3934 error = sogeterr(so, B_TRUE);
3935 mutex_exit(&so->so_lock);
3936 if (error != 0) {
3937 eprintsoline(so, error);
3938 return (error);
3939 }
3940 }
3941 }
3942 } while (uiop->uio_resid > 0);
3943 return (0);
3944 }
3945
3946 /*
3947 * Sending data on a datagram socket.
3948 * Assumes caller has verified that SS_ISBOUND etc. are set.
3949 *
3950 * For AF_UNIX the destination address is translated to an internal
3951 * name and the source address is passed as an option.
3952 */
3953 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3954 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3955 struct uio *uiop, int flags)
3956 {
3957 struct T_unitdata_req tudr;
3958 mblk_t *mp;
3959 int error;
3960 void *addr;
3961 socklen_t addrlen;
3962 void *src;
3963 socklen_t srclen;
3964 ssize_t len;
3965 sotpi_info_t *sti = SOTOTPI(so);
3966
3967 ASSERT(name != NULL && namelen != 0);
3968
3969 len = uiop->uio_resid;
3970 if (len > sti->sti_tidu_size) {
3971 error = EMSGSIZE;
3972 goto done;
3973 }
3974
3975 /* Length and family checks */
3976 error = so_addr_verify(so, name, namelen);
3977 if (error != 0)
3978 goto done;
3979
3980 if (sti->sti_direct)
3981 return (sodgram_direct(so, name, namelen, uiop, flags));
3982
3983 if (so->so_family == AF_UNIX) {
3984 if (sti->sti_faddr_noxlate) {
3985 /*
3986 * Already have a transport internal address. Do not
3987 * pass any (transport internal) source address.
3988 */
3989 addr = name;
3990 addrlen = namelen;
3991 src = NULL;
3992 srclen = 0;
3993 } else {
3994 /*
3995 * Pass the sockaddr_un source address as an option
3996 * and translate the remote address.
3997 *
3998 * Note that this code does not prevent sti_laddr_sa
3999 * from changing while it is being used. Thus
4000 * if an unbind+bind occurs concurrently with this
4001 * send the peer might see a partially new and a
4002 * partially old "from" address.
4003 */
4004 src = sti->sti_laddr_sa;
4005 srclen = (socklen_t)sti->sti_laddr_len;
4006 dprintso(so, 1,
4007 ("sosend_dgram UNIX: srclen %d, src %p\n",
4008 srclen, src));
4009 error = so_ux_addr_xlate(so, name, namelen,
4010 (flags & MSG_XPG4_2),
4011 &addr, &addrlen);
4012 if (error) {
4013 eprintsoline(so, error);
4014 goto done;
4015 }
4016 }
4017 } else {
4018 addr = name;
4019 addrlen = namelen;
4020 src = NULL;
4021 srclen = 0;
4022 }
4023 tudr.PRIM_type = T_UNITDATA_REQ;
4024 tudr.DEST_length = addrlen;
4025 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4026 if (srclen == 0) {
4027 tudr.OPT_length = 0;
4028 tudr.OPT_offset = 0;
4029
4030 mp = soallocproto2(&tudr, sizeof (tudr),
4031 addr, addrlen, 0, _ALLOC_INTR, CRED());
4032 if (mp == NULL) {
4033 /*
4034 * Caught a signal waiting for memory.
4035 * Let send* return EINTR.
4036 */
4037 error = EINTR;
4038 goto done;
4039 }
4040 } else {
4041 /*
4042 * There is a AF_UNIX sockaddr_un to include as a source
4043 * address option.
4044 */
4045 struct T_opthdr toh;
4046 ssize_t size;
4047
4048 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4049 _TPI_ALIGN_TOPT(srclen));
4050 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4051 _TPI_ALIGN_TOPT(addrlen));
4052
4053 toh.level = SOL_SOCKET;
4054 toh.name = SO_SRCADDR;
4055 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4056 toh.status = 0;
4057
4058 size = tudr.OPT_offset + tudr.OPT_length;
4059 mp = soallocproto2(&tudr, sizeof (tudr),
4060 addr, addrlen, size, _ALLOC_INTR, CRED());
4061 if (mp == NULL) {
4062 /*
4063 * Caught a signal waiting for memory.
4064 * Let send* return EINTR.
4065 */
4066 error = EINTR;
4067 goto done;
4068 }
4069 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4070 soappendmsg(mp, &toh, sizeof (toh));
4071 soappendmsg(mp, src, srclen);
4072 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4073 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4074 }
4075
4076 if (AU_AUDITING())
4077 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4078
4079 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4080 done:
4081 #ifdef SOCK_DEBUG
4082 if (error) {
4083 eprintsoline(so, error);
4084 }
4085 #endif /* SOCK_DEBUG */
4086 return (error);
4087 }
4088
4089 /*
4090 * Sending data on a connected stream socket.
4091 * Assumes caller has verified that SS_ISCONNECTED is set.
4092 */
4093 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)4094 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4095 int sflag)
4096 {
4097 struct T_data_req tdr;
4098 mblk_t *mp;
4099 int error;
4100 ssize_t iosize;
4101 sotpi_info_t *sti = SOTOTPI(so);
4102
4103 dprintso(so, 1,
4104 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4105 (void *)so, uiop->uio_resid, prim, sflag));
4106
4107 /*
4108 * Has to be bound and connected. However, since no locks are
4109 * held the state could have changed after sotpi_sendmsg checked it
4110 * thus it is not possible to ASSERT on the state.
4111 */
4112
4113 do {
4114 /*
4115 * Set the MORE flag if uio_resid does not fit in this
4116 * message or if the caller passed in "more".
4117 * Error for transports with zero tidu_size.
4118 */
4119 tdr.PRIM_type = prim;
4120 iosize = sti->sti_tidu_size;
4121 if (iosize <= 0)
4122 return (EMSGSIZE);
4123 if (uiop->uio_resid > iosize) {
4124 tdr.MORE_flag = 1;
4125 } else {
4126 if (more)
4127 tdr.MORE_flag = 1;
4128 else
4129 tdr.MORE_flag = 0;
4130 iosize = uiop->uio_resid;
4131 }
4132 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4133 prim, tdr.MORE_flag, iosize));
4134 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4135 if (mp == NULL) {
4136 /*
4137 * Caught a signal waiting for memory.
4138 * Let send* return EINTR.
4139 */
4140 return (EINTR);
4141 }
4142
4143 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4144 0, sflag | MSG_BAND, 0);
4145 if (error) {
4146 eprintsoline(so, error);
4147 return (error);
4148 }
4149 if (uiop->uio_resid > 0) {
4150 /*
4151 * Recheck for fatal errors. Fail write even though
4152 * some data have been written. This is consistent
4153 * with strwrite semantics and BSD sockets semantics.
4154 */
4155 if (so->so_state & SS_CANTSENDMORE) {
4156 eprintsoline(so, error);
4157 return (EPIPE);
4158 }
4159 if (so->so_error != 0) {
4160 mutex_enter(&so->so_lock);
4161 error = sogeterr(so, B_TRUE);
4162 mutex_exit(&so->so_lock);
4163 if (error != 0) {
4164 eprintsoline(so, error);
4165 return (error);
4166 }
4167 }
4168 }
4169 } while (uiop->uio_resid > 0);
4170 return (0);
4171 }
4172
4173 /*
4174 * Check the state for errors and call the appropriate send function.
4175 *
4176 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4177 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4178 * after sending the message.
4179 */
4180 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4181 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4182 struct cred *cr)
4183 {
4184 int so_state;
4185 int so_mode;
4186 int error;
4187 struct sockaddr *name;
4188 t_uscalar_t namelen;
4189 int dontroute;
4190 int flags;
4191 sotpi_info_t *sti = SOTOTPI(so);
4192
4193 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4194 (void *)so, (void *)msg, msg->msg_flags,
4195 pr_state(so->so_state, so->so_mode), so->so_error));
4196
4197 if (so->so_version == SOV_STREAM) {
4198 /* The imaginary "sockmod" has been popped - act as a stream */
4199 so_update_attrs(so, SOMOD);
4200 return (strwrite(SOTOV(so), uiop, cr));
4201 }
4202
4203 mutex_enter(&so->so_lock);
4204 so_state = so->so_state;
4205
4206 if (so_state & SS_CANTSENDMORE) {
4207 mutex_exit(&so->so_lock);
4208 return (EPIPE);
4209 }
4210
4211 if (so->so_error != 0) {
4212 error = sogeterr(so, B_TRUE);
4213 if (error != 0) {
4214 mutex_exit(&so->so_lock);
4215 return (error);
4216 }
4217 }
4218
4219 name = (struct sockaddr *)msg->msg_name;
4220 namelen = msg->msg_namelen;
4221
4222 so_mode = so->so_mode;
4223
4224 if (name == NULL) {
4225 if (!(so_state & SS_ISCONNECTED)) {
4226 mutex_exit(&so->so_lock);
4227 if (so_mode & SM_CONNREQUIRED)
4228 return (ENOTCONN);
4229 else
4230 return (EDESTADDRREQ);
4231 }
4232 if (so_mode & SM_CONNREQUIRED) {
4233 name = NULL;
4234 namelen = 0;
4235 } else {
4236 /*
4237 * Note that this code does not prevent sti_faddr_sa
4238 * from changing while it is being used. Thus
4239 * if an "unconnect"+connect occurs concurrently with
4240 * this send the datagram might be delivered to a
4241 * garbaled address.
4242 */
4243 ASSERT(sti->sti_faddr_sa);
4244 name = sti->sti_faddr_sa;
4245 namelen = (t_uscalar_t)sti->sti_faddr_len;
4246 }
4247 } else {
4248 if (!(so_state & SS_ISCONNECTED) &&
4249 (so_mode & SM_CONNREQUIRED)) {
4250 /* Required but not connected */
4251 mutex_exit(&so->so_lock);
4252 return (ENOTCONN);
4253 }
4254 /*
4255 * Ignore the address on connection-oriented sockets.
4256 * Just like BSD this code does not generate an error for
4257 * TCP (a CONNREQUIRED socket) when sending to an address
4258 * passed in with sendto/sendmsg. Instead the data is
4259 * delivered on the connection as if no address had been
4260 * supplied.
4261 */
4262 if ((so_state & SS_ISCONNECTED) &&
4263 !(so_mode & SM_CONNREQUIRED)) {
4264 mutex_exit(&so->so_lock);
4265 return (EISCONN);
4266 }
4267 if (!(so_state & SS_ISBOUND)) {
4268 so_lock_single(so); /* Set SOLOCKED */
4269 error = sotpi_bind(so, NULL, 0,
4270 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4271 so_unlock_single(so, SOLOCKED);
4272 if (error) {
4273 mutex_exit(&so->so_lock);
4274 eprintsoline(so, error);
4275 return (error);
4276 }
4277 }
4278 /*
4279 * Handle delayed datagram errors. These are only queued
4280 * when the application sets SO_DGRAM_ERRIND.
4281 * Return the error if we are sending to the address
4282 * that was returned in the last T_UDERROR_IND.
4283 * If sending to some other address discard the delayed
4284 * error indication.
4285 */
4286 if (sti->sti_delayed_error) {
4287 struct T_uderror_ind *tudi;
4288 void *addr;
4289 t_uscalar_t addrlen;
4290 boolean_t match = B_FALSE;
4291
4292 ASSERT(sti->sti_eaddr_mp);
4293 error = sti->sti_delayed_error;
4294 sti->sti_delayed_error = 0;
4295 tudi =
4296 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4297 addrlen = tudi->DEST_length;
4298 addr = sogetoff(sti->sti_eaddr_mp,
4299 tudi->DEST_offset, addrlen, 1);
4300 ASSERT(addr); /* Checked by strsock_proto */
4301 switch (so->so_family) {
4302 case AF_INET: {
4303 /* Compare just IP address and port */
4304 sin_t *sin1 = (sin_t *)name;
4305 sin_t *sin2 = (sin_t *)addr;
4306
4307 if (addrlen == sizeof (sin_t) &&
4308 namelen == addrlen &&
4309 sin1->sin_port == sin2->sin_port &&
4310 sin1->sin_addr.s_addr ==
4311 sin2->sin_addr.s_addr)
4312 match = B_TRUE;
4313 break;
4314 }
4315 case AF_INET6: {
4316 /* Compare just IP address and port. Not flow */
4317 sin6_t *sin1 = (sin6_t *)name;
4318 sin6_t *sin2 = (sin6_t *)addr;
4319
4320 if (addrlen == sizeof (sin6_t) &&
4321 namelen == addrlen &&
4322 sin1->sin6_port == sin2->sin6_port &&
4323 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4324 &sin2->sin6_addr))
4325 match = B_TRUE;
4326 break;
4327 }
4328 case AF_UNIX:
4329 default:
4330 if (namelen == addrlen &&
4331 bcmp(name, addr, namelen) == 0)
4332 match = B_TRUE;
4333 }
4334 if (match) {
4335 freemsg(sti->sti_eaddr_mp);
4336 sti->sti_eaddr_mp = NULL;
4337 mutex_exit(&so->so_lock);
4338 #ifdef DEBUG
4339 dprintso(so, 0,
4340 ("sockfs delayed error %d for %s\n",
4341 error,
4342 pr_addr(so->so_family, name, namelen)));
4343 #endif /* DEBUG */
4344 return (error);
4345 }
4346 freemsg(sti->sti_eaddr_mp);
4347 sti->sti_eaddr_mp = NULL;
4348 }
4349 }
4350 mutex_exit(&so->so_lock);
4351
4352 flags = msg->msg_flags;
4353 dontroute = 0;
4354 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4355 uint32_t val;
4356
4357 val = 1;
4358 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4359 &val, (t_uscalar_t)sizeof (val), cr);
4360 if (error)
4361 return (error);
4362 dontroute = 1;
4363 }
4364
4365 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4366 error = EOPNOTSUPP;
4367 goto done;
4368 }
4369 if (msg->msg_controllen != 0) {
4370 if (!(so_mode & SM_CONNREQUIRED)) {
4371 so_update_attrs(so, SOMOD);
4372 error = sosend_dgramcmsg(so, name, namelen, uiop,
4373 msg->msg_control, msg->msg_controllen, flags);
4374 } else {
4375 if (flags & MSG_OOB) {
4376 /* Can't generate T_EXDATA_REQ with options */
4377 error = EOPNOTSUPP;
4378 goto done;
4379 }
4380 so_update_attrs(so, SOMOD);
4381 error = sosend_svccmsg(so, uiop,
4382 !(flags & MSG_EOR),
4383 msg->msg_control, msg->msg_controllen,
4384 flags);
4385 }
4386 goto done;
4387 }
4388
4389 so_update_attrs(so, SOMOD);
4390 if (!(so_mode & SM_CONNREQUIRED)) {
4391 /*
4392 * If there is no SO_DONTROUTE to turn off return immediately
4393 * from send_dgram. This can allow tail-call optimizations.
4394 */
4395 if (!dontroute) {
4396 return (sosend_dgram(so, name, namelen, uiop, flags));
4397 }
4398 error = sosend_dgram(so, name, namelen, uiop, flags);
4399 } else {
4400 t_scalar_t prim;
4401 int sflag;
4402
4403 /* Ignore msg_name in the connected state */
4404 if (flags & MSG_OOB) {
4405 prim = T_EXDATA_REQ;
4406 /*
4407 * Send down T_EXDATA_REQ even if there is flow
4408 * control for data.
4409 */
4410 sflag = MSG_IGNFLOW;
4411 } else {
4412 if (so_mode & SM_BYTESTREAM) {
4413 /* Byte stream transport - use write */
4414 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4415
4416 /* Send M_DATA messages */
4417 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4418 (error = nl7c_data(so, uiop)) >= 0) {
4419 /* NL7C consumed the data */
4420 return (error);
4421 }
4422 /*
4423 * If there is no SO_DONTROUTE to turn off,
4424 * sti_direct is on, and there is no flow
4425 * control, we can take the fast path.
4426 */
4427 if (!dontroute && sti->sti_direct != 0 &&
4428 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4429 return (sostream_direct(so, uiop,
4430 NULL, cr));
4431 }
4432 error = strwrite(SOTOV(so), uiop, cr);
4433 goto done;
4434 }
4435 prim = T_DATA_REQ;
4436 sflag = 0;
4437 }
4438 /*
4439 * If there is no SO_DONTROUTE to turn off return immediately
4440 * from sosend_svc. This can allow tail-call optimizations.
4441 */
4442 if (!dontroute)
4443 return (sosend_svc(so, uiop, prim,
4444 !(flags & MSG_EOR), sflag));
4445 error = sosend_svc(so, uiop, prim,
4446 !(flags & MSG_EOR), sflag);
4447 }
4448 ASSERT(dontroute);
4449 done:
4450 if (dontroute) {
4451 uint32_t val;
4452
4453 val = 0;
4454 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4455 &val, (t_uscalar_t)sizeof (val), cr);
4456 }
4457 return (error);
4458 }
4459
4460 /*
4461 * kstrwritemp() has very similar semantics as that of strwrite().
4462 * The main difference is it obtains mblks from the caller and also
4463 * does not do any copy as done in strwrite() from user buffers to
4464 * kernel buffers.
4465 *
4466 * Currently, this routine is used by sendfile to send data allocated
4467 * within the kernel without any copying. This interface does not use the
4468 * synchronous stream interface as synch. stream interface implies
4469 * copying.
4470 */
4471 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4472 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4473 {
4474 struct stdata *stp;
4475 struct queue *wqp;
4476 mblk_t *newmp;
4477 char waitflag;
4478 int tempmode;
4479 int error = 0;
4480 int done = 0;
4481 struct sonode *so;
4482 boolean_t direct;
4483
4484 ASSERT(vp->v_stream);
4485 stp = vp->v_stream;
4486
4487 so = VTOSO(vp);
4488 direct = _SOTOTPI(so)->sti_direct;
4489
4490 /*
4491 * This is the sockfs direct fast path. canputnext() need
4492 * not be accurate so we don't grab the sd_lock here. If
4493 * we get flow-controlled, we grab sd_lock just before the
4494 * do..while loop below to emulate what strwrite() does.
4495 */
4496 wqp = stp->sd_wrq;
4497 if (canputnext(wqp) && direct &&
4498 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4499 return (sostream_direct(so, NULL, mp, CRED()));
4500 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4501 /* Fast check of flags before acquiring the lock */
4502 mutex_enter(&stp->sd_lock);
4503 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4504 mutex_exit(&stp->sd_lock);
4505 if (error != 0) {
4506 if (!(stp->sd_flag & STPLEX) &&
4507 (stp->sd_wput_opt & SW_SIGPIPE)) {
4508 error = EPIPE;
4509 }
4510 return (error);
4511 }
4512 }
4513
4514 waitflag = WRITEWAIT;
4515 if (stp->sd_flag & OLDNDELAY)
4516 tempmode = fmode & ~FNDELAY;
4517 else
4518 tempmode = fmode;
4519
4520 mutex_enter(&stp->sd_lock);
4521 do {
4522 if (canputnext(wqp)) {
4523 mutex_exit(&stp->sd_lock);
4524 if (stp->sd_wputdatafunc != NULL) {
4525 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4526 NULL, NULL, NULL);
4527 if (newmp == NULL) {
4528 /* The caller will free mp */
4529 return (ECOMM);
4530 }
4531 mp = newmp;
4532 }
4533 putnext(wqp, mp);
4534 return (0);
4535 }
4536 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4537 &done);
4538 } while (error == 0 && !done);
4539
4540 mutex_exit(&stp->sd_lock);
4541 /*
4542 * EAGAIN tells the application to try again. ENOMEM
4543 * is returned only if the memory allocation size
4544 * exceeds the physical limits of the system. ENOMEM
4545 * can't be true here.
4546 */
4547 if (error == ENOMEM)
4548 error = EAGAIN;
4549 return (error);
4550 }
4551
4552 /* ARGSUSED */
4553 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4554 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4555 struct cred *cr, mblk_t **mpp)
4556 {
4557 int error;
4558
4559 if (so->so_family != AF_INET && so->so_family != AF_INET6)
4560 return (EAFNOSUPPORT);
4561
4562 if (so->so_state & SS_CANTSENDMORE)
4563 return (EPIPE);
4564
4565 if (so->so_type != SOCK_STREAM)
4566 return (EOPNOTSUPP);
4567
4568 if ((so->so_state & SS_ISCONNECTED) == 0)
4569 return (ENOTCONN);
4570
4571 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4572 if (error == 0)
4573 *mpp = NULL;
4574 return (error);
4575 }
4576
4577 /*
4578 * Sending data on a datagram socket.
4579 * Assumes caller has verified that SS_ISBOUND etc. are set.
4580 */
4581 /* ARGSUSED */
4582 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4583 sodgram_direct(struct sonode *so, struct sockaddr *name,
4584 socklen_t namelen, struct uio *uiop, int flags)
4585 {
4586 struct T_unitdata_req tudr;
4587 mblk_t *mp = NULL;
4588 int error = 0;
4589 void *addr;
4590 socklen_t addrlen;
4591 ssize_t len;
4592 struct stdata *stp = SOTOV(so)->v_stream;
4593 int so_state;
4594 queue_t *udp_wq;
4595 boolean_t connected;
4596 mblk_t *mpdata = NULL;
4597 sotpi_info_t *sti = SOTOTPI(so);
4598 uint32_t auditing = AU_AUDITING();
4599
4600 ASSERT(name != NULL && namelen != 0);
4601 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4602 ASSERT(!(so->so_mode & SM_EXDATA));
4603 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4604 ASSERT(SOTOV(so)->v_type == VSOCK);
4605
4606 /* Caller checked for proper length */
4607 len = uiop->uio_resid;
4608 ASSERT(len <= sti->sti_tidu_size);
4609
4610 /* Length and family checks have been done by caller */
4611 ASSERT(name->sa_family == so->so_family);
4612 ASSERT(so->so_family == AF_INET ||
4613 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4614 ASSERT(so->so_family == AF_INET6 ||
4615 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4616
4617 addr = name;
4618 addrlen = namelen;
4619
4620 if (stp->sd_sidp != NULL &&
4621 (error = straccess(stp, JCWRITE)) != 0)
4622 goto done;
4623
4624 so_state = so->so_state;
4625
4626 connected = so_state & SS_ISCONNECTED;
4627 if (!connected) {
4628 tudr.PRIM_type = T_UNITDATA_REQ;
4629 tudr.DEST_length = addrlen;
4630 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4631 tudr.OPT_length = 0;
4632 tudr.OPT_offset = 0;
4633
4634 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4635 _ALLOC_INTR, CRED());
4636 if (mp == NULL) {
4637 /*
4638 * Caught a signal waiting for memory.
4639 * Let send* return EINTR.
4640 */
4641 error = EINTR;
4642 goto done;
4643 }
4644 }
4645
4646 /*
4647 * For UDP we don't break up the copyin into smaller pieces
4648 * as in the TCP case. That means if ENOMEM is returned by
4649 * mcopyinuio() then the uio vector has not been modified at
4650 * all and we fallback to either strwrite() or kstrputmsg()
4651 * below. Note also that we never generate priority messages
4652 * from here.
4653 */
4654 udp_wq = stp->sd_wrq->q_next;
4655 if (canput(udp_wq) &&
4656 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4657 ASSERT(DB_TYPE(mpdata) == M_DATA);
4658 ASSERT(uiop->uio_resid == 0);
4659 if (!connected)
4660 linkb(mp, mpdata);
4661 else
4662 mp = mpdata;
4663 if (auditing)
4664 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4665
4666 udp_wput(udp_wq, mp);
4667 return (0);
4668 }
4669
4670 ASSERT(mpdata == NULL);
4671 if (error != 0 && error != ENOMEM) {
4672 freemsg(mp);
4673 return (error);
4674 }
4675
4676 /*
4677 * For connected, let strwrite() handle the blocking case.
4678 * Otherwise we fall thru and use kstrputmsg().
4679 */
4680 if (connected)
4681 return (strwrite(SOTOV(so), uiop, CRED()));
4682
4683 if (auditing)
4684 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4685
4686 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4687 done:
4688 #ifdef SOCK_DEBUG
4689 if (error != 0) {
4690 eprintsoline(so, error);
4691 }
4692 #endif /* SOCK_DEBUG */
4693 return (error);
4694 }
4695
4696 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4697 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4698 {
4699 struct stdata *stp = SOTOV(so)->v_stream;
4700 ssize_t iosize, rmax, maxblk;
4701 queue_t *tcp_wq = stp->sd_wrq->q_next;
4702 mblk_t *newmp;
4703 int error = 0, wflag = 0;
4704
4705 ASSERT(so->so_mode & SM_BYTESTREAM);
4706 ASSERT(SOTOV(so)->v_type == VSOCK);
4707
4708 if (stp->sd_sidp != NULL &&
4709 (error = straccess(stp, JCWRITE)) != 0)
4710 return (error);
4711
4712 if (uiop == NULL) {
4713 /*
4714 * kstrwritemp() should have checked sd_flag and
4715 * flow-control before coming here. If we end up
4716 * here it means that we can simply pass down the
4717 * data to tcp.
4718 */
4719 ASSERT(mp != NULL);
4720 if (stp->sd_wputdatafunc != NULL) {
4721 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4722 NULL, NULL, NULL);
4723 if (newmp == NULL) {
4724 /* The caller will free mp */
4725 return (ECOMM);
4726 }
4727 mp = newmp;
4728 }
4729 tcp_wput(tcp_wq, mp);
4730 return (0);
4731 }
4732
4733 /* Fallback to strwrite() to do proper error handling */
4734 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4735 return (strwrite(SOTOV(so), uiop, cr));
4736
4737 rmax = stp->sd_qn_maxpsz;
4738 ASSERT(rmax >= 0 || rmax == INFPSZ);
4739 if (rmax == 0 || uiop->uio_resid <= 0)
4740 return (0);
4741
4742 if (rmax == INFPSZ)
4743 rmax = uiop->uio_resid;
4744
4745 maxblk = stp->sd_maxblk;
4746
4747 for (;;) {
4748 iosize = MIN(uiop->uio_resid, rmax);
4749
4750 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4751 if (mp == NULL) {
4752 /*
4753 * Fallback to strwrite() for ENOMEM; if this
4754 * is our first time in this routine and the uio
4755 * vector has not been modified, we will end up
4756 * calling strwrite() without any flag set.
4757 */
4758 if (error == ENOMEM)
4759 goto slow_send;
4760 else
4761 return (error);
4762 }
4763 ASSERT(uiop->uio_resid >= 0);
4764 /*
4765 * If mp is non-NULL and ENOMEM is set, it means that
4766 * mcopyinuio() was able to break down some of the user
4767 * data into one or more mblks. Send the partial data
4768 * to tcp and let the rest be handled in strwrite().
4769 */
4770 ASSERT(error == 0 || error == ENOMEM);
4771 if (stp->sd_wputdatafunc != NULL) {
4772 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4773 NULL, NULL, NULL);
4774 if (newmp == NULL) {
4775 /* The caller will free mp */
4776 return (ECOMM);
4777 }
4778 mp = newmp;
4779 }
4780 tcp_wput(tcp_wq, mp);
4781
4782 wflag |= NOINTR;
4783
4784 if (uiop->uio_resid == 0) { /* No more data; we're done */
4785 ASSERT(error == 0);
4786 break;
4787 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4788 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4789 slow_send:
4790 /*
4791 * We were able to send down partial data using
4792 * the direct call interface, but are now relying
4793 * on strwrite() to handle the non-fastpath cases.
4794 * If the socket is blocking we will sleep in
4795 * strwaitq() until write is permitted, otherwise,
4796 * we will need to return the amount of bytes
4797 * written so far back to the app. This is the
4798 * reason why we pass NOINTR flag to strwrite()
4799 * for non-blocking socket, because we don't want
4800 * to return EAGAIN when portion of the user data
4801 * has actually been sent down.
4802 */
4803 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4804 }
4805 }
4806 return (0);
4807 }
4808
4809 /*
4810 * Update sti_faddr by asking the transport (unless AF_UNIX).
4811 */
4812 /* ARGSUSED */
4813 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4814 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4815 boolean_t accept, struct cred *cr)
4816 {
4817 struct strbuf strbuf;
4818 int error = 0, res;
4819 void *addr;
4820 t_uscalar_t addrlen;
4821 k_sigset_t smask;
4822 sotpi_info_t *sti = SOTOTPI(so);
4823
4824 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4825 (void *)so, pr_state(so->so_state, so->so_mode)));
4826
4827 ASSERT(*namelen > 0);
4828 mutex_enter(&so->so_lock);
4829 so_lock_single(so); /* Set SOLOCKED */
4830
4831 if (accept) {
4832 bcopy(sti->sti_faddr_sa, name,
4833 MIN(*namelen, sti->sti_faddr_len));
4834 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4835 goto done;
4836 }
4837
4838 if (!(so->so_state & SS_ISCONNECTED)) {
4839 error = ENOTCONN;
4840 goto done;
4841 }
4842 /* Added this check for X/Open */
4843 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4844 error = EINVAL;
4845 if (xnet_check_print) {
4846 printf("sockfs: X/Open getpeername check => EINVAL\n");
4847 }
4848 goto done;
4849 }
4850
4851 if (sti->sti_faddr_valid) {
4852 bcopy(sti->sti_faddr_sa, name,
4853 MIN(*namelen, sti->sti_faddr_len));
4854 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4855 goto done;
4856 }
4857
4858 #ifdef DEBUG
4859 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4860 pr_addr(so->so_family, sti->sti_faddr_sa,
4861 (t_uscalar_t)sti->sti_faddr_len)));
4862 #endif /* DEBUG */
4863
4864 if (so->so_family == AF_UNIX) {
4865 /* Transport has different name space - return local info */
4866 if (sti->sti_faddr_noxlate)
4867 *namelen = 0;
4868 error = 0;
4869 goto done;
4870 }
4871
4872 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4873
4874 ASSERT(sti->sti_faddr_sa);
4875 /* Allocate local buffer to use with ioctl */
4876 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4877 mutex_exit(&so->so_lock);
4878 addr = kmem_alloc(addrlen, KM_SLEEP);
4879
4880 /*
4881 * Issue TI_GETPEERNAME with signals masked.
4882 * Put the result in sti_faddr_sa so that getpeername works after
4883 * a shutdown(output).
4884 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4885 * back to the socket.
4886 */
4887 strbuf.buf = addr;
4888 strbuf.maxlen = addrlen;
4889 strbuf.len = 0;
4890
4891 sigintr(&smask, 0);
4892 res = 0;
4893 ASSERT(cr);
4894 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4895 0, K_TO_K, cr, &res);
4896 sigunintr(&smask);
4897
4898 mutex_enter(&so->so_lock);
4899 /*
4900 * If there is an error record the error in so_error put don't fail
4901 * the getpeername. Instead fallback on the recorded
4902 * sti->sti_faddr_sa.
4903 */
4904 if (error) {
4905 /*
4906 * Various stream head errors can be returned to the ioctl.
4907 * However, it is impossible to determine which ones of
4908 * these are really socket level errors that were incorrectly
4909 * consumed by the ioctl. Thus this code silently ignores the
4910 * error - to code explicitly does not reinstate the error
4911 * using soseterror().
4912 * Experiments have shows that at least this set of
4913 * errors are reported and should not be reinstated on the
4914 * socket:
4915 * EINVAL E.g. if an I_LINK was in effect when
4916 * getpeername was called.
4917 * EPIPE The ioctl error semantics prefer the write
4918 * side error over the read side error.
4919 * ENOTCONN The transport just got disconnected but
4920 * sockfs had not yet seen the T_DISCON_IND
4921 * when issuing the ioctl.
4922 */
4923 error = 0;
4924 } else if (res == 0 && strbuf.len > 0 &&
4925 (so->so_state & SS_ISCONNECTED)) {
4926 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4927 sti->sti_faddr_len = (socklen_t)strbuf.len;
4928 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4929 sti->sti_faddr_valid = 1;
4930
4931 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4932 *namelen = sti->sti_faddr_len;
4933 }
4934 kmem_free(addr, addrlen);
4935 #ifdef DEBUG
4936 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4937 pr_addr(so->so_family, sti->sti_faddr_sa,
4938 (t_uscalar_t)sti->sti_faddr_len)));
4939 #endif /* DEBUG */
4940 done:
4941 so_unlock_single(so, SOLOCKED);
4942 mutex_exit(&so->so_lock);
4943 return (error);
4944 }
4945
4946 /*
4947 * Update sti_laddr by asking the transport (unless AF_UNIX).
4948 */
4949 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4950 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4951 struct cred *cr)
4952 {
4953 struct strbuf strbuf;
4954 int error = 0, res;
4955 void *addr;
4956 t_uscalar_t addrlen;
4957 k_sigset_t smask;
4958 sotpi_info_t *sti = SOTOTPI(so);
4959
4960 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4961 (void *)so, pr_state(so->so_state, so->so_mode)));
4962
4963 ASSERT(*namelen > 0);
4964 mutex_enter(&so->so_lock);
4965 so_lock_single(so); /* Set SOLOCKED */
4966
4967 #ifdef DEBUG
4968
4969 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4970 pr_addr(so->so_family, sti->sti_laddr_sa,
4971 (t_uscalar_t)sti->sti_laddr_len)));
4972 #endif /* DEBUG */
4973 if (sti->sti_laddr_valid) {
4974 bcopy(sti->sti_laddr_sa, name,
4975 MIN(*namelen, sti->sti_laddr_len));
4976 *namelen = sti->sti_laddr_len;
4977 goto done;
4978 }
4979
4980 if (so->so_family == AF_UNIX) {
4981 /* Transport has different name space - return local info */
4982 error = 0;
4983 *namelen = 0;
4984 goto done;
4985 }
4986 if (!(so->so_state & SS_ISBOUND)) {
4987 /* If not bound, then nothing to return. */
4988 error = 0;
4989 goto done;
4990 }
4991
4992 /* Allocate local buffer to use with ioctl */
4993 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4994 mutex_exit(&so->so_lock);
4995 addr = kmem_alloc(addrlen, KM_SLEEP);
4996
4997 /*
4998 * Issue TI_GETMYNAME with signals masked.
4999 * Put the result in sti_laddr_sa so that getsockname works after
5000 * a shutdown(output).
5001 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5002 * back to the socket.
5003 */
5004 strbuf.buf = addr;
5005 strbuf.maxlen = addrlen;
5006 strbuf.len = 0;
5007
5008 sigintr(&smask, 0);
5009 res = 0;
5010 ASSERT(cr);
5011 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5012 0, K_TO_K, cr, &res);
5013 sigunintr(&smask);
5014
5015 mutex_enter(&so->so_lock);
5016 /*
5017 * If there is an error record the error in so_error put don't fail
5018 * the getsockname. Instead fallback on the recorded
5019 * sti->sti_laddr_sa.
5020 */
5021 if (error) {
5022 /*
5023 * Various stream head errors can be returned to the ioctl.
5024 * However, it is impossible to determine which ones of
5025 * these are really socket level errors that were incorrectly
5026 * consumed by the ioctl. Thus this code silently ignores the
5027 * error - to code explicitly does not reinstate the error
5028 * using soseterror().
5029 * Experiments have shows that at least this set of
5030 * errors are reported and should not be reinstated on the
5031 * socket:
5032 * EINVAL E.g. if an I_LINK was in effect when
5033 * getsockname was called.
5034 * EPIPE The ioctl error semantics prefer the write
5035 * side error over the read side error.
5036 */
5037 error = 0;
5038 } else if (res == 0 && strbuf.len > 0 &&
5039 (so->so_state & SS_ISBOUND)) {
5040 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5041 sti->sti_laddr_len = (socklen_t)strbuf.len;
5042 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5043 sti->sti_laddr_valid = 1;
5044
5045 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5046 *namelen = sti->sti_laddr_len;
5047 }
5048 kmem_free(addr, addrlen);
5049 #ifdef DEBUG
5050 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5051 pr_addr(so->so_family, sti->sti_laddr_sa,
5052 (t_uscalar_t)sti->sti_laddr_len)));
5053 #endif /* DEBUG */
5054 done:
5055 so_unlock_single(so, SOLOCKED);
5056 mutex_exit(&so->so_lock);
5057 return (error);
5058 }
5059
5060 /*
5061 * Get socket options. For SOL_SOCKET options some options are handled
5062 * by the sockfs while others use the value recorded in the sonode as a
5063 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5064 *
5065 * On the return most *optlenp bytes are copied to optval.
5066 */
5067 /* ARGSUSED */
5068 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5069 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5070 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5071 {
5072 struct T_optmgmt_req optmgmt_req;
5073 struct T_optmgmt_ack *optmgmt_ack;
5074 struct opthdr oh;
5075 struct opthdr *opt_res;
5076 mblk_t *mp = NULL;
5077 int error = 0;
5078 void *option = NULL; /* Set if fallback value */
5079 t_uscalar_t maxlen = *optlenp;
5080 t_uscalar_t len;
5081 uint32_t value;
5082 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5083 struct timeval32 tmo_val32;
5084 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5085
5086 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5087 (void *)so, level, option_name, optval, (void *)optlenp,
5088 pr_state(so->so_state, so->so_mode)));
5089
5090 mutex_enter(&so->so_lock);
5091 so_lock_single(so); /* Set SOLOCKED */
5092
5093 /*
5094 * Check for SOL_SOCKET options.
5095 * Certain SOL_SOCKET options are returned directly whereas
5096 * others only provide a default (fallback) value should
5097 * the T_SVR4_OPTMGMT_REQ fail.
5098 */
5099 if (level == SOL_SOCKET) {
5100 /* Check parameters */
5101 switch (option_name) {
5102 case SO_TYPE:
5103 case SO_ERROR:
5104 case SO_DEBUG:
5105 case SO_ACCEPTCONN:
5106 case SO_REUSEADDR:
5107 case SO_KEEPALIVE:
5108 case SO_DONTROUTE:
5109 case SO_BROADCAST:
5110 case SO_USELOOPBACK:
5111 case SO_OOBINLINE:
5112 case SO_SNDBUF:
5113 case SO_RCVBUF:
5114 #ifdef notyet
5115 case SO_SNDLOWAT:
5116 case SO_RCVLOWAT:
5117 #endif /* notyet */
5118 case SO_DOMAIN:
5119 case SO_DGRAM_ERRIND:
5120 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5121 error = EINVAL;
5122 eprintsoline(so, error);
5123 goto done2;
5124 }
5125 break;
5126 case SO_RCVTIMEO:
5127 case SO_SNDTIMEO:
5128 if (get_udatamodel() == DATAMODEL_NONE ||
5129 get_udatamodel() == DATAMODEL_NATIVE) {
5130 if (maxlen < sizeof (struct timeval)) {
5131 error = EINVAL;
5132 eprintsoline(so, error);
5133 goto done2;
5134 }
5135 } else {
5136 if (maxlen < sizeof (struct timeval32)) {
5137 error = EINVAL;
5138 eprintsoline(so, error);
5139 goto done2;
5140 }
5141
5142 }
5143 break;
5144 case SO_LINGER:
5145 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5146 error = EINVAL;
5147 eprintsoline(so, error);
5148 goto done2;
5149 }
5150 break;
5151 case SO_SND_BUFINFO:
5152 if (maxlen < (t_uscalar_t)
5153 sizeof (struct so_snd_bufinfo)) {
5154 error = EINVAL;
5155 eprintsoline(so, error);
5156 goto done2;
5157 }
5158 break;
5159 }
5160
5161 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5162
5163 switch (option_name) {
5164 case SO_TYPE:
5165 value = so->so_type;
5166 option = &value;
5167 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5168
5169 case SO_ERROR:
5170 value = sogeterr(so, B_TRUE);
5171 option = &value;
5172 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5173
5174 case SO_ACCEPTCONN:
5175 if (so->so_state & SS_ACCEPTCONN)
5176 value = SO_ACCEPTCONN;
5177 else
5178 value = 0;
5179 #ifdef DEBUG
5180 if (value) {
5181 dprintso(so, 1,
5182 ("sotpi_getsockopt: 0x%x is set\n",
5183 option_name));
5184 } else {
5185 dprintso(so, 1,
5186 ("sotpi_getsockopt: 0x%x not set\n",
5187 option_name));
5188 }
5189 #endif /* DEBUG */
5190 option = &value;
5191 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5192
5193 case SO_DEBUG:
5194 case SO_REUSEADDR:
5195 case SO_KEEPALIVE:
5196 case SO_DONTROUTE:
5197 case SO_BROADCAST:
5198 case SO_USELOOPBACK:
5199 case SO_OOBINLINE:
5200 case SO_DGRAM_ERRIND:
5201 value = (so->so_options & option_name);
5202 #ifdef DEBUG
5203 if (value) {
5204 dprintso(so, 1,
5205 ("sotpi_getsockopt: 0x%x is set\n",
5206 option_name));
5207 } else {
5208 dprintso(so, 1,
5209 ("sotpi_getsockopt: 0x%x not set\n",
5210 option_name));
5211 }
5212 #endif /* DEBUG */
5213 option = &value;
5214 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5215
5216 /*
5217 * The following options are only returned by sockfs when the
5218 * T_SVR4_OPTMGMT_REQ fails.
5219 */
5220 case SO_LINGER:
5221 option = &so->so_linger;
5222 len = (t_uscalar_t)sizeof (struct linger);
5223 break;
5224 case SO_SNDBUF: {
5225 ssize_t lvalue;
5226
5227 /*
5228 * If the option has not been set then get a default
5229 * value from the read queue. This value is
5230 * returned if the transport fails
5231 * the T_SVR4_OPTMGMT_REQ.
5232 */
5233 lvalue = so->so_sndbuf;
5234 if (lvalue == 0) {
5235 mutex_exit(&so->so_lock);
5236 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5237 QHIWAT, 0, &lvalue);
5238 mutex_enter(&so->so_lock);
5239 dprintso(so, 1,
5240 ("got SO_SNDBUF %ld from q\n", lvalue));
5241 }
5242 value = (int)lvalue;
5243 option = &value;
5244 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5245 break;
5246 }
5247 case SO_RCVBUF: {
5248 ssize_t lvalue;
5249
5250 /*
5251 * If the option has not been set then get a default
5252 * value from the read queue. This value is
5253 * returned if the transport fails
5254 * the T_SVR4_OPTMGMT_REQ.
5255 *
5256 * XXX If SO_RCVBUF has been set and this is an
5257 * XPG 4.2 application then do not ask the transport
5258 * since the transport might adjust the value and not
5259 * return exactly what was set by the application.
5260 * For non-XPG 4.2 application we return the value
5261 * that the transport is actually using.
5262 */
5263 lvalue = so->so_rcvbuf;
5264 if (lvalue == 0) {
5265 mutex_exit(&so->so_lock);
5266 (void) strqget(RD(strvp2wq(SOTOV(so))),
5267 QHIWAT, 0, &lvalue);
5268 mutex_enter(&so->so_lock);
5269 dprintso(so, 1,
5270 ("got SO_RCVBUF %ld from q\n", lvalue));
5271 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5272 value = (int)lvalue;
5273 option = &value;
5274 goto copyout; /* skip asking transport */
5275 }
5276 value = (int)lvalue;
5277 option = &value;
5278 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5279 break;
5280 }
5281 case SO_DOMAIN:
5282 value = so->so_family;
5283 option = &value;
5284 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5285
5286 #ifdef notyet
5287 /*
5288 * We do not implement the semantics of these options
5289 * thus we shouldn't implement the options either.
5290 */
5291 case SO_SNDLOWAT:
5292 value = so->so_sndlowat;
5293 option = &value;
5294 break;
5295 case SO_RCVLOWAT:
5296 value = so->so_rcvlowat;
5297 option = &value;
5298 break;
5299 #endif /* notyet */
5300 case SO_SNDTIMEO:
5301 case SO_RCVTIMEO: {
5302 clock_t val;
5303
5304 if (option_name == SO_RCVTIMEO)
5305 val = drv_hztousec(so->so_rcvtimeo);
5306 else
5307 val = drv_hztousec(so->so_sndtimeo);
5308 tmo_val.tv_sec = val / (1000 * 1000);
5309 tmo_val.tv_usec = val % (1000 * 1000);
5310 if (get_udatamodel() == DATAMODEL_NONE ||
5311 get_udatamodel() == DATAMODEL_NATIVE) {
5312 option = &tmo_val;
5313 len = sizeof (struct timeval);
5314 } else {
5315 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5316 option = &tmo_val32;
5317 len = sizeof (struct timeval32);
5318 }
5319 break;
5320 }
5321 case SO_SND_BUFINFO: {
5322 snd_bufinfo.sbi_wroff =
5323 (so->so_proto_props).sopp_wroff;
5324 snd_bufinfo.sbi_maxblk =
5325 (so->so_proto_props).sopp_maxblk;
5326 snd_bufinfo.sbi_maxpsz =
5327 (so->so_proto_props).sopp_maxpsz;
5328 snd_bufinfo.sbi_tail =
5329 (so->so_proto_props).sopp_tail;
5330 option = &snd_bufinfo;
5331 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5332 break;
5333 }
5334 }
5335 }
5336
5337 mutex_exit(&so->so_lock);
5338
5339 /* Send request */
5340 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5341 optmgmt_req.MGMT_flags = T_CHECK;
5342 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5343 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5344
5345 oh.level = level;
5346 oh.name = option_name;
5347 oh.len = maxlen;
5348
5349 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5350 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5351 /* Let option management work in the presence of data flow control */
5352 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5353 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5354 mp = NULL;
5355 mutex_enter(&so->so_lock);
5356 if (error) {
5357 eprintsoline(so, error);
5358 goto done2;
5359 }
5360 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5361 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5362 if (error) {
5363 if (option != NULL) {
5364 /* We have a fallback value */
5365 error = 0;
5366 goto copyout;
5367 }
5368 eprintsoline(so, error);
5369 goto done2;
5370 }
5371 ASSERT(mp);
5372 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5373 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5374 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5375 if (opt_res == NULL) {
5376 if (option != NULL) {
5377 /* We have a fallback value */
5378 error = 0;
5379 goto copyout;
5380 }
5381 error = EPROTO;
5382 eprintsoline(so, error);
5383 goto done;
5384 }
5385 option = &opt_res[1];
5386
5387 /* check to ensure that the option is within bounds */
5388 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5389 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5390 if (option != NULL) {
5391 /* We have a fallback value */
5392 error = 0;
5393 goto copyout;
5394 }
5395 error = EPROTO;
5396 eprintsoline(so, error);
5397 goto done;
5398 }
5399
5400 len = opt_res->len;
5401
5402 copyout: {
5403 t_uscalar_t size = MIN(len, maxlen);
5404 bcopy(option, optval, size);
5405 bcopy(&size, optlenp, sizeof (size));
5406 }
5407 done:
5408 freemsg(mp);
5409 done2:
5410 so_unlock_single(so, SOLOCKED);
5411 mutex_exit(&so->so_lock);
5412
5413 return (error);
5414 }
5415
5416 /*
5417 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5418 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5419 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5420 * setsockopt has to work even if the transport does not support the option.
5421 */
5422 /* ARGSUSED */
5423 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5424 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5425 const void *optval, t_uscalar_t optlen, struct cred *cr)
5426 {
5427 struct T_optmgmt_req optmgmt_req;
5428 struct opthdr oh;
5429 mblk_t *mp;
5430 int error = 0;
5431 boolean_t handled = B_FALSE;
5432
5433 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5434 (void *)so, level, option_name, optval, optlen,
5435 pr_state(so->so_state, so->so_mode)));
5436
5437 /* X/Open requires this check */
5438 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5439 if (xnet_check_print)
5440 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5441 return (EINVAL);
5442 }
5443
5444 mutex_enter(&so->so_lock);
5445 so_lock_single(so); /* Set SOLOCKED */
5446 mutex_exit(&so->so_lock);
5447
5448 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5449 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5450 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5451 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5452
5453 oh.level = level;
5454 oh.name = option_name;
5455 oh.len = optlen;
5456
5457 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5458 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5459 /* Let option management work in the presence of data flow control */
5460 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5461 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5462 mp = NULL;
5463 mutex_enter(&so->so_lock);
5464 if (error) {
5465 eprintsoline(so, error);
5466 goto done2;
5467 }
5468 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5469 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5470 if (error) {
5471 eprintsoline(so, error);
5472 goto done;
5473 }
5474 ASSERT(mp);
5475 /* No need to verify T_optmgmt_ack */
5476 freemsg(mp);
5477 done:
5478 /*
5479 * Check for SOL_SOCKET options and record their values.
5480 * If we know about a SOL_SOCKET parameter and the transport
5481 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5482 * EPROTO) we let the setsockopt succeed.
5483 */
5484 if (level == SOL_SOCKET) {
5485 /* Check parameters */
5486 switch (option_name) {
5487 case SO_DEBUG:
5488 case SO_REUSEADDR:
5489 case SO_KEEPALIVE:
5490 case SO_DONTROUTE:
5491 case SO_BROADCAST:
5492 case SO_USELOOPBACK:
5493 case SO_OOBINLINE:
5494 case SO_SNDBUF:
5495 case SO_RCVBUF:
5496 #ifdef notyet
5497 case SO_SNDLOWAT:
5498 case SO_RCVLOWAT:
5499 #endif /* notyet */
5500 case SO_DGRAM_ERRIND:
5501 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5502 error = EINVAL;
5503 eprintsoline(so, error);
5504 goto done2;
5505 }
5506 ASSERT(optval);
5507 handled = B_TRUE;
5508 break;
5509 case SO_SNDTIMEO:
5510 case SO_RCVTIMEO:
5511 if (get_udatamodel() == DATAMODEL_NONE ||
5512 get_udatamodel() == DATAMODEL_NATIVE) {
5513 if (optlen != sizeof (struct timeval)) {
5514 error = EINVAL;
5515 eprintsoline(so, error);
5516 goto done2;
5517 }
5518 } else {
5519 if (optlen != sizeof (struct timeval32)) {
5520 error = EINVAL;
5521 eprintsoline(so, error);
5522 goto done2;
5523 }
5524 }
5525 ASSERT(optval);
5526 handled = B_TRUE;
5527 break;
5528 case SO_LINGER:
5529 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5530 error = EINVAL;
5531 eprintsoline(so, error);
5532 goto done2;
5533 }
5534 ASSERT(optval);
5535 handled = B_TRUE;
5536 break;
5537 }
5538
5539 #define intvalue (*(int32_t *)optval)
5540
5541 switch (option_name) {
5542 case SO_TYPE:
5543 case SO_ERROR:
5544 case SO_ACCEPTCONN:
5545 /* Can't be set */
5546 error = ENOPROTOOPT;
5547 goto done2;
5548 case SO_LINGER: {
5549 struct linger *l = (struct linger *)optval;
5550
5551 so->so_linger.l_linger = l->l_linger;
5552 if (l->l_onoff) {
5553 so->so_linger.l_onoff = SO_LINGER;
5554 so->so_options |= SO_LINGER;
5555 } else {
5556 so->so_linger.l_onoff = 0;
5557 so->so_options &= ~SO_LINGER;
5558 }
5559 break;
5560 }
5561
5562 case SO_DEBUG:
5563 #ifdef SOCK_TEST
5564 if (intvalue & 2)
5565 sock_test_timelimit = 10 * hz;
5566 else
5567 sock_test_timelimit = 0;
5568
5569 if (intvalue & 4)
5570 do_useracc = 0;
5571 else
5572 do_useracc = 1;
5573 #endif /* SOCK_TEST */
5574 /* FALLTHRU */
5575 case SO_REUSEADDR:
5576 case SO_KEEPALIVE:
5577 case SO_DONTROUTE:
5578 case SO_BROADCAST:
5579 case SO_USELOOPBACK:
5580 case SO_OOBINLINE:
5581 case SO_DGRAM_ERRIND:
5582 if (intvalue != 0) {
5583 dprintso(so, 1,
5584 ("socket_setsockopt: setting 0x%x\n",
5585 option_name));
5586 so->so_options |= option_name;
5587 } else {
5588 dprintso(so, 1,
5589 ("socket_setsockopt: clearing 0x%x\n",
5590 option_name));
5591 so->so_options &= ~option_name;
5592 }
5593 break;
5594 /*
5595 * The following options are only returned by us when the
5596 * transport layer fails.
5597 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5598 * since the transport might adjust the value and not
5599 * return exactly what was set by the application.
5600 */
5601 case SO_SNDBUF:
5602 so->so_sndbuf = intvalue;
5603 break;
5604 case SO_RCVBUF:
5605 so->so_rcvbuf = intvalue;
5606 break;
5607 case SO_RCVPSH:
5608 so->so_rcv_timer_interval = intvalue;
5609 break;
5610 #ifdef notyet
5611 /*
5612 * We do not implement the semantics of these options
5613 * thus we shouldn't implement the options either.
5614 */
5615 case SO_SNDLOWAT:
5616 so->so_sndlowat = intvalue;
5617 break;
5618 case SO_RCVLOWAT:
5619 so->so_rcvlowat = intvalue;
5620 break;
5621 #endif /* notyet */
5622 case SO_SNDTIMEO:
5623 case SO_RCVTIMEO: {
5624 struct timeval tl;
5625 clock_t val;
5626
5627 if (get_udatamodel() == DATAMODEL_NONE ||
5628 get_udatamodel() == DATAMODEL_NATIVE)
5629 bcopy(&tl, (struct timeval *)optval,
5630 sizeof (struct timeval));
5631 else
5632 TIMEVAL32_TO_TIMEVAL(&tl,
5633 (struct timeval32 *)optval);
5634 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5635 if (option_name == SO_RCVTIMEO)
5636 so->so_rcvtimeo = drv_usectohz(val);
5637 else
5638 so->so_sndtimeo = drv_usectohz(val);
5639 break;
5640 }
5641 }
5642 #undef intvalue
5643
5644 if (error) {
5645 if ((error == ENOPROTOOPT || error == EPROTO ||
5646 error == EINVAL) && handled) {
5647 dprintso(so, 1,
5648 ("setsockopt: ignoring error %d for 0x%x\n",
5649 error, option_name));
5650 error = 0;
5651 }
5652 }
5653 }
5654 done2:
5655 so_unlock_single(so, SOLOCKED);
5656 mutex_exit(&so->so_lock);
5657 return (error);
5658 }
5659
5660 /*
5661 * sotpi_close() is called when the last open reference goes away.
5662 */
5663 /* ARGSUSED */
5664 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5665 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5666 {
5667 struct vnode *vp = SOTOV(so);
5668 dev_t dev;
5669 int error = 0;
5670 sotpi_info_t *sti = SOTOTPI(so);
5671
5672 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5673 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5674
5675 dev = sti->sti_dev;
5676
5677 ASSERT(STREAMSTAB(getmajor(dev)));
5678
5679 mutex_enter(&so->so_lock);
5680 so_lock_single(so); /* Set SOLOCKED */
5681
5682 ASSERT(so_verify_oobstate(so));
5683
5684 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5685 sti->sti_nl7c_flags = 0;
5686 nl7c_close(so);
5687 }
5688
5689 if (vp->v_stream != NULL) {
5690 vnode_t *ux_vp;
5691
5692 if (so->so_family == AF_UNIX) {
5693 /* Could avoid this when CANTSENDMORE for !dgram */
5694 so_unix_close(so);
5695 }
5696
5697 mutex_exit(&so->so_lock);
5698 /*
5699 * Disassemble the linkage from the AF_UNIX underlying file
5700 * system vnode to this socket (by atomically clearing
5701 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5702 * and frees the stream head.
5703 */
5704 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5705 ASSERT(ux_vp->v_stream);
5706 sti->sti_ux_bound_vp = NULL;
5707 vn_rele_stream(ux_vp);
5708 }
5709 error = strclose(vp, flag, cr);
5710 vp->v_stream = NULL;
5711 mutex_enter(&so->so_lock);
5712 }
5713
5714 /*
5715 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5716 */
5717 so_flush_discon_ind(so);
5718
5719 so_unlock_single(so, SOLOCKED);
5720 mutex_exit(&so->so_lock);
5721
5722 /*
5723 * Needed for STREAMs.
5724 * Decrement the device driver's reference count for streams
5725 * opened via the clone dip. The driver was held in clone_open().
5726 * The absence of clone_close() forces this asymmetry.
5727 */
5728 if (so->so_flag & SOCLONE)
5729 ddi_rele_driver(getmajor(dev));
5730
5731 return (error);
5732 }
5733
5734 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5735 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5736 struct cred *cr, int32_t *rvalp)
5737 {
5738 struct vnode *vp = SOTOV(so);
5739 sotpi_info_t *sti = SOTOTPI(so);
5740 int error = 0;
5741
5742 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5743 cmd, arg, pr_state(so->so_state, so->so_mode)));
5744
5745 switch (cmd) {
5746 case SIOCSQPTR:
5747 /*
5748 * SIOCSQPTR is valid only when helper stream is created
5749 * by the protocol.
5750 */
5751 case _I_INSERT:
5752 case _I_REMOVE:
5753 /*
5754 * Since there's no compelling reason to support these ioctls
5755 * on sockets, and doing so would increase the complexity
5756 * markedly, prevent it.
5757 */
5758 return (EOPNOTSUPP);
5759
5760 case I_FIND:
5761 case I_LIST:
5762 case I_LOOK:
5763 case I_POP:
5764 case I_PUSH:
5765 /*
5766 * To prevent races and inconsistencies between the actual
5767 * state of the stream and the state according to the sonode,
5768 * we serialize all operations which modify or operate on the
5769 * list of modules on the socket's stream.
5770 */
5771 mutex_enter(&sti->sti_plumb_lock);
5772 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5773 mutex_exit(&sti->sti_plumb_lock);
5774 return (error);
5775
5776 default:
5777 if (so->so_version != SOV_STREAM)
5778 break;
5779
5780 /*
5781 * The imaginary "sockmod" has been popped; act as a stream.
5782 */
5783 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5784 }
5785
5786 ASSERT(so->so_version != SOV_STREAM);
5787
5788 /*
5789 * Process socket-specific ioctls.
5790 */
5791 switch (cmd) {
5792 case FIONBIO: {
5793 int32_t value;
5794
5795 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5796 (mode & (int)FKIOCTL)))
5797 return (EFAULT);
5798
5799 mutex_enter(&so->so_lock);
5800 if (value) {
5801 so->so_state |= SS_NDELAY;
5802 } else {
5803 so->so_state &= ~SS_NDELAY;
5804 }
5805 mutex_exit(&so->so_lock);
5806 return (0);
5807 }
5808
5809 case FIOASYNC: {
5810 int32_t value;
5811
5812 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5813 (mode & (int)FKIOCTL)))
5814 return (EFAULT);
5815
5816 mutex_enter(&so->so_lock);
5817 /*
5818 * SS_ASYNC flag not already set correctly?
5819 * (!value != !(so->so_state & SS_ASYNC))
5820 * but some engineers find that too hard to read.
5821 */
5822 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5823 value != 0 && (so->so_state & SS_ASYNC) == 0)
5824 error = so_flip_async(so, vp, mode, cr);
5825 mutex_exit(&so->so_lock);
5826 return (error);
5827 }
5828
5829 case SIOCSPGRP:
5830 case FIOSETOWN: {
5831 pid_t pgrp;
5832
5833 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5834 (mode & (int)FKIOCTL)))
5835 return (EFAULT);
5836
5837 mutex_enter(&so->so_lock);
5838 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5839 /* Any change? */
5840 if (pgrp != so->so_pgrp)
5841 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5842 mutex_exit(&so->so_lock);
5843 return (error);
5844 }
5845 case SIOCGPGRP:
5846 case FIOGETOWN:
5847 if (so_copyout(&so->so_pgrp, (void *)arg,
5848 sizeof (pid_t), (mode & (int)FKIOCTL)))
5849 return (EFAULT);
5850 return (0);
5851
5852 case SIOCATMARK: {
5853 int retval;
5854 uint_t so_state;
5855
5856 /*
5857 * strwaitmark has a finite timeout after which it
5858 * returns -1 if the mark state is undetermined.
5859 * In order to avoid any race between the mark state
5860 * in sockfs and the mark state in the stream head this
5861 * routine loops until the mark state can be determined
5862 * (or the urgent data indication has been removed by some
5863 * other thread).
5864 */
5865 do {
5866 mutex_enter(&so->so_lock);
5867 so_state = so->so_state;
5868 mutex_exit(&so->so_lock);
5869 if (so_state & SS_RCVATMARK) {
5870 retval = 1;
5871 } else if (!(so_state & SS_OOBPEND)) {
5872 /*
5873 * No SIGURG has been generated -- there is no
5874 * pending or present urgent data. Thus can't
5875 * possibly be at the mark.
5876 */
5877 retval = 0;
5878 } else {
5879 /*
5880 * Have the stream head wait until there is
5881 * either some messages on the read queue, or
5882 * STRATMARK or STRNOTATMARK gets set. The
5883 * STRNOTATMARK flag is used so that the
5884 * transport can send up a MSGNOTMARKNEXT
5885 * M_DATA to indicate that it is not
5886 * at the mark and additional data is not about
5887 * to be send upstream.
5888 *
5889 * If the mark state is undetermined this will
5890 * return -1 and we will loop rechecking the
5891 * socket state.
5892 */
5893 retval = strwaitmark(vp);
5894 }
5895 } while (retval == -1);
5896
5897 if (so_copyout(&retval, (void *)arg, sizeof (int),
5898 (mode & (int)FKIOCTL)))
5899 return (EFAULT);
5900 return (0);
5901 }
5902
5903 case I_FDINSERT:
5904 case I_SENDFD:
5905 case I_RECVFD:
5906 case I_ATMARK:
5907 case _SIOCSOCKFALLBACK:
5908 /*
5909 * These ioctls do not apply to sockets. I_FDINSERT can be
5910 * used to send M_PROTO messages without modifying the socket
5911 * state. I_SENDFD/RECVFD should not be used for socket file
5912 * descriptor passing since they assume a twisted stream.
5913 * SIOCATMARK must be used instead of I_ATMARK.
5914 *
5915 * _SIOCSOCKFALLBACK from an application should never be
5916 * processed. It is only generated by socktpi_open() or
5917 * in response to I_POP or I_PUSH.
5918 */
5919 #ifdef DEBUG
5920 zcmn_err(getzoneid(), CE_WARN,
5921 "Unsupported STREAMS ioctl 0x%x on socket. "
5922 "Pid = %d\n", cmd, curproc->p_pid);
5923 #endif /* DEBUG */
5924 return (EOPNOTSUPP);
5925
5926 case _I_GETPEERCRED:
5927 if ((mode & FKIOCTL) == 0)
5928 return (EINVAL);
5929
5930 mutex_enter(&so->so_lock);
5931 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5932 error = ENOTSUP;
5933 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
5934 error = ENOTCONN;
5935 } else if (so->so_peercred != NULL) {
5936 k_peercred_t *kp = (k_peercred_t *)arg;
5937 kp->pc_cr = so->so_peercred;
5938 kp->pc_cpid = so->so_cpid;
5939 crhold(so->so_peercred);
5940 } else {
5941 error = EINVAL;
5942 }
5943 mutex_exit(&so->so_lock);
5944 return (error);
5945
5946 default:
5947 /*
5948 * Do the higher-order bits of the ioctl cmd indicate
5949 * that it is an I_* streams ioctl?
5950 */
5951 if ((cmd & 0xffffff00U) == STR &&
5952 so->so_version == SOV_SOCKBSD) {
5953 #ifdef DEBUG
5954 zcmn_err(getzoneid(), CE_WARN,
5955 "Unsupported STREAMS ioctl 0x%x on socket. "
5956 "Pid = %d\n", cmd, curproc->p_pid);
5957 #endif /* DEBUG */
5958 return (EOPNOTSUPP);
5959 }
5960 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5961 }
5962 }
5963
5964 /*
5965 * Handle plumbing-related ioctls.
5966 */
5967 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5968 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5969 struct cred *cr, int32_t *rvalp)
5970 {
5971 static const char sockmod_name[] = "sockmod";
5972 struct sonode *so = VTOSO(vp);
5973 char mname[FMNAMESZ + 1];
5974 int error;
5975 sotpi_info_t *sti = SOTOTPI(so);
5976
5977 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5978
5979 if (so->so_version == SOV_SOCKBSD)
5980 return (EOPNOTSUPP);
5981
5982 if (so->so_version == SOV_STREAM) {
5983 /*
5984 * The imaginary "sockmod" has been popped - act as a stream.
5985 * If this is a push of sockmod then change back to a socket.
5986 */
5987 if (cmd == I_PUSH) {
5988 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5989 (void *)arg, mname, sizeof (mname), NULL);
5990
5991 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5992 dprintso(so, 0, ("socktpi_ioctl: going to "
5993 "socket version\n"));
5994 so_stream2sock(so);
5995 return (0);
5996 }
5997 }
5998 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5999 }
6000
6001 switch (cmd) {
6002 case I_PUSH:
6003 if (sti->sti_direct) {
6004 mutex_enter(&so->so_lock);
6005 so_lock_single(so);
6006 mutex_exit(&so->so_lock);
6007
6008 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6009 cr, rvalp);
6010
6011 mutex_enter(&so->so_lock);
6012 if (error == 0)
6013 sti->sti_direct = 0;
6014 so_unlock_single(so, SOLOCKED);
6015 mutex_exit(&so->so_lock);
6016
6017 if (error != 0)
6018 return (error);
6019 }
6020
6021 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6022 if (error == 0)
6023 sti->sti_pushcnt++;
6024 return (error);
6025
6026 case I_POP:
6027 if (sti->sti_pushcnt == 0) {
6028 /* Emulate sockmod being popped */
6029 dprintso(so, 0,
6030 ("socktpi_ioctl: going to STREAMS version\n"));
6031 return (so_sock2stream(so));
6032 }
6033
6034 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6035 if (error == 0)
6036 sti->sti_pushcnt--;
6037 return (error);
6038
6039 case I_LIST: {
6040 struct str_mlist *kmlistp, *umlistp;
6041 struct str_list kstrlist;
6042 ssize_t kstrlistsize;
6043 int i, nmods;
6044
6045 STRUCT_DECL(str_list, ustrlist);
6046 STRUCT_INIT(ustrlist, mode);
6047
6048 if (arg == NULL) {
6049 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6050 if (error == 0)
6051 (*rvalp)++; /* Add one for sockmod */
6052 return (error);
6053 }
6054
6055 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6056 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6057 if (error != 0)
6058 return (error);
6059
6060 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6061 if (nmods <= 0)
6062 return (EINVAL);
6063 /*
6064 * Ceiling nmods at nstrpush to prevent someone from
6065 * maliciously consuming lots of kernel memory.
6066 */
6067 nmods = MIN(nmods, nstrpush);
6068
6069 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6070 kstrlist.sl_nmods = nmods;
6071 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6072
6073 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6074 cr, rvalp);
6075 if (error != 0)
6076 goto done;
6077
6078 /*
6079 * Considering the module list as a 0-based array of sl_nmods
6080 * modules, sockmod should conceptually exist at slot
6081 * sti_pushcnt. Insert sockmod at this location by sliding all
6082 * of the module names after so_pushcnt over by one. We know
6083 * that there will be room to do this since we allocated
6084 * sl_modlist with an additional slot.
6085 */
6086 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6087 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6088
6089 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6090 kstrlist.sl_nmods++;
6091
6092 /*
6093 * Copy all of the entries out to ustrlist.
6094 */
6095 kmlistp = kstrlist.sl_modlist;
6096 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6097 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6098 error = so_copyout(kmlistp++, umlistp++,
6099 sizeof (struct str_mlist), mode & FKIOCTL);
6100 if (error != 0)
6101 goto done;
6102 }
6103
6104 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6105 mode & FKIOCTL);
6106 if (error == 0)
6107 *rvalp = 0;
6108 done:
6109 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6110 return (error);
6111 }
6112 case I_LOOK:
6113 if (sti->sti_pushcnt == 0) {
6114 return (so_copyout(sockmod_name, (void *)arg,
6115 sizeof (sockmod_name), mode & FKIOCTL));
6116 }
6117 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6118
6119 case I_FIND:
6120 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6121 if (error && error != EINVAL)
6122 return (error);
6123
6124 /* if not found and string was sockmod return 1 */
6125 if (*rvalp == 0 || error == EINVAL) {
6126 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6127 (void *)arg, mname, sizeof (mname), NULL);
6128 if (error == ENAMETOOLONG)
6129 error = EINVAL;
6130
6131 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6132 *rvalp = 1;
6133 }
6134 return (error);
6135
6136 default:
6137 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6138 break;
6139 }
6140
6141 return (0);
6142 }
6143
6144 /*
6145 * Wrapper around the streams poll routine that implements socket poll
6146 * semantics.
6147 * The sockfs never calls pollwakeup itself - the stream head take care
6148 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6149 * stream head there can never be a deadlock due to holding so_lock across
6150 * pollwakeup and acquiring so_lock in this routine.
6151 *
6152 * However, since the performance of VOP_POLL is critical we avoid
6153 * acquiring so_lock here. This is based on two assumptions:
6154 * - The poll implementation holds locks to serialize the VOP_POLL call
6155 * and a pollwakeup for the same pollhead. This ensures that should
6156 * e.g. so_state change during a socktpi_poll call the pollwakeup
6157 * (which strsock_* and strrput conspire to issue) is issued after
6158 * the state change. Thus the pollwakeup will block until VOP_POLL has
6159 * returned and then wake up poll and have it call VOP_POLL again.
6160 * - The reading of so_state without holding so_lock does not result in
6161 * stale data that is older than the latest state change that has dropped
6162 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6163 * memory barrier to force the data into the coherency domain.
6164 */
6165 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6166 sotpi_poll(
6167 struct sonode *so,
6168 short events,
6169 int anyyet,
6170 short *reventsp,
6171 struct pollhead **phpp)
6172 {
6173 short origevents = events;
6174 struct vnode *vp = SOTOV(so);
6175 int error;
6176 int so_state = so->so_state; /* snapshot */
6177 sotpi_info_t *sti = SOTOTPI(so);
6178
6179 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6180 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6181
6182 ASSERT(vp->v_type == VSOCK);
6183 ASSERT(vp->v_stream != NULL);
6184
6185 if (so->so_version == SOV_STREAM) {
6186 /* The imaginary "sockmod" has been popped - act as a stream */
6187 return (strpoll(vp->v_stream, events, anyyet,
6188 reventsp, phpp));
6189 }
6190
6191 if (!(so_state & SS_ISCONNECTED) &&
6192 (so->so_mode & SM_CONNREQUIRED)) {
6193 /* Not connected yet - turn off write side events */
6194 events &= ~(POLLOUT|POLLWRBAND);
6195 }
6196 /*
6197 * Check for errors without calling strpoll if the caller wants them.
6198 * In sockets the errors are represented as input/output events
6199 * and there is no need to ask the stream head for this information.
6200 */
6201 if (so->so_error != 0 &&
6202 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6203 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6204 return (0);
6205 }
6206 /*
6207 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6208 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6209 * will not trigger a POLLIN event with POLLRDDATA set.
6210 * The handling of urgent data (causing POLLRDBAND) is done by
6211 * inspecting SS_OOBPEND below.
6212 */
6213 events |= POLLRDDATA;
6214
6215 /*
6216 * After shutdown(output) a stream head write error is set.
6217 * However, we should not return output events.
6218 */
6219 events |= POLLNOERR;
6220 error = strpoll(vp->v_stream, events, anyyet,
6221 reventsp, phpp);
6222 if (error)
6223 return (error);
6224
6225 ASSERT(!(*reventsp & POLLERR));
6226
6227 /*
6228 * Notes on T_CONN_IND handling for sockets.
6229 *
6230 * If strpoll() returned without events, SR_POLLIN is guaranteed
6231 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6232 *
6233 * Since the so_lock is not held, soqueueconnind() may have run
6234 * and a T_CONN_IND may be waiting. We now check for any queued
6235 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6236 * to ensure poll returns.
6237 *
6238 * However:
6239 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6240 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6241 * the following actions will occur; taken together they ensure the
6242 * syscall will return.
6243 *
6244 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6245 * the accept() was run on a non-blocking socket sowaitconnind()
6246 * may have already returned EWOULDBLOCK, so not be waiting to
6247 * process the message. Additionally socktpi_poll() has probably
6248 * proceeded past the sti_conn_ind_head check below.
6249 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6250 * this thread, however that could occur before poll_common()
6251 * has entered cv_wait.
6252 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6253 *
6254 * Before proceeding to cv_wait() in poll_common() for an event,
6255 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6256 * and if set, re-calls strpoll() to ensure the late arriving
6257 * T_CONN_IND is recognized, and pollsys() returns.
6258 */
6259
6260 if (sti->sti_conn_ind_head != NULL)
6261 *reventsp |= (POLLIN|POLLRDNORM) & events;
6262
6263 if (so->so_state & SS_OOBPEND)
6264 *reventsp |= POLLRDBAND & events;
6265
6266 if (sti->sti_nl7c_rcv_mp != NULL) {
6267 *reventsp |= (POLLIN|POLLRDNORM) & events;
6268 }
6269 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6270 ((POLLIN|POLLRDNORM) & *reventsp)) {
6271 sti->sti_nl7c_flags |= NL7C_POLLIN;
6272 }
6273
6274 return (0);
6275 }
6276
6277 /*ARGSUSED*/
6278 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6279 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6280 {
6281 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6282 int error = 0;
6283
6284 error = sonode_constructor(buf, cdrarg, kmflags);
6285 if (error != 0)
6286 return (error);
6287
6288 error = i_sotpi_info_constructor(&st->st_info);
6289 if (error != 0)
6290 sonode_destructor(buf, cdrarg);
6291
6292 st->st_sonode.so_priv = &st->st_info;
6293
6294 return (error);
6295 }
6296
6297 /*ARGSUSED1*/
6298 static void
socktpi_destructor(void * buf,void * cdrarg)6299 socktpi_destructor(void *buf, void *cdrarg)
6300 {
6301 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6302
6303 ASSERT(st->st_sonode.so_priv == &st->st_info);
6304 st->st_sonode.so_priv = NULL;
6305
6306 i_sotpi_info_destructor(&st->st_info);
6307 sonode_destructor(buf, cdrarg);
6308 }
6309
6310 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6311 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6312 {
6313 int retval;
6314
6315 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6316 struct sonode *so = (struct sonode *)buf;
6317 sotpi_info_t *sti = SOTOTPI(so);
6318
6319 mutex_enter(&socklist.sl_lock);
6320
6321 sti->sti_next_so = socklist.sl_list;
6322 sti->sti_prev_so = NULL;
6323 if (sti->sti_next_so != NULL)
6324 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6325 socklist.sl_list = so;
6326
6327 mutex_exit(&socklist.sl_lock);
6328
6329 }
6330 return (retval);
6331 }
6332
6333 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6334 socktpi_unix_destructor(void *buf, void *cdrarg)
6335 {
6336 struct sonode *so = (struct sonode *)buf;
6337 sotpi_info_t *sti = SOTOTPI(so);
6338
6339 mutex_enter(&socklist.sl_lock);
6340
6341 if (sti->sti_next_so != NULL)
6342 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6343 if (sti->sti_prev_so != NULL)
6344 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6345 else
6346 socklist.sl_list = sti->sti_next_so;
6347
6348 mutex_exit(&socklist.sl_lock);
6349
6350 socktpi_destructor(buf, cdrarg);
6351 }
6352
6353 int
socktpi_init(void)6354 socktpi_init(void)
6355 {
6356 /*
6357 * Create sonode caches. We create a special one for AF_UNIX so
6358 * that we can track them for netstat(1m).
6359 */
6360 socktpi_cache = kmem_cache_create("socktpi_cache",
6361 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6362 socktpi_destructor, NULL, NULL, NULL, 0);
6363
6364 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6365 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6366 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6367
6368 return (0);
6369 }
6370
6371 /*
6372 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6373 *
6374 * Caller must still update state and mode using sotpi_update_state().
6375 */
6376 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6377 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6378 boolean_t *direct, queue_t **qp, struct cred *cr)
6379 {
6380 sotpi_info_t *sti;
6381 struct sockparams *origsp = so->so_sockparams;
6382 sock_lower_handle_t handle = so->so_proto_handle;
6383 struct stdata *stp;
6384 struct vnode *vp;
6385 queue_t *q;
6386 int error = 0;
6387
6388 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6389 SS_FALLBACK_PENDING);
6390 ASSERT(SOCK_IS_NONSTR(so));
6391
6392 *qp = NULL;
6393 *direct = B_FALSE;
6394 so->so_sockparams = newsp;
6395 /*
6396 * Allocate and initalize fields required by TPI.
6397 */
6398 (void) sotpi_info_create(so, KM_SLEEP);
6399 sotpi_info_init(so);
6400
6401 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6402 sotpi_info_fini(so);
6403 sotpi_info_destroy(so);
6404 return (error);
6405 }
6406 ASSERT(handle == so->so_proto_handle);
6407 sti = SOTOTPI(so);
6408 if (sti->sti_direct != 0)
6409 *direct = B_TRUE;
6410
6411 /*
6412 * Keep the original sp around so we can properly dispose of the
6413 * sonode when the socket is being closed.
6414 */
6415 sti->sti_orig_sp = origsp;
6416
6417 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6418 so_alloc_addr(so, so->so_max_addr_len);
6419
6420 /*
6421 * If the application has done a SIOCSPGRP, make sure the
6422 * STREAM head is aware. This needs to take place before
6423 * the protocol start sending up messages. Otherwise we
6424 * might miss to generate SIGPOLL.
6425 *
6426 * It is possible that the application will receive duplicate
6427 * signals if some were already generated for either data or
6428 * connection indications.
6429 */
6430 if (so->so_pgrp != 0) {
6431 if (so_set_events(so, so->so_vnode, cr) != 0)
6432 so->so_pgrp = 0;
6433 }
6434
6435 /*
6436 * Determine which queue to use.
6437 */
6438 vp = SOTOV(so);
6439 stp = vp->v_stream;
6440 ASSERT(stp != NULL);
6441 q = stp->sd_wrq->q_next;
6442
6443 /*
6444 * Skip any modules that may have been auto pushed when the device
6445 * was opened
6446 */
6447 while (q->q_next != NULL)
6448 q = q->q_next;
6449 *qp = _RD(q);
6450
6451 /* This is now a STREAMS sockets */
6452 so->so_not_str = B_FALSE;
6453
6454 return (error);
6455 }
6456
6457 /*
6458 * Revert a TPI sonode. It is only allowed to revert the sonode during
6459 * the fallback process.
6460 */
6461 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6462 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6463 {
6464 vnode_t *vp = SOTOV(so);
6465
6466 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6467 SS_FALLBACK_PENDING);
6468 ASSERT(!SOCK_IS_NONSTR(so));
6469 ASSERT(vp->v_stream != NULL);
6470
6471 strclean(vp);
6472 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6473
6474 /*
6475 * Restore the original sockparams. The caller is responsible for
6476 * dropping the ref to the new sp.
6477 */
6478 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6479
6480 sotpi_info_fini(so);
6481 sotpi_info_destroy(so);
6482
6483 /* This is no longer a STREAMS sockets */
6484 so->so_not_str = B_TRUE;
6485 }
6486
6487 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6488 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6489 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6490 socklen_t faddrlen, short opts)
6491 {
6492 sotpi_info_t *sti = SOTOTPI(so);
6493
6494 so_proc_tcapability_ack(so, tcap);
6495
6496 so->so_options |= opts;
6497
6498 /*
6499 * Determine whether the foreign and local address are valid
6500 */
6501 if (laddrlen != 0) {
6502 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6503 sti->sti_laddr_len = laddrlen;
6504 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6505 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6506 }
6507
6508 if (faddrlen != 0) {
6509 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6510 sti->sti_faddr_len = faddrlen;
6511 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6512 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6513 }
6514
6515 }
6516
6517 /*
6518 * Allocate enough space to cache the local and foreign addresses.
6519 */
6520 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6521 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6522 {
6523 sotpi_info_t *sti = SOTOTPI(so);
6524
6525 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6526 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6527 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6528 P2ROUNDUP(maxlen, KMEM_ALIGN);
6529 so->so_max_addr_len = sti->sti_laddr_maxlen;
6530 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6531 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6532 + sti->sti_laddr_maxlen);
6533
6534 if (so->so_family == AF_UNIX) {
6535 /*
6536 * Initialize AF_UNIX related fields.
6537 */
6538 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6539 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6540 }
6541 }
6542
6543
6544 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6545 sotpi_sototpi(struct sonode *so)
6546 {
6547 sotpi_info_t *sti;
6548
6549 ASSERT(so != NULL);
6550
6551 sti = (sotpi_info_t *)so->so_priv;
6552
6553 ASSERT(sti != NULL);
6554 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6555
6556 return (sti);
6557 }
6558
6559 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6560 i_sotpi_info_constructor(sotpi_info_t *sti)
6561 {
6562 sti->sti_magic = SOTPI_INFO_MAGIC;
6563 sti->sti_ack_mp = NULL;
6564 sti->sti_discon_ind_mp = NULL;
6565 sti->sti_ux_bound_vp = NULL;
6566 sti->sti_unbind_mp = NULL;
6567
6568 sti->sti_conn_ind_head = NULL;
6569 sti->sti_conn_ind_tail = NULL;
6570
6571 sti->sti_laddr_sa = NULL;
6572 sti->sti_faddr_sa = NULL;
6573
6574 sti->sti_nl7c_flags = 0;
6575 sti->sti_nl7c_uri = NULL;
6576 sti->sti_nl7c_rcv_mp = NULL;
6577
6578 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6579 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6580
6581 return (0);
6582 }
6583
6584 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6585 i_sotpi_info_destructor(sotpi_info_t *sti)
6586 {
6587 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6588 ASSERT(sti->sti_ack_mp == NULL);
6589 ASSERT(sti->sti_discon_ind_mp == NULL);
6590 ASSERT(sti->sti_ux_bound_vp == NULL);
6591 ASSERT(sti->sti_unbind_mp == NULL);
6592
6593 ASSERT(sti->sti_conn_ind_head == NULL);
6594 ASSERT(sti->sti_conn_ind_tail == NULL);
6595
6596 ASSERT(sti->sti_laddr_sa == NULL);
6597 ASSERT(sti->sti_faddr_sa == NULL);
6598
6599 ASSERT(sti->sti_nl7c_flags == 0);
6600 ASSERT(sti->sti_nl7c_uri == NULL);
6601 ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6602
6603 mutex_destroy(&sti->sti_plumb_lock);
6604 cv_destroy(&sti->sti_ack_cv);
6605 }
6606
6607 /*
6608 * Creates and attaches TPI information to the given sonode
6609 */
6610 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6611 sotpi_info_create(struct sonode *so, int kmflags)
6612 {
6613 sotpi_info_t *sti;
6614
6615 ASSERT(so->so_priv == NULL);
6616
6617 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6618 return (B_FALSE);
6619
6620 if (i_sotpi_info_constructor(sti) != 0) {
6621 kmem_free(sti, sizeof (*sti));
6622 return (B_FALSE);
6623 }
6624
6625 so->so_priv = (void *)sti;
6626 return (B_TRUE);
6627 }
6628
6629 /*
6630 * Initializes the TPI information.
6631 */
6632 static void
sotpi_info_init(struct sonode * so)6633 sotpi_info_init(struct sonode *so)
6634 {
6635 struct vnode *vp = SOTOV(so);
6636 sotpi_info_t *sti = SOTOTPI(so);
6637 time_t now;
6638
6639 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6640 vp->v_rdev = sti->sti_dev;
6641
6642 sti->sti_orig_sp = NULL;
6643
6644 sti->sti_pushcnt = 0;
6645
6646 now = gethrestime_sec();
6647 sti->sti_atime = now;
6648 sti->sti_mtime = now;
6649 sti->sti_ctime = now;
6650
6651 sti->sti_eaddr_mp = NULL;
6652 sti->sti_delayed_error = 0;
6653
6654 sti->sti_provinfo = NULL;
6655
6656 sti->sti_oobcnt = 0;
6657 sti->sti_oobsigcnt = 0;
6658
6659 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6660
6661 sti->sti_laddr_sa = 0;
6662 sti->sti_faddr_sa = 0;
6663 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6664 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6665
6666 sti->sti_laddr_valid = 0;
6667 sti->sti_faddr_valid = 0;
6668 sti->sti_faddr_noxlate = 0;
6669
6670 sti->sti_direct = 0;
6671
6672 ASSERT(sti->sti_ack_mp == NULL);
6673 ASSERT(sti->sti_ux_bound_vp == NULL);
6674 ASSERT(sti->sti_unbind_mp == NULL);
6675
6676 ASSERT(sti->sti_conn_ind_head == NULL);
6677 ASSERT(sti->sti_conn_ind_tail == NULL);
6678 }
6679
6680 /*
6681 * Given a sonode, grab the TPI info and free any data.
6682 */
6683 static void
sotpi_info_fini(struct sonode * so)6684 sotpi_info_fini(struct sonode *so)
6685 {
6686 sotpi_info_t *sti = SOTOTPI(so);
6687 mblk_t *mp;
6688
6689 ASSERT(sti->sti_discon_ind_mp == NULL);
6690
6691 if ((mp = sti->sti_conn_ind_head) != NULL) {
6692 mblk_t *mp1;
6693
6694 while (mp) {
6695 mp1 = mp->b_next;
6696 mp->b_next = NULL;
6697 freemsg(mp);
6698 mp = mp1;
6699 }
6700 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6701 }
6702
6703 /*
6704 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6705 * indirect them. It also uses so_count as a validity test.
6706 */
6707 mutex_enter(&so->so_lock);
6708
6709 if (sti->sti_laddr_sa) {
6710 ASSERT((caddr_t)sti->sti_faddr_sa ==
6711 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6712 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6713 sti->sti_laddr_valid = 0;
6714 sti->sti_faddr_valid = 0;
6715 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6716 sti->sti_laddr_sa = NULL;
6717 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6718 sti->sti_faddr_sa = NULL;
6719 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6720 }
6721
6722 mutex_exit(&so->so_lock);
6723
6724 if ((mp = sti->sti_eaddr_mp) != NULL) {
6725 freemsg(mp);
6726 sti->sti_eaddr_mp = NULL;
6727 sti->sti_delayed_error = 0;
6728 }
6729
6730 if ((mp = sti->sti_ack_mp) != NULL) {
6731 freemsg(mp);
6732 sti->sti_ack_mp = NULL;
6733 }
6734
6735 if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6736 sti->sti_nl7c_rcv_mp = NULL;
6737 freemsg(mp);
6738 }
6739 sti->sti_nl7c_rcv_rval = 0;
6740 if (sti->sti_nl7c_uri != NULL) {
6741 nl7c_urifree(so);
6742 /* urifree() cleared nl7c_uri */
6743 }
6744 if (sti->sti_nl7c_flags) {
6745 sti->sti_nl7c_flags = 0;
6746 }
6747
6748 ASSERT(sti->sti_ux_bound_vp == NULL);
6749 if ((mp = sti->sti_unbind_mp) != NULL) {
6750 freemsg(mp);
6751 sti->sti_unbind_mp = NULL;
6752 }
6753 }
6754
6755 /*
6756 * Destroys the TPI information attached to a sonode.
6757 */
6758 static void
sotpi_info_destroy(struct sonode * so)6759 sotpi_info_destroy(struct sonode *so)
6760 {
6761 sotpi_info_t *sti = SOTOTPI(so);
6762
6763 i_sotpi_info_destructor(sti);
6764 kmem_free(sti, sizeof (*sti));
6765
6766 so->so_priv = NULL;
6767 }
6768
6769 /*
6770 * Create the global sotpi socket module entry. It will never be freed.
6771 */
6772 smod_info_t *
sotpi_smod_create(void)6773 sotpi_smod_create(void)
6774 {
6775 smod_info_t *smodp;
6776
6777 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6778 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6779 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6780 /*
6781 * Initialize the smod_refcnt to 1 so it will never be freed.
6782 */
6783 smodp->smod_refcnt = 1;
6784 smodp->smod_uc_version = SOCK_UC_VERSION;
6785 smodp->smod_dc_version = SOCK_DC_VERSION;
6786 smodp->smod_sock_create_func = &sotpi_create;
6787 smodp->smod_sock_destroy_func = &sotpi_destroy;
6788 return (smodp);
6789 }
6790