1 /* $NetBSD: uipc_usrreq.c,v 1.114 2008/04/28 20:24:05 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1982, 1986, 1989, 1991, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 62 */ 63 64 /* 65 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved. 66 * 67 * Redistribution and use in source and binary forms, with or without 68 * modification, are permitted provided that the following conditions 69 * are met: 70 * 1. Redistributions of source code must retain the above copyright 71 * notice, this list of conditions and the following disclaimer. 72 * 2. Redistributions in binary form must reproduce the above copyright 73 * notice, this list of conditions and the following disclaimer in the 74 * documentation and/or other materials provided with the distribution. 75 * 3. All advertising materials mentioning features or use of this software 76 * must display the following acknowledgement: 77 * This product includes software developed by the University of 78 * California, Berkeley and its contributors. 79 * 4. Neither the name of the University nor the names of its contributors 80 * may be used to endorse or promote products derived from this software 81 * without specific prior written permission. 82 * 83 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 93 * SUCH DAMAGE. 94 * 95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 96 */ 97 98 #include <sys/cdefs.h> 99 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.114 2008/04/28 20:24:05 martin Exp $"); 100 101 #include <sys/param.h> 102 #include <sys/systm.h> 103 #include <sys/proc.h> 104 #include <sys/filedesc.h> 105 #include <sys/domain.h> 106 #include <sys/protosw.h> 107 #include <sys/socket.h> 108 #include <sys/socketvar.h> 109 #include <sys/unpcb.h> 110 #include <sys/un.h> 111 #include <sys/namei.h> 112 #include <sys/vnode.h> 113 #include <sys/file.h> 114 #include <sys/stat.h> 115 #include <sys/mbuf.h> 116 #include <sys/kauth.h> 117 #include <sys/kmem.h> 118 #include <sys/atomic.h> 119 120 /* 121 * Unix communications domain. 122 * 123 * TODO: 124 * SEQPACKET, RDM 125 * rethink name space problems 126 * need a proper out-of-band 127 * 128 * Notes on locking: 129 * 130 * The generic rules noted in uipc_socket2.c apply. In addition: 131 * 132 * o We have a global lock, uipc_lock. 133 * 134 * o All datagram sockets are locked by uipc_lock. 135 * 136 * o For stream socketpairs, the two endpoints are created sharing the same 137 * independent lock. Sockets presented to PRU_CONNECT2 must already have 138 * matching locks. 139 * 140 * o Stream sockets created via socket() start life with their own 141 * independent lock. 142 * 143 * o Stream connections to a named endpoint are slightly more complicated. 144 * Sockets that have called listen() have their lock pointer mutated to 145 * the global uipc_lock. When establishing a connection, the connecting 146 * socket also has its lock mutated to uipc_lock, which matches the head 147 * (listening socket). We create a new socket for accept() to return, and 148 * that also shares the head's lock. Until the connection is completely 149 * done on both ends, all three sockets are locked by uipc_lock. Once the 150 * connection is complete, the association with the head's lock is broken. 151 * The connecting socket and the socket returned from accept() have their 152 * lock pointers mutated away from uipc_lock, and back to the connecting 153 * socket's original, independent lock. The head continues to be locked 154 * by uipc_lock. 155 * 156 * o If uipc_lock is determined to be a significant source of contention, 157 * it could easily be hashed out. It is difficult to simply make it an 158 * independent lock because of visibility / garbage collection issues: 159 * if a socket has been associated with a lock at any point, that lock 160 * must remain valid until the socket is no longer visible in the system. 161 * The lock must not be freed or otherwise destroyed until any sockets 162 * that had referenced it have also been destroyed. 163 */ 164 const struct sockaddr_un sun_noname = { 165 .sun_len = sizeof(sun_noname), 166 .sun_family = AF_LOCAL, 167 }; 168 ino_t unp_ino; /* prototype for fake inode numbers */ 169 170 struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *); 171 static kmutex_t *uipc_lock; 172 173 /* 174 * Initialize Unix protocols. 175 */ 176 void 177 uipc_init(void) 178 { 179 180 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 181 } 182 183 /* 184 * A connection succeeded: disassociate both endpoints from the head's 185 * lock, and make them share their own lock. There is a race here: for 186 * a very brief time one endpoint will be locked by a different lock 187 * than the other end. However, since the current thread holds the old 188 * lock (the listening socket's lock, the head) access can still only be 189 * made to one side of the connection. 190 */ 191 static void 192 unp_setpeerlocks(struct socket *so, struct socket *so2) 193 { 194 struct unpcb *unp; 195 kmutex_t *lock; 196 197 KASSERT(solocked2(so, so2)); 198 199 /* 200 * Bail out if either end of the socket is not yet fully 201 * connected or accepted. We only break the lock association 202 * with the head when the pair of sockets stand completely 203 * on their own. 204 */ 205 if (so->so_head != NULL || so2->so_head != NULL) 206 return; 207 208 /* 209 * Drop references to old lock. A third reference (from the 210 * queue head) must be held as we still hold its lock. Bonus: 211 * we don't need to worry about garbage collecting the lock. 212 */ 213 lock = so->so_lock; 214 KASSERT(lock == uipc_lock); 215 mutex_obj_free(lock); 216 mutex_obj_free(lock); 217 218 /* 219 * Grab stream lock from the initiator and share between the two 220 * endpoints. Issue memory barrier to ensure all modifications 221 * become globally visible before the lock change. so2 is 222 * assumed not to have a stream lock, because it was created 223 * purely for the server side to accept this connection and 224 * started out life using the domain-wide lock. 225 */ 226 unp = sotounpcb(so); 227 KASSERT(unp->unp_streamlock != NULL); 228 KASSERT(sotounpcb(so2)->unp_streamlock == NULL); 229 lock = unp->unp_streamlock; 230 unp->unp_streamlock = NULL; 231 mutex_obj_hold(lock); 232 membar_exit(); 233 so->so_lock = lock; 234 so2->so_lock = lock; 235 } 236 237 /* 238 * Reset a socket's lock back to the domain-wide lock. 239 */ 240 static void 241 unp_resetlock(struct socket *so) 242 { 243 kmutex_t *olock, *nlock; 244 struct unpcb *unp; 245 246 KASSERT(solocked(so)); 247 248 olock = so->so_lock; 249 nlock = uipc_lock; 250 if (olock == nlock) 251 return; 252 unp = sotounpcb(so); 253 KASSERT(unp->unp_streamlock == NULL); 254 unp->unp_streamlock = olock; 255 mutex_obj_hold(nlock); 256 mutex_enter(nlock); 257 so->so_lock = nlock; 258 mutex_exit(olock); 259 } 260 261 static void 262 unp_free(struct unpcb *unp) 263 { 264 265 if (unp->unp_addr) 266 free(unp->unp_addr, M_SONAME); 267 if (unp->unp_streamlock != NULL) 268 mutex_obj_free(unp->unp_streamlock); 269 free(unp, M_PCB); 270 } 271 272 int 273 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp, 274 struct lwp *l) 275 { 276 struct socket *so2; 277 const struct sockaddr_un *sun; 278 279 so2 = unp->unp_conn->unp_socket; 280 281 KASSERT(solocked(so2)); 282 283 if (unp->unp_addr) 284 sun = unp->unp_addr; 285 else 286 sun = &sun_noname; 287 if (unp->unp_conn->unp_flags & UNP_WANTCRED) 288 control = unp_addsockcred(l, control); 289 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m, 290 control) == 0) { 291 so2->so_rcv.sb_overflowed++; 292 sounlock(so2); 293 unp_dispose(control); 294 m_freem(control); 295 m_freem(m); 296 solock(so2); 297 return (ENOBUFS); 298 } else { 299 sorwakeup(so2); 300 return (0); 301 } 302 } 303 304 void 305 unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr) 306 { 307 const struct sockaddr_un *sun; 308 struct unpcb *unp; 309 bool ext; 310 311 unp = sotounpcb(so); 312 ext = false; 313 314 for (;;) { 315 sun = NULL; 316 if (peeraddr) { 317 if (unp->unp_conn && unp->unp_conn->unp_addr) 318 sun = unp->unp_conn->unp_addr; 319 } else { 320 if (unp->unp_addr) 321 sun = unp->unp_addr; 322 } 323 if (sun == NULL) 324 sun = &sun_noname; 325 nam->m_len = sun->sun_len; 326 if (nam->m_len > MLEN && !ext) { 327 sounlock(so); 328 MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK); 329 solock(so); 330 ext = true; 331 } else { 332 KASSERT(nam->m_len <= MAXPATHLEN * 2); 333 memcpy(mtod(nam, void *), sun, (size_t)nam->m_len); 334 break; 335 } 336 } 337 } 338 339 /*ARGSUSED*/ 340 int 341 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 342 struct mbuf *control, struct lwp *l) 343 { 344 struct unpcb *unp = sotounpcb(so); 345 struct socket *so2; 346 struct proc *p; 347 u_int newhiwat; 348 int error = 0; 349 350 if (req == PRU_CONTROL) 351 return (EOPNOTSUPP); 352 353 #ifdef DIAGNOSTIC 354 if (req != PRU_SEND && req != PRU_SENDOOB && control) 355 panic("uipc_usrreq: unexpected control mbuf"); 356 #endif 357 p = l ? l->l_proc : NULL; 358 if (req != PRU_ATTACH) { 359 if (unp == 0) { 360 error = EINVAL; 361 goto release; 362 } 363 KASSERT(solocked(so)); 364 } 365 366 switch (req) { 367 368 case PRU_ATTACH: 369 if (unp != 0) { 370 error = EISCONN; 371 break; 372 } 373 error = unp_attach(so); 374 break; 375 376 case PRU_DETACH: 377 unp_detach(unp); 378 break; 379 380 case PRU_BIND: 381 KASSERT(l != NULL); 382 error = unp_bind(so, nam, l); 383 break; 384 385 case PRU_LISTEN: 386 /* 387 * If the socket can accept a connection, it must be 388 * locked by uipc_lock. 389 */ 390 unp_resetlock(so); 391 if (unp->unp_vnode == 0) 392 error = EINVAL; 393 break; 394 395 case PRU_CONNECT: 396 KASSERT(l != NULL); 397 error = unp_connect(so, nam, l); 398 break; 399 400 case PRU_CONNECT2: 401 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2); 402 break; 403 404 case PRU_DISCONNECT: 405 unp_disconnect(unp); 406 break; 407 408 case PRU_ACCEPT: 409 KASSERT(so->so_lock == uipc_lock); 410 /* 411 * Mark the initiating STREAM socket as connected *ONLY* 412 * after it's been accepted. This prevents a client from 413 * overrunning a server and receiving ECONNREFUSED. 414 */ 415 if (unp->unp_conn == NULL) 416 break; 417 so2 = unp->unp_conn->unp_socket; 418 if (so2->so_state & SS_ISCONNECTING) { 419 KASSERT(solocked2(so, so->so_head)); 420 KASSERT(solocked2(so2, so->so_head)); 421 soisconnected(so2); 422 } 423 /* 424 * If the connection is fully established, break the 425 * association with uipc_lock and give the connected 426 * pair a seperate lock to share. 427 */ 428 unp_setpeerlocks(so2, so); 429 /* 430 * Only now return peer's address, as we may need to 431 * block in order to allocate memory. 432 * 433 * XXX Minor race: connection can be broken while 434 * lock is dropped in unp_setaddr(). We will return 435 * error == 0 and sun_noname as the peer address. 436 */ 437 unp_setaddr(so, nam, true); 438 break; 439 440 case PRU_SHUTDOWN: 441 socantsendmore(so); 442 unp_shutdown(unp); 443 break; 444 445 case PRU_RCVD: 446 switch (so->so_type) { 447 448 case SOCK_DGRAM: 449 panic("uipc 1"); 450 /*NOTREACHED*/ 451 452 case SOCK_STREAM: 453 #define rcv (&so->so_rcv) 454 #define snd (&so2->so_snd) 455 if (unp->unp_conn == 0) 456 break; 457 so2 = unp->unp_conn->unp_socket; 458 KASSERT(solocked2(so, so2)); 459 /* 460 * Adjust backpressure on sender 461 * and wakeup any waiting to write. 462 */ 463 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; 464 unp->unp_mbcnt = rcv->sb_mbcnt; 465 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc; 466 (void)chgsbsize(so2->so_uidinfo, 467 &snd->sb_hiwat, newhiwat, RLIM_INFINITY); 468 unp->unp_cc = rcv->sb_cc; 469 sowwakeup(so2); 470 #undef snd 471 #undef rcv 472 break; 473 474 default: 475 panic("uipc 2"); 476 } 477 break; 478 479 case PRU_SEND: 480 /* 481 * Note: unp_internalize() rejects any control message 482 * other than SCM_RIGHTS, and only allows one. This 483 * has the side-effect of preventing a caller from 484 * forging SCM_CREDS. 485 */ 486 if (control) { 487 sounlock(so); 488 error = unp_internalize(&control); 489 solock(so); 490 if (error != 0) { 491 m_freem(control); 492 m_freem(m); 493 break; 494 } 495 } 496 switch (so->so_type) { 497 498 case SOCK_DGRAM: { 499 KASSERT(so->so_lock == uipc_lock); 500 if (nam) { 501 if ((so->so_state & SS_ISCONNECTED) != 0) 502 error = EISCONN; 503 else { 504 /* 505 * Note: once connected, the 506 * socket's lock must not be 507 * dropped until we have sent 508 * the message and disconnected. 509 * This is necessary to prevent 510 * intervening control ops, like 511 * another connection. 512 */ 513 error = unp_connect(so, nam, l); 514 } 515 } else { 516 if ((so->so_state & SS_ISCONNECTED) == 0) 517 error = ENOTCONN; 518 } 519 if (error) { 520 sounlock(so); 521 unp_dispose(control); 522 m_freem(control); 523 m_freem(m); 524 solock(so); 525 break; 526 } 527 KASSERT(p != NULL); 528 error = unp_output(m, control, unp, l); 529 if (nam) 530 unp_disconnect(unp); 531 break; 532 } 533 534 case SOCK_STREAM: 535 #define rcv (&so2->so_rcv) 536 #define snd (&so->so_snd) 537 if (unp->unp_conn == NULL) { 538 error = ENOTCONN; 539 break; 540 } 541 so2 = unp->unp_conn->unp_socket; 542 KASSERT(solocked2(so, so2)); 543 if (unp->unp_conn->unp_flags & UNP_WANTCRED) { 544 /* 545 * Credentials are passed only once on 546 * SOCK_STREAM. 547 */ 548 unp->unp_conn->unp_flags &= ~UNP_WANTCRED; 549 control = unp_addsockcred(l, control); 550 } 551 /* 552 * Send to paired receive port, and then reduce 553 * send buffer hiwater marks to maintain backpressure. 554 * Wake up readers. 555 */ 556 if (control) { 557 if (sbappendcontrol(rcv, m, control) != 0) 558 control = NULL; 559 } else 560 sbappend(rcv, m); 561 snd->sb_mbmax -= 562 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; 563 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; 564 newhiwat = snd->sb_hiwat - 565 (rcv->sb_cc - unp->unp_conn->unp_cc); 566 (void)chgsbsize(so->so_uidinfo, 567 &snd->sb_hiwat, newhiwat, RLIM_INFINITY); 568 unp->unp_conn->unp_cc = rcv->sb_cc; 569 sorwakeup(so2); 570 #undef snd 571 #undef rcv 572 if (control != NULL) { 573 sounlock(so); 574 unp_dispose(control); 575 m_freem(control); 576 solock(so); 577 } 578 break; 579 580 default: 581 panic("uipc 4"); 582 } 583 break; 584 585 case PRU_ABORT: 586 (void)unp_drop(unp, ECONNABORTED); 587 588 KASSERT(so->so_head == NULL); 589 #ifdef DIAGNOSTIC 590 if (so->so_pcb == 0) 591 panic("uipc 5: drop killed pcb"); 592 #endif 593 unp_detach(unp); 594 break; 595 596 case PRU_SENSE: 597 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 598 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { 599 so2 = unp->unp_conn->unp_socket; 600 KASSERT(solocked2(so, so2)); 601 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc; 602 } 603 ((struct stat *) m)->st_dev = NODEV; 604 if (unp->unp_ino == 0) 605 unp->unp_ino = unp_ino++; 606 ((struct stat *) m)->st_atimespec = 607 ((struct stat *) m)->st_mtimespec = 608 ((struct stat *) m)->st_ctimespec = unp->unp_ctime; 609 ((struct stat *) m)->st_ino = unp->unp_ino; 610 return (0); 611 612 case PRU_RCVOOB: 613 error = EOPNOTSUPP; 614 break; 615 616 case PRU_SENDOOB: 617 m_freem(control); 618 m_freem(m); 619 error = EOPNOTSUPP; 620 break; 621 622 case PRU_SOCKADDR: 623 unp_setaddr(so, nam, false); 624 break; 625 626 case PRU_PEERADDR: 627 unp_setaddr(so, nam, true); 628 break; 629 630 default: 631 panic("piusrreq"); 632 } 633 634 release: 635 return (error); 636 } 637 638 /* 639 * Unix domain socket option processing. 640 */ 641 int 642 uipc_ctloutput(int op, struct socket *so, int level, int optname, 643 struct mbuf **mp) 644 { 645 struct unpcb *unp = sotounpcb(so); 646 struct mbuf *m = *mp; 647 int optval = 0, error = 0; 648 649 KASSERT(solocked(so)); 650 651 if (level != 0) { 652 error = ENOPROTOOPT; 653 if (op == PRCO_SETOPT && m) 654 (void) m_free(m); 655 } else switch (op) { 656 657 case PRCO_SETOPT: 658 switch (optname) { 659 case LOCAL_CREDS: 660 case LOCAL_CONNWAIT: 661 if (m == NULL || m->m_len != sizeof(int)) 662 error = EINVAL; 663 else { 664 optval = *mtod(m, int *); 665 switch (optname) { 666 #define OPTSET(bit) \ 667 if (optval) \ 668 unp->unp_flags |= (bit); \ 669 else \ 670 unp->unp_flags &= ~(bit); 671 672 case LOCAL_CREDS: 673 OPTSET(UNP_WANTCRED); 674 break; 675 case LOCAL_CONNWAIT: 676 OPTSET(UNP_CONNWAIT); 677 break; 678 } 679 } 680 break; 681 #undef OPTSET 682 683 default: 684 error = ENOPROTOOPT; 685 break; 686 } 687 if (m) 688 (void) m_free(m); 689 break; 690 691 case PRCO_GETOPT: 692 sounlock(so); 693 switch (optname) { 694 case LOCAL_PEEREID: 695 if (unp->unp_flags & UNP_EIDSVALID) { 696 *mp = m = m_get(M_WAIT, MT_SOOPTS); 697 m->m_len = sizeof(struct unpcbid); 698 *mtod(m, struct unpcbid *) = unp->unp_connid; 699 } else { 700 error = EINVAL; 701 } 702 break; 703 case LOCAL_CREDS: 704 *mp = m = m_get(M_WAIT, MT_SOOPTS); 705 m->m_len = sizeof(int); 706 707 #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0) 708 709 optval = OPTBIT(UNP_WANTCRED); 710 *mtod(m, int *) = optval; 711 break; 712 #undef OPTBIT 713 714 default: 715 error = ENOPROTOOPT; 716 break; 717 } 718 solock(so); 719 break; 720 } 721 return (error); 722 } 723 724 /* 725 * Both send and receive buffers are allocated PIPSIZ bytes of buffering 726 * for stream sockets, although the total for sender and receiver is 727 * actually only PIPSIZ. 728 * Datagram sockets really use the sendspace as the maximum datagram size, 729 * and don't really want to reserve the sendspace. Their recvspace should 730 * be large enough for at least one max-size datagram plus address. 731 */ 732 #define PIPSIZ 4096 733 u_long unpst_sendspace = PIPSIZ; 734 u_long unpst_recvspace = PIPSIZ; 735 u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 736 u_long unpdg_recvspace = 4*1024; 737 738 u_int unp_rights; /* file descriptors in flight */ 739 740 int 741 unp_attach(struct socket *so) 742 { 743 struct unpcb *unp; 744 int error; 745 746 switch (so->so_type) { 747 case SOCK_STREAM: 748 if (so->so_lock == NULL) { 749 /* 750 * XXX Assuming that no socket locks are held, 751 * as this call may sleep. 752 */ 753 so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 754 solock(so); 755 } 756 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 757 error = soreserve(so, unpst_sendspace, unpst_recvspace); 758 if (error != 0) 759 return (error); 760 } 761 break; 762 763 case SOCK_DGRAM: 764 if (so->so_lock == NULL) { 765 mutex_obj_hold(uipc_lock); 766 so->so_lock = uipc_lock; 767 solock(so); 768 } 769 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 770 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 771 if (error != 0) 772 return (error); 773 } 774 break; 775 776 default: 777 panic("unp_attach"); 778 } 779 KASSERT(solocked(so)); 780 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT); 781 if (unp == NULL) 782 return (ENOBUFS); 783 memset((void *)unp, 0, sizeof(*unp)); 784 unp->unp_socket = so; 785 so->so_pcb = unp; 786 nanotime(&unp->unp_ctime); 787 return (0); 788 } 789 790 void 791 unp_detach(struct unpcb *unp) 792 { 793 struct socket *so; 794 vnode_t *vp; 795 796 so = unp->unp_socket; 797 798 retry: 799 if ((vp = unp->unp_vnode) != NULL) { 800 sounlock(so); 801 /* Acquire v_interlock to protect against unp_connect(). */ 802 /* XXXAD racy */ 803 mutex_enter(&vp->v_interlock); 804 vp->v_socket = NULL; 805 vrelel(vp, 0); 806 solock(so); 807 unp->unp_vnode = NULL; 808 } 809 if (unp->unp_conn) 810 unp_disconnect(unp); 811 while (unp->unp_refs) { 812 KASSERT(solocked2(so, unp->unp_refs->unp_socket)); 813 if (unp_drop(unp->unp_refs, ECONNRESET)) { 814 solock(so); 815 goto retry; 816 } 817 } 818 soisdisconnected(so); 819 so->so_pcb = NULL; 820 if (unp_rights) { 821 /* 822 * Normally the receive buffer is flushed later, 823 * in sofree, but if our receive buffer holds references 824 * to descriptors that are now garbage, we will dispose 825 * of those descriptor references after the garbage collector 826 * gets them (resulting in a "panic: closef: count < 0"). 827 */ 828 sorflush(so); 829 unp_free(unp); 830 sounlock(so); 831 unp_gc(); 832 solock(so); 833 } else 834 unp_free(unp); 835 } 836 837 int 838 unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l) 839 { 840 struct sockaddr_un *sun; 841 struct unpcb *unp; 842 vnode_t *vp; 843 struct vattr vattr; 844 size_t addrlen; 845 int error; 846 struct nameidata nd; 847 proc_t *p; 848 849 unp = sotounpcb(so); 850 if (unp->unp_vnode != NULL) 851 return (EINVAL); 852 if ((unp->unp_flags & UNP_BUSY) != 0) { 853 /* 854 * EALREADY may not be strictly accurate, but since this 855 * is a major application error it's hardly a big deal. 856 */ 857 return (EALREADY); 858 } 859 unp->unp_flags |= UNP_BUSY; 860 sounlock(so); 861 862 /* 863 * Allocate the new sockaddr. We have to allocate one 864 * extra byte so that we can ensure that the pathname 865 * is nul-terminated. 866 */ 867 p = l->l_proc; 868 addrlen = nam->m_len + 1; 869 sun = malloc(addrlen, M_SONAME, M_WAITOK); 870 m_copydata(nam, 0, nam->m_len, (void *)sun); 871 *(((char *)sun) + nam->m_len) = '\0'; 872 873 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE, 874 sun->sun_path); 875 876 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 877 if ((error = namei(&nd)) != 0) 878 goto bad; 879 vp = nd.ni_vp; 880 if (vp != NULL) { 881 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 882 if (nd.ni_dvp == vp) 883 vrele(nd.ni_dvp); 884 else 885 vput(nd.ni_dvp); 886 vrele(vp); 887 error = EADDRINUSE; 888 goto bad; 889 } 890 VATTR_NULL(&vattr); 891 vattr.va_type = VSOCK; 892 vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask); 893 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 894 if (error) 895 goto bad; 896 vp = nd.ni_vp; 897 solock(so); 898 vp->v_socket = unp->unp_socket; 899 unp->unp_vnode = vp; 900 unp->unp_addrlen = addrlen; 901 unp->unp_addr = sun; 902 unp->unp_connid.unp_pid = p->p_pid; 903 unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred); 904 unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred); 905 unp->unp_flags |= UNP_EIDSBIND; 906 VOP_UNLOCK(vp, 0); 907 unp->unp_flags &= ~UNP_BUSY; 908 return (0); 909 910 bad: 911 free(sun, M_SONAME); 912 solock(so); 913 unp->unp_flags &= ~UNP_BUSY; 914 return (error); 915 } 916 917 int 918 unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l) 919 { 920 struct sockaddr_un *sun; 921 vnode_t *vp; 922 struct socket *so2, *so3; 923 struct unpcb *unp, *unp2, *unp3; 924 size_t addrlen; 925 int error; 926 struct nameidata nd; 927 928 unp = sotounpcb(so); 929 if ((unp->unp_flags & UNP_BUSY) != 0) { 930 /* 931 * EALREADY may not be strictly accurate, but since this 932 * is a major application error it's hardly a big deal. 933 */ 934 return (EALREADY); 935 } 936 unp->unp_flags |= UNP_BUSY; 937 sounlock(so); 938 939 /* 940 * Allocate a temporary sockaddr. We have to allocate one extra 941 * byte so that we can ensure that the pathname is nul-terminated. 942 * When we establish the connection, we copy the other PCB's 943 * sockaddr to our own. 944 */ 945 addrlen = nam->m_len + 1; 946 sun = malloc(addrlen, M_SONAME, M_WAITOK); 947 m_copydata(nam, 0, nam->m_len, (void *)sun); 948 *(((char *)sun) + nam->m_len) = '\0'; 949 950 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE, 951 sun->sun_path); 952 953 if ((error = namei(&nd)) != 0) 954 goto bad2; 955 vp = nd.ni_vp; 956 if (vp->v_type != VSOCK) { 957 error = ENOTSOCK; 958 goto bad; 959 } 960 if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0) 961 goto bad; 962 /* Acquire v_interlock to protect against unp_detach(). */ 963 mutex_enter(&vp->v_interlock); 964 so2 = vp->v_socket; 965 if (so2 == NULL) { 966 mutex_exit(&vp->v_interlock); 967 error = ECONNREFUSED; 968 goto bad; 969 } 970 if (so->so_type != so2->so_type) { 971 mutex_exit(&vp->v_interlock); 972 error = EPROTOTYPE; 973 goto bad; 974 } 975 solock(so); 976 unp_resetlock(so); 977 mutex_exit(&vp->v_interlock); 978 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 979 /* 980 * This may seem somewhat fragile but is OK: if we can 981 * see SO_ACCEPTCONN set on the endpoint, then it must 982 * be locked by the domain-wide uipc_lock. 983 */ 984 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 || 985 so2->so_lock == uipc_lock); 986 if ((so2->so_options & SO_ACCEPTCONN) == 0 || 987 (so3 = sonewconn(so2, 0)) == 0) { 988 error = ECONNREFUSED; 989 sounlock(so); 990 goto bad; 991 } 992 unp2 = sotounpcb(so2); 993 unp3 = sotounpcb(so3); 994 if (unp2->unp_addr) { 995 unp3->unp_addr = malloc(unp2->unp_addrlen, 996 M_SONAME, M_WAITOK); 997 memcpy(unp3->unp_addr, unp2->unp_addr, 998 unp2->unp_addrlen); 999 unp3->unp_addrlen = unp2->unp_addrlen; 1000 } 1001 unp3->unp_flags = unp2->unp_flags; 1002 unp3->unp_connid.unp_pid = l->l_proc->p_pid; 1003 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred); 1004 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred); 1005 unp3->unp_flags |= UNP_EIDSVALID; 1006 if (unp2->unp_flags & UNP_EIDSBIND) { 1007 unp->unp_connid = unp2->unp_connid; 1008 unp->unp_flags |= UNP_EIDSVALID; 1009 } 1010 so2 = so3; 1011 } 1012 error = unp_connect2(so, so2, PRU_CONNECT); 1013 sounlock(so); 1014 bad: 1015 vput(vp); 1016 bad2: 1017 free(sun, M_SONAME); 1018 solock(so); 1019 unp->unp_flags &= ~UNP_BUSY; 1020 return (error); 1021 } 1022 1023 int 1024 unp_connect2(struct socket *so, struct socket *so2, int req) 1025 { 1026 struct unpcb *unp = sotounpcb(so); 1027 struct unpcb *unp2; 1028 1029 if (so2->so_type != so->so_type) 1030 return (EPROTOTYPE); 1031 1032 /* 1033 * All three sockets involved must be locked by same lock: 1034 * 1035 * local endpoint (so) 1036 * remote endpoint (so2) 1037 * queue head (so->so_head, only if PR_CONNREQUIRED) 1038 */ 1039 KASSERT(solocked2(so, so2)); 1040 if (so->so_head != NULL) { 1041 KASSERT(so->so_lock == uipc_lock); 1042 KASSERT(solocked2(so, so->so_head)); 1043 } 1044 1045 unp2 = sotounpcb(so2); 1046 unp->unp_conn = unp2; 1047 switch (so->so_type) { 1048 1049 case SOCK_DGRAM: 1050 unp->unp_nextref = unp2->unp_refs; 1051 unp2->unp_refs = unp; 1052 soisconnected(so); 1053 break; 1054 1055 case SOCK_STREAM: 1056 unp2->unp_conn = unp; 1057 if (req == PRU_CONNECT && 1058 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1059 soisconnecting(so); 1060 else 1061 soisconnected(so); 1062 soisconnected(so2); 1063 /* 1064 * If the connection is fully established, break the 1065 * association with uipc_lock and give the connected 1066 * pair a seperate lock to share. For CONNECT2, we 1067 * require that the locks already match (the sockets 1068 * are created that way). 1069 */ 1070 if (req == PRU_CONNECT) 1071 unp_setpeerlocks(so, so2); 1072 break; 1073 1074 default: 1075 panic("unp_connect2"); 1076 } 1077 return (0); 1078 } 1079 1080 void 1081 unp_disconnect(struct unpcb *unp) 1082 { 1083 struct unpcb *unp2 = unp->unp_conn; 1084 struct socket *so; 1085 1086 if (unp2 == 0) 1087 return; 1088 unp->unp_conn = 0; 1089 so = unp->unp_socket; 1090 switch (so->so_type) { 1091 case SOCK_DGRAM: 1092 if (unp2->unp_refs == unp) 1093 unp2->unp_refs = unp->unp_nextref; 1094 else { 1095 unp2 = unp2->unp_refs; 1096 for (;;) { 1097 KASSERT(solocked2(so, unp2->unp_socket)); 1098 if (unp2 == 0) 1099 panic("unp_disconnect"); 1100 if (unp2->unp_nextref == unp) 1101 break; 1102 unp2 = unp2->unp_nextref; 1103 } 1104 unp2->unp_nextref = unp->unp_nextref; 1105 } 1106 unp->unp_nextref = 0; 1107 so->so_state &= ~SS_ISCONNECTED; 1108 break; 1109 1110 case SOCK_STREAM: 1111 KASSERT(solocked2(so, unp2->unp_socket)); 1112 soisdisconnected(so); 1113 unp2->unp_conn = 0; 1114 soisdisconnected(unp2->unp_socket); 1115 break; 1116 } 1117 } 1118 1119 #ifdef notdef 1120 unp_abort(struct unpcb *unp) 1121 { 1122 unp_detach(unp); 1123 } 1124 #endif 1125 1126 void 1127 unp_shutdown(struct unpcb *unp) 1128 { 1129 struct socket *so; 1130 1131 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1132 (so = unp->unp_conn->unp_socket)) 1133 socantrcvmore(so); 1134 } 1135 1136 bool 1137 unp_drop(struct unpcb *unp, int errno) 1138 { 1139 struct socket *so = unp->unp_socket; 1140 1141 KASSERT(solocked(so)); 1142 1143 so->so_error = errno; 1144 unp_disconnect(unp); 1145 if (so->so_head) { 1146 so->so_pcb = NULL; 1147 /* sofree() drops the socket lock */ 1148 sofree(so); 1149 unp_free(unp); 1150 return true; 1151 } 1152 return false; 1153 } 1154 1155 #ifdef notdef 1156 unp_drain(void) 1157 { 1158 1159 } 1160 #endif 1161 1162 int 1163 unp_externalize(struct mbuf *rights, struct lwp *l) 1164 { 1165 struct cmsghdr *cm = mtod(rights, struct cmsghdr *); 1166 struct proc *p = l->l_proc; 1167 int i, *fdp; 1168 file_t **rp; 1169 file_t *fp; 1170 int nfds, error = 0; 1171 1172 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / 1173 sizeof(file_t *); 1174 rp = (file_t **)CMSG_DATA(cm); 1175 1176 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK); 1177 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER); 1178 1179 /* Make sure the recipient should be able to see the descriptors.. */ 1180 if (p->p_cwdi->cwdi_rdir != NULL) { 1181 rp = (file_t **)CMSG_DATA(cm); 1182 for (i = 0; i < nfds; i++) { 1183 fp = *rp++; 1184 /* 1185 * If we are in a chroot'ed directory, and 1186 * someone wants to pass us a directory, make 1187 * sure it's inside the subtree we're allowed 1188 * to access. 1189 */ 1190 if (fp->f_type == DTYPE_VNODE) { 1191 vnode_t *vp = (vnode_t *)fp->f_data; 1192 if ((vp->v_type == VDIR) && 1193 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) { 1194 error = EPERM; 1195 break; 1196 } 1197 } 1198 } 1199 } 1200 1201 restart: 1202 rp = (file_t **)CMSG_DATA(cm); 1203 if (error != 0) { 1204 for (i = 0; i < nfds; i++) { 1205 fp = *rp; 1206 /* 1207 * zero the pointer before calling unp_discard, 1208 * since it may end up in unp_gc().. 1209 */ 1210 *rp++ = 0; 1211 unp_discard(fp); 1212 } 1213 goto out; 1214 } 1215 1216 /* 1217 * First loop -- allocate file descriptor table slots for the 1218 * new descriptors. 1219 */ 1220 for (i = 0; i < nfds; i++) { 1221 fp = *rp++; 1222 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) { 1223 /* 1224 * Back out what we've done so far. 1225 */ 1226 for (--i; i >= 0; i--) { 1227 fd_abort(p, NULL, fdp[i]); 1228 } 1229 if (error == ENOSPC) { 1230 fd_tryexpand(p); 1231 error = 0; 1232 } else { 1233 /* 1234 * This is the error that has historically 1235 * been returned, and some callers may 1236 * expect it. 1237 */ 1238 error = EMSGSIZE; 1239 } 1240 goto restart; 1241 } 1242 } 1243 1244 /* 1245 * Now that adding them has succeeded, update all of the 1246 * descriptor passing state. 1247 */ 1248 rp = (file_t **)CMSG_DATA(cm); 1249 for (i = 0; i < nfds; i++) { 1250 fp = *rp++; 1251 atomic_dec_uint(&unp_rights); 1252 fd_affix(p, fp, fdp[i]); 1253 mutex_enter(&fp->f_lock); 1254 fp->f_msgcount--; 1255 mutex_exit(&fp->f_lock); 1256 /* 1257 * Note that fd_affix() adds a reference to the file. 1258 * The file may already have been closed by another 1259 * LWP in the process, so we must drop the reference 1260 * added by unp_internalize() with closef(). 1261 */ 1262 closef(fp); 1263 } 1264 1265 /* 1266 * Copy temporary array to message and adjust length, in case of 1267 * transition from large file_t pointers to ints. 1268 */ 1269 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int)); 1270 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int)); 1271 rights->m_len = CMSG_SPACE(nfds * sizeof(int)); 1272 out: 1273 rw_exit(&p->p_cwdi->cwdi_lock); 1274 free(fdp, M_TEMP); 1275 return (error); 1276 } 1277 1278 int 1279 unp_internalize(struct mbuf **controlp) 1280 { 1281 struct filedesc *fdescp = curlwp->l_fd; 1282 struct mbuf *control = *controlp; 1283 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *); 1284 file_t **rp, **files; 1285 file_t *fp; 1286 int i, fd, *fdp; 1287 int nfds, error; 1288 1289 error = 0; 1290 newcm = NULL; 1291 1292 /* Sanity check the control message header. */ 1293 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || 1294 cm->cmsg_len != control->m_len) 1295 return (EINVAL); 1296 1297 /* 1298 * Verify that the file descriptors are valid, and acquire 1299 * a reference to each. 1300 */ 1301 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int); 1302 fdp = (int *)CMSG_DATA(cm); 1303 for (i = 0; i < nfds; i++) { 1304 fd = *fdp++; 1305 if ((fp = fd_getfile(fd)) == NULL) { 1306 nfds = i + 1; 1307 error = EBADF; 1308 goto out; 1309 } 1310 } 1311 1312 /* Allocate new space and copy header into it. */ 1313 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK); 1314 if (newcm == NULL) { 1315 error = E2BIG; 1316 goto out; 1317 } 1318 memcpy(newcm, cm, sizeof(struct cmsghdr)); 1319 files = (file_t **)CMSG_DATA(newcm); 1320 1321 /* 1322 * Transform the file descriptors into file_t pointers, in 1323 * reverse order so that if pointers are bigger than ints, the 1324 * int won't get until we're done. No need to lock, as we have 1325 * already validated the descriptors with fd_getfile(). 1326 */ 1327 fdp = (int *)CMSG_DATA(cm) + nfds; 1328 rp = files + nfds; 1329 for (i = 0; i < nfds; i++) { 1330 fp = fdescp->fd_ofiles[*--fdp]->ff_file; 1331 KASSERT(fp != NULL); 1332 mutex_enter(&fp->f_lock); 1333 *--rp = fp; 1334 fp->f_count++; 1335 fp->f_msgcount++; 1336 mutex_exit(&fp->f_lock); 1337 atomic_inc_uint(&unp_rights); 1338 } 1339 1340 out: 1341 /* Release descriptor references. */ 1342 fdp = (int *)CMSG_DATA(cm); 1343 for (i = 0; i < nfds; i++) { 1344 fd_putfile(*fdp++); 1345 } 1346 1347 if (error == 0) { 1348 if (control->m_flags & M_EXT) { 1349 m_freem(control); 1350 *controlp = control = m_get(M_WAIT, MT_CONTROL); 1351 } 1352 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)), 1353 M_MBUF, NULL, NULL); 1354 cm = newcm; 1355 /* 1356 * Adjust message & mbuf to note amount of space 1357 * actually used. 1358 */ 1359 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *)); 1360 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *)); 1361 } 1362 1363 return error; 1364 } 1365 1366 struct mbuf * 1367 unp_addsockcred(struct lwp *l, struct mbuf *control) 1368 { 1369 struct cmsghdr *cmp; 1370 struct sockcred *sc; 1371 struct mbuf *m, *n; 1372 int len, space, i; 1373 1374 len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred))); 1375 space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred))); 1376 1377 m = m_get(M_WAIT, MT_CONTROL); 1378 if (space > MLEN) { 1379 if (space > MCLBYTES) 1380 MEXTMALLOC(m, space, M_WAITOK); 1381 else 1382 m_clget(m, M_WAIT); 1383 if ((m->m_flags & M_EXT) == 0) { 1384 m_free(m); 1385 return (control); 1386 } 1387 } 1388 1389 m->m_len = space; 1390 m->m_next = NULL; 1391 cmp = mtod(m, struct cmsghdr *); 1392 sc = (struct sockcred *)CMSG_DATA(cmp); 1393 cmp->cmsg_len = len; 1394 cmp->cmsg_level = SOL_SOCKET; 1395 cmp->cmsg_type = SCM_CREDS; 1396 sc->sc_uid = kauth_cred_getuid(l->l_cred); 1397 sc->sc_euid = kauth_cred_geteuid(l->l_cred); 1398 sc->sc_gid = kauth_cred_getgid(l->l_cred); 1399 sc->sc_egid = kauth_cred_getegid(l->l_cred); 1400 sc->sc_ngroups = kauth_cred_ngroups(l->l_cred); 1401 for (i = 0; i < sc->sc_ngroups; i++) 1402 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i); 1403 1404 /* 1405 * If a control message already exists, append us to the end. 1406 */ 1407 if (control != NULL) { 1408 for (n = control; n->m_next != NULL; n = n->m_next) 1409 ; 1410 n->m_next = m; 1411 } else 1412 control = m; 1413 1414 return (control); 1415 } 1416 1417 int unp_defer, unp_gcing; 1418 extern struct domain unixdomain; 1419 1420 /* 1421 * Comment added long after the fact explaining what's going on here. 1422 * Do a mark-sweep GC of file descriptors on the system, to free up 1423 * any which are caught in flight to an about-to-be-closed socket. 1424 * 1425 * Traditional mark-sweep gc's start at the "root", and mark 1426 * everything reachable from the root (which, in our case would be the 1427 * process table). The mark bits are cleared during the sweep. 1428 * 1429 * XXX For some inexplicable reason (perhaps because the file 1430 * descriptor tables used to live in the u area which could be swapped 1431 * out and thus hard to reach), we do multiple scans over the set of 1432 * descriptors, using use *two* mark bits per object (DEFER and MARK). 1433 * Whenever we find a descriptor which references other descriptors, 1434 * the ones it references are marked with both bits, and we iterate 1435 * over the whole file table until there are no more DEFER bits set. 1436 * We also make an extra pass *before* the GC to clear the mark bits, 1437 * which could have been cleared at almost no cost during the previous 1438 * sweep. 1439 */ 1440 void 1441 unp_gc(void) 1442 { 1443 file_t *fp, *nextfp; 1444 struct socket *so, *so1; 1445 file_t **extra_ref, **fpp; 1446 int nunref, nslots, i; 1447 1448 if (atomic_swap_uint(&unp_gcing, 1) == 1) 1449 return; 1450 1451 restart: 1452 nslots = nfiles * 2; 1453 extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP); 1454 1455 mutex_enter(&filelist_lock); 1456 unp_defer = 0; 1457 1458 /* Clear mark bits */ 1459 LIST_FOREACH(fp, &filehead, f_list) { 1460 atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER)); 1461 } 1462 1463 /* 1464 * Iterate over the set of descriptors, marking ones believed 1465 * (based on refcount) to be referenced from a process, and 1466 * marking for rescan descriptors which are queued on a socket. 1467 */ 1468 do { 1469 LIST_FOREACH(fp, &filehead, f_list) { 1470 mutex_enter(&fp->f_lock); 1471 if (fp->f_flag & FDEFER) { 1472 atomic_and_uint(&fp->f_flag, ~FDEFER); 1473 unp_defer--; 1474 KASSERT(fp->f_count != 0); 1475 } else { 1476 if (fp->f_count == 0 || 1477 (fp->f_flag & FMARK) || 1478 fp->f_count == fp->f_msgcount) { 1479 mutex_exit(&fp->f_lock); 1480 continue; 1481 } 1482 } 1483 atomic_or_uint(&fp->f_flag, FMARK); 1484 1485 if (fp->f_type != DTYPE_SOCKET || 1486 (so = fp->f_data) == NULL || 1487 so->so_proto->pr_domain != &unixdomain || 1488 (so->so_proto->pr_flags&PR_RIGHTS) == 0) { 1489 mutex_exit(&fp->f_lock); 1490 continue; 1491 } 1492 #ifdef notdef 1493 if (so->so_rcv.sb_flags & SB_LOCK) { 1494 mutex_exit(&fp->f_lock); 1495 mutex_exit(&filelist_lock); 1496 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1497 /* 1498 * This is problematical; it's not clear 1499 * we need to wait for the sockbuf to be 1500 * unlocked (on a uniprocessor, at least), 1501 * and it's also not clear what to do 1502 * if sbwait returns an error due to receipt 1503 * of a signal. If sbwait does return 1504 * an error, we'll go into an infinite 1505 * loop. Delete all of this for now. 1506 */ 1507 (void) sbwait(&so->so_rcv); 1508 goto restart; 1509 } 1510 #endif 1511 mutex_exit(&fp->f_lock); 1512 1513 /* 1514 * XXX Locking a socket with filelist_lock held 1515 * is ugly. filelist_lock can be taken by the 1516 * pagedaemon when reclaiming items from file_cache. 1517 * Socket activity could delay the pagedaemon. 1518 */ 1519 solock(so); 1520 unp_scan(so->so_rcv.sb_mb, unp_mark, 0); 1521 /* 1522 * Mark descriptors referenced from sockets queued 1523 * on the accept queue as well. 1524 */ 1525 if (so->so_options & SO_ACCEPTCONN) { 1526 TAILQ_FOREACH(so1, &so->so_q0, so_qe) { 1527 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1528 } 1529 TAILQ_FOREACH(so1, &so->so_q, so_qe) { 1530 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1531 } 1532 } 1533 sounlock(so); 1534 } 1535 } while (unp_defer); 1536 1537 /* 1538 * Sweep pass. Find unmarked descriptors, and free them. 1539 * 1540 * We grab an extra reference to each of the file table entries 1541 * that are not otherwise accessible and then free the rights 1542 * that are stored in messages on them. 1543 * 1544 * The bug in the original code is a little tricky, so I'll describe 1545 * what's wrong with it here. 1546 * 1547 * It is incorrect to simply unp_discard each entry for f_msgcount 1548 * times -- consider the case of sockets A and B that contain 1549 * references to each other. On a last close of some other socket, 1550 * we trigger a gc since the number of outstanding rights (unp_rights) 1551 * is non-zero. If during the sweep phase the gc code un_discards, 1552 * we end up doing a (full) closef on the descriptor. A closef on A 1553 * results in the following chain. Closef calls soo_close, which 1554 * calls soclose. Soclose calls first (through the switch 1555 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1556 * returns because the previous instance had set unp_gcing, and 1557 * we return all the way back to soclose, which marks the socket 1558 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush 1559 * to free up the rights that are queued in messages on the socket A, 1560 * i.e., the reference on B. The sorflush calls via the dom_dispose 1561 * switch unp_dispose, which unp_scans with unp_discard. This second 1562 * instance of unp_discard just calls closef on B. 1563 * 1564 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1565 * which results in another closef on A. Unfortunately, A is already 1566 * being closed, and the descriptor has already been marked with 1567 * SS_NOFDREF, and soclose panics at this point. 1568 * 1569 * Here, we first take an extra reference to each inaccessible 1570 * descriptor. Then, if the inaccessible descriptor is a 1571 * socket, we call sorflush in case it is a Unix domain 1572 * socket. After we destroy all the rights carried in 1573 * messages, we do a last closef to get rid of our extra 1574 * reference. This is the last close, and the unp_detach etc 1575 * will shut down the socket. 1576 * 1577 * 91/09/19, bsy@cs.cmu.edu 1578 */ 1579 if (nslots < nfiles) { 1580 mutex_exit(&filelist_lock); 1581 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1582 goto restart; 1583 } 1584 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0; 1585 fp = nextfp) { 1586 nextfp = LIST_NEXT(fp, f_list); 1587 mutex_enter(&fp->f_lock); 1588 if (fp->f_count != 0 && 1589 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { 1590 *fpp++ = fp; 1591 nunref++; 1592 fp->f_count++; 1593 } 1594 mutex_exit(&fp->f_lock); 1595 } 1596 mutex_exit(&filelist_lock); 1597 1598 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1599 fp = *fpp; 1600 if (fp->f_type == DTYPE_SOCKET) { 1601 so = fp->f_data; 1602 solock(so); 1603 sorflush(fp->f_data); 1604 sounlock(so); 1605 } 1606 } 1607 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1608 closef(*fpp); 1609 } 1610 kmem_free(extra_ref, nslots * sizeof(file_t *)); 1611 atomic_swap_uint(&unp_gcing, 0); 1612 } 1613 1614 void 1615 unp_dispose(struct mbuf *m) 1616 { 1617 1618 if (m) 1619 unp_scan(m, unp_discard, 1); 1620 } 1621 1622 void 1623 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard) 1624 { 1625 struct mbuf *m; 1626 file_t **rp; 1627 struct cmsghdr *cm; 1628 int i; 1629 int qfds; 1630 1631 while (m0) { 1632 for (m = m0; m; m = m->m_next) { 1633 if (m->m_type == MT_CONTROL && 1634 m->m_len >= sizeof(*cm)) { 1635 cm = mtod(m, struct cmsghdr *); 1636 if (cm->cmsg_level != SOL_SOCKET || 1637 cm->cmsg_type != SCM_RIGHTS) 1638 continue; 1639 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) 1640 / sizeof(file_t *); 1641 rp = (file_t **)CMSG_DATA(cm); 1642 for (i = 0; i < qfds; i++) { 1643 file_t *fp = *rp; 1644 if (discard) 1645 *rp = 0; 1646 (*op)(fp); 1647 rp++; 1648 } 1649 break; /* XXX, but saves time */ 1650 } 1651 } 1652 m0 = m0->m_nextpkt; 1653 } 1654 } 1655 1656 void 1657 unp_mark(file_t *fp) 1658 { 1659 1660 if (fp == NULL) 1661 return; 1662 1663 /* If we're already deferred, don't screw up the defer count */ 1664 mutex_enter(&fp->f_lock); 1665 if (fp->f_flag & (FMARK | FDEFER)) { 1666 mutex_exit(&fp->f_lock); 1667 return; 1668 } 1669 1670 /* 1671 * Minimize the number of deferrals... Sockets are the only 1672 * type of descriptor which can hold references to another 1673 * descriptor, so just mark other descriptors, and defer 1674 * unmarked sockets for the next pass. 1675 */ 1676 if (fp->f_type == DTYPE_SOCKET) { 1677 unp_defer++; 1678 KASSERT(fp->f_count != 0); 1679 atomic_or_uint(&fp->f_flag, FDEFER); 1680 } else { 1681 atomic_or_uint(&fp->f_flag, FMARK); 1682 } 1683 mutex_exit(&fp->f_lock); 1684 return; 1685 } 1686 1687 void 1688 unp_discard(file_t *fp) 1689 { 1690 1691 if (fp == NULL) 1692 return; 1693 1694 mutex_enter(&fp->f_lock); 1695 KASSERT(fp->f_count > 0); 1696 fp->f_msgcount--; 1697 mutex_exit(&fp->f_lock); 1698 atomic_dec_uint(&unp_rights); 1699 (void)closef(fp); 1700 } 1701