1 /* $NetBSD: uipc_usrreq.c,v 1.45 1999/06/17 23:17:45 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 1998 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved. 42 * Copyright (c) 1982, 1986, 1989, 1991, 1993 43 * The Regents of the University of California. All rights reserved. 44 * 45 * Redistribution and use in source and binary forms, with or without 46 * modification, are permitted provided that the following conditions 47 * are met: 48 * 1. Redistributions of source code must retain the above copyright 49 * notice, this list of conditions and the following disclaimer. 50 * 2. Redistributions in binary form must reproduce the above copyright 51 * notice, this list of conditions and the following disclaimer in the 52 * documentation and/or other materials provided with the distribution. 53 * 3. All advertising materials mentioning features or use of this software 54 * must display the following acknowledgement: 55 * This product includes software developed by the University of 56 * California, Berkeley and its contributors. 57 * 4. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 74 */ 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/proc.h> 79 #include <sys/filedesc.h> 80 #include <sys/domain.h> 81 #include <sys/protosw.h> 82 #include <sys/socket.h> 83 #include <sys/socketvar.h> 84 #include <sys/unpcb.h> 85 #include <sys/un.h> 86 #include <sys/namei.h> 87 #include <sys/vnode.h> 88 #include <sys/file.h> 89 #include <sys/stat.h> 90 #include <sys/mbuf.h> 91 92 /* 93 * Unix communications domain. 94 * 95 * TODO: 96 * SEQPACKET, RDM 97 * rethink name space problems 98 * need a proper out-of-band 99 */ 100 struct sockaddr_un sun_noname = { sizeof(sun_noname), AF_LOCAL }; 101 ino_t unp_ino; /* prototype for fake inode numbers */ 102 103 struct mbuf *unp_addsockcred __P((struct proc *, struct mbuf *)); 104 105 int 106 unp_output(m, control, unp, p) 107 struct mbuf *m, *control; 108 struct unpcb *unp; 109 struct proc *p; 110 { 111 struct socket *so2; 112 struct sockaddr_un *sun; 113 114 so2 = unp->unp_conn->unp_socket; 115 if (unp->unp_addr) 116 sun = unp->unp_addr; 117 else 118 sun = &sun_noname; 119 if (unp->unp_conn->unp_flags & UNP_WANTCRED) 120 control = unp_addsockcred(p, control); 121 if (sbappendaddr(&so2->so_rcv, (struct sockaddr *)sun, m, 122 control) == 0) { 123 m_freem(control); 124 m_freem(m); 125 return (EINVAL); 126 } else { 127 sorwakeup(so2); 128 return (0); 129 } 130 } 131 132 void 133 unp_setsockaddr(unp, nam) 134 register struct unpcb *unp; 135 struct mbuf *nam; 136 { 137 struct sockaddr_un *sun; 138 139 if (unp->unp_addr) 140 sun = unp->unp_addr; 141 else 142 sun = &sun_noname; 143 nam->m_len = sun->sun_len; 144 if (nam->m_len > MLEN) 145 MEXTMALLOC(nam, nam->m_len, M_WAITOK); 146 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len); 147 } 148 149 void 150 unp_setpeeraddr(unp, nam) 151 register struct unpcb *unp; 152 struct mbuf *nam; 153 { 154 struct sockaddr_un *sun; 155 156 if (unp->unp_conn && unp->unp_conn->unp_addr) 157 sun = unp->unp_conn->unp_addr; 158 else 159 sun = &sun_noname; 160 nam->m_len = sun->sun_len; 161 if (nam->m_len > MLEN) 162 MEXTMALLOC(nam, nam->m_len, M_WAITOK); 163 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len); 164 } 165 166 /*ARGSUSED*/ 167 int 168 uipc_usrreq(so, req, m, nam, control, p) 169 struct socket *so; 170 int req; 171 struct mbuf *m, *nam, *control; 172 struct proc *p; 173 { 174 struct unpcb *unp = sotounpcb(so); 175 register struct socket *so2; 176 register int error = 0; 177 178 if (req == PRU_CONTROL) 179 return (EOPNOTSUPP); 180 181 #ifdef DIAGNOSTIC 182 if (req != PRU_SEND && req != PRU_SENDOOB && control) 183 panic("uipc_usrreq: unexpected control mbuf"); 184 #endif 185 if (unp == 0 && req != PRU_ATTACH) { 186 error = EINVAL; 187 goto release; 188 } 189 190 switch (req) { 191 192 case PRU_ATTACH: 193 if (unp != 0) { 194 error = EISCONN; 195 break; 196 } 197 error = unp_attach(so); 198 break; 199 200 case PRU_DETACH: 201 unp_detach(unp); 202 break; 203 204 case PRU_BIND: 205 error = unp_bind(unp, nam, p); 206 break; 207 208 case PRU_LISTEN: 209 if (unp->unp_vnode == 0) 210 error = EINVAL; 211 break; 212 213 case PRU_CONNECT: 214 error = unp_connect(so, nam, p); 215 break; 216 217 case PRU_CONNECT2: 218 error = unp_connect2(so, (struct socket *)nam); 219 break; 220 221 case PRU_DISCONNECT: 222 unp_disconnect(unp); 223 break; 224 225 case PRU_ACCEPT: 226 unp_setpeeraddr(unp, nam); 227 break; 228 229 case PRU_SHUTDOWN: 230 socantsendmore(so); 231 unp_shutdown(unp); 232 break; 233 234 case PRU_RCVD: 235 switch (so->so_type) { 236 237 case SOCK_DGRAM: 238 panic("uipc 1"); 239 /*NOTREACHED*/ 240 241 case SOCK_STREAM: 242 #define rcv (&so->so_rcv) 243 #define snd (&so2->so_snd) 244 if (unp->unp_conn == 0) 245 break; 246 so2 = unp->unp_conn->unp_socket; 247 /* 248 * Adjust backpressure on sender 249 * and wakeup any waiting to write. 250 */ 251 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; 252 unp->unp_mbcnt = rcv->sb_mbcnt; 253 snd->sb_hiwat += unp->unp_cc - rcv->sb_cc; 254 unp->unp_cc = rcv->sb_cc; 255 sowwakeup(so2); 256 #undef snd 257 #undef rcv 258 break; 259 260 default: 261 panic("uipc 2"); 262 } 263 break; 264 265 case PRU_SEND: 266 /* 267 * Note: unp_internalize() rejects any control message 268 * other than SCM_RIGHTS, and only allows one. This 269 * has the side-effect of preventing a caller from 270 * forging SCM_CREDS. 271 */ 272 if (control && (error = unp_internalize(control, p))) 273 break; 274 switch (so->so_type) { 275 276 case SOCK_DGRAM: { 277 if (nam) { 278 if ((so->so_state & SS_ISCONNECTED) != 0) { 279 error = EISCONN; 280 goto die; 281 } 282 error = unp_connect(so, nam, p); 283 if (error) { 284 die: 285 m_freem(control); 286 m_freem(m); 287 break; 288 } 289 } else { 290 if ((so->so_state & SS_ISCONNECTED) == 0) { 291 error = ENOTCONN; 292 goto die; 293 } 294 } 295 error = unp_output(m, control, unp, p); 296 if (nam) 297 unp_disconnect(unp); 298 break; 299 } 300 301 case SOCK_STREAM: 302 #define rcv (&so2->so_rcv) 303 #define snd (&so->so_snd) 304 if (unp->unp_conn == 0) 305 panic("uipc 3"); 306 so2 = unp->unp_conn->unp_socket; 307 if (unp->unp_conn->unp_flags & UNP_WANTCRED) { 308 /* 309 * Credentials are passed only once on 310 * SOCK_STREAM. 311 */ 312 unp->unp_conn->unp_flags &= ~UNP_WANTCRED; 313 control = unp_addsockcred(p, control); 314 } 315 /* 316 * Send to paired receive port, and then reduce 317 * send buffer hiwater marks to maintain backpressure. 318 * Wake up readers. 319 */ 320 if (control) { 321 if (sbappendcontrol(rcv, m, control) == 0) 322 m_freem(control); 323 } else 324 sbappend(rcv, m); 325 snd->sb_mbmax -= 326 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; 327 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; 328 snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; 329 unp->unp_conn->unp_cc = rcv->sb_cc; 330 sorwakeup(so2); 331 #undef snd 332 #undef rcv 333 break; 334 335 default: 336 panic("uipc 4"); 337 } 338 break; 339 340 case PRU_ABORT: 341 unp_drop(unp, ECONNABORTED); 342 343 #ifdef DIAGNOSTIC 344 if (so->so_pcb == 0) 345 panic("uipc 5: drop killed pcb"); 346 #endif 347 unp_detach(unp); 348 break; 349 350 case PRU_SENSE: 351 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 352 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { 353 so2 = unp->unp_conn->unp_socket; 354 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc; 355 } 356 ((struct stat *) m)->st_dev = NODEV; 357 if (unp->unp_ino == 0) 358 unp->unp_ino = unp_ino++; 359 ((struct stat *) m)->st_atimespec = 360 ((struct stat *) m)->st_mtimespec = 361 ((struct stat *) m)->st_ctimespec = unp->unp_ctime; 362 ((struct stat *) m)->st_ino = unp->unp_ino; 363 return (0); 364 365 case PRU_RCVOOB: 366 error = EOPNOTSUPP; 367 break; 368 369 case PRU_SENDOOB: 370 m_freem(control); 371 m_freem(m); 372 error = EOPNOTSUPP; 373 break; 374 375 case PRU_SOCKADDR: 376 unp_setsockaddr(unp, nam); 377 break; 378 379 case PRU_PEERADDR: 380 unp_setpeeraddr(unp, nam); 381 break; 382 383 default: 384 panic("piusrreq"); 385 } 386 387 release: 388 return (error); 389 } 390 391 /* 392 * Unix domain socket option processing. 393 */ 394 int 395 uipc_ctloutput(op, so, level, optname, mp) 396 int op; 397 struct socket *so; 398 int level, optname; 399 struct mbuf **mp; 400 { 401 struct unpcb *unp = sotounpcb(so); 402 struct mbuf *m = *mp; 403 int optval = 0, error = 0; 404 405 if (level != 0) { 406 error = EINVAL; 407 if (op == PRCO_SETOPT && m) 408 (void) m_free(m); 409 } else switch (op) { 410 411 case PRCO_SETOPT: 412 switch (optname) { 413 case LOCAL_CREDS: 414 if (m == NULL || m->m_len != sizeof(int)) 415 error = EINVAL; 416 else { 417 optval = *mtod(m, int *); 418 switch (optname) { 419 #define OPTSET(bit) \ 420 if (optval) \ 421 unp->unp_flags |= (bit); \ 422 else \ 423 unp->unp_flags &= ~(bit); 424 425 case LOCAL_CREDS: 426 OPTSET(UNP_WANTCRED); 427 break; 428 } 429 } 430 break; 431 #undef OPTSET 432 433 default: 434 error = ENOPROTOOPT; 435 break; 436 } 437 if (m) 438 (void) m_free(m); 439 break; 440 441 case PRCO_GETOPT: 442 switch (optname) { 443 case LOCAL_CREDS: 444 *mp = m = m_get(M_WAIT, MT_SOOPTS); 445 m->m_len = sizeof(int); 446 switch (optname) { 447 448 #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0) 449 450 case LOCAL_CREDS: 451 optval = OPTBIT(UNP_WANTCRED); 452 break; 453 } 454 *mtod(m, int *) = optval; 455 break; 456 #undef OPTBIT 457 458 default: 459 error = ENOPROTOOPT; 460 break; 461 } 462 break; 463 } 464 return (error); 465 } 466 467 /* 468 * Both send and receive buffers are allocated PIPSIZ bytes of buffering 469 * for stream sockets, although the total for sender and receiver is 470 * actually only PIPSIZ. 471 * Datagram sockets really use the sendspace as the maximum datagram size, 472 * and don't really want to reserve the sendspace. Their recvspace should 473 * be large enough for at least one max-size datagram plus address. 474 */ 475 #define PIPSIZ 4096 476 u_long unpst_sendspace = PIPSIZ; 477 u_long unpst_recvspace = PIPSIZ; 478 u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 479 u_long unpdg_recvspace = 4*1024; 480 481 int unp_rights; /* file descriptors in flight */ 482 483 int 484 unp_attach(so) 485 struct socket *so; 486 { 487 register struct unpcb *unp; 488 struct timeval tv; 489 int error; 490 491 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 492 switch (so->so_type) { 493 494 case SOCK_STREAM: 495 error = soreserve(so, unpst_sendspace, unpst_recvspace); 496 break; 497 498 case SOCK_DGRAM: 499 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 500 break; 501 502 default: 503 panic("unp_attach"); 504 } 505 if (error) 506 return (error); 507 } 508 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT); 509 if (unp == NULL) 510 return (ENOBUFS); 511 memset((caddr_t)unp, 0, sizeof(*unp)); 512 unp->unp_socket = so; 513 so->so_pcb = unp; 514 microtime(&tv); 515 TIMEVAL_TO_TIMESPEC(&tv, &unp->unp_ctime); 516 return (0); 517 } 518 519 void 520 unp_detach(unp) 521 register struct unpcb *unp; 522 { 523 524 if (unp->unp_vnode) { 525 unp->unp_vnode->v_socket = 0; 526 vrele(unp->unp_vnode); 527 unp->unp_vnode = 0; 528 } 529 if (unp->unp_conn) 530 unp_disconnect(unp); 531 while (unp->unp_refs) 532 unp_drop(unp->unp_refs, ECONNRESET); 533 soisdisconnected(unp->unp_socket); 534 unp->unp_socket->so_pcb = 0; 535 if (unp->unp_addr) 536 free(unp->unp_addr, M_SONAME); 537 if (unp_rights) { 538 /* 539 * Normally the receive buffer is flushed later, 540 * in sofree, but if our receive buffer holds references 541 * to descriptors that are now garbage, we will dispose 542 * of those descriptor references after the garbage collector 543 * gets them (resulting in a "panic: closef: count < 0"). 544 */ 545 sorflush(unp->unp_socket); 546 free(unp, M_PCB); 547 unp_gc(); 548 } else 549 free(unp, M_PCB); 550 } 551 552 int 553 unp_bind(unp, nam, p) 554 struct unpcb *unp; 555 struct mbuf *nam; 556 struct proc *p; 557 { 558 struct sockaddr_un *sun; 559 register struct vnode *vp; 560 struct vattr vattr; 561 size_t addrlen; 562 int error; 563 struct nameidata nd; 564 565 if (unp->unp_vnode != 0) 566 return (EINVAL); 567 568 /* 569 * Allocate the new sockaddr. We have to allocate one 570 * extra byte so that we can ensure that the pathname 571 * is nul-terminated. 572 */ 573 addrlen = nam->m_len + 1; 574 sun = malloc(addrlen, M_SONAME, M_WAITOK); 575 m_copydata(nam, 0, nam->m_len, (caddr_t)sun); 576 *(((char *)sun) + nam->m_len) = '\0'; 577 578 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, 579 sun->sun_path, p); 580 581 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 582 if ((error = namei(&nd)) != 0) 583 goto bad; 584 vp = nd.ni_vp; 585 if (vp != NULL) { 586 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 587 if (nd.ni_dvp == vp) 588 vrele(nd.ni_dvp); 589 else 590 vput(nd.ni_dvp); 591 vrele(vp); 592 error = EADDRINUSE; 593 goto bad; 594 } 595 VATTR_NULL(&vattr); 596 vattr.va_type = VSOCK; 597 vattr.va_mode = ACCESSPERMS; 598 VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); 599 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 600 if (error) 601 goto bad; 602 vp = nd.ni_vp; 603 vp->v_socket = unp->unp_socket; 604 unp->unp_vnode = vp; 605 unp->unp_addrlen = addrlen; 606 unp->unp_addr = sun; 607 VOP_UNLOCK(vp, 0); 608 return (0); 609 610 bad: 611 free(sun, M_SONAME); 612 return (error); 613 } 614 615 int 616 unp_connect(so, nam, p) 617 struct socket *so; 618 struct mbuf *nam; 619 struct proc *p; 620 { 621 register struct sockaddr_un *sun; 622 register struct vnode *vp; 623 register struct socket *so2, *so3; 624 struct unpcb *unp2, *unp3; 625 size_t addrlen; 626 int error; 627 struct nameidata nd; 628 629 /* 630 * Allocate a temporary sockaddr. We have to allocate one extra 631 * byte so that we can ensure that the pathname is nul-terminated. 632 * When we establish the connection, we copy the other PCB's 633 * sockaddr to our own. 634 */ 635 addrlen = nam->m_len + 1; 636 sun = malloc(addrlen, M_SONAME, M_WAITOK); 637 m_copydata(nam, 0, nam->m_len, (caddr_t)sun); 638 *(((char *)sun) + nam->m_len) = '\0'; 639 640 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, sun->sun_path, p); 641 642 if ((error = namei(&nd)) != 0) 643 goto bad2; 644 vp = nd.ni_vp; 645 if (vp->v_type != VSOCK) { 646 error = ENOTSOCK; 647 goto bad; 648 } 649 if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) 650 goto bad; 651 so2 = vp->v_socket; 652 if (so2 == 0) { 653 error = ECONNREFUSED; 654 goto bad; 655 } 656 if (so->so_type != so2->so_type) { 657 error = EPROTOTYPE; 658 goto bad; 659 } 660 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 661 if ((so2->so_options & SO_ACCEPTCONN) == 0 || 662 (so3 = sonewconn(so2, 0)) == 0) { 663 error = ECONNREFUSED; 664 goto bad; 665 } 666 unp2 = sotounpcb(so2); 667 unp3 = sotounpcb(so3); 668 if (unp2->unp_addr) { 669 unp3->unp_addr = malloc(unp2->unp_addrlen, 670 M_SONAME, M_WAITOK); 671 memcpy(unp3->unp_addr, unp2->unp_addr, 672 unp2->unp_addrlen); 673 unp3->unp_addrlen = unp2->unp_addrlen; 674 } 675 unp3->unp_flags = unp2->unp_flags; 676 so2 = so3; 677 } 678 error = unp_connect2(so, so2); 679 bad: 680 vput(vp); 681 bad2: 682 free(sun, M_SONAME); 683 return (error); 684 } 685 686 int 687 unp_connect2(so, so2) 688 register struct socket *so; 689 register struct socket *so2; 690 { 691 register struct unpcb *unp = sotounpcb(so); 692 register struct unpcb *unp2; 693 694 if (so2->so_type != so->so_type) 695 return (EPROTOTYPE); 696 unp2 = sotounpcb(so2); 697 unp->unp_conn = unp2; 698 switch (so->so_type) { 699 700 case SOCK_DGRAM: 701 unp->unp_nextref = unp2->unp_refs; 702 unp2->unp_refs = unp; 703 soisconnected(so); 704 break; 705 706 case SOCK_STREAM: 707 unp2->unp_conn = unp; 708 soisconnected(so); 709 soisconnected(so2); 710 break; 711 712 default: 713 panic("unp_connect2"); 714 } 715 return (0); 716 } 717 718 void 719 unp_disconnect(unp) 720 struct unpcb *unp; 721 { 722 register struct unpcb *unp2 = unp->unp_conn; 723 724 if (unp2 == 0) 725 return; 726 unp->unp_conn = 0; 727 switch (unp->unp_socket->so_type) { 728 729 case SOCK_DGRAM: 730 if (unp2->unp_refs == unp) 731 unp2->unp_refs = unp->unp_nextref; 732 else { 733 unp2 = unp2->unp_refs; 734 for (;;) { 735 if (unp2 == 0) 736 panic("unp_disconnect"); 737 if (unp2->unp_nextref == unp) 738 break; 739 unp2 = unp2->unp_nextref; 740 } 741 unp2->unp_nextref = unp->unp_nextref; 742 } 743 unp->unp_nextref = 0; 744 unp->unp_socket->so_state &= ~SS_ISCONNECTED; 745 break; 746 747 case SOCK_STREAM: 748 soisdisconnected(unp->unp_socket); 749 unp2->unp_conn = 0; 750 soisdisconnected(unp2->unp_socket); 751 break; 752 } 753 } 754 755 #ifdef notdef 756 unp_abort(unp) 757 struct unpcb *unp; 758 { 759 760 unp_detach(unp); 761 } 762 #endif 763 764 void 765 unp_shutdown(unp) 766 struct unpcb *unp; 767 { 768 struct socket *so; 769 770 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 771 (so = unp->unp_conn->unp_socket)) 772 socantrcvmore(so); 773 } 774 775 void 776 unp_drop(unp, errno) 777 struct unpcb *unp; 778 int errno; 779 { 780 struct socket *so = unp->unp_socket; 781 782 so->so_error = errno; 783 unp_disconnect(unp); 784 if (so->so_head) { 785 so->so_pcb = 0; 786 sofree(so); 787 if (unp->unp_addr) 788 free(unp->unp_addr, M_SONAME); 789 free(unp, M_PCB); 790 } 791 } 792 793 #ifdef notdef 794 unp_drain() 795 { 796 797 } 798 #endif 799 800 int 801 unp_externalize(rights) 802 struct mbuf *rights; 803 { 804 struct proc *p = curproc; /* XXX */ 805 register struct cmsghdr *cm = mtod(rights, struct cmsghdr *); 806 register int i, *fdp = (int *)(cm + 1); 807 register struct file **rp; 808 register struct file *fp; 809 int nfds = (cm->cmsg_len - ALIGN(sizeof(*cm))) / sizeof(struct file *); 810 int f, error = 0; 811 812 /* Make sure the recipient should be able to see the descriptors.. */ 813 if (p->p_cwdi->cwdi_rdir != NULL) { 814 rp = (struct file **)ALIGN(cm + 1); 815 for (i = 0; i < nfds; i++) { 816 fp = *rp++; 817 /* 818 * If we are in a chroot'ed directory, and 819 * someone wants to pass us a directory, make 820 * sure it's inside the subtree we're allowed 821 * to access. 822 */ 823 if (fp->f_type == DTYPE_VNODE) { 824 struct vnode *vp = (struct vnode *)fp->f_data; 825 if ((vp->v_type == VDIR) && 826 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, p)) { 827 error = EPERM; 828 break; 829 } 830 } 831 } 832 } 833 rp = (struct file **)ALIGN(cm + 1); 834 835 /* Make sure that the recipient has space */ 836 if (error || (!fdavail(p, nfds))) { 837 for (i = 0; i < nfds; i++) { 838 fp = *rp; 839 /* 840 * zero the pointer before calling unp_discard, 841 * since it may end up in unp_gc().. 842 */ 843 *rp++ = 0; 844 unp_discard(fp); 845 } 846 return (error ? error : EMSGSIZE); 847 } 848 849 /* 850 * Add file to the recipient's open file table, converting them 851 * to integer file descriptors as we go. Done in forward order 852 * because an integer will always come in the same place or before 853 * its corresponding struct file pointer. 854 */ 855 for (i = 0; i < nfds; i++) { 856 fp = *rp++; 857 fp->f_msgcount--; 858 unp_rights--; 859 860 if (fdalloc(p, 0, &f)) 861 panic("unp_externalize"); 862 p->p_fd->fd_ofiles[f] = fp; 863 *fdp++ = f; 864 } 865 866 /* 867 * Adjust length, in case of transition from large struct file 868 * pointers to ints. 869 */ 870 cm->cmsg_len = sizeof(*cm) + (nfds * sizeof(int)); 871 rights->m_len = cm->cmsg_len; 872 return (0); 873 } 874 875 int 876 unp_internalize(control, p) 877 struct mbuf *control; 878 struct proc *p; 879 { 880 struct filedesc *fdescp = p->p_fd; 881 register struct cmsghdr *cm = mtod(control, struct cmsghdr *); 882 register struct file **rp; 883 register struct file *fp; 884 register int i, fd, *fdp; 885 int nfds; 886 u_int neededspace; 887 888 /* 889 * A VERY IMPORTANT NOTE ON THE USE OF sizeof(*cm) AS IT RELATES 890 * TO SCM_RIGHTS MESSAGES! 891 * 892 * SCM_RIGHTS messages are an array of ints, which have 4-byte 893 * alignment. A cmsghdr is a 12-byte long structure, so the 894 * ints can be packed directly after the cmsghdr. When they 895 * are converted to file *s, however, we must ALIGN() the 896 * size of the cmsghdr, since pointers may be larger than ints, 897 * and thus have more strict alignment requirements. 898 */ 899 900 /* Sanity check the control message header */ 901 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || 902 cm->cmsg_len != control->m_len) 903 return (EINVAL); 904 905 /* Verify that the file descriptors are valid */ 906 nfds = (cm->cmsg_len - sizeof(*cm)) / sizeof(int); 907 fdp = (int *)(cm + 1); 908 for (i = 0; i < nfds; i++) { 909 fd = *fdp++; 910 if ((unsigned)fd >= fdescp->fd_nfiles || 911 fdescp->fd_ofiles[fd] == NULL || 912 (fdescp->fd_ofiles[fd]->f_iflags & FIF_WANTCLOSE) != 0) 913 return (EBADF); 914 } 915 916 /* Make sure we have room for the struct file pointers */ 917 morespace: 918 neededspace = (ALIGN(sizeof(*cm)) + nfds * sizeof(struct file *)) - 919 control->m_len; 920 if (neededspace > M_TRAILINGSPACE(control)) { 921 922 /* if we already have a cluster, the message is just too big */ 923 if (control->m_flags & M_EXT) 924 return (E2BIG); 925 926 /* allocate a cluster and try again */ 927 MCLGET(control, M_WAIT); 928 if ((control->m_flags & M_EXT) == 0) 929 return (ENOBUFS); /* allocation failed */ 930 931 /* copy the data to the cluster */ 932 memcpy(mtod(control, char *), cm, cm->cmsg_len); 933 cm = mtod(control, struct cmsghdr *); 934 goto morespace; 935 } 936 937 /* adjust message & mbuf to note amount of space actually used. */ 938 cm->cmsg_len += neededspace; 939 control->m_len = cm->cmsg_len; 940 941 /* 942 * Transform the file descriptors into struct file pointers, in 943 * reverse order so that if pointers are bigger than ints, the 944 * int won't get until we're done. 945 */ 946 fdp = ((int *)(cm + 1)) + nfds - 1; 947 rp = ((struct file **)ALIGN(cm + 1)) + nfds - 1; 948 for (i = 0; i < nfds; i++) { 949 fp = fdescp->fd_ofiles[*fdp--]; 950 FILE_USE(fp); 951 *rp-- = fp; 952 fp->f_count++; 953 fp->f_msgcount++; 954 FILE_UNUSE(fp, NULL); 955 unp_rights++; 956 } 957 return (0); 958 } 959 960 struct mbuf * 961 unp_addsockcred(p, control) 962 struct proc *p; 963 struct mbuf *control; 964 { 965 struct cmsghdr *cmp; 966 struct sockcred *sc; 967 struct mbuf *m, *n; 968 int len, i; 969 970 len = sizeof(struct cmsghdr) + SOCKCREDSIZE(p->p_ucred->cr_ngroups); 971 972 m = m_get(M_WAIT, MT_CONTROL); 973 if (len > MLEN) { 974 if (len > MCLBYTES) 975 MEXTMALLOC(m, len, M_WAITOK); 976 else 977 MCLGET(m, M_WAIT); 978 if ((m->m_flags & M_EXT) == 0) { 979 m_free(m); 980 return (control); 981 } 982 } 983 984 m->m_len = len; 985 m->m_next = NULL; 986 cmp = mtod(m, struct cmsghdr *); 987 sc = (struct sockcred *)CMSG_DATA(cmp); 988 cmp->cmsg_len = len; 989 cmp->cmsg_level = SOL_SOCKET; 990 cmp->cmsg_type = SCM_CREDS; 991 sc->sc_uid = p->p_cred->p_ruid; 992 sc->sc_euid = p->p_ucred->cr_uid; 993 sc->sc_gid = p->p_cred->p_rgid; 994 sc->sc_egid = p->p_ucred->cr_gid; 995 sc->sc_ngroups = p->p_ucred->cr_ngroups; 996 for (i = 0; i < sc->sc_ngroups; i++) 997 sc->sc_groups[i] = p->p_ucred->cr_groups[i]; 998 999 /* 1000 * If a control message already exists, append us to the end. 1001 */ 1002 if (control != NULL) { 1003 for (n = control; n->m_next != NULL; n = n->m_next) 1004 ; 1005 n->m_next = m; 1006 } else 1007 control = m; 1008 1009 return (control); 1010 } 1011 1012 int unp_defer, unp_gcing; 1013 extern struct domain unixdomain; 1014 1015 /* 1016 * Comment added long after the fact explaining what's going on here. 1017 * Do a mark-sweep GC of file descriptors on the system, to free up 1018 * any which are caught in flight to an about-to-be-closed socket. 1019 * 1020 * Traditional mark-sweep gc's start at the "root", and mark 1021 * everything reachable from the root (which, in our case would be the 1022 * process table). The mark bits are cleared during the sweep. 1023 * 1024 * XXX For some inexplicable reason (perhaps because the file 1025 * descriptor tables used to live in the u area which could be swapped 1026 * out and thus hard to reach), we do multiple scans over the set of 1027 * descriptors, using use *two* mark bits per object (DEFER and MARK). 1028 * Whenever we find a descriptor which references other descriptors, 1029 * the ones it references are marked with both bits, and we iterate 1030 * over the whole file table until there are no more DEFER bits set. 1031 * We also make an extra pass *before* the GC to clear the mark bits, 1032 * which could have been cleared at almost no cost during the previous 1033 * sweep. 1034 * 1035 * XXX MP: this needs to run with locks such that no other thread of 1036 * control can create or destroy references to file descriptors. it 1037 * may be necessary to defer the GC until later (when the locking 1038 * situation is more hospitable); it may be necessary to push this 1039 * into a separate thread. 1040 */ 1041 void 1042 unp_gc() 1043 { 1044 register struct file *fp, *nextfp; 1045 register struct socket *so, *so1; 1046 struct file **extra_ref, **fpp; 1047 int nunref, i; 1048 1049 if (unp_gcing) 1050 return; 1051 unp_gcing = 1; 1052 unp_defer = 0; 1053 1054 /* Clear mark bits */ 1055 for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) 1056 fp->f_flag &= ~(FMARK|FDEFER); 1057 1058 /* 1059 * Iterate over the set of descriptors, marking ones believed 1060 * (based on refcount) to be referenced from a process, and 1061 * marking for rescan descriptors which are queued on a socket. 1062 */ 1063 do { 1064 for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { 1065 if (fp->f_flag & FDEFER) { 1066 fp->f_flag &= ~FDEFER; 1067 unp_defer--; 1068 #ifdef DIAGNOSTIC 1069 if (fp->f_count == 0) 1070 panic("unp_gc: deferred unreferenced socket"); 1071 #endif 1072 } else { 1073 if (fp->f_count == 0) 1074 continue; 1075 if (fp->f_flag & FMARK) 1076 continue; 1077 if (fp->f_count == fp->f_msgcount) 1078 continue; 1079 } 1080 fp->f_flag |= FMARK; 1081 1082 if (fp->f_type != DTYPE_SOCKET || 1083 (so = (struct socket *)fp->f_data) == 0) 1084 continue; 1085 if (so->so_proto->pr_domain != &unixdomain || 1086 (so->so_proto->pr_flags&PR_RIGHTS) == 0) 1087 continue; 1088 #ifdef notdef 1089 if (so->so_rcv.sb_flags & SB_LOCK) { 1090 /* 1091 * This is problematical; it's not clear 1092 * we need to wait for the sockbuf to be 1093 * unlocked (on a uniprocessor, at least), 1094 * and it's also not clear what to do 1095 * if sbwait returns an error due to receipt 1096 * of a signal. If sbwait does return 1097 * an error, we'll go into an infinite 1098 * loop. Delete all of this for now. 1099 */ 1100 (void) sbwait(&so->so_rcv); 1101 goto restart; 1102 } 1103 #endif 1104 unp_scan(so->so_rcv.sb_mb, unp_mark, 0); 1105 /* 1106 * mark descriptors referenced from sockets queued on the accept queue as well. 1107 */ 1108 if (so->so_options & SO_ACCEPTCONN) { 1109 for (so1 = so->so_q0.tqh_first; 1110 so1 != 0; 1111 so1 = so1->so_qe.tqe_next) { 1112 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1113 } 1114 for (so1 = so->so_q.tqh_first; 1115 so1 != 0; 1116 so1 = so1->so_qe.tqe_next) { 1117 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0); 1118 } 1119 } 1120 1121 } 1122 } while (unp_defer); 1123 /* 1124 * Sweep pass. Find unmarked descriptors, and free them. 1125 * 1126 * We grab an extra reference to each of the file table entries 1127 * that are not otherwise accessible and then free the rights 1128 * that are stored in messages on them. 1129 * 1130 * The bug in the orginal code is a little tricky, so I'll describe 1131 * what's wrong with it here. 1132 * 1133 * It is incorrect to simply unp_discard each entry for f_msgcount 1134 * times -- consider the case of sockets A and B that contain 1135 * references to each other. On a last close of some other socket, 1136 * we trigger a gc since the number of outstanding rights (unp_rights) 1137 * is non-zero. If during the sweep phase the gc code un_discards, 1138 * we end up doing a (full) closef on the descriptor. A closef on A 1139 * results in the following chain. Closef calls soo_close, which 1140 * calls soclose. Soclose calls first (through the switch 1141 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1142 * returns because the previous instance had set unp_gcing, and 1143 * we return all the way back to soclose, which marks the socket 1144 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush 1145 * to free up the rights that are queued in messages on the socket A, 1146 * i.e., the reference on B. The sorflush calls via the dom_dispose 1147 * switch unp_dispose, which unp_scans with unp_discard. This second 1148 * instance of unp_discard just calls closef on B. 1149 * 1150 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1151 * which results in another closef on A. Unfortunately, A is already 1152 * being closed, and the descriptor has already been marked with 1153 * SS_NOFDREF, and soclose panics at this point. 1154 * 1155 * Here, we first take an extra reference to each inaccessible 1156 * descriptor. Then, if the inaccessible descriptor is a 1157 * socket, we call sorflush in case it is a Unix domain 1158 * socket. After we destroy all the rights carried in 1159 * messages, we do a last closef to get rid of our extra 1160 * reference. This is the last close, and the unp_detach etc 1161 * will shut down the socket. 1162 * 1163 * 91/09/19, bsy@cs.cmu.edu 1164 */ 1165 extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); 1166 for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0; 1167 fp = nextfp) { 1168 nextfp = fp->f_list.le_next; 1169 if (fp->f_count == 0) 1170 continue; 1171 if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { 1172 *fpp++ = fp; 1173 nunref++; 1174 fp->f_count++; 1175 } 1176 } 1177 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1178 fp = *fpp; 1179 FILE_USE(fp); 1180 if (fp->f_type == DTYPE_SOCKET) 1181 sorflush((struct socket *)fp->f_data); 1182 FILE_UNUSE(fp, NULL); 1183 } 1184 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1185 fp = *fpp; 1186 FILE_USE(fp); 1187 (void) closef(fp, (struct proc *)0); 1188 } 1189 free((caddr_t)extra_ref, M_FILE); 1190 unp_gcing = 0; 1191 } 1192 1193 void 1194 unp_dispose(m) 1195 struct mbuf *m; 1196 { 1197 1198 if (m) 1199 unp_scan(m, unp_discard, 1); 1200 } 1201 1202 void 1203 unp_scan(m0, op, discard) 1204 register struct mbuf *m0; 1205 void (*op) __P((struct file *)); 1206 int discard; 1207 { 1208 register struct mbuf *m; 1209 register struct file **rp; 1210 register struct cmsghdr *cm; 1211 register int i; 1212 int qfds; 1213 1214 while (m0) { 1215 for (m = m0; m; m = m->m_next) 1216 if (m->m_type == MT_CONTROL && 1217 m->m_len >= sizeof(*cm)) { 1218 cm = mtod(m, struct cmsghdr *); 1219 if (cm->cmsg_level != SOL_SOCKET || 1220 cm->cmsg_type != SCM_RIGHTS) 1221 continue; 1222 qfds = (cm->cmsg_len - ALIGN(sizeof(*cm))) 1223 / sizeof(struct file *); 1224 rp = (struct file **)ALIGN(cm + 1); 1225 for (i = 0; i < qfds; i++) { 1226 struct file *fp = *rp; 1227 if (discard) 1228 *rp = 0; 1229 (*op)(fp); 1230 rp++; 1231 } 1232 break; /* XXX, but saves time */ 1233 } 1234 m0 = m0->m_act; 1235 } 1236 } 1237 1238 void 1239 unp_mark(fp) 1240 struct file *fp; 1241 { 1242 if (fp == NULL) 1243 return; 1244 1245 if (fp->f_flag & FMARK) 1246 return; 1247 1248 /* If we're already deferred, don't screw up the defer count */ 1249 if (fp->f_flag & FDEFER) 1250 return; 1251 1252 /* 1253 * Minimize the number of deferrals... Sockets are the only 1254 * type of descriptor which can hold references to another 1255 * descriptor, so just mark other descriptors, and defer 1256 * unmarked sockets for the next pass. 1257 */ 1258 if (fp->f_type == DTYPE_SOCKET) { 1259 unp_defer++; 1260 if (fp->f_count == 0) 1261 panic("unp_mark: queued unref"); 1262 fp->f_flag |= FDEFER; 1263 } else { 1264 fp->f_flag |= FMARK; 1265 } 1266 return; 1267 } 1268 1269 void 1270 unp_discard(fp) 1271 struct file *fp; 1272 { 1273 if (fp == NULL) 1274 return; 1275 FILE_USE(fp); 1276 fp->f_msgcount--; 1277 unp_rights--; 1278 (void) closef(fp, (struct proc *)0); 1279 } 1280