1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/event.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/kern_syscall.h> 59 #include <sys/malloc.h> 60 #include <sys/mapped_ioctl.h> 61 #include <sys/poll.h> 62 #include <sys/queue.h> 63 #include <sys/resourcevar.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/buf.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 #include <sys/file2.h> 74 #include <sys/mplock2.h> 75 76 #include <machine/limits.h> 77 78 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 79 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 80 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 81 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 82 83 typedef struct kfd_set { 84 fd_mask fds_bits[2]; 85 } kfd_set; 86 87 enum select_copyin_states { 88 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; 89 90 struct select_kevent_copyin_args { 91 kfd_set *read_set; 92 kfd_set *write_set; 93 kfd_set *except_set; 94 int active_set; /* One of select_copyin_states */ 95 struct lwp *lwp; /* Pointer to our lwp */ 96 int num_fds; /* Number of file descriptors (syscall arg) */ 97 int proc_fds; /* Processed fd's (wraps) */ 98 int error; /* Returned to userland */ 99 }; 100 101 struct poll_kevent_copyin_args { 102 struct lwp *lwp; 103 struct pollfd *fds; 104 int nfds; 105 int pfds; 106 int error; 107 }; 108 109 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 110 struct timespec *ts, int *res); 111 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, 112 int *res); 113 static int dofileread(int, struct file *, struct uio *, int, size_t *); 114 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 115 116 /* 117 * Read system call. 118 * 119 * MPSAFE 120 */ 121 int 122 sys_read(struct read_args *uap) 123 { 124 struct thread *td = curthread; 125 struct uio auio; 126 struct iovec aiov; 127 int error; 128 129 if ((ssize_t)uap->nbyte < 0) 130 error = EINVAL; 131 132 aiov.iov_base = uap->buf; 133 aiov.iov_len = uap->nbyte; 134 auio.uio_iov = &aiov; 135 auio.uio_iovcnt = 1; 136 auio.uio_offset = -1; 137 auio.uio_resid = uap->nbyte; 138 auio.uio_rw = UIO_READ; 139 auio.uio_segflg = UIO_USERSPACE; 140 auio.uio_td = td; 141 142 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 143 return(error); 144 } 145 146 /* 147 * Positioned (Pread) read system call 148 * 149 * MPSAFE 150 */ 151 int 152 sys_extpread(struct extpread_args *uap) 153 { 154 struct thread *td = curthread; 155 struct uio auio; 156 struct iovec aiov; 157 int error; 158 int flags; 159 160 if ((ssize_t)uap->nbyte < 0) 161 return(EINVAL); 162 163 aiov.iov_base = uap->buf; 164 aiov.iov_len = uap->nbyte; 165 auio.uio_iov = &aiov; 166 auio.uio_iovcnt = 1; 167 auio.uio_offset = uap->offset; 168 auio.uio_resid = uap->nbyte; 169 auio.uio_rw = UIO_READ; 170 auio.uio_segflg = UIO_USERSPACE; 171 auio.uio_td = td; 172 173 flags = uap->flags & O_FMASK; 174 if (uap->offset != (off_t)-1) 175 flags |= O_FOFFSET; 176 177 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 178 return(error); 179 } 180 181 /* 182 * Scatter read system call. 183 * 184 * MPSAFE 185 */ 186 int 187 sys_readv(struct readv_args *uap) 188 { 189 struct thread *td = curthread; 190 struct uio auio; 191 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 192 int error; 193 194 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 195 &auio.uio_resid); 196 if (error) 197 return (error); 198 auio.uio_iov = iov; 199 auio.uio_iovcnt = uap->iovcnt; 200 auio.uio_offset = -1; 201 auio.uio_rw = UIO_READ; 202 auio.uio_segflg = UIO_USERSPACE; 203 auio.uio_td = td; 204 205 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 206 207 iovec_free(&iov, aiov); 208 return (error); 209 } 210 211 212 /* 213 * Scatter positioned read system call. 214 * 215 * MPSAFE 216 */ 217 int 218 sys_extpreadv(struct extpreadv_args *uap) 219 { 220 struct thread *td = curthread; 221 struct uio auio; 222 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 223 int error; 224 int flags; 225 226 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 227 &auio.uio_resid); 228 if (error) 229 return (error); 230 auio.uio_iov = iov; 231 auio.uio_iovcnt = uap->iovcnt; 232 auio.uio_offset = uap->offset; 233 auio.uio_rw = UIO_READ; 234 auio.uio_segflg = UIO_USERSPACE; 235 auio.uio_td = td; 236 237 flags = uap->flags & O_FMASK; 238 if (uap->offset != (off_t)-1) 239 flags |= O_FOFFSET; 240 241 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 242 243 iovec_free(&iov, aiov); 244 return(error); 245 } 246 247 /* 248 * MPSAFE 249 */ 250 int 251 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 252 { 253 struct thread *td = curthread; 254 struct proc *p = td->td_proc; 255 struct file *fp; 256 int error; 257 258 KKASSERT(p); 259 260 fp = holdfp(p->p_fd, fd, FREAD); 261 if (fp == NULL) 262 return (EBADF); 263 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 264 error = ESPIPE; 265 } else { 266 error = dofileread(fd, fp, auio, flags, res); 267 } 268 fdrop(fp); 269 return(error); 270 } 271 272 /* 273 * Common code for readv and preadv that reads data in 274 * from a file using the passed in uio, offset, and flags. 275 * 276 * MPALMOSTSAFE - ktrace needs help 277 */ 278 static int 279 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 280 { 281 int error; 282 size_t len; 283 #ifdef KTRACE 284 struct thread *td = curthread; 285 struct iovec *ktriov = NULL; 286 struct uio ktruio; 287 #endif 288 289 #ifdef KTRACE 290 /* 291 * if tracing, save a copy of iovec 292 */ 293 if (KTRPOINT(td, KTR_GENIO)) { 294 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 295 296 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 297 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 298 ktruio = *auio; 299 } 300 #endif 301 len = auio->uio_resid; 302 error = fo_read(fp, auio, fp->f_cred, flags); 303 if (error) { 304 if (auio->uio_resid != len && (error == ERESTART || 305 error == EINTR || error == EWOULDBLOCK)) 306 error = 0; 307 } 308 #ifdef KTRACE 309 if (ktriov != NULL) { 310 if (error == 0) { 311 ktruio.uio_iov = ktriov; 312 ktruio.uio_resid = len - auio->uio_resid; 313 get_mplock(); 314 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 315 rel_mplock(); 316 } 317 FREE(ktriov, M_TEMP); 318 } 319 #endif 320 if (error == 0) 321 *res = len - auio->uio_resid; 322 323 return(error); 324 } 325 326 /* 327 * Write system call 328 * 329 * MPSAFE 330 */ 331 int 332 sys_write(struct write_args *uap) 333 { 334 struct thread *td = curthread; 335 struct uio auio; 336 struct iovec aiov; 337 int error; 338 339 if ((ssize_t)uap->nbyte < 0) 340 error = EINVAL; 341 342 aiov.iov_base = (void *)(uintptr_t)uap->buf; 343 aiov.iov_len = uap->nbyte; 344 auio.uio_iov = &aiov; 345 auio.uio_iovcnt = 1; 346 auio.uio_offset = -1; 347 auio.uio_resid = uap->nbyte; 348 auio.uio_rw = UIO_WRITE; 349 auio.uio_segflg = UIO_USERSPACE; 350 auio.uio_td = td; 351 352 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 353 354 return(error); 355 } 356 357 /* 358 * Pwrite system call 359 * 360 * MPSAFE 361 */ 362 int 363 sys_extpwrite(struct extpwrite_args *uap) 364 { 365 struct thread *td = curthread; 366 struct uio auio; 367 struct iovec aiov; 368 int error; 369 int flags; 370 371 if ((ssize_t)uap->nbyte < 0) 372 error = EINVAL; 373 374 aiov.iov_base = (void *)(uintptr_t)uap->buf; 375 aiov.iov_len = uap->nbyte; 376 auio.uio_iov = &aiov; 377 auio.uio_iovcnt = 1; 378 auio.uio_offset = uap->offset; 379 auio.uio_resid = uap->nbyte; 380 auio.uio_rw = UIO_WRITE; 381 auio.uio_segflg = UIO_USERSPACE; 382 auio.uio_td = td; 383 384 flags = uap->flags & O_FMASK; 385 if (uap->offset != (off_t)-1) 386 flags |= O_FOFFSET; 387 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 388 return(error); 389 } 390 391 /* 392 * MPSAFE 393 */ 394 int 395 sys_writev(struct writev_args *uap) 396 { 397 struct thread *td = curthread; 398 struct uio auio; 399 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 400 int error; 401 402 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 403 &auio.uio_resid); 404 if (error) 405 return (error); 406 auio.uio_iov = iov; 407 auio.uio_iovcnt = uap->iovcnt; 408 auio.uio_offset = -1; 409 auio.uio_rw = UIO_WRITE; 410 auio.uio_segflg = UIO_USERSPACE; 411 auio.uio_td = td; 412 413 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 414 415 iovec_free(&iov, aiov); 416 return (error); 417 } 418 419 420 /* 421 * Gather positioned write system call 422 * 423 * MPSAFE 424 */ 425 int 426 sys_extpwritev(struct extpwritev_args *uap) 427 { 428 struct thread *td = curthread; 429 struct uio auio; 430 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 431 int error; 432 int flags; 433 434 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 435 &auio.uio_resid); 436 if (error) 437 return (error); 438 auio.uio_iov = iov; 439 auio.uio_iovcnt = uap->iovcnt; 440 auio.uio_offset = uap->offset; 441 auio.uio_rw = UIO_WRITE; 442 auio.uio_segflg = UIO_USERSPACE; 443 auio.uio_td = td; 444 445 flags = uap->flags & O_FMASK; 446 if (uap->offset != (off_t)-1) 447 flags |= O_FOFFSET; 448 449 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 450 451 iovec_free(&iov, aiov); 452 return(error); 453 } 454 455 /* 456 * MPSAFE 457 */ 458 int 459 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 460 { 461 struct thread *td = curthread; 462 struct proc *p = td->td_proc; 463 struct file *fp; 464 int error; 465 466 KKASSERT(p); 467 468 fp = holdfp(p->p_fd, fd, FWRITE); 469 if (fp == NULL) 470 return (EBADF); 471 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 472 error = ESPIPE; 473 } else { 474 error = dofilewrite(fd, fp, auio, flags, res); 475 } 476 477 fdrop(fp); 478 return (error); 479 } 480 481 /* 482 * Common code for writev and pwritev that writes data to 483 * a file using the passed in uio, offset, and flags. 484 * 485 * MPALMOSTSAFE - ktrace needs help 486 */ 487 static int 488 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 489 { 490 struct thread *td = curthread; 491 struct lwp *lp = td->td_lwp; 492 int error; 493 size_t len; 494 #ifdef KTRACE 495 struct iovec *ktriov = NULL; 496 struct uio ktruio; 497 #endif 498 499 #ifdef KTRACE 500 /* 501 * if tracing, save a copy of iovec and uio 502 */ 503 if (KTRPOINT(td, KTR_GENIO)) { 504 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 505 506 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 507 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 508 ktruio = *auio; 509 } 510 #endif 511 len = auio->uio_resid; 512 error = fo_write(fp, auio, fp->f_cred, flags); 513 if (error) { 514 if (auio->uio_resid != len && (error == ERESTART || 515 error == EINTR || error == EWOULDBLOCK)) 516 error = 0; 517 /* Socket layer is responsible for issuing SIGPIPE. */ 518 if (error == EPIPE) { 519 get_mplock(); 520 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 521 rel_mplock(); 522 } 523 } 524 #ifdef KTRACE 525 if (ktriov != NULL) { 526 if (error == 0) { 527 ktruio.uio_iov = ktriov; 528 ktruio.uio_resid = len - auio->uio_resid; 529 get_mplock(); 530 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 531 rel_mplock(); 532 } 533 FREE(ktriov, M_TEMP); 534 } 535 #endif 536 if (error == 0) 537 *res = len - auio->uio_resid; 538 539 return(error); 540 } 541 542 /* 543 * Ioctl system call 544 * 545 * MPALMOSTSAFE 546 */ 547 int 548 sys_ioctl(struct ioctl_args *uap) 549 { 550 int error; 551 552 get_mplock(); 553 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); 554 rel_mplock(); 555 return (error); 556 } 557 558 struct ioctl_map_entry { 559 const char *subsys; 560 struct ioctl_map_range *cmd_ranges; 561 LIST_ENTRY(ioctl_map_entry) entries; 562 }; 563 564 /* 565 * The true heart of all ioctl syscall handlers (native, emulation). 566 * If map != NULL, it will be searched for a matching entry for com, 567 * and appropriate conversions/conversion functions will be utilized. 568 */ 569 int 570 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 571 struct sysmsg *msg) 572 { 573 struct thread *td = curthread; 574 struct proc *p = td->td_proc; 575 struct ucred *cred; 576 struct file *fp; 577 struct ioctl_map_range *iomc = NULL; 578 int error; 579 u_int size; 580 u_long ocom = com; 581 caddr_t data, memp; 582 int tmp; 583 #define STK_PARAMS 128 584 union { 585 char stkbuf[STK_PARAMS]; 586 long align; 587 } ubuf; 588 589 KKASSERT(p); 590 cred = td->td_ucred; 591 592 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 593 if (fp == NULL) 594 return(EBADF); 595 596 if (map != NULL) { /* obey translation map */ 597 u_long maskcmd; 598 struct ioctl_map_entry *e; 599 600 maskcmd = com & map->mask; 601 602 LIST_FOREACH(e, &map->mapping, entries) { 603 for (iomc = e->cmd_ranges; iomc->start != 0 || 604 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 605 iomc->mapfunc != NULL; 606 iomc++) { 607 if (maskcmd >= iomc->start && 608 maskcmd <= iomc->end) 609 break; 610 } 611 612 /* Did we find a match? */ 613 if (iomc->start != 0 || iomc->maptocmd != 0 || 614 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 615 break; 616 } 617 618 if (iomc == NULL || 619 (iomc->start == 0 && iomc->maptocmd == 0 620 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 621 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 622 map->sys, fd, maskcmd, 623 (int)((maskcmd >> 8) & 0xff), 624 (int)(maskcmd & 0xff)); 625 error = EINVAL; 626 goto done; 627 } 628 629 /* 630 * If it's a non-range one to one mapping, maptocmd should be 631 * correct. If it's a ranged one to one mapping, we pass the 632 * original value of com, and for a range mapped to a different 633 * range, we always need a mapping function to translate the 634 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 635 */ 636 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 637 com = iomc->maptocmd; 638 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 639 if (iomc->mapfunc != NULL) 640 com = iomc->mapfunc(iomc->start, iomc->end, 641 iomc->start, iomc->end, 642 com, com); 643 } else { 644 if (iomc->mapfunc != NULL) { 645 com = iomc->mapfunc(iomc->start, iomc->end, 646 iomc->maptocmd, iomc->maptoend, 647 com, ocom); 648 } else { 649 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 650 map->sys, fd, maskcmd, 651 (int)((maskcmd >> 8) & 0xff), 652 (int)(maskcmd & 0xff)); 653 error = EINVAL; 654 goto done; 655 } 656 } 657 } 658 659 switch (com) { 660 case FIONCLEX: 661 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 662 goto done; 663 case FIOCLEX: 664 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 665 goto done; 666 } 667 668 /* 669 * Interpret high order word to find amount of data to be 670 * copied to/from the user's address space. 671 */ 672 size = IOCPARM_LEN(com); 673 if (size > IOCPARM_MAX) { 674 error = ENOTTY; 675 goto done; 676 } 677 678 memp = NULL; 679 if (size > sizeof (ubuf.stkbuf)) { 680 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 681 data = memp; 682 } else { 683 data = ubuf.stkbuf; 684 } 685 if ((com & IOC_IN) != 0) { 686 if (size != 0) { 687 error = copyin(uspc_data, data, (size_t)size); 688 if (error) { 689 if (memp != NULL) 690 kfree(memp, M_IOCTLOPS); 691 goto done; 692 } 693 } else { 694 *(caddr_t *)data = uspc_data; 695 } 696 } else if ((com & IOC_OUT) != 0 && size) { 697 /* 698 * Zero the buffer so the user always 699 * gets back something deterministic. 700 */ 701 bzero(data, (size_t)size); 702 } else if ((com & IOC_VOID) != 0) { 703 *(caddr_t *)data = uspc_data; 704 } 705 706 switch (com) { 707 case FIONBIO: 708 if ((tmp = *(int *)data)) 709 fp->f_flag |= FNONBLOCK; 710 else 711 fp->f_flag &= ~FNONBLOCK; 712 error = 0; 713 break; 714 715 case FIOASYNC: 716 if ((tmp = *(int *)data)) 717 fp->f_flag |= FASYNC; 718 else 719 fp->f_flag &= ~FASYNC; 720 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 721 break; 722 723 default: 724 /* 725 * If there is a override function, 726 * call it instead of directly routing the call 727 */ 728 if (map != NULL && iomc->wrapfunc != NULL) 729 error = iomc->wrapfunc(fp, com, ocom, data, cred); 730 else 731 error = fo_ioctl(fp, com, data, cred, msg); 732 /* 733 * Copy any data to user, size was 734 * already set and checked above. 735 */ 736 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 737 error = copyout(data, uspc_data, (size_t)size); 738 break; 739 } 740 if (memp != NULL) 741 kfree(memp, M_IOCTLOPS); 742 done: 743 fdrop(fp); 744 return(error); 745 } 746 747 int 748 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 749 { 750 struct ioctl_map_entry *ne; 751 752 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 753 he->subsys != NULL && *he->subsys != '\0'); 754 755 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 756 757 ne->subsys = he->subsys; 758 ne->cmd_ranges = he->cmd_ranges; 759 760 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 761 762 return(0); 763 } 764 765 int 766 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 767 { 768 struct ioctl_map_entry *ne; 769 770 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 771 772 LIST_FOREACH(ne, &he->map->mapping, entries) { 773 if (ne->cmd_ranges != he->cmd_ranges) 774 continue; 775 LIST_REMOVE(ne, entries); 776 kfree(ne, M_IOCTLMAP); 777 return(0); 778 } 779 return(EINVAL); 780 } 781 782 static int nselcoll; /* Select collisions since boot */ 783 int selwait; 784 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 785 786 /* 787 * Select system call. 788 * 789 * MPSAFE 790 */ 791 int 792 sys_select(struct select_args *uap) 793 { 794 struct timeval ktv; 795 struct timespec *ktsp, kts; 796 int error; 797 798 /* 799 * Get timeout if any. 800 */ 801 if (uap->tv != NULL) { 802 error = copyin(uap->tv, &ktv, sizeof (ktv)); 803 if (error) 804 return (error); 805 TIMEVAL_TO_TIMESPEC(&ktv, &kts); 806 ktsp = &kts; 807 } else { 808 ktsp = NULL; 809 } 810 811 /* 812 * Do real work. 813 */ 814 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 815 &uap->sysmsg_result); 816 817 return (error); 818 } 819 820 821 /* 822 * Pselect system call. 823 * 824 * MPALMOSTSAFE 825 */ 826 int 827 sys_pselect(struct pselect_args *uap) 828 { 829 struct thread *td = curthread; 830 struct lwp *lp = td->td_lwp; 831 struct timespec *ktsp, kts; 832 sigset_t sigmask; 833 int error; 834 835 /* 836 * Get timeout if any. 837 */ 838 if (uap->ts != NULL) { 839 error = copyin(uap->ts, &kts, sizeof (kts)); 840 if (error) 841 return (error); 842 ktsp = &kts; 843 } else { 844 ktsp = NULL; 845 } 846 847 /* 848 * Install temporary signal mask if any provided. 849 */ 850 if (uap->sigmask != NULL) { 851 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 852 if (error) 853 return (error); 854 get_mplock(); 855 lp->lwp_oldsigmask = lp->lwp_sigmask; 856 SIG_CANTMASK(sigmask); 857 lp->lwp_sigmask = sigmask; 858 } else { 859 get_mplock(); 860 } 861 862 /* 863 * Do real job. 864 */ 865 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 866 &uap->sysmsg_result); 867 868 if (uap->sigmask != NULL) { 869 /* doselect() responsible for turning ERESTART into EINTR */ 870 KKASSERT(error != ERESTART); 871 if (error == EINTR) { 872 /* 873 * We can't restore the previous signal mask now 874 * because it could block the signal that interrupted 875 * us. So make a note to restore it after executing 876 * the handler. 877 */ 878 lp->lwp_flag |= LWP_OLDMASK; 879 } else { 880 /* 881 * No handler to run. Restore previous mask immediately. 882 */ 883 lp->lwp_sigmask = lp->lwp_oldsigmask; 884 } 885 } 886 rel_mplock(); 887 888 return (error); 889 } 890 891 static int 892 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 893 { 894 struct select_kevent_copyin_args *skap = NULL; 895 struct kevent *kev; 896 int fd; 897 kfd_set *fdp = NULL; 898 short filter = 0; 899 u_int fflags = 0; 900 901 skap = (struct select_kevent_copyin_args *)arg; 902 903 if (*events == maxevents) 904 return (0); 905 906 while (skap->active_set < COPYIN_DONE) { 907 switch (skap->active_set) { 908 case COPYIN_READ: 909 /* 910 * Register descriptors for the read filter 911 */ 912 fdp = skap->read_set; 913 filter = EVFILT_READ; 914 fflags = 0; 915 if (fdp) 916 break; 917 ++skap->active_set; 918 skap->proc_fds = 0; 919 /* fall through */ 920 case COPYIN_WRITE: 921 /* 922 * Register descriptors for the write filter 923 */ 924 fdp = skap->write_set; 925 filter = EVFILT_WRITE; 926 fflags = 0; 927 if (fdp) 928 break; 929 ++skap->active_set; 930 skap->proc_fds = 0; 931 /* fall through */ 932 case COPYIN_EXCEPT: 933 /* 934 * Register descriptors for the exception filter 935 */ 936 fdp = skap->except_set; 937 filter = EVFILT_EXCEPT; 938 fflags = NOTE_OOB; 939 if (fdp) 940 break; 941 ++skap->active_set; 942 skap->proc_fds = 0; 943 /* fall through */ 944 case COPYIN_DONE: 945 /* 946 * Nothing left to register 947 */ 948 return(0); 949 /* NOT REACHED */ 950 } 951 952 while (skap->proc_fds < skap->num_fds) { 953 fd = skap->proc_fds; 954 if (FD_ISSET(fd, fdp)) { 955 kev = &kevp[*events]; 956 EV_SET(kev, fd, filter, 957 EV_ADD|EV_ENABLE, 958 fflags, 0, 959 (void *)skap->lwp->lwp_kqueue_serial); 960 FD_CLR(fd, fdp); 961 ++*events; 962 } 963 ++skap->proc_fds; 964 if (*events == maxevents) 965 return (0); 966 } 967 skap->active_set++; 968 skap->proc_fds = 0; 969 } 970 971 return (0); 972 } 973 974 static int 975 select_copyout(void *arg, struct kevent *kevp, int count, int *res) 976 { 977 struct select_kevent_copyin_args *skap; 978 struct kevent kev; 979 int i = 0; 980 981 skap = (struct select_kevent_copyin_args *)arg; 982 983 if (kevp[0].flags & EV_ERROR) { 984 skap->error = kevp[0].data; 985 return (0); 986 } 987 988 for (i = 0; i < count; ++i) { 989 if ((u_int)kevp[i].udata != skap->lwp->lwp_kqueue_serial) { 990 kev = kevp[i]; 991 kev.flags = EV_DISABLE|EV_DELETE; 992 kqueue_register(&skap->lwp->lwp_kqueue, &kev); 993 continue; 994 } 995 996 switch (kevp[i].filter) { 997 case EVFILT_READ: 998 FD_SET(kevp[i].ident, skap->read_set); 999 break; 1000 case EVFILT_WRITE: 1001 FD_SET(kevp[i].ident, skap->write_set); 1002 break; 1003 case EVFILT_EXCEPT: 1004 FD_SET(kevp[i].ident, skap->except_set); 1005 break; 1006 } 1007 1008 ++*res; 1009 } 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Copy select bits in from userland. Allocate kernel memory if the 1016 * set is large. 1017 */ 1018 static int 1019 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) 1020 { 1021 int error; 1022 1023 if (in_set) { 1024 if (bytes < sizeof(*tmp_set)) 1025 *out_set = tmp_set; 1026 else 1027 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); 1028 error = copyin(in_set, *out_set, bytes); 1029 } else { 1030 *out_set = NULL; 1031 error = 0; 1032 } 1033 return (error); 1034 } 1035 1036 /* 1037 * Copy returned select bits back out to userland. 1038 */ 1039 static int 1040 putbits(int bytes, kfd_set *in_set, fd_set *out_set) 1041 { 1042 int error; 1043 1044 if (in_set) { 1045 error = copyout(in_set, out_set, bytes); 1046 } else { 1047 error = 0; 1048 } 1049 return (error); 1050 } 1051 1052 /* 1053 * Common code for sys_select() and sys_pselect(). 1054 * 1055 * in, out and ex are userland pointers. ts must point to validated 1056 * kernel-side timeout value or NULL for infinite timeout. res must 1057 * point to syscall return value. 1058 */ 1059 static int 1060 doselect(int nd, fd_set *read, fd_set *write, fd_set *except, 1061 struct timespec *ts, int *res) 1062 { 1063 struct proc *p = curproc; 1064 struct select_kevent_copyin_args *kap, ka; 1065 int bytes, error; 1066 kfd_set read_tmp; 1067 kfd_set write_tmp; 1068 kfd_set except_tmp; 1069 1070 *res = 0; 1071 if (nd < 0) 1072 return (EINVAL); 1073 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ 1074 nd = p->p_fd->fd_nfiles; 1075 1076 kap = &ka; 1077 kap->lwp = curthread->td_lwp; 1078 kap->num_fds = nd; 1079 kap->proc_fds = 0; 1080 kap->error = 0; 1081 kap->active_set = COPYIN_READ; 1082 1083 /* 1084 * Calculate bytes based on the number of __fd_mask[] array entries 1085 * multiplied by the size of __fd_mask. 1086 */ 1087 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); 1088 1089 error = getbits(bytes, read, &kap->read_set, &read_tmp); 1090 if (error == 0) 1091 error = getbits(bytes, write, &kap->write_set, &write_tmp); 1092 if (error == 0) 1093 error = getbits(bytes, except, &kap->except_set, &except_tmp); 1094 if (error) 1095 goto done; 1096 1097 /* 1098 * NOTE: Make sure the max events passed to kern_kevent() is 1099 * effectively unlimited. (nd * 3) accomplishes this. 1100 * 1101 * (*res) continues to increment as returned events are 1102 * loaded in. 1103 */ 1104 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, 1105 select_copyin, select_copyout, ts); 1106 if (error == 0) 1107 error = putbits(bytes, kap->read_set, read); 1108 if (error == 0) 1109 error = putbits(bytes, kap->write_set, write); 1110 if (error == 0) 1111 error = putbits(bytes, kap->except_set, except); 1112 1113 /* 1114 * Cumulative error from individual events (EBADFD?) 1115 */ 1116 if (kap->error) 1117 error = kap->error; 1118 1119 /* 1120 * Clean up. 1121 */ 1122 done: 1123 if (kap->read_set && kap->read_set != &read_tmp) 1124 kfree(kap->read_set, M_SELECT); 1125 if (kap->write_set && kap->write_set != &write_tmp) 1126 kfree(kap->write_set, M_SELECT); 1127 if (kap->except_set && kap->except_set != &except_tmp) 1128 kfree(kap->except_set, M_SELECT); 1129 1130 kap->lwp->lwp_kqueue_serial++; 1131 1132 return (error); 1133 } 1134 1135 /* 1136 * Poll system call. 1137 * 1138 * MPSAFE 1139 */ 1140 int 1141 sys_poll(struct poll_args *uap) 1142 { 1143 struct timespec ts, *tsp; 1144 int error; 1145 1146 if (uap->timeout != INFTIM) { 1147 ts.tv_sec = uap->timeout / 1000; 1148 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; 1149 tsp = &ts; 1150 } else { 1151 tsp = NULL; 1152 } 1153 1154 error = dopoll(uap->nfds, uap->fds, tsp, &uap->sysmsg_result); 1155 1156 return (error); 1157 } 1158 1159 static int 1160 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1161 { 1162 struct poll_kevent_copyin_args *pkap; 1163 struct pollfd *pfd; 1164 struct kevent *kev; 1165 int kev_count; 1166 1167 pkap = (struct poll_kevent_copyin_args *)arg; 1168 1169 while (pkap->pfds < pkap->nfds) { 1170 pfd = &pkap->fds[pkap->pfds]; 1171 1172 /* Clear return events */ 1173 pfd->revents = 0; 1174 1175 /* Do not check if fd is equal to -1 */ 1176 if (pfd->fd == -1) { 1177 ++pkap->pfds; 1178 continue; 1179 } 1180 1181 kev_count = 0; 1182 if (pfd->events & (POLLIN | POLLRDNORM)) 1183 kev_count++; 1184 if (pfd->events & (POLLOUT | POLLWRNORM)) 1185 kev_count++; 1186 if (pfd->events & (POLLPRI | POLLRDBAND)) 1187 kev_count++; 1188 1189 if (*events + kev_count > maxevents) 1190 return (0); 1191 1192 kev = &kevp[*events]; 1193 if (pfd->events & (POLLIN | POLLRDNORM)) 1194 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1195 0, 0, (void *)pkap->pfds); 1196 if (pfd->events & (POLLOUT | POLLWRNORM)) 1197 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, 1198 0, 0, (void *)pkap->pfds); 1199 if (pfd->events & (POLLPRI | POLLRDBAND)) 1200 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, 1201 NOTE_OOB, 0, (void *)pkap->pfds); 1202 1203 ++pkap->pfds; 1204 (*events) += kev_count; 1205 } 1206 1207 return (0); 1208 } 1209 1210 static int 1211 poll_copyout(void *arg, struct kevent *kevp, int count, int *res) 1212 { 1213 struct poll_kevent_copyin_args *pkap; 1214 struct pollfd *pfd; 1215 struct kevent kev; 1216 int i; 1217 1218 pkap = (struct poll_kevent_copyin_args *)arg; 1219 1220 for (i = 0; i < count; ++i) { 1221 if ((int)kevp[i].udata < pkap->nfds) { 1222 pfd = &pkap->fds[(int)kevp[i].udata]; 1223 if (kevp[i].ident == pfd->fd) { 1224 if (kevp[i].flags & EV_ERROR) { 1225 /* Bad file descriptor */ 1226 if (kevp[i].data == EBADF) 1227 pfd->revents |= POLLNVAL; 1228 else 1229 pfd->revents |= POLLERR; 1230 1231 ++*res; 1232 continue; 1233 } 1234 1235 if (kevp[i].flags & EV_EOF) { 1236 pfd->revents |= POLLHUP; 1237 ++*res; 1238 continue; 1239 } 1240 1241 switch (kevp[i].filter) { 1242 case EVFILT_READ: 1243 pfd->revents |= (POLLIN | POLLRDNORM); 1244 break; 1245 case EVFILT_WRITE: 1246 pfd->revents |= (POLLOUT | POLLWRNORM); 1247 break; 1248 case EVFILT_EXCEPT: 1249 pfd->revents |= (POLLPRI | POLLRDBAND); 1250 break; 1251 } 1252 1253 ++*res; 1254 continue; 1255 } 1256 } 1257 1258 /* Remove descriptor not in pollfd set from kq */ 1259 kev = kevp[i]; 1260 kev.flags = EV_DISABLE|EV_DELETE; 1261 kqueue_register(&pkap->lwp->lwp_kqueue, &kev); 1262 } 1263 1264 return (0); 1265 } 1266 1267 static int 1268 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res) 1269 { 1270 struct poll_kevent_copyin_args ka; 1271 struct pollfd sfds[64]; 1272 int bytes; 1273 int error; 1274 1275 *res = 0; 1276 if (nfds < 0) 1277 return (EINVAL); 1278 1279 /* 1280 * This is a bit arbitrary but we need to limit internal kmallocs. 1281 */ 1282 if (nfds > maxfilesperproc * 2) 1283 nfds = maxfilesperproc * 2; 1284 bytes = sizeof(struct pollfd) * nfds; 1285 1286 ka.lwp = curthread->td_lwp; 1287 ka.nfds = nfds; 1288 ka.pfds = 0; 1289 ka.error = 0; 1290 1291 if (ka.nfds < 64) 1292 ka.fds = sfds; 1293 else 1294 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); 1295 1296 error = copyin(fds, ka.fds, bytes); 1297 if (error == 0) 1298 error = kern_kevent(&ka.lwp->lwp_kqueue, ka.nfds, res, &ka, 1299 poll_copyin, poll_copyout, ts); 1300 1301 if (error == 0) 1302 error = copyout(ka.fds, fds, bytes); 1303 1304 if (ka.fds != sfds) 1305 kfree(ka.fds, M_SELECT); 1306 1307 return (error); 1308 } 1309 1310 /* 1311 * OpenBSD poll system call. 1312 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1313 * 1314 * MPSAFE 1315 */ 1316 int 1317 sys_openbsd_poll(struct openbsd_poll_args *uap) 1318 { 1319 return (sys_poll((struct poll_args *)uap)); 1320 } 1321 1322 /*ARGSUSED*/ 1323 int 1324 seltrue(cdev_t dev, int events) 1325 { 1326 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1327 } 1328 1329 /* 1330 * Record a select request. A global wait must be used since a process/thread 1331 * might go away after recording its request. 1332 */ 1333 void 1334 selrecord(struct thread *selector, struct selinfo *sip) 1335 { 1336 struct proc *p; 1337 struct lwp *lp = NULL; 1338 1339 if (selector->td_lwp == NULL) 1340 panic("selrecord: thread needs a process"); 1341 1342 if (sip->si_pid == selector->td_proc->p_pid && 1343 sip->si_tid == selector->td_lwp->lwp_tid) 1344 return; 1345 if (sip->si_pid && (p = pfind(sip->si_pid))) 1346 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1347 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1348 sip->si_flags |= SI_COLL; 1349 } else { 1350 sip->si_pid = selector->td_proc->p_pid; 1351 sip->si_tid = selector->td_lwp->lwp_tid; 1352 } 1353 } 1354 1355 /* 1356 * Do a wakeup when a selectable event occurs. 1357 */ 1358 void 1359 selwakeup(struct selinfo *sip) 1360 { 1361 struct proc *p; 1362 struct lwp *lp = NULL; 1363 1364 if (sip->si_pid == 0) 1365 return; 1366 if (sip->si_flags & SI_COLL) { 1367 nselcoll++; 1368 sip->si_flags &= ~SI_COLL; 1369 wakeup((caddr_t)&selwait); /* YYY fixable */ 1370 } 1371 p = pfind(sip->si_pid); 1372 sip->si_pid = 0; 1373 if (p == NULL) 1374 return; 1375 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1376 if (lp == NULL) 1377 return; 1378 1379 /* 1380 * This is a temporary hack until the code can be rewritten. 1381 * Check LWP_SELECT before assuming we can setrunnable(). 1382 * Otherwise we might catch the lwp before it actually goes to 1383 * sleep. 1384 */ 1385 crit_enter(); 1386 if (lp->lwp_flag & LWP_SELECT) { 1387 lp->lwp_flag &= ~LWP_SELECT; 1388 } else if (lp->lwp_wchan == (caddr_t)&selwait) { 1389 /* 1390 * Flag the process to break the tsleep when 1391 * setrunnable is called, but only call setrunnable 1392 * here if the process is not in a stopped state. 1393 */ 1394 lp->lwp_flag |= LWP_BREAKTSLEEP; 1395 if (p->p_stat != SSTOP) 1396 setrunnable(lp); 1397 } 1398 crit_exit(); 1399 1400 kqueue_wakeup(&lp->lwp_kqueue); 1401 } 1402 1403