1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/event.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/kern_syscall.h> 59 #include <sys/malloc.h> 60 #include <sys/mapped_ioctl.h> 61 #include <sys/poll.h> 62 #include <sys/queue.h> 63 #include <sys/resourcevar.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/buf.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 #include <sys/file2.h> 74 #include <sys/mplock2.h> 75 76 #include <machine/limits.h> 77 78 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 79 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 80 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 81 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 82 83 typedef struct kfd_set { 84 fd_mask fds_bits[2]; 85 } kfd_set; 86 87 enum select_copyin_states { 88 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; 89 90 struct select_kevent_copyin_args { 91 kfd_set *read_set; 92 kfd_set *write_set; 93 kfd_set *except_set; 94 int active_set; /* One of select_copyin_states */ 95 struct lwp *lwp; /* Pointer to our lwp */ 96 int num_fds; /* Number of file descriptors (syscall arg) */ 97 int proc_fds; /* Processed fd's (wraps) */ 98 int error; /* Returned to userland */ 99 }; 100 101 struct poll_kevent_copyin_args { 102 struct lwp *lwp; 103 struct pollfd *fds; 104 int nfds; 105 int pfds; 106 int error; 107 }; 108 109 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 110 struct timespec *ts, int *res); 111 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, 112 int *res); 113 static int dofileread(int, struct file *, struct uio *, int, size_t *); 114 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 115 116 /* 117 * Read system call. 118 * 119 * MPSAFE 120 */ 121 int 122 sys_read(struct read_args *uap) 123 { 124 struct thread *td = curthread; 125 struct uio auio; 126 struct iovec aiov; 127 int error; 128 129 if ((ssize_t)uap->nbyte < 0) 130 error = EINVAL; 131 132 aiov.iov_base = uap->buf; 133 aiov.iov_len = uap->nbyte; 134 auio.uio_iov = &aiov; 135 auio.uio_iovcnt = 1; 136 auio.uio_offset = -1; 137 auio.uio_resid = uap->nbyte; 138 auio.uio_rw = UIO_READ; 139 auio.uio_segflg = UIO_USERSPACE; 140 auio.uio_td = td; 141 142 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 143 return(error); 144 } 145 146 /* 147 * Positioned (Pread) read system call 148 * 149 * MPSAFE 150 */ 151 int 152 sys_extpread(struct extpread_args *uap) 153 { 154 struct thread *td = curthread; 155 struct uio auio; 156 struct iovec aiov; 157 int error; 158 int flags; 159 160 if ((ssize_t)uap->nbyte < 0) 161 return(EINVAL); 162 163 aiov.iov_base = uap->buf; 164 aiov.iov_len = uap->nbyte; 165 auio.uio_iov = &aiov; 166 auio.uio_iovcnt = 1; 167 auio.uio_offset = uap->offset; 168 auio.uio_resid = uap->nbyte; 169 auio.uio_rw = UIO_READ; 170 auio.uio_segflg = UIO_USERSPACE; 171 auio.uio_td = td; 172 173 flags = uap->flags & O_FMASK; 174 if (uap->offset != (off_t)-1) 175 flags |= O_FOFFSET; 176 177 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 178 return(error); 179 } 180 181 /* 182 * Scatter read system call. 183 * 184 * MPSAFE 185 */ 186 int 187 sys_readv(struct readv_args *uap) 188 { 189 struct thread *td = curthread; 190 struct uio auio; 191 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 192 int error; 193 194 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 195 &auio.uio_resid); 196 if (error) 197 return (error); 198 auio.uio_iov = iov; 199 auio.uio_iovcnt = uap->iovcnt; 200 auio.uio_offset = -1; 201 auio.uio_rw = UIO_READ; 202 auio.uio_segflg = UIO_USERSPACE; 203 auio.uio_td = td; 204 205 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 206 207 iovec_free(&iov, aiov); 208 return (error); 209 } 210 211 212 /* 213 * Scatter positioned read system call. 214 * 215 * MPSAFE 216 */ 217 int 218 sys_extpreadv(struct extpreadv_args *uap) 219 { 220 struct thread *td = curthread; 221 struct uio auio; 222 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 223 int error; 224 int flags; 225 226 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 227 &auio.uio_resid); 228 if (error) 229 return (error); 230 auio.uio_iov = iov; 231 auio.uio_iovcnt = uap->iovcnt; 232 auio.uio_offset = uap->offset; 233 auio.uio_rw = UIO_READ; 234 auio.uio_segflg = UIO_USERSPACE; 235 auio.uio_td = td; 236 237 flags = uap->flags & O_FMASK; 238 if (uap->offset != (off_t)-1) 239 flags |= O_FOFFSET; 240 241 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 242 243 iovec_free(&iov, aiov); 244 return(error); 245 } 246 247 /* 248 * MPSAFE 249 */ 250 int 251 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 252 { 253 struct thread *td = curthread; 254 struct proc *p = td->td_proc; 255 struct file *fp; 256 int error; 257 258 KKASSERT(p); 259 260 fp = holdfp(p->p_fd, fd, FREAD); 261 if (fp == NULL) 262 return (EBADF); 263 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 264 error = ESPIPE; 265 } else { 266 error = dofileread(fd, fp, auio, flags, res); 267 } 268 fdrop(fp); 269 return(error); 270 } 271 272 /* 273 * Common code for readv and preadv that reads data in 274 * from a file using the passed in uio, offset, and flags. 275 * 276 * MPALMOSTSAFE - ktrace needs help 277 */ 278 static int 279 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 280 { 281 int error; 282 size_t len; 283 #ifdef KTRACE 284 struct thread *td = curthread; 285 struct iovec *ktriov = NULL; 286 struct uio ktruio; 287 #endif 288 289 #ifdef KTRACE 290 /* 291 * if tracing, save a copy of iovec 292 */ 293 if (KTRPOINT(td, KTR_GENIO)) { 294 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 295 296 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 297 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 298 ktruio = *auio; 299 } 300 #endif 301 len = auio->uio_resid; 302 error = fo_read(fp, auio, fp->f_cred, flags); 303 if (error) { 304 if (auio->uio_resid != len && (error == ERESTART || 305 error == EINTR || error == EWOULDBLOCK)) 306 error = 0; 307 } 308 #ifdef KTRACE 309 if (ktriov != NULL) { 310 if (error == 0) { 311 ktruio.uio_iov = ktriov; 312 ktruio.uio_resid = len - auio->uio_resid; 313 get_mplock(); 314 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 315 rel_mplock(); 316 } 317 FREE(ktriov, M_TEMP); 318 } 319 #endif 320 if (error == 0) 321 *res = len - auio->uio_resid; 322 323 return(error); 324 } 325 326 /* 327 * Write system call 328 * 329 * MPSAFE 330 */ 331 int 332 sys_write(struct write_args *uap) 333 { 334 struct thread *td = curthread; 335 struct uio auio; 336 struct iovec aiov; 337 int error; 338 339 if ((ssize_t)uap->nbyte < 0) 340 error = EINVAL; 341 342 aiov.iov_base = (void *)(uintptr_t)uap->buf; 343 aiov.iov_len = uap->nbyte; 344 auio.uio_iov = &aiov; 345 auio.uio_iovcnt = 1; 346 auio.uio_offset = -1; 347 auio.uio_resid = uap->nbyte; 348 auio.uio_rw = UIO_WRITE; 349 auio.uio_segflg = UIO_USERSPACE; 350 auio.uio_td = td; 351 352 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 353 354 return(error); 355 } 356 357 /* 358 * Pwrite system call 359 * 360 * MPSAFE 361 */ 362 int 363 sys_extpwrite(struct extpwrite_args *uap) 364 { 365 struct thread *td = curthread; 366 struct uio auio; 367 struct iovec aiov; 368 int error; 369 int flags; 370 371 if ((ssize_t)uap->nbyte < 0) 372 error = EINVAL; 373 374 aiov.iov_base = (void *)(uintptr_t)uap->buf; 375 aiov.iov_len = uap->nbyte; 376 auio.uio_iov = &aiov; 377 auio.uio_iovcnt = 1; 378 auio.uio_offset = uap->offset; 379 auio.uio_resid = uap->nbyte; 380 auio.uio_rw = UIO_WRITE; 381 auio.uio_segflg = UIO_USERSPACE; 382 auio.uio_td = td; 383 384 flags = uap->flags & O_FMASK; 385 if (uap->offset != (off_t)-1) 386 flags |= O_FOFFSET; 387 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 388 return(error); 389 } 390 391 /* 392 * MPSAFE 393 */ 394 int 395 sys_writev(struct writev_args *uap) 396 { 397 struct thread *td = curthread; 398 struct uio auio; 399 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 400 int error; 401 402 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 403 &auio.uio_resid); 404 if (error) 405 return (error); 406 auio.uio_iov = iov; 407 auio.uio_iovcnt = uap->iovcnt; 408 auio.uio_offset = -1; 409 auio.uio_rw = UIO_WRITE; 410 auio.uio_segflg = UIO_USERSPACE; 411 auio.uio_td = td; 412 413 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 414 415 iovec_free(&iov, aiov); 416 return (error); 417 } 418 419 420 /* 421 * Gather positioned write system call 422 * 423 * MPSAFE 424 */ 425 int 426 sys_extpwritev(struct extpwritev_args *uap) 427 { 428 struct thread *td = curthread; 429 struct uio auio; 430 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 431 int error; 432 int flags; 433 434 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 435 &auio.uio_resid); 436 if (error) 437 return (error); 438 auio.uio_iov = iov; 439 auio.uio_iovcnt = uap->iovcnt; 440 auio.uio_offset = uap->offset; 441 auio.uio_rw = UIO_WRITE; 442 auio.uio_segflg = UIO_USERSPACE; 443 auio.uio_td = td; 444 445 flags = uap->flags & O_FMASK; 446 if (uap->offset != (off_t)-1) 447 flags |= O_FOFFSET; 448 449 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 450 451 iovec_free(&iov, aiov); 452 return(error); 453 } 454 455 /* 456 * MPSAFE 457 */ 458 int 459 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 460 { 461 struct thread *td = curthread; 462 struct proc *p = td->td_proc; 463 struct file *fp; 464 int error; 465 466 KKASSERT(p); 467 468 fp = holdfp(p->p_fd, fd, FWRITE); 469 if (fp == NULL) 470 return (EBADF); 471 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 472 error = ESPIPE; 473 } else { 474 error = dofilewrite(fd, fp, auio, flags, res); 475 } 476 477 fdrop(fp); 478 return (error); 479 } 480 481 /* 482 * Common code for writev and pwritev that writes data to 483 * a file using the passed in uio, offset, and flags. 484 * 485 * MPALMOSTSAFE - ktrace needs help 486 */ 487 static int 488 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 489 { 490 struct thread *td = curthread; 491 struct lwp *lp = td->td_lwp; 492 int error; 493 size_t len; 494 #ifdef KTRACE 495 struct iovec *ktriov = NULL; 496 struct uio ktruio; 497 #endif 498 499 #ifdef KTRACE 500 /* 501 * if tracing, save a copy of iovec and uio 502 */ 503 if (KTRPOINT(td, KTR_GENIO)) { 504 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 505 506 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 507 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 508 ktruio = *auio; 509 } 510 #endif 511 len = auio->uio_resid; 512 error = fo_write(fp, auio, fp->f_cred, flags); 513 if (error) { 514 if (auio->uio_resid != len && (error == ERESTART || 515 error == EINTR || error == EWOULDBLOCK)) 516 error = 0; 517 /* Socket layer is responsible for issuing SIGPIPE. */ 518 if (error == EPIPE) { 519 get_mplock(); 520 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 521 rel_mplock(); 522 } 523 } 524 #ifdef KTRACE 525 if (ktriov != NULL) { 526 if (error == 0) { 527 ktruio.uio_iov = ktriov; 528 ktruio.uio_resid = len - auio->uio_resid; 529 get_mplock(); 530 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 531 rel_mplock(); 532 } 533 FREE(ktriov, M_TEMP); 534 } 535 #endif 536 if (error == 0) 537 *res = len - auio->uio_resid; 538 539 return(error); 540 } 541 542 /* 543 * Ioctl system call 544 * 545 * MPALMOSTSAFE 546 */ 547 int 548 sys_ioctl(struct ioctl_args *uap) 549 { 550 int error; 551 552 get_mplock(); 553 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); 554 rel_mplock(); 555 return (error); 556 } 557 558 struct ioctl_map_entry { 559 const char *subsys; 560 struct ioctl_map_range *cmd_ranges; 561 LIST_ENTRY(ioctl_map_entry) entries; 562 }; 563 564 /* 565 * The true heart of all ioctl syscall handlers (native, emulation). 566 * If map != NULL, it will be searched for a matching entry for com, 567 * and appropriate conversions/conversion functions will be utilized. 568 */ 569 int 570 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 571 struct sysmsg *msg) 572 { 573 struct thread *td = curthread; 574 struct proc *p = td->td_proc; 575 struct ucred *cred; 576 struct file *fp; 577 struct ioctl_map_range *iomc = NULL; 578 int error; 579 u_int size; 580 u_long ocom = com; 581 caddr_t data, memp; 582 int tmp; 583 #define STK_PARAMS 128 584 union { 585 char stkbuf[STK_PARAMS]; 586 long align; 587 } ubuf; 588 589 KKASSERT(p); 590 cred = td->td_ucred; 591 592 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 593 if (fp == NULL) 594 return(EBADF); 595 596 if (map != NULL) { /* obey translation map */ 597 u_long maskcmd; 598 struct ioctl_map_entry *e; 599 600 maskcmd = com & map->mask; 601 602 LIST_FOREACH(e, &map->mapping, entries) { 603 for (iomc = e->cmd_ranges; iomc->start != 0 || 604 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 605 iomc->mapfunc != NULL; 606 iomc++) { 607 if (maskcmd >= iomc->start && 608 maskcmd <= iomc->end) 609 break; 610 } 611 612 /* Did we find a match? */ 613 if (iomc->start != 0 || iomc->maptocmd != 0 || 614 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 615 break; 616 } 617 618 if (iomc == NULL || 619 (iomc->start == 0 && iomc->maptocmd == 0 620 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 621 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 622 map->sys, fd, maskcmd, 623 (int)((maskcmd >> 8) & 0xff), 624 (int)(maskcmd & 0xff)); 625 error = EINVAL; 626 goto done; 627 } 628 629 /* 630 * If it's a non-range one to one mapping, maptocmd should be 631 * correct. If it's a ranged one to one mapping, we pass the 632 * original value of com, and for a range mapped to a different 633 * range, we always need a mapping function to translate the 634 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 635 */ 636 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 637 com = iomc->maptocmd; 638 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 639 if (iomc->mapfunc != NULL) 640 com = iomc->mapfunc(iomc->start, iomc->end, 641 iomc->start, iomc->end, 642 com, com); 643 } else { 644 if (iomc->mapfunc != NULL) { 645 com = iomc->mapfunc(iomc->start, iomc->end, 646 iomc->maptocmd, iomc->maptoend, 647 com, ocom); 648 } else { 649 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 650 map->sys, fd, maskcmd, 651 (int)((maskcmd >> 8) & 0xff), 652 (int)(maskcmd & 0xff)); 653 error = EINVAL; 654 goto done; 655 } 656 } 657 } 658 659 switch (com) { 660 case FIONCLEX: 661 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 662 goto done; 663 case FIOCLEX: 664 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 665 goto done; 666 } 667 668 /* 669 * Interpret high order word to find amount of data to be 670 * copied to/from the user's address space. 671 */ 672 size = IOCPARM_LEN(com); 673 if (size > IOCPARM_MAX) { 674 error = ENOTTY; 675 goto done; 676 } 677 678 memp = NULL; 679 if (size > sizeof (ubuf.stkbuf)) { 680 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 681 data = memp; 682 } else { 683 data = ubuf.stkbuf; 684 } 685 if ((com & IOC_IN) != 0) { 686 if (size != 0) { 687 error = copyin(uspc_data, data, (size_t)size); 688 if (error) { 689 if (memp != NULL) 690 kfree(memp, M_IOCTLOPS); 691 goto done; 692 } 693 } else { 694 *(caddr_t *)data = uspc_data; 695 } 696 } else if ((com & IOC_OUT) != 0 && size) { 697 /* 698 * Zero the buffer so the user always 699 * gets back something deterministic. 700 */ 701 bzero(data, (size_t)size); 702 } else if ((com & IOC_VOID) != 0) { 703 *(caddr_t *)data = uspc_data; 704 } 705 706 switch (com) { 707 case FIONBIO: 708 if ((tmp = *(int *)data)) 709 fp->f_flag |= FNONBLOCK; 710 else 711 fp->f_flag &= ~FNONBLOCK; 712 error = 0; 713 break; 714 715 case FIOASYNC: 716 if ((tmp = *(int *)data)) 717 fp->f_flag |= FASYNC; 718 else 719 fp->f_flag &= ~FASYNC; 720 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 721 break; 722 723 default: 724 /* 725 * If there is a override function, 726 * call it instead of directly routing the call 727 */ 728 if (map != NULL && iomc->wrapfunc != NULL) 729 error = iomc->wrapfunc(fp, com, ocom, data, cred); 730 else 731 error = fo_ioctl(fp, com, data, cred, msg); 732 /* 733 * Copy any data to user, size was 734 * already set and checked above. 735 */ 736 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 737 error = copyout(data, uspc_data, (size_t)size); 738 break; 739 } 740 if (memp != NULL) 741 kfree(memp, M_IOCTLOPS); 742 done: 743 fdrop(fp); 744 return(error); 745 } 746 747 int 748 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 749 { 750 struct ioctl_map_entry *ne; 751 752 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 753 he->subsys != NULL && *he->subsys != '\0'); 754 755 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 756 757 ne->subsys = he->subsys; 758 ne->cmd_ranges = he->cmd_ranges; 759 760 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 761 762 return(0); 763 } 764 765 int 766 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 767 { 768 struct ioctl_map_entry *ne; 769 770 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 771 772 LIST_FOREACH(ne, &he->map->mapping, entries) { 773 if (ne->cmd_ranges != he->cmd_ranges) 774 continue; 775 LIST_REMOVE(ne, entries); 776 kfree(ne, M_IOCTLMAP); 777 return(0); 778 } 779 return(EINVAL); 780 } 781 782 static int nselcoll; /* Select collisions since boot */ 783 int selwait; 784 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 785 static int nseldebug; 786 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, ""); 787 788 /* 789 * Select system call. 790 * 791 * MPSAFE 792 */ 793 int 794 sys_select(struct select_args *uap) 795 { 796 struct timeval ktv; 797 struct timespec *ktsp, kts; 798 int error; 799 800 /* 801 * Get timeout if any. 802 */ 803 if (uap->tv != NULL) { 804 error = copyin(uap->tv, &ktv, sizeof (ktv)); 805 if (error) 806 return (error); 807 TIMEVAL_TO_TIMESPEC(&ktv, &kts); 808 ktsp = &kts; 809 } else { 810 ktsp = NULL; 811 } 812 813 /* 814 * Do real work. 815 */ 816 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 817 &uap->sysmsg_result); 818 819 return (error); 820 } 821 822 823 /* 824 * Pselect system call. 825 * 826 * MPALMOSTSAFE 827 */ 828 int 829 sys_pselect(struct pselect_args *uap) 830 { 831 struct thread *td = curthread; 832 struct lwp *lp = td->td_lwp; 833 struct timespec *ktsp, kts; 834 sigset_t sigmask; 835 int error; 836 837 /* 838 * Get timeout if any. 839 */ 840 if (uap->ts != NULL) { 841 error = copyin(uap->ts, &kts, sizeof (kts)); 842 if (error) 843 return (error); 844 ktsp = &kts; 845 } else { 846 ktsp = NULL; 847 } 848 849 /* 850 * Install temporary signal mask if any provided. 851 */ 852 if (uap->sigmask != NULL) { 853 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 854 if (error) 855 return (error); 856 get_mplock(); 857 lp->lwp_oldsigmask = lp->lwp_sigmask; 858 SIG_CANTMASK(sigmask); 859 lp->lwp_sigmask = sigmask; 860 } else { 861 get_mplock(); 862 } 863 864 /* 865 * Do real job. 866 */ 867 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 868 &uap->sysmsg_result); 869 870 if (uap->sigmask != NULL) { 871 /* doselect() responsible for turning ERESTART into EINTR */ 872 KKASSERT(error != ERESTART); 873 if (error == EINTR) { 874 /* 875 * We can't restore the previous signal mask now 876 * because it could block the signal that interrupted 877 * us. So make a note to restore it after executing 878 * the handler. 879 */ 880 lp->lwp_flag |= LWP_OLDMASK; 881 } else { 882 /* 883 * No handler to run. Restore previous mask immediately. 884 */ 885 lp->lwp_sigmask = lp->lwp_oldsigmask; 886 } 887 } 888 rel_mplock(); 889 890 return (error); 891 } 892 893 static int 894 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 895 { 896 struct select_kevent_copyin_args *skap = NULL; 897 struct kevent *kev; 898 int fd; 899 kfd_set *fdp = NULL; 900 short filter = 0; 901 u_int fflags = 0; 902 903 skap = (struct select_kevent_copyin_args *)arg; 904 905 if (*events == maxevents) 906 return (0); 907 908 while (skap->active_set < COPYIN_DONE) { 909 switch (skap->active_set) { 910 case COPYIN_READ: 911 /* 912 * Register descriptors for the read filter 913 */ 914 fdp = skap->read_set; 915 filter = EVFILT_READ; 916 fflags = 0; 917 if (fdp) 918 break; 919 ++skap->active_set; 920 skap->proc_fds = 0; 921 /* fall through */ 922 case COPYIN_WRITE: 923 /* 924 * Register descriptors for the write filter 925 */ 926 fdp = skap->write_set; 927 filter = EVFILT_WRITE; 928 fflags = 0; 929 if (fdp) 930 break; 931 ++skap->active_set; 932 skap->proc_fds = 0; 933 /* fall through */ 934 case COPYIN_EXCEPT: 935 /* 936 * Register descriptors for the exception filter 937 */ 938 fdp = skap->except_set; 939 filter = EVFILT_EXCEPT; 940 fflags = NOTE_OOB; 941 if (fdp) 942 break; 943 ++skap->active_set; 944 skap->proc_fds = 0; 945 /* fall through */ 946 case COPYIN_DONE: 947 /* 948 * Nothing left to register 949 */ 950 return(0); 951 /* NOT REACHED */ 952 } 953 954 while (skap->proc_fds < skap->num_fds) { 955 fd = skap->proc_fds; 956 if (FD_ISSET(fd, fdp)) { 957 kev = &kevp[*events]; 958 EV_SET(kev, fd, filter, 959 EV_ADD|EV_ENABLE, 960 fflags, 0, 961 (void *)skap->lwp->lwp_kqueue_serial); 962 FD_CLR(fd, fdp); 963 ++*events; 964 } 965 ++skap->proc_fds; 966 if (*events == maxevents) 967 return (0); 968 } 969 skap->active_set++; 970 skap->proc_fds = 0; 971 } 972 973 return (0); 974 } 975 976 static int 977 select_copyout(void *arg, struct kevent *kevp, int count, int *res) 978 { 979 struct select_kevent_copyin_args *skap; 980 struct kevent kev; 981 int i = 0; 982 983 skap = (struct select_kevent_copyin_args *)arg; 984 985 if (kevp[0].flags & EV_ERROR) { 986 skap->error = kevp[0].data; 987 return (0); 988 } 989 990 for (i = 0; i < count; ++i) { 991 if ((u_int)kevp[i].udata != skap->lwp->lwp_kqueue_serial) { 992 kev = kevp[i]; 993 kev.flags = EV_DISABLE|EV_DELETE; 994 kqueue_register(&skap->lwp->lwp_kqueue, &kev); 995 continue; 996 } 997 998 switch (kevp[i].filter) { 999 case EVFILT_READ: 1000 FD_SET(kevp[i].ident, skap->read_set); 1001 break; 1002 case EVFILT_WRITE: 1003 FD_SET(kevp[i].ident, skap->write_set); 1004 break; 1005 case EVFILT_EXCEPT: 1006 FD_SET(kevp[i].ident, skap->except_set); 1007 break; 1008 } 1009 1010 ++*res; 1011 } 1012 1013 return (0); 1014 } 1015 1016 /* 1017 * Copy select bits in from userland. Allocate kernel memory if the 1018 * set is large. 1019 */ 1020 static int 1021 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) 1022 { 1023 int error; 1024 1025 if (in_set) { 1026 if (bytes < sizeof(*tmp_set)) 1027 *out_set = tmp_set; 1028 else 1029 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); 1030 error = copyin(in_set, *out_set, bytes); 1031 } else { 1032 *out_set = NULL; 1033 error = 0; 1034 } 1035 return (error); 1036 } 1037 1038 /* 1039 * Copy returned select bits back out to userland. 1040 */ 1041 static int 1042 putbits(int bytes, kfd_set *in_set, fd_set *out_set) 1043 { 1044 int error; 1045 1046 if (in_set) { 1047 error = copyout(in_set, out_set, bytes); 1048 } else { 1049 error = 0; 1050 } 1051 return (error); 1052 } 1053 1054 /* 1055 * Common code for sys_select() and sys_pselect(). 1056 * 1057 * in, out and ex are userland pointers. ts must point to validated 1058 * kernel-side timeout value or NULL for infinite timeout. res must 1059 * point to syscall return value. 1060 */ 1061 static int 1062 doselect(int nd, fd_set *read, fd_set *write, fd_set *except, 1063 struct timespec *ts, int *res) 1064 { 1065 struct proc *p = curproc; 1066 struct select_kevent_copyin_args *kap, ka; 1067 int bytes, error; 1068 kfd_set read_tmp; 1069 kfd_set write_tmp; 1070 kfd_set except_tmp; 1071 1072 *res = 0; 1073 if (nd < 0) 1074 return (EINVAL); 1075 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ 1076 nd = p->p_fd->fd_nfiles; 1077 1078 kap = &ka; 1079 kap->lwp = curthread->td_lwp; 1080 kap->num_fds = nd; 1081 kap->proc_fds = 0; 1082 kap->error = 0; 1083 kap->active_set = COPYIN_READ; 1084 1085 /* 1086 * Calculate bytes based on the number of __fd_mask[] array entries 1087 * multiplied by the size of __fd_mask. 1088 */ 1089 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); 1090 1091 error = getbits(bytes, read, &kap->read_set, &read_tmp); 1092 if (error == 0) 1093 error = getbits(bytes, write, &kap->write_set, &write_tmp); 1094 if (error == 0) 1095 error = getbits(bytes, except, &kap->except_set, &except_tmp); 1096 if (error) 1097 goto done; 1098 1099 /* 1100 * NOTE: Make sure the max events passed to kern_kevent() is 1101 * effectively unlimited. (nd * 3) accomplishes this. 1102 * 1103 * (*res) continues to increment as returned events are 1104 * loaded in. 1105 */ 1106 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, 1107 select_copyin, select_copyout, ts); 1108 if (error == 0) 1109 error = putbits(bytes, kap->read_set, read); 1110 if (error == 0) 1111 error = putbits(bytes, kap->write_set, write); 1112 if (error == 0) 1113 error = putbits(bytes, kap->except_set, except); 1114 1115 /* 1116 * Cumulative error from individual events (EBADFD?) 1117 */ 1118 if (kap->error) 1119 error = kap->error; 1120 1121 /* 1122 * Clean up. 1123 */ 1124 done: 1125 if (kap->read_set && kap->read_set != &read_tmp) 1126 kfree(kap->read_set, M_SELECT); 1127 if (kap->write_set && kap->write_set != &write_tmp) 1128 kfree(kap->write_set, M_SELECT); 1129 if (kap->except_set && kap->except_set != &except_tmp) 1130 kfree(kap->except_set, M_SELECT); 1131 1132 kap->lwp->lwp_kqueue_serial++; 1133 1134 return (error); 1135 } 1136 1137 /* 1138 * Poll system call. 1139 * 1140 * MPSAFE 1141 */ 1142 int 1143 sys_poll(struct poll_args *uap) 1144 { 1145 struct timespec ts, *tsp; 1146 int error; 1147 1148 if (uap->timeout != INFTIM) { 1149 ts.tv_sec = uap->timeout / 1000; 1150 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; 1151 tsp = &ts; 1152 } else { 1153 tsp = NULL; 1154 } 1155 1156 error = dopoll(uap->nfds, uap->fds, tsp, &uap->sysmsg_result); 1157 1158 return (error); 1159 } 1160 1161 static int 1162 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1163 { 1164 struct poll_kevent_copyin_args *pkap; 1165 struct pollfd *pfd; 1166 struct kevent *kev; 1167 int kev_count; 1168 1169 pkap = (struct poll_kevent_copyin_args *)arg; 1170 1171 while (pkap->pfds < pkap->nfds) { 1172 pfd = &pkap->fds[pkap->pfds]; 1173 1174 /* Clear return events */ 1175 pfd->revents = 0; 1176 1177 /* Do not check if fd is equal to -1 */ 1178 if (pfd->fd == -1) { 1179 ++pkap->pfds; 1180 continue; 1181 } 1182 1183 kev_count = 0; 1184 if (pfd->events & (POLLIN | POLLRDNORM)) 1185 kev_count++; 1186 if (pfd->events & (POLLOUT | POLLWRNORM)) 1187 kev_count++; 1188 if (pfd->events & (POLLPRI | POLLRDBAND)) 1189 kev_count++; 1190 1191 if (*events + kev_count > maxevents) 1192 return (0); 1193 1194 /* 1195 * NOTE: A combined serial number and poll array index is 1196 * stored in kev->udata. 1197 */ 1198 kev = &kevp[*events]; 1199 if (pfd->events & (POLLIN | POLLRDNORM)) { 1200 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1201 0, 0, (void *)(pkap->lwp->lwp_kqueue_serial + 1202 pkap->pfds)); 1203 } 1204 if (pfd->events & (POLLOUT | POLLWRNORM)) { 1205 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, 1206 0, 0, (void *)(pkap->lwp->lwp_kqueue_serial + 1207 pkap->pfds)); 1208 } 1209 if (pfd->events & (POLLPRI | POLLRDBAND)) { 1210 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, 1211 NOTE_OOB, 0, 1212 (void *)(pkap->lwp->lwp_kqueue_serial + 1213 pkap->pfds)); 1214 } 1215 1216 if (nseldebug) { 1217 kprintf("poll index %d fd %d events %08x\n", 1218 pkap->pfds, pfd->fd, pfd->events); 1219 } 1220 1221 ++pkap->pfds; 1222 (*events) += kev_count; 1223 } 1224 1225 return (0); 1226 } 1227 1228 static int 1229 poll_copyout(void *arg, struct kevent *kevp, int count, int *res) 1230 { 1231 struct poll_kevent_copyin_args *pkap; 1232 struct pollfd *pfd; 1233 struct kevent kev; 1234 int i; 1235 u_int pi; 1236 1237 pkap = (struct poll_kevent_copyin_args *)arg; 1238 1239 for (i = 0; i < count; ++i) { 1240 /* 1241 * Extract the poll array index and delete spurious events. 1242 * We can easily tell if the serial number is incorrect 1243 * by checking whether the extracted index is out of range. 1244 */ 1245 pi = (u_int)kevp[i].udata - (u_int)pkap->lwp->lwp_kqueue_serial; 1246 1247 if (pi >= pkap->nfds) { 1248 kev = kevp[i]; 1249 kev.flags = EV_DISABLE|EV_DELETE; 1250 kqueue_register(&pkap->lwp->lwp_kqueue, &kev); 1251 if (nseldebug) 1252 kprintf("poll index %d out of range\n", pi); 1253 continue; 1254 } 1255 pfd = &pkap->fds[pi]; 1256 if (kevp[i].ident == pfd->fd) { 1257 if (kevp[i].flags & EV_ERROR) { 1258 switch(kevp[i].data) { 1259 case EOPNOTSUPP: 1260 /* 1261 * Operation not supported. Poll 1262 * does not return an error for 1263 * POLLPRI (OOB/urgent data) when 1264 * it is not supported by the device. 1265 */ 1266 if (kevp[i].filter != EVFILT_EXCEPT) { 1267 pfd->revents |= POLLERR; 1268 ++*res; 1269 } 1270 break; 1271 case EBADF: 1272 /* Bad file descriptor */ 1273 pfd->revents |= POLLNVAL; 1274 ++*res; 1275 break; 1276 default: 1277 pfd->revents |= POLLERR; 1278 ++*res; 1279 break; 1280 } 1281 if (nseldebug) 1282 kprintf("poll index %d fd %d filter %d error %d\n", 1283 pi, pfd->fd, 1284 kevp[i].filter, kevp[i].data); 1285 continue; 1286 } 1287 1288 if (kevp[i].flags & EV_EOF) { 1289 pfd->revents |= POLLHUP; 1290 ++*res; 1291 continue; 1292 } 1293 1294 switch (kevp[i].filter) { 1295 case EVFILT_READ: 1296 pfd->revents |= (POLLIN | POLLRDNORM); 1297 break; 1298 case EVFILT_WRITE: 1299 pfd->revents |= (POLLOUT | POLLWRNORM); 1300 break; 1301 case EVFILT_EXCEPT: 1302 pfd->revents |= (POLLPRI | POLLRDBAND); 1303 break; 1304 } 1305 1306 if (nseldebug) { 1307 kprintf("poll index %d fd %d revents %08x\n", 1308 pi, pfd->fd, pfd->revents); 1309 } 1310 1311 ++*res; 1312 continue; 1313 } else { 1314 if (nseldebug) 1315 kprintf("poll index %d mismatch %d/%d\n", 1316 pi, kevp[i].ident, pfd->fd); 1317 } 1318 } 1319 1320 return (0); 1321 } 1322 1323 static int 1324 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res) 1325 { 1326 struct poll_kevent_copyin_args ka; 1327 struct pollfd sfds[64]; 1328 int bytes; 1329 int error; 1330 1331 *res = 0; 1332 if (nfds < 0) 1333 return (EINVAL); 1334 1335 /* 1336 * This is a bit arbitrary but we need to limit internal kmallocs. 1337 */ 1338 if (nfds > maxfilesperproc * 2) 1339 nfds = maxfilesperproc * 2; 1340 bytes = sizeof(struct pollfd) * nfds; 1341 1342 ka.lwp = curthread->td_lwp; 1343 ka.nfds = nfds; 1344 ka.pfds = 0; 1345 ka.error = 0; 1346 1347 if (ka.nfds < 64) 1348 ka.fds = sfds; 1349 else 1350 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); 1351 1352 error = copyin(fds, ka.fds, bytes); 1353 if (error == 0) 1354 error = kern_kevent(&ka.lwp->lwp_kqueue, ka.nfds, res, &ka, 1355 poll_copyin, poll_copyout, ts); 1356 1357 if (error == 0) 1358 error = copyout(ka.fds, fds, bytes); 1359 1360 if (ka.fds != sfds) 1361 kfree(ka.fds, M_SELECT); 1362 1363 ka.lwp->lwp_kqueue_serial += nfds; 1364 1365 return (error); 1366 } 1367 1368 /* 1369 * OpenBSD poll system call. 1370 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1371 * 1372 * MPSAFE 1373 */ 1374 int 1375 sys_openbsd_poll(struct openbsd_poll_args *uap) 1376 { 1377 return (sys_poll((struct poll_args *)uap)); 1378 } 1379 1380 /*ARGSUSED*/ 1381 int 1382 seltrue(cdev_t dev, int events) 1383 { 1384 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1385 } 1386 1387 /* 1388 * Record a select request. A global wait must be used since a process/thread 1389 * might go away after recording its request. 1390 */ 1391 void 1392 selrecord(struct thread *selector, struct selinfo *sip) 1393 { 1394 struct proc *p; 1395 struct lwp *lp = NULL; 1396 1397 if (selector->td_lwp == NULL) 1398 panic("selrecord: thread needs a process"); 1399 1400 if (sip->si_pid == selector->td_proc->p_pid && 1401 sip->si_tid == selector->td_lwp->lwp_tid) 1402 return; 1403 if (sip->si_pid && (p = pfind(sip->si_pid))) 1404 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1405 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1406 sip->si_flags |= SI_COLL; 1407 } else { 1408 sip->si_pid = selector->td_proc->p_pid; 1409 sip->si_tid = selector->td_lwp->lwp_tid; 1410 } 1411 } 1412 1413 /* 1414 * Do a wakeup when a selectable event occurs. 1415 */ 1416 void 1417 selwakeup(struct selinfo *sip) 1418 { 1419 struct proc *p; 1420 struct lwp *lp = NULL; 1421 1422 if (sip->si_pid == 0) 1423 return; 1424 if (sip->si_flags & SI_COLL) { 1425 nselcoll++; 1426 sip->si_flags &= ~SI_COLL; 1427 wakeup((caddr_t)&selwait); /* YYY fixable */ 1428 } 1429 p = pfind(sip->si_pid); 1430 sip->si_pid = 0; 1431 if (p == NULL) 1432 return; 1433 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1434 if (lp == NULL) 1435 return; 1436 1437 /* 1438 * This is a temporary hack until the code can be rewritten. 1439 * Check LWP_SELECT before assuming we can setrunnable(). 1440 * Otherwise we might catch the lwp before it actually goes to 1441 * sleep. 1442 */ 1443 crit_enter(); 1444 if (lp->lwp_flag & LWP_SELECT) { 1445 lp->lwp_flag &= ~LWP_SELECT; 1446 } else if (lp->lwp_wchan == (caddr_t)&selwait) { 1447 /* 1448 * Flag the process to break the tsleep when 1449 * setrunnable is called, but only call setrunnable 1450 * here if the process is not in a stopped state. 1451 */ 1452 lp->lwp_flag |= LWP_BREAKTSLEEP; 1453 if (p->p_stat != SSTOP) 1454 setrunnable(lp); 1455 } 1456 crit_exit(); 1457 1458 kqueue_wakeup(&lp->lwp_kqueue); 1459 } 1460 1461