1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/sysmsg.h> 43 #include <sys/event.h> 44 #include <sys/filedesc.h> 45 #include <sys/filio.h> 46 #include <sys/fcntl.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/signalvar.h> 50 #include <sys/socketvar.h> 51 #include <sys/malloc.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/kern_syscall.h> 55 #include <sys/mapped_ioctl.h> 56 #include <sys/poll.h> 57 #include <sys/queue.h> 58 #include <sys/resourcevar.h> 59 #include <sys/socketops.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/buf.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 #include <vm/vm.h> 67 #include <vm/vm_page.h> 68 69 #include <sys/file2.h> 70 #include <sys/spinlock2.h> 71 72 #include <machine/limits.h> 73 74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 75 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 77 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 78 79 static struct krate krate_poll = { .freq = 1 }; 80 81 typedef struct kfd_set { 82 fd_mask fds_bits[2]; 83 } kfd_set; 84 85 enum select_copyin_states { 86 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; 87 88 struct select_kevent_copyin_args { 89 kfd_set *read_set; 90 kfd_set *write_set; 91 kfd_set *except_set; 92 int active_set; /* One of select_copyin_states */ 93 struct lwp *lwp; /* Pointer to our lwp */ 94 int num_fds; /* Number of file descriptors (syscall arg) */ 95 int proc_fds; /* Processed fd's (wraps) */ 96 int error; /* Returned to userland */ 97 }; 98 99 struct poll_kevent_copyin_args { 100 struct lwp *lwp; 101 struct pollfd *fds; 102 int nfds; 103 int pfds; 104 int error; 105 }; 106 107 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token); 108 109 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 110 struct timespec *ts, int *res); 111 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, 112 int *res, int flags); 113 static int dofileread(int, struct file *, struct uio *, int, size_t *); 114 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 115 116 /* 117 * Read system call. 118 * 119 * MPSAFE 120 */ 121 int 122 sys_read(struct sysmsg *sysmsg, const struct read_args *uap) 123 { 124 struct thread *td = curthread; 125 struct uio auio; 126 struct iovec aiov; 127 int error; 128 129 if ((ssize_t)uap->nbyte < 0) 130 error = EINVAL; 131 132 aiov.iov_base = uap->buf; 133 aiov.iov_len = uap->nbyte; 134 auio.uio_iov = &aiov; 135 auio.uio_iovcnt = 1; 136 auio.uio_offset = -1; 137 auio.uio_resid = uap->nbyte; 138 auio.uio_rw = UIO_READ; 139 auio.uio_segflg = UIO_USERSPACE; 140 auio.uio_td = td; 141 142 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 143 return(error); 144 } 145 146 /* 147 * Positioned (Pread) read system call 148 * 149 * MPSAFE 150 */ 151 int 152 sys_extpread(struct sysmsg *sysmsg, const struct extpread_args *uap) 153 { 154 struct thread *td = curthread; 155 struct uio auio; 156 struct iovec aiov; 157 int error; 158 int flags; 159 160 if ((ssize_t)uap->nbyte < 0) 161 return(EINVAL); 162 163 aiov.iov_base = uap->buf; 164 aiov.iov_len = uap->nbyte; 165 auio.uio_iov = &aiov; 166 auio.uio_iovcnt = 1; 167 auio.uio_offset = uap->offset; 168 auio.uio_resid = uap->nbyte; 169 auio.uio_rw = UIO_READ; 170 auio.uio_segflg = UIO_USERSPACE; 171 auio.uio_td = td; 172 173 flags = uap->flags & O_FMASK; 174 if (uap->offset != (off_t)-1) 175 flags |= O_FOFFSET; 176 177 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 178 return(error); 179 } 180 181 /* 182 * Scatter read system call. 183 * 184 * MPSAFE 185 */ 186 int 187 sys_readv(struct sysmsg *sysmsg, const struct readv_args *uap) 188 { 189 struct thread *td = curthread; 190 struct uio auio; 191 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 192 int error; 193 194 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 195 &auio.uio_resid); 196 if (error) 197 return (error); 198 auio.uio_iov = iov; 199 auio.uio_iovcnt = uap->iovcnt; 200 auio.uio_offset = -1; 201 auio.uio_rw = UIO_READ; 202 auio.uio_segflg = UIO_USERSPACE; 203 auio.uio_td = td; 204 205 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 206 207 iovec_free(&iov, aiov); 208 return (error); 209 } 210 211 212 /* 213 * Scatter positioned read system call. 214 * 215 * MPSAFE 216 */ 217 int 218 sys_extpreadv(struct sysmsg *sysmsg, const struct extpreadv_args *uap) 219 { 220 struct thread *td = curthread; 221 struct uio auio; 222 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 223 int error; 224 int flags; 225 226 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 227 &auio.uio_resid); 228 if (error) 229 return (error); 230 auio.uio_iov = iov; 231 auio.uio_iovcnt = uap->iovcnt; 232 auio.uio_offset = uap->offset; 233 auio.uio_rw = UIO_READ; 234 auio.uio_segflg = UIO_USERSPACE; 235 auio.uio_td = td; 236 237 flags = uap->flags & O_FMASK; 238 if (uap->offset != (off_t)-1) 239 flags |= O_FOFFSET; 240 241 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 242 243 iovec_free(&iov, aiov); 244 return(error); 245 } 246 247 /* 248 * MPSAFE 249 */ 250 int 251 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 252 { 253 struct thread *td = curthread; 254 struct file *fp; 255 int error; 256 257 fp = holdfp(td, fd, FREAD); 258 if (fp == NULL) 259 return (EBADF); 260 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 261 error = ESPIPE; 262 } else { 263 error = dofileread(fd, fp, auio, flags, res); 264 } 265 dropfp(td, fd, fp); 266 267 return(error); 268 } 269 270 /* 271 * Common code for readv and preadv that reads data in 272 * from a file using the passed in uio, offset, and flags. 273 * 274 * MPALMOSTSAFE - ktrace needs help 275 */ 276 static int 277 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 278 { 279 int error; 280 size_t len; 281 #ifdef KTRACE 282 struct thread *td = curthread; 283 struct iovec *ktriov = NULL; 284 struct uio ktruio; 285 #endif 286 287 #ifdef KTRACE 288 /* 289 * if tracing, save a copy of iovec 290 */ 291 if (KTRPOINT(td, KTR_GENIO)) { 292 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 293 294 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 295 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 296 ktruio = *auio; 297 } 298 #endif 299 len = auio->uio_resid; 300 error = fo_read(fp, auio, fp->f_cred, flags); 301 if (error) { 302 if (auio->uio_resid != len && (error == ERESTART || 303 error == EINTR || error == EWOULDBLOCK)) 304 error = 0; 305 } 306 #ifdef KTRACE 307 if (ktriov != NULL) { 308 if (error == 0) { 309 ktruio.uio_iov = ktriov; 310 ktruio.uio_resid = len - auio->uio_resid; 311 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 312 } 313 kfree(ktriov, M_TEMP); 314 } 315 #endif 316 if (error == 0) 317 *res = len - auio->uio_resid; 318 319 return(error); 320 } 321 322 /* 323 * Write system call 324 * 325 * MPSAFE 326 */ 327 int 328 sys_write(struct sysmsg *sysmsg, const struct write_args *uap) 329 { 330 struct thread *td = curthread; 331 struct uio auio; 332 struct iovec aiov; 333 int error; 334 335 if ((ssize_t)uap->nbyte < 0) 336 error = EINVAL; 337 338 aiov.iov_base = (void *)(uintptr_t)uap->buf; 339 aiov.iov_len = uap->nbyte; 340 auio.uio_iov = &aiov; 341 auio.uio_iovcnt = 1; 342 auio.uio_offset = -1; 343 auio.uio_resid = uap->nbyte; 344 auio.uio_rw = UIO_WRITE; 345 auio.uio_segflg = UIO_USERSPACE; 346 auio.uio_td = td; 347 348 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 349 350 return(error); 351 } 352 353 /* 354 * Pwrite system call 355 * 356 * MPSAFE 357 */ 358 int 359 sys_extpwrite(struct sysmsg *sysmsg, const struct extpwrite_args *uap) 360 { 361 struct thread *td = curthread; 362 struct uio auio; 363 struct iovec aiov; 364 int error; 365 int flags; 366 367 if ((ssize_t)uap->nbyte < 0) 368 error = EINVAL; 369 370 aiov.iov_base = (void *)(uintptr_t)uap->buf; 371 aiov.iov_len = uap->nbyte; 372 auio.uio_iov = &aiov; 373 auio.uio_iovcnt = 1; 374 auio.uio_offset = uap->offset; 375 auio.uio_resid = uap->nbyte; 376 auio.uio_rw = UIO_WRITE; 377 auio.uio_segflg = UIO_USERSPACE; 378 auio.uio_td = td; 379 380 flags = uap->flags & O_FMASK; 381 if (uap->offset != (off_t)-1) 382 flags |= O_FOFFSET; 383 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 384 return(error); 385 } 386 387 /* 388 * MPSAFE 389 */ 390 int 391 sys_writev(struct sysmsg *sysmsg, const struct writev_args *uap) 392 { 393 struct thread *td = curthread; 394 struct uio auio; 395 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 396 int error; 397 398 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 399 &auio.uio_resid); 400 if (error) 401 return (error); 402 auio.uio_iov = iov; 403 auio.uio_iovcnt = uap->iovcnt; 404 auio.uio_offset = -1; 405 auio.uio_rw = UIO_WRITE; 406 auio.uio_segflg = UIO_USERSPACE; 407 auio.uio_td = td; 408 409 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 410 411 iovec_free(&iov, aiov); 412 return (error); 413 } 414 415 416 /* 417 * Gather positioned write system call 418 * 419 * MPSAFE 420 */ 421 int 422 sys_extpwritev(struct sysmsg *sysmsg, const struct extpwritev_args *uap) 423 { 424 struct thread *td = curthread; 425 struct uio auio; 426 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 427 int error; 428 int flags; 429 430 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 431 &auio.uio_resid); 432 if (error) 433 return (error); 434 auio.uio_iov = iov; 435 auio.uio_iovcnt = uap->iovcnt; 436 auio.uio_offset = uap->offset; 437 auio.uio_rw = UIO_WRITE; 438 auio.uio_segflg = UIO_USERSPACE; 439 auio.uio_td = td; 440 441 flags = uap->flags & O_FMASK; 442 if (uap->offset != (off_t)-1) 443 flags |= O_FOFFSET; 444 445 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 446 447 iovec_free(&iov, aiov); 448 return(error); 449 } 450 451 /* 452 * MPSAFE 453 */ 454 int 455 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 456 { 457 struct thread *td = curthread; 458 struct file *fp; 459 int error; 460 461 fp = holdfp(td, fd, FWRITE); 462 if (fp == NULL) 463 return (EBADF); 464 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 465 error = ESPIPE; 466 } else { 467 error = dofilewrite(fd, fp, auio, flags, res); 468 } 469 dropfp(td, fd, fp); 470 471 return(error); 472 } 473 474 /* 475 * Common code for writev and pwritev that writes data to 476 * a file using the passed in uio, offset, and flags. 477 * 478 * MPALMOSTSAFE - ktrace needs help 479 */ 480 static int 481 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 482 { 483 struct thread *td = curthread; 484 struct lwp *lp = td->td_lwp; 485 int error; 486 size_t len; 487 #ifdef KTRACE 488 struct iovec *ktriov = NULL; 489 struct uio ktruio; 490 #endif 491 492 #ifdef KTRACE 493 /* 494 * if tracing, save a copy of iovec and uio 495 */ 496 if (KTRPOINT(td, KTR_GENIO)) { 497 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 498 499 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 500 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 501 ktruio = *auio; 502 } 503 #endif 504 len = auio->uio_resid; 505 error = fo_write(fp, auio, fp->f_cred, flags); 506 if (error) { 507 if (auio->uio_resid != len && (error == ERESTART || 508 error == EINTR || error == EWOULDBLOCK)) 509 error = 0; 510 /* Socket layer is responsible for issuing SIGPIPE. */ 511 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) 512 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 513 } 514 #ifdef KTRACE 515 if (ktriov != NULL) { 516 if (error == 0) { 517 ktruio.uio_iov = ktriov; 518 ktruio.uio_resid = len - auio->uio_resid; 519 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 520 } 521 kfree(ktriov, M_TEMP); 522 } 523 #endif 524 if (error == 0) 525 *res = len - auio->uio_resid; 526 527 return(error); 528 } 529 530 /* 531 * Ioctl system call 532 * 533 * MPSAFE 534 */ 535 int 536 sys_ioctl(struct sysmsg *sysmsg, const struct ioctl_args *uap) 537 { 538 int error; 539 540 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, sysmsg); 541 return (error); 542 } 543 544 struct ioctl_map_entry { 545 const char *subsys; 546 struct ioctl_map_range *cmd_ranges; 547 LIST_ENTRY(ioctl_map_entry) entries; 548 }; 549 550 /* 551 * The true heart of all ioctl syscall handlers (native, emulation). 552 * If map != NULL, it will be searched for a matching entry for com, 553 * and appropriate conversions/conversion functions will be utilized. 554 * 555 * MPSAFE 556 */ 557 int 558 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 559 struct sysmsg *msg) 560 { 561 struct thread *td = curthread; 562 struct proc *p = td->td_proc; 563 struct ucred *cred; 564 struct file *fp; 565 struct ioctl_map_range *iomc = NULL; 566 int error; 567 u_int size; 568 u_long ocom = com; 569 caddr_t data, memp; 570 int tmp; 571 #define STK_PARAMS 128 572 union { 573 char stkbuf[STK_PARAMS]; 574 long align; 575 } ubuf; 576 577 KKASSERT(p); 578 cred = td->td_ucred; 579 memp = NULL; 580 581 fp = holdfp(td, fd, FREAD|FWRITE); 582 if (fp == NULL) 583 return(EBADF); 584 585 if (map != NULL) { /* obey translation map */ 586 u_long maskcmd; 587 struct ioctl_map_entry *e; 588 589 maskcmd = com & map->mask; 590 591 lwkt_gettoken(&mioctl_token); 592 LIST_FOREACH(e, &map->mapping, entries) { 593 for (iomc = e->cmd_ranges; iomc->start != 0 || 594 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 595 iomc->mapfunc != NULL; 596 iomc++) { 597 if (maskcmd >= iomc->start && 598 maskcmd <= iomc->end) 599 break; 600 } 601 602 /* Did we find a match? */ 603 if (iomc->start != 0 || iomc->maptocmd != 0 || 604 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 605 break; 606 } 607 lwkt_reltoken(&mioctl_token); 608 609 if (iomc == NULL || 610 (iomc->start == 0 && iomc->maptocmd == 0 611 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 612 krateprintf(&krate_poll, 613 "%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) " 614 "not implemented\n", 615 map->sys, fd, maskcmd, 616 (int)((maskcmd >> 8) & 0xff), 617 (int)(maskcmd & 0xff)); 618 error = EINVAL; 619 goto done; 620 } 621 622 /* 623 * If it's a non-range one to one mapping, maptocmd should be 624 * correct. If it's a ranged one to one mapping, we pass the 625 * original value of com, and for a range mapped to a different 626 * range, we always need a mapping function to translate the 627 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 628 */ 629 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 630 com = iomc->maptocmd; 631 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 632 if (iomc->mapfunc != NULL) 633 com = iomc->mapfunc(iomc->start, iomc->end, 634 iomc->start, iomc->end, 635 com, com); 636 } else { 637 if (iomc->mapfunc != NULL) { 638 com = iomc->mapfunc(iomc->start, iomc->end, 639 iomc->maptocmd, iomc->maptoend, 640 com, ocom); 641 } else { 642 krateprintf(&krate_poll, 643 "%s: Invalid mapping for fd=%d, " 644 "cmd=%#lx ('%c',%d)\n", 645 map->sys, fd, maskcmd, 646 (int)((maskcmd >> 8) & 0xff), 647 (int)(maskcmd & 0xff)); 648 error = EINVAL; 649 goto done; 650 } 651 } 652 } 653 654 switch (com) { 655 case FIONCLEX: 656 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 657 goto done; 658 case FIOCLEX: 659 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 660 goto done; 661 } 662 663 /* 664 * Interpret high order word to find amount of data to be 665 * copied to/from the user's address space. 666 */ 667 size = IOCPARM_LEN(com); 668 if (size > IOCPARM_MAX) { 669 error = ENOTTY; 670 goto done; 671 } 672 673 if ((com & IOC_VOID) == 0 && size > sizeof(ubuf.stkbuf)) { 674 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 675 data = memp; 676 } else { 677 memp = NULL; 678 data = ubuf.stkbuf; 679 } 680 if (com & IOC_VOID) { 681 *(caddr_t *)data = uspc_data; 682 } else if (com & IOC_IN) { 683 if (size != 0) { 684 error = copyin(uspc_data, data, (size_t)size); 685 if (error) 686 goto done; 687 } else { 688 *(caddr_t *)data = uspc_data; 689 } 690 } else if ((com & IOC_OUT) != 0 && size) { 691 /* 692 * Zero the buffer so the user always 693 * gets back something deterministic. 694 */ 695 bzero(data, (size_t)size); 696 } 697 698 switch (com) { 699 case FIONBIO: 700 if ((tmp = *(int *)data)) 701 atomic_set_int(&fp->f_flag, FNONBLOCK); 702 else 703 atomic_clear_int(&fp->f_flag, FNONBLOCK); 704 error = 0; 705 break; 706 707 case FIOASYNC: 708 if ((tmp = *(int *)data)) 709 atomic_set_int(&fp->f_flag, FASYNC); 710 else 711 atomic_clear_int(&fp->f_flag, FASYNC); 712 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 713 break; 714 715 default: 716 /* 717 * If there is a override function, 718 * call it instead of directly routing the call 719 */ 720 if (map != NULL && iomc->wrapfunc != NULL) 721 error = iomc->wrapfunc(fp, com, ocom, data, cred); 722 else 723 error = fo_ioctl(fp, com, data, cred, msg); 724 /* 725 * Copy any data to user, size was 726 * already set and checked above. 727 */ 728 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 729 error = copyout(data, uspc_data, (size_t)size); 730 break; 731 } 732 done: 733 if (memp != NULL) 734 kfree(memp, M_IOCTLOPS); 735 dropfp(td, fd, fp); 736 737 return(error); 738 } 739 740 /* 741 * MPSAFE 742 */ 743 int 744 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 745 { 746 struct ioctl_map_entry *ne; 747 748 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 749 he->subsys != NULL && *he->subsys != '\0'); 750 751 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, 752 M_WAITOK | M_ZERO); 753 754 ne->subsys = he->subsys; 755 ne->cmd_ranges = he->cmd_ranges; 756 757 lwkt_gettoken(&mioctl_token); 758 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 759 lwkt_reltoken(&mioctl_token); 760 761 return(0); 762 } 763 764 /* 765 * MPSAFE 766 */ 767 int 768 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 769 { 770 struct ioctl_map_entry *ne; 771 int error = EINVAL; 772 773 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 774 775 lwkt_gettoken(&mioctl_token); 776 LIST_FOREACH(ne, &he->map->mapping, entries) { 777 if (ne->cmd_ranges == he->cmd_ranges) { 778 LIST_REMOVE(ne, entries); 779 kfree(ne, M_IOCTLMAP); 780 error = 0; 781 break; 782 } 783 } 784 lwkt_reltoken(&mioctl_token); 785 return(error); 786 } 787 788 static int nseldebug; 789 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, ""); 790 791 /* 792 * Select system call. 793 * 794 * MPSAFE 795 */ 796 int 797 sys_select(struct sysmsg *sysmsg, const struct select_args *uap) 798 { 799 struct timeval ktv; 800 struct timespec *ktsp, kts; 801 int error; 802 803 /* 804 * Get timeout if any. 805 */ 806 if (uap->tv != NULL) { 807 error = copyin(uap->tv, &ktv, sizeof (ktv)); 808 if (error) 809 return (error); 810 TIMEVAL_TO_TIMESPEC(&ktv, &kts); 811 ktsp = &kts; 812 } else { 813 ktsp = NULL; 814 } 815 816 /* 817 * Do real work. 818 */ 819 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 820 &sysmsg->sysmsg_result); 821 822 return (error); 823 } 824 825 826 /* 827 * Pselect system call. 828 */ 829 int 830 sys_pselect(struct sysmsg *sysmsg, const struct pselect_args *uap) 831 { 832 struct thread *td = curthread; 833 struct lwp *lp = td->td_lwp; 834 struct timespec *ktsp, kts; 835 sigset_t sigmask; 836 int error; 837 838 /* 839 * Get timeout if any. 840 */ 841 if (uap->ts != NULL) { 842 error = copyin(uap->ts, &kts, sizeof (kts)); 843 if (error) 844 return (error); 845 ktsp = &kts; 846 } else { 847 ktsp = NULL; 848 } 849 850 /* 851 * Install temporary signal mask if any provided. 852 */ 853 if (uap->sigmask != NULL) { 854 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 855 if (error) 856 return (error); 857 lwkt_gettoken(&lp->lwp_proc->p_token); 858 lp->lwp_oldsigmask = lp->lwp_sigmask; 859 SIG_CANTMASK(sigmask); 860 lp->lwp_sigmask = sigmask; 861 lwkt_reltoken(&lp->lwp_proc->p_token); 862 } 863 864 /* 865 * Do real job. 866 */ 867 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 868 &sysmsg->sysmsg_result); 869 870 if (uap->sigmask != NULL) { 871 lwkt_gettoken(&lp->lwp_proc->p_token); 872 /* doselect() responsible for turning ERESTART into EINTR */ 873 KKASSERT(error != ERESTART); 874 if (error == EINTR) { 875 /* 876 * We can't restore the previous signal mask now 877 * because it could block the signal that interrupted 878 * us. So make a note to restore it after executing 879 * the handler. 880 */ 881 lp->lwp_flags |= LWP_OLDMASK; 882 } else { 883 /* 884 * No handler to run. Restore previous mask immediately. 885 */ 886 lp->lwp_sigmask = lp->lwp_oldsigmask; 887 } 888 lwkt_reltoken(&lp->lwp_proc->p_token); 889 } 890 891 return (error); 892 } 893 894 static int 895 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 896 { 897 struct select_kevent_copyin_args *skap = NULL; 898 struct kevent *kev; 899 int fd; 900 kfd_set *fdp = NULL; 901 short filter = 0; 902 u_int fflags = 0; 903 904 skap = (struct select_kevent_copyin_args *)arg; 905 906 if (*events == maxevents) 907 return (0); 908 909 while (skap->active_set < COPYIN_DONE) { 910 switch (skap->active_set) { 911 case COPYIN_READ: 912 /* 913 * Register descriptors for the read filter 914 */ 915 fdp = skap->read_set; 916 filter = EVFILT_READ; 917 fflags = NOTE_OLDAPI; 918 if (fdp) 919 break; 920 ++skap->active_set; 921 skap->proc_fds = 0; 922 /* fall through */ 923 case COPYIN_WRITE: 924 /* 925 * Register descriptors for the write filter 926 */ 927 fdp = skap->write_set; 928 filter = EVFILT_WRITE; 929 fflags = NOTE_OLDAPI; 930 if (fdp) 931 break; 932 ++skap->active_set; 933 skap->proc_fds = 0; 934 /* fall through */ 935 case COPYIN_EXCEPT: 936 /* 937 * Register descriptors for the exception filter 938 */ 939 fdp = skap->except_set; 940 filter = EVFILT_EXCEPT; 941 fflags = NOTE_OLDAPI | NOTE_OOB; 942 if (fdp) 943 break; 944 ++skap->active_set; 945 skap->proc_fds = 0; 946 /* fall through */ 947 case COPYIN_DONE: 948 /* 949 * Nothing left to register 950 */ 951 return(0); 952 /* NOT REACHED */ 953 } 954 955 while (skap->proc_fds < skap->num_fds) { 956 fd = skap->proc_fds; 957 if (FD_ISSET(fd, fdp)) { 958 kev = &kevp[*events]; 959 EV_SET(kev, fd, filter, 960 EV_ADD|EV_ENABLE, 961 fflags, 0, 962 (void *)(uintptr_t) 963 skap->lwp->lwp_kqueue_serial); 964 FD_CLR(fd, fdp); 965 ++*events; 966 967 if (nseldebug) { 968 kprintf( 969 "select fd %d filter %d " 970 "serial %ju\n", fd, filter, 971 (uintmax_t) 972 skap->lwp->lwp_kqueue_serial); 973 } 974 } 975 ++skap->proc_fds; 976 if (*events == maxevents) 977 return (0); 978 } 979 skap->active_set++; 980 skap->proc_fds = 0; 981 } 982 983 return (0); 984 } 985 986 static int 987 select_copyout(void *arg, struct kevent *kevp, int count, int *res) 988 { 989 struct select_kevent_copyin_args *skap; 990 struct kevent kev; 991 int i; 992 int n; 993 994 skap = (struct select_kevent_copyin_args *)arg; 995 996 for (i = 0; i < count; ++i) { 997 /* 998 * Filter out and delete spurious events 999 */ 1000 if ((uint64_t)(uintptr_t)kevp[i].udata != 1001 skap->lwp->lwp_kqueue_serial) 1002 { 1003 panic("select_copyout: unexpected udata"); 1004 deregister: 1005 kev = kevp[i]; 1006 kev.flags = EV_DISABLE|EV_DELETE; 1007 n = 1; 1008 kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n, 0); 1009 if (nseldebug) { 1010 kprintf("select fd %ju mismatched serial %ju\n", 1011 (uintmax_t)kevp[i].ident, 1012 (uintmax_t)skap->lwp->lwp_kqueue_serial); 1013 } 1014 continue; 1015 } 1016 1017 /* 1018 * Handle errors 1019 */ 1020 if (kevp[i].flags & EV_ERROR) { 1021 int error = kevp[i].data; 1022 1023 switch (error) { 1024 case EBADF: 1025 /* 1026 * A bad file descriptor is considered a 1027 * fatal error for select, bail out. 1028 */ 1029 skap->error = error; 1030 *res = -1; 1031 return error; 1032 1033 default: 1034 /* 1035 * Select silently swallows any unknown errors 1036 * for descriptors in the read or write sets. 1037 * 1038 * ALWAYS filter out EOPNOTSUPP errors from 1039 * filters (at least until all filters support 1040 * EVFILT_EXCEPT) 1041 * 1042 * We also filter out ENODEV since dev_dkqfilter 1043 * returns ENODEV if EOPNOTSUPP is returned in an 1044 * inner call. 1045 * 1046 * XXX: fix this 1047 */ 1048 if (kevp[i].filter != EVFILT_READ && 1049 kevp[i].filter != EVFILT_WRITE && 1050 error != EOPNOTSUPP && 1051 error != ENODEV) { 1052 skap->error = error; 1053 *res = -1; 1054 return error; 1055 } 1056 break; 1057 } 1058 1059 /* 1060 * We must deregister any unsupported select events 1061 * to avoid a live-lock. 1062 */ 1063 if (nseldebug) { 1064 kprintf("select fd %ju filter %d error %d\n", 1065 (uintmax_t)kevp[i].ident, 1066 kevp[i].filter, error); 1067 } 1068 goto deregister; 1069 } 1070 1071 switch (kevp[i].filter) { 1072 case EVFILT_READ: 1073 FD_SET(kevp[i].ident, skap->read_set); 1074 break; 1075 case EVFILT_WRITE: 1076 FD_SET(kevp[i].ident, skap->write_set); 1077 break; 1078 case EVFILT_EXCEPT: 1079 FD_SET(kevp[i].ident, skap->except_set); 1080 break; 1081 } 1082 1083 ++*res; 1084 } 1085 1086 return (0); 1087 } 1088 1089 /* 1090 * Copy select bits in from userland. Allocate kernel memory if the 1091 * set is large. 1092 */ 1093 static int 1094 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) 1095 { 1096 int error; 1097 1098 if (in_set) { 1099 if (bytes < sizeof(*tmp_set)) 1100 *out_set = tmp_set; 1101 else 1102 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); 1103 error = copyin(in_set, *out_set, bytes); 1104 } else { 1105 *out_set = NULL; 1106 error = 0; 1107 } 1108 return (error); 1109 } 1110 1111 /* 1112 * Copy returned select bits back out to userland. 1113 */ 1114 static int 1115 putbits(int bytes, kfd_set *in_set, fd_set *out_set) 1116 { 1117 int error; 1118 1119 if (in_set) { 1120 error = copyout(in_set, out_set, bytes); 1121 } else { 1122 error = 0; 1123 } 1124 return (error); 1125 } 1126 1127 static int 1128 dotimeout_only(struct timespec *ts) 1129 { 1130 return(nanosleep1(ts, NULL)); 1131 } 1132 1133 /* 1134 * Common code for sys_select() and sys_pselect(). 1135 * 1136 * in, out and ex are userland pointers. ts must point to validated 1137 * kernel-side timeout value or NULL for infinite timeout. res must 1138 * point to syscall return value. 1139 */ 1140 static int 1141 doselect(int nd, fd_set *read, fd_set *write, fd_set *except, 1142 struct timespec *ts, int *res) 1143 { 1144 struct proc *p = curproc; 1145 struct select_kevent_copyin_args *kap, ka; 1146 int bytes, error; 1147 kfd_set read_tmp; 1148 kfd_set write_tmp; 1149 kfd_set except_tmp; 1150 1151 *res = 0; 1152 if (nd < 0) 1153 return (EINVAL); 1154 if (nd == 0 && ts) 1155 return (dotimeout_only(ts)); 1156 1157 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ 1158 nd = p->p_fd->fd_nfiles; 1159 1160 kap = &ka; 1161 kap->lwp = curthread->td_lwp; 1162 kap->num_fds = nd; 1163 kap->proc_fds = 0; 1164 kap->error = 0; 1165 kap->active_set = COPYIN_READ; 1166 1167 /* 1168 * Calculate bytes based on the number of __fd_mask[] array entries 1169 * multiplied by the size of __fd_mask. 1170 */ 1171 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); 1172 1173 /* kap->read_set = NULL; not needed */ 1174 kap->write_set = NULL; 1175 kap->except_set = NULL; 1176 1177 error = getbits(bytes, read, &kap->read_set, &read_tmp); 1178 if (error == 0) 1179 error = getbits(bytes, write, &kap->write_set, &write_tmp); 1180 if (error == 0) 1181 error = getbits(bytes, except, &kap->except_set, &except_tmp); 1182 if (error) 1183 goto done; 1184 1185 /* 1186 * NOTE: Make sure the max events passed to kern_kevent() is 1187 * effectively unlimited. (nd * 3) accomplishes this. 1188 * 1189 * (*res) continues to increment as returned events are 1190 * loaded in. 1191 */ 1192 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, 1193 select_copyin, select_copyout, ts, 1194 KEVENT_AUTO_STALE); 1195 if (error == 0) 1196 error = putbits(bytes, kap->read_set, read); 1197 if (error == 0) 1198 error = putbits(bytes, kap->write_set, write); 1199 if (error == 0) 1200 error = putbits(bytes, kap->except_set, except); 1201 1202 /* 1203 * An error from an individual event that should be passed 1204 * back to userland (EBADF) 1205 */ 1206 if (kap->error) 1207 error = kap->error; 1208 1209 /* 1210 * Clean up. 1211 */ 1212 done: 1213 if (kap->read_set && kap->read_set != &read_tmp) 1214 kfree(kap->read_set, M_SELECT); 1215 if (kap->write_set && kap->write_set != &write_tmp) 1216 kfree(kap->write_set, M_SELECT); 1217 if (kap->except_set && kap->except_set != &except_tmp) 1218 kfree(kap->except_set, M_SELECT); 1219 1220 kap->lwp->lwp_kqueue_serial += kap->num_fds; 1221 1222 return (error); 1223 } 1224 1225 /* 1226 * Poll system call. 1227 * 1228 * MPSAFE 1229 */ 1230 int 1231 sys_poll(struct sysmsg *sysmsg, const struct poll_args *uap) 1232 { 1233 struct timespec ts, *tsp; 1234 int error; 1235 1236 if (uap->timeout != INFTIM) { 1237 if (uap->timeout < 0) 1238 return (EINVAL); 1239 ts.tv_sec = uap->timeout / 1000; 1240 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; 1241 tsp = &ts; 1242 } else { 1243 tsp = NULL; 1244 } 1245 1246 error = dopoll(uap->nfds, uap->fds, tsp, &sysmsg->sysmsg_result, 0); 1247 1248 return (error); 1249 } 1250 1251 /* 1252 * Ppoll system call. 1253 * 1254 * MPSAFE 1255 */ 1256 int 1257 sys_ppoll(struct sysmsg *sysmsg, const struct ppoll_args *uap) 1258 { 1259 struct thread *td = curthread; 1260 struct lwp *lp = td->td_lwp; 1261 struct timespec *ktsp, kts; 1262 sigset_t sigmask; 1263 int error; 1264 1265 /* 1266 * Get timeout if any. 1267 */ 1268 if (uap->ts != NULL) { 1269 error = copyin(uap->ts, &kts, sizeof (kts)); 1270 if (error) 1271 return (error); 1272 ktsp = &kts; 1273 } else { 1274 ktsp = NULL; 1275 } 1276 1277 /* 1278 * Install temporary signal mask if any provided. 1279 */ 1280 if (uap->sigmask != NULL) { 1281 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 1282 if (error) 1283 return (error); 1284 lwkt_gettoken(&lp->lwp_proc->p_token); 1285 lp->lwp_oldsigmask = lp->lwp_sigmask; 1286 SIG_CANTMASK(sigmask); 1287 lp->lwp_sigmask = sigmask; 1288 lwkt_reltoken(&lp->lwp_proc->p_token); 1289 } 1290 1291 error = dopoll(uap->nfds, uap->fds, ktsp, &sysmsg->sysmsg_result, 1292 ktsp != NULL ? KEVENT_TIMEOUT_PRECISE : 0); 1293 1294 if (uap->sigmask != NULL) { 1295 lwkt_gettoken(&lp->lwp_proc->p_token); 1296 /* dopoll() responsible for turning ERESTART into EINTR */ 1297 KKASSERT(error != ERESTART); 1298 if (error == EINTR) { 1299 /* 1300 * We can't restore the previous signal mask now 1301 * because it could block the signal that interrupted 1302 * us. So make a note to restore it after executing 1303 * the handler. 1304 */ 1305 lp->lwp_flags |= LWP_OLDMASK; 1306 } else { 1307 /* 1308 * No handler to run. Restore previous mask immediately. 1309 */ 1310 lp->lwp_sigmask = lp->lwp_oldsigmask; 1311 } 1312 lwkt_reltoken(&lp->lwp_proc->p_token); 1313 } 1314 1315 return (error); 1316 } 1317 1318 static int 1319 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1320 { 1321 struct poll_kevent_copyin_args *pkap; 1322 struct pollfd *pfd; 1323 struct kevent *kev; 1324 int kev_count; 1325 1326 pkap = (struct poll_kevent_copyin_args *)arg; 1327 1328 while (pkap->pfds < pkap->nfds) { 1329 pfd = &pkap->fds[pkap->pfds]; 1330 1331 /* Clear return events */ 1332 pfd->revents = 0; 1333 1334 /* Do not check if fd is equal to -1 */ 1335 if (pfd->fd == -1) { 1336 ++pkap->pfds; 1337 continue; 1338 } 1339 1340 /* 1341 * NOTE: pfd->events == 0 implies POLLHUP in BSDs. Used 1342 * by at least sshd and X11 udev support. 1343 */ 1344 kev_count = 0; 1345 if (pfd->events == 0) 1346 kev_count++; 1347 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) 1348 kev_count++; 1349 if (pfd->events & (POLLOUT | POLLWRNORM)) 1350 kev_count++; 1351 if (pfd->events & (POLLPRI | POLLRDBAND)) 1352 kev_count++; 1353 1354 if (*events + kev_count > maxevents) 1355 return (0); 1356 1357 /* 1358 * NOTE: A combined serial number and poll array index is 1359 * stored in kev->udata. 1360 * 1361 * NOTE: Events will be registered with KEVENT_UNIQUE_NOTES 1362 * set, using kev->data for the uniqifier. kev->data 1363 * is an implied in the actual registration. 1364 */ 1365 kev = &kevp[*events]; 1366 1367 /* 1368 * Implied POLLHUP 1369 */ 1370 if (pfd->events == 0) { 1371 int notes = NOTE_OLDAPI | NOTE_HUPONLY; 1372 1373 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1374 notes, pkap->pfds, (void *)(uintptr_t) 1375 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1376 } 1377 1378 /* 1379 * Nominal read events 1380 */ 1381 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) { 1382 int notes = NOTE_OLDAPI; 1383 if ((pfd->events & (POLLIN | POLLRDNORM)) == 0) 1384 notes |= NOTE_HUPONLY; 1385 1386 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1387 notes, pkap->pfds, (void *)(uintptr_t) 1388 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1389 } 1390 1391 /* 1392 * Nominal write events 1393 */ 1394 if (pfd->events & (POLLOUT | POLLWRNORM)) { 1395 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, 1396 NOTE_OLDAPI, pkap->pfds, (void *)(uintptr_t) 1397 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1398 } 1399 1400 /* 1401 * Nominal exceptional events 1402 */ 1403 if (pfd->events & (POLLPRI | POLLRDBAND)) { 1404 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, 1405 NOTE_OLDAPI | NOTE_OOB, pkap->pfds, 1406 (void *)(uintptr_t) 1407 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1408 } 1409 1410 if (nseldebug) { 1411 kprintf("poll index %d/%d fd %d events %08x " 1412 "serial %ju\n", pkap->pfds, pkap->nfds-1, 1413 pfd->fd, pfd->events, 1414 (uintmax_t)pkap->lwp->lwp_kqueue_serial); 1415 } 1416 1417 ++pkap->pfds; 1418 (*events) += kev_count; 1419 } 1420 1421 return (0); 1422 } 1423 1424 static int 1425 poll_copyout(void *arg, struct kevent *kevp, int count, int *res) 1426 { 1427 struct poll_kevent_copyin_args *pkap; 1428 struct pollfd *pfd; 1429 struct kevent kev; 1430 int count_res; 1431 int i; 1432 int n; 1433 uint64_t pi; 1434 1435 pkap = (struct poll_kevent_copyin_args *)arg; 1436 1437 for (i = 0; i < count; ++i) { 1438 /* 1439 * Extract the poll array index and delete spurious events. 1440 * We can easily tell if the serial number is incorrect 1441 * by checking whether the extracted index is out of range. 1442 */ 1443 pi = (uint64_t)(uintptr_t)kevp[i].udata - 1444 pkap->lwp->lwp_kqueue_serial; 1445 if (pi >= pkap->nfds) { 1446 panic("poll_copyout: unexpected udata"); 1447 deregister: 1448 kev = kevp[i]; 1449 kev.flags = EV_DISABLE|EV_DELETE; 1450 kev.data = pi; /* uniquifier */ 1451 n = 1; 1452 kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n, 1453 KEVENT_UNIQUE_NOTES); 1454 if (nseldebug) { 1455 kprintf("poll index %ju out of range against " 1456 "serial %ju\n", (uintmax_t)pi, 1457 (uintmax_t)pkap->lwp->lwp_kqueue_serial); 1458 } 1459 continue; 1460 } 1461 1462 /* 1463 * Locate the pollfd and process events 1464 */ 1465 pfd = &pkap->fds[pi]; 1466 if (kevp[i].ident == pfd->fd) { 1467 /* 1468 * A single descriptor may generate an error against 1469 * more than one filter, make sure to set the 1470 * appropriate flags but do not increment (*res) 1471 * more than once. 1472 */ 1473 count_res = (pfd->revents == 0); 1474 if (kevp[i].flags & EV_ERROR) { 1475 switch(kevp[i].data) { 1476 case EBADF: 1477 case POLLNVAL: 1478 /* Bad file descriptor */ 1479 if (count_res) 1480 ++*res; 1481 pfd->revents |= POLLNVAL; 1482 break; 1483 default: 1484 /* 1485 * Poll silently swallows any unknown 1486 * errors except in the case of POLLPRI 1487 * (OOB/urgent data). 1488 * 1489 * ALWAYS filter out EOPNOTSUPP errors 1490 * from filters, common applications 1491 * set POLLPRI|POLLRDBAND and most 1492 * filters do not support EVFILT_EXCEPT. 1493 * 1494 * We also filter out ENODEV since 1495 * dev_dkqfilter returns ENODEV if 1496 * EOPNOTSUPP is returned in an 1497 * inner call. 1498 * 1499 * XXX: fix this 1500 */ 1501 if (kevp[i].filter != EVFILT_READ && 1502 kevp[i].filter != EVFILT_WRITE && 1503 kevp[i].data != EOPNOTSUPP && 1504 kevp[i].data != ENODEV) { 1505 if (count_res) 1506 ++*res; 1507 pfd->revents |= POLLERR; 1508 } 1509 break; 1510 } 1511 if (pfd->revents == 0 && nseldebug) { 1512 kprintf("poll index EV_ERROR %ju fd %d " 1513 "filter %d error %jd\n", 1514 (uintmax_t)pi, pfd->fd, 1515 kevp[i].filter, 1516 (intmax_t)kevp[i].data); 1517 } 1518 1519 /* 1520 * Silently deregister any unhandled EV_ERROR 1521 * condition (usually EOPNOTSUPP). 1522 */ 1523 if (pfd->revents == 0) 1524 goto deregister; 1525 continue; 1526 } 1527 1528 switch (kevp[i].filter) { 1529 case EVFILT_READ: 1530 /* 1531 * NODATA on the read side can indicate a 1532 * half-closed situation and not necessarily 1533 * a disconnect, so depend on the user 1534 * issuing a read() and getting 0 bytes back. 1535 * 1536 * If EV_HUP is set the peer completely 1537 * disconnected and we can set POLLHUP. 1538 * Linux can return POLLHUP even if read 1539 * data has not been drained, so we should 1540 * too. 1541 */ 1542 /* if (kevp[i].flags & EV_NODATA) */ { 1543 if (kevp[i].flags & EV_HUP) 1544 pfd->revents |= POLLHUP; 1545 } 1546 if ((kevp[i].flags & EV_EOF) && 1547 kevp[i].fflags != 0) 1548 pfd->revents |= POLLERR; 1549 if (pfd->events & POLLIN) 1550 pfd->revents |= POLLIN; 1551 if (pfd->events & POLLRDNORM) 1552 pfd->revents |= POLLRDNORM; 1553 break; 1554 case EVFILT_WRITE: 1555 /* 1556 * As per the OpenGroup POLLHUP is mutually 1557 * exclusive with the writability flags. I 1558 * consider this a bit broken but... 1559 * 1560 * In this case a disconnect is implied even 1561 * for a half-closed (write side) situation. 1562 */ 1563 if (kevp[i].flags & EV_EOF) { 1564 pfd->revents |= POLLHUP; 1565 if (kevp[i].fflags != 0) 1566 pfd->revents |= POLLERR; 1567 } else { 1568 if (pfd->events & POLLOUT) 1569 pfd->revents |= POLLOUT; 1570 if (pfd->events & POLLWRNORM) 1571 pfd->revents |= POLLWRNORM; 1572 } 1573 break; 1574 case EVFILT_EXCEPT: 1575 /* 1576 * EV_NODATA should never be tagged for this 1577 * filter. 1578 */ 1579 if (pfd->events & POLLPRI) 1580 pfd->revents |= POLLPRI; 1581 if (pfd->events & POLLRDBAND) 1582 pfd->revents |= POLLRDBAND; 1583 break; 1584 } 1585 1586 if (nseldebug) { 1587 kprintf("poll index %ju/%d fd %d " 1588 "revents %08x\n", (uintmax_t)pi, pkap->nfds, 1589 pfd->fd, pfd->revents); 1590 } 1591 1592 if (count_res && pfd->revents) 1593 ++*res; 1594 } 1595 1596 /* 1597 * We must deregister any kqueue poll event that does not 1598 * set poll return bits to prevent a live-lock. 1599 */ 1600 if (pfd->revents == 0) { 1601 krateprintf(&krate_poll, 1602 "poll index %ju no-action %ju/%d " 1603 "events=%08x kevpfilt=%d/%08x\n", 1604 (uintmax_t)pi, (uintmax_t)kevp[i].ident, 1605 pfd->fd, pfd->events, 1606 kevp[i].filter, kevp[i].flags); 1607 goto deregister; 1608 } 1609 } 1610 1611 return (0); 1612 } 1613 1614 static int 1615 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res, int flags) 1616 { 1617 struct poll_kevent_copyin_args ka; 1618 struct pollfd sfds[64]; 1619 int bytes; 1620 int error; 1621 1622 flags |= KEVENT_AUTO_STALE | KEVENT_UNIQUE_NOTES; 1623 1624 *res = 0; 1625 if (nfds < 0) 1626 return (EINVAL); 1627 1628 if (nfds == 0 && ts) 1629 return (dotimeout_only(ts)); 1630 1631 /* 1632 * This is a bit arbitrary but we need to limit internal kmallocs. 1633 */ 1634 if (nfds > maxfilesperproc * 2) 1635 nfds = maxfilesperproc * 2; 1636 bytes = sizeof(struct pollfd) * nfds; 1637 1638 ka.lwp = curthread->td_lwp; 1639 ka.nfds = nfds; 1640 ka.pfds = 0; 1641 ka.error = 0; 1642 1643 if (ka.nfds < 64) 1644 ka.fds = sfds; 1645 else 1646 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); 1647 1648 error = copyin(fds, ka.fds, bytes); 1649 1650 if (error == 0) 1651 error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka, 1652 poll_copyin, poll_copyout, ts, flags); 1653 1654 if (error == 0) 1655 error = copyout(ka.fds, fds, bytes); 1656 1657 if (ka.fds != sfds) 1658 kfree(ka.fds, M_SELECT); 1659 1660 ka.lwp->lwp_kqueue_serial += nfds; 1661 1662 return (error); 1663 } 1664 1665 static int 1666 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1667 { 1668 return (0); 1669 } 1670 1671 static int 1672 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res) 1673 { 1674 ++*res; 1675 return (0); 1676 } 1677 1678 extern struct fileops socketops; 1679 1680 /* 1681 * NOTE: Callers of socket_wait() must already have a reference on the 1682 * socket. 1683 */ 1684 int 1685 socket_wait(struct socket *so, struct timespec *ts, int *res) 1686 { 1687 struct thread *td = curthread; 1688 struct file *fp; 1689 struct kqueue kq; 1690 struct kevent kev; 1691 int error, fd; 1692 int n; 1693 1694 if ((error = falloc(td->td_lwp, &fp, &fd)) != 0) 1695 return (error); 1696 1697 fp->f_type = DTYPE_SOCKET; 1698 fp->f_flag = FREAD | FWRITE; 1699 fp->f_ops = &socketops; 1700 fp->f_data = so; 1701 fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd); 1702 fsetfdflags(td->td_proc->p_fd, fd, UF_EXCLOSE); 1703 1704 bzero(&kq, sizeof(kq)); 1705 kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd); 1706 EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL); 1707 n = 1; 1708 if ((error = kqueue_register(&kq, &kev, &n, 0)) != 0) { 1709 fdrop(fp); 1710 return (error); 1711 } 1712 1713 error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin, 1714 socket_wait_copyout, ts, 0); 1715 1716 EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL); 1717 n = 1; 1718 kqueue_register(&kq, &kev, &n, 0); 1719 fp->f_ops = &badfileops; 1720 fdrop(fp); 1721 1722 return (error); 1723 } 1724 1725 /* 1726 * OpenBSD poll system call. 1727 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1728 * 1729 * MPSAFE 1730 */ 1731 int 1732 sys_openbsd_poll(struct sysmsg *sysmsg, const struct openbsd_poll_args *uap) 1733 { 1734 return (sys_poll(sysmsg, (const struct poll_args *)uap)); 1735 } 1736 1737 /*ARGSUSED*/ 1738 int 1739 seltrue(cdev_t dev, int events) 1740 { 1741 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1742 } 1743