1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/sysmsg.h> 43 #include <sys/event.h> 44 #include <sys/filedesc.h> 45 #include <sys/filio.h> 46 #include <sys/fcntl.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/signalvar.h> 50 #include <sys/socketvar.h> 51 #include <sys/malloc.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/kern_syscall.h> 55 #include <sys/mapped_ioctl.h> 56 #include <sys/poll.h> 57 #include <sys/queue.h> 58 #include <sys/resourcevar.h> 59 #include <sys/socketops.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/buf.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 #include <vm/vm.h> 67 #include <vm/vm_page.h> 68 69 #include <sys/file2.h> 70 #include <sys/spinlock2.h> 71 #include <sys/signal2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static struct krate krate_poll = { .freq = 1 }; 81 82 typedef struct kfd_set { 83 fd_mask fds_bits[2]; 84 } kfd_set; 85 86 enum select_copyin_states { 87 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; 88 89 struct select_kevent_copyin_args { 90 kfd_set *read_set; 91 kfd_set *write_set; 92 kfd_set *except_set; 93 int active_set; /* One of select_copyin_states */ 94 struct lwp *lwp; /* Pointer to our lwp */ 95 int num_fds; /* Number of file descriptors (syscall arg) */ 96 int proc_fds; /* Processed fd's (wraps) */ 97 int error; /* Returned to userland */ 98 }; 99 100 struct poll_kevent_copyin_args { 101 struct lwp *lwp; 102 struct pollfd *fds; 103 int nfds; 104 int pfds; 105 int error; 106 }; 107 108 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token); 109 110 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 111 struct timespec *ts, int *res); 112 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, 113 int *res, int flags); 114 static int dofileread(int, struct file *, struct uio *, int, size_t *); 115 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 116 117 /* 118 * Read system call. 119 * 120 * MPSAFE 121 */ 122 int 123 sys_read(struct sysmsg *sysmsg, const struct read_args *uap) 124 { 125 struct thread *td = curthread; 126 struct uio auio; 127 struct iovec aiov; 128 int error; 129 130 if ((ssize_t)uap->nbyte < 0) 131 error = EINVAL; 132 133 aiov.iov_base = uap->buf; 134 aiov.iov_len = uap->nbyte; 135 auio.uio_iov = &aiov; 136 auio.uio_iovcnt = 1; 137 auio.uio_offset = -1; 138 auio.uio_resid = uap->nbyte; 139 auio.uio_rw = UIO_READ; 140 auio.uio_segflg = UIO_USERSPACE; 141 auio.uio_td = td; 142 143 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 144 return(error); 145 } 146 147 /* 148 * Positioned (Pread) read system call 149 * 150 * MPSAFE 151 */ 152 int 153 sys_extpread(struct sysmsg *sysmsg, const struct extpread_args *uap) 154 { 155 struct thread *td = curthread; 156 struct uio auio; 157 struct iovec aiov; 158 int error; 159 int flags; 160 161 if ((ssize_t)uap->nbyte < 0) 162 return(EINVAL); 163 164 aiov.iov_base = uap->buf; 165 aiov.iov_len = uap->nbyte; 166 auio.uio_iov = &aiov; 167 auio.uio_iovcnt = 1; 168 auio.uio_offset = uap->offset; 169 auio.uio_resid = uap->nbyte; 170 auio.uio_rw = UIO_READ; 171 auio.uio_segflg = UIO_USERSPACE; 172 auio.uio_td = td; 173 174 flags = uap->flags & O_FMASK; 175 if (uap->offset != (off_t)-1) 176 flags |= O_FOFFSET; 177 178 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 179 return(error); 180 } 181 182 /* 183 * Scatter read system call. 184 * 185 * MPSAFE 186 */ 187 int 188 sys_readv(struct sysmsg *sysmsg, const struct readv_args *uap) 189 { 190 struct thread *td = curthread; 191 struct uio auio; 192 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 193 int error; 194 195 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 196 &auio.uio_resid); 197 if (error) 198 return (error); 199 auio.uio_iov = iov; 200 auio.uio_iovcnt = uap->iovcnt; 201 auio.uio_offset = -1; 202 auio.uio_rw = UIO_READ; 203 auio.uio_segflg = UIO_USERSPACE; 204 auio.uio_td = td; 205 206 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 207 208 iovec_free(&iov, aiov); 209 return (error); 210 } 211 212 213 /* 214 * Scatter positioned read system call. 215 * 216 * MPSAFE 217 */ 218 int 219 sys_extpreadv(struct sysmsg *sysmsg, const struct extpreadv_args *uap) 220 { 221 struct thread *td = curthread; 222 struct uio auio; 223 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 224 int error; 225 int flags; 226 227 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 228 &auio.uio_resid); 229 if (error) 230 return (error); 231 auio.uio_iov = iov; 232 auio.uio_iovcnt = uap->iovcnt; 233 auio.uio_offset = uap->offset; 234 auio.uio_rw = UIO_READ; 235 auio.uio_segflg = UIO_USERSPACE; 236 auio.uio_td = td; 237 238 flags = uap->flags & O_FMASK; 239 if (uap->offset != (off_t)-1) 240 flags |= O_FOFFSET; 241 242 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 243 244 iovec_free(&iov, aiov); 245 return(error); 246 } 247 248 /* 249 * MPSAFE 250 */ 251 int 252 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 253 { 254 struct thread *td = curthread; 255 struct file *fp; 256 int error; 257 258 fp = holdfp(td, fd, FREAD); 259 if (fp == NULL) 260 return (EBADF); 261 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 262 error = ESPIPE; 263 } else { 264 error = dofileread(fd, fp, auio, flags, res); 265 } 266 dropfp(td, fd, fp); 267 268 return(error); 269 } 270 271 /* 272 * Common code for readv and preadv that reads data in 273 * from a file using the passed in uio, offset, and flags. 274 * 275 * MPALMOSTSAFE - ktrace needs help 276 */ 277 static int 278 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 279 { 280 int error; 281 size_t len; 282 #ifdef KTRACE 283 struct thread *td = curthread; 284 struct iovec *ktriov = NULL; 285 struct uio ktruio; 286 #endif 287 288 #ifdef KTRACE 289 /* 290 * if tracing, save a copy of iovec 291 */ 292 if (KTRPOINT(td, KTR_GENIO)) { 293 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 294 295 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 296 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 297 ktruio = *auio; 298 } 299 #endif 300 len = auio->uio_resid; 301 error = fo_read(fp, auio, fp->f_cred, flags); 302 if (error) { 303 if (auio->uio_resid != len && (error == ERESTART || 304 error == EINTR || error == EWOULDBLOCK)) 305 error = 0; 306 } 307 #ifdef KTRACE 308 if (ktriov != NULL) { 309 if (error == 0) { 310 ktruio.uio_iov = ktriov; 311 ktruio.uio_resid = len - auio->uio_resid; 312 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 313 } 314 kfree(ktriov, M_TEMP); 315 } 316 #endif 317 if (error == 0) 318 *res = len - auio->uio_resid; 319 320 return(error); 321 } 322 323 /* 324 * Write system call 325 * 326 * MPSAFE 327 */ 328 int 329 sys_write(struct sysmsg *sysmsg, const struct write_args *uap) 330 { 331 struct thread *td = curthread; 332 struct uio auio; 333 struct iovec aiov; 334 int error; 335 336 if ((ssize_t)uap->nbyte < 0) 337 error = EINVAL; 338 339 aiov.iov_base = (void *)(uintptr_t)uap->buf; 340 aiov.iov_len = uap->nbyte; 341 auio.uio_iov = &aiov; 342 auio.uio_iovcnt = 1; 343 auio.uio_offset = -1; 344 auio.uio_resid = uap->nbyte; 345 auio.uio_rw = UIO_WRITE; 346 auio.uio_segflg = UIO_USERSPACE; 347 auio.uio_td = td; 348 349 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 350 351 return(error); 352 } 353 354 /* 355 * Pwrite system call 356 * 357 * MPSAFE 358 */ 359 int 360 sys_extpwrite(struct sysmsg *sysmsg, const struct extpwrite_args *uap) 361 { 362 struct thread *td = curthread; 363 struct uio auio; 364 struct iovec aiov; 365 int error; 366 int flags; 367 368 if ((ssize_t)uap->nbyte < 0) 369 error = EINVAL; 370 371 aiov.iov_base = (void *)(uintptr_t)uap->buf; 372 aiov.iov_len = uap->nbyte; 373 auio.uio_iov = &aiov; 374 auio.uio_iovcnt = 1; 375 auio.uio_offset = uap->offset; 376 auio.uio_resid = uap->nbyte; 377 auio.uio_rw = UIO_WRITE; 378 auio.uio_segflg = UIO_USERSPACE; 379 auio.uio_td = td; 380 381 flags = uap->flags & O_FMASK; 382 if (uap->offset != (off_t)-1) 383 flags |= O_FOFFSET; 384 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 385 return(error); 386 } 387 388 /* 389 * MPSAFE 390 */ 391 int 392 sys_writev(struct sysmsg *sysmsg, const struct writev_args *uap) 393 { 394 struct thread *td = curthread; 395 struct uio auio; 396 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 397 int error; 398 399 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 400 &auio.uio_resid); 401 if (error) 402 return (error); 403 auio.uio_iov = iov; 404 auio.uio_iovcnt = uap->iovcnt; 405 auio.uio_offset = -1; 406 auio.uio_rw = UIO_WRITE; 407 auio.uio_segflg = UIO_USERSPACE; 408 auio.uio_td = td; 409 410 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult); 411 412 iovec_free(&iov, aiov); 413 return (error); 414 } 415 416 417 /* 418 * Gather positioned write system call 419 * 420 * MPSAFE 421 */ 422 int 423 sys_extpwritev(struct sysmsg *sysmsg, const struct extpwritev_args *uap) 424 { 425 struct thread *td = curthread; 426 struct uio auio; 427 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 428 int error; 429 int flags; 430 431 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 432 &auio.uio_resid); 433 if (error) 434 return (error); 435 auio.uio_iov = iov; 436 auio.uio_iovcnt = uap->iovcnt; 437 auio.uio_offset = uap->offset; 438 auio.uio_rw = UIO_WRITE; 439 auio.uio_segflg = UIO_USERSPACE; 440 auio.uio_td = td; 441 442 flags = uap->flags & O_FMASK; 443 if (uap->offset != (off_t)-1) 444 flags |= O_FOFFSET; 445 446 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult); 447 448 iovec_free(&iov, aiov); 449 return(error); 450 } 451 452 /* 453 * MPSAFE 454 */ 455 int 456 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 457 { 458 struct thread *td = curthread; 459 struct file *fp; 460 int error; 461 462 fp = holdfp(td, fd, FWRITE); 463 if (fp == NULL) 464 return (EBADF); 465 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 466 error = ESPIPE; 467 } else { 468 error = dofilewrite(fd, fp, auio, flags, res); 469 } 470 dropfp(td, fd, fp); 471 472 return(error); 473 } 474 475 /* 476 * Common code for writev and pwritev that writes data to 477 * a file using the passed in uio, offset, and flags. 478 * 479 * MPALMOSTSAFE - ktrace needs help 480 */ 481 static int 482 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 483 { 484 struct thread *td = curthread; 485 struct lwp *lp = td->td_lwp; 486 int error; 487 size_t len; 488 #ifdef KTRACE 489 struct iovec *ktriov = NULL; 490 struct uio ktruio; 491 #endif 492 493 #ifdef KTRACE 494 /* 495 * if tracing, save a copy of iovec and uio 496 */ 497 if (KTRPOINT(td, KTR_GENIO)) { 498 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 499 500 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 501 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 502 ktruio = *auio; 503 } 504 #endif 505 len = auio->uio_resid; 506 error = fo_write(fp, auio, fp->f_cred, flags); 507 if (error) { 508 if (auio->uio_resid != len && (error == ERESTART || 509 error == EINTR || error == EWOULDBLOCK)) 510 error = 0; 511 /* Socket layer is responsible for issuing SIGPIPE. */ 512 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) 513 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 514 } 515 #ifdef KTRACE 516 if (ktriov != NULL) { 517 if (error == 0) { 518 ktruio.uio_iov = ktriov; 519 ktruio.uio_resid = len - auio->uio_resid; 520 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 521 } 522 kfree(ktriov, M_TEMP); 523 } 524 #endif 525 if (error == 0) 526 *res = len - auio->uio_resid; 527 528 return(error); 529 } 530 531 /* 532 * Ioctl system call 533 * 534 * MPSAFE 535 */ 536 int 537 sys_ioctl(struct sysmsg *sysmsg, const struct ioctl_args *uap) 538 { 539 int error; 540 541 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, sysmsg); 542 return (error); 543 } 544 545 struct ioctl_map_entry { 546 const char *subsys; 547 struct ioctl_map_range *cmd_ranges; 548 LIST_ENTRY(ioctl_map_entry) entries; 549 }; 550 551 /* 552 * The true heart of all ioctl syscall handlers (native, emulation). 553 * If map != NULL, it will be searched for a matching entry for com, 554 * and appropriate conversions/conversion functions will be utilized. 555 * 556 * MPSAFE 557 */ 558 int 559 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 560 struct sysmsg *msg) 561 { 562 struct thread *td = curthread; 563 struct proc *p = td->td_proc; 564 struct ucred *cred; 565 struct file *fp; 566 struct ioctl_map_range *iomc = NULL; 567 int error; 568 u_int size; 569 u_long ocom = com; 570 caddr_t data, memp; 571 int tmp; 572 #define STK_PARAMS 128 573 union { 574 char stkbuf[STK_PARAMS]; 575 long align; 576 } ubuf; 577 578 KKASSERT(p); 579 cred = td->td_ucred; 580 memp = NULL; 581 582 fp = holdfp(td, fd, FREAD|FWRITE); 583 if (fp == NULL) 584 return(EBADF); 585 586 if (map != NULL) { /* obey translation map */ 587 u_long maskcmd; 588 struct ioctl_map_entry *e; 589 590 maskcmd = com & map->mask; 591 592 lwkt_gettoken(&mioctl_token); 593 LIST_FOREACH(e, &map->mapping, entries) { 594 for (iomc = e->cmd_ranges; iomc->start != 0 || 595 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 596 iomc->mapfunc != NULL; 597 iomc++) { 598 if (maskcmd >= iomc->start && 599 maskcmd <= iomc->end) 600 break; 601 } 602 603 /* Did we find a match? */ 604 if (iomc->start != 0 || iomc->maptocmd != 0 || 605 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 606 break; 607 } 608 lwkt_reltoken(&mioctl_token); 609 610 if (iomc == NULL || 611 (iomc->start == 0 && iomc->maptocmd == 0 612 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 613 krateprintf(&krate_poll, 614 "%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) " 615 "not implemented\n", 616 map->sys, fd, maskcmd, 617 (int)((maskcmd >> 8) & 0xff), 618 (int)(maskcmd & 0xff)); 619 error = EINVAL; 620 goto done; 621 } 622 623 /* 624 * If it's a non-range one to one mapping, maptocmd should be 625 * correct. If it's a ranged one to one mapping, we pass the 626 * original value of com, and for a range mapped to a different 627 * range, we always need a mapping function to translate the 628 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 629 */ 630 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 631 com = iomc->maptocmd; 632 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 633 if (iomc->mapfunc != NULL) 634 com = iomc->mapfunc(iomc->start, iomc->end, 635 iomc->start, iomc->end, 636 com, com); 637 } else { 638 if (iomc->mapfunc != NULL) { 639 com = iomc->mapfunc(iomc->start, iomc->end, 640 iomc->maptocmd, iomc->maptoend, 641 com, ocom); 642 } else { 643 krateprintf(&krate_poll, 644 "%s: Invalid mapping for fd=%d, " 645 "cmd=%#lx ('%c',%d)\n", 646 map->sys, fd, maskcmd, 647 (int)((maskcmd >> 8) & 0xff), 648 (int)(maskcmd & 0xff)); 649 error = EINVAL; 650 goto done; 651 } 652 } 653 } 654 655 switch (com) { 656 case FIONCLEX: 657 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 658 goto done; 659 case FIOCLEX: 660 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 661 goto done; 662 } 663 664 /* 665 * Interpret high order word to find amount of data to be 666 * copied to/from the user's address space. 667 */ 668 size = IOCPARM_LEN(com); 669 if (size > IOCPARM_MAX) { 670 error = ENOTTY; 671 goto done; 672 } 673 674 if ((com & IOC_VOID) == 0 && size > sizeof(ubuf.stkbuf)) { 675 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 676 data = memp; 677 } else { 678 memp = NULL; 679 data = ubuf.stkbuf; 680 } 681 if (com & IOC_VOID) { 682 *(caddr_t *)data = uspc_data; 683 } else if (com & IOC_IN) { 684 if (size != 0) { 685 error = copyin(uspc_data, data, (size_t)size); 686 if (error) 687 goto done; 688 } else { 689 *(caddr_t *)data = uspc_data; 690 } 691 } else if ((com & IOC_OUT) != 0 && size) { 692 /* 693 * Zero the buffer so the user always 694 * gets back something deterministic. 695 */ 696 bzero(data, (size_t)size); 697 } 698 699 switch (com) { 700 case FIONBIO: 701 if ((tmp = *(int *)data)) 702 atomic_set_int(&fp->f_flag, FNONBLOCK); 703 else 704 atomic_clear_int(&fp->f_flag, FNONBLOCK); 705 error = 0; 706 break; 707 708 case FIOASYNC: 709 if ((tmp = *(int *)data)) 710 atomic_set_int(&fp->f_flag, FASYNC); 711 else 712 atomic_clear_int(&fp->f_flag, FASYNC); 713 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 714 break; 715 716 default: 717 /* 718 * If there is a override function, 719 * call it instead of directly routing the call 720 */ 721 if (map != NULL && iomc->wrapfunc != NULL) 722 error = iomc->wrapfunc(fp, com, ocom, data, cred); 723 else 724 error = fo_ioctl(fp, com, data, cred, msg); 725 /* 726 * Copy any data to user, size was 727 * already set and checked above. 728 */ 729 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 730 error = copyout(data, uspc_data, (size_t)size); 731 break; 732 } 733 done: 734 if (memp != NULL) 735 kfree(memp, M_IOCTLOPS); 736 dropfp(td, fd, fp); 737 738 return(error); 739 } 740 741 /* 742 * MPSAFE 743 */ 744 int 745 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 746 { 747 struct ioctl_map_entry *ne; 748 749 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 750 he->subsys != NULL && *he->subsys != '\0'); 751 752 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, 753 M_WAITOK | M_ZERO); 754 755 ne->subsys = he->subsys; 756 ne->cmd_ranges = he->cmd_ranges; 757 758 lwkt_gettoken(&mioctl_token); 759 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 760 lwkt_reltoken(&mioctl_token); 761 762 return(0); 763 } 764 765 /* 766 * MPSAFE 767 */ 768 int 769 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 770 { 771 struct ioctl_map_entry *ne; 772 int error = EINVAL; 773 774 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 775 776 lwkt_gettoken(&mioctl_token); 777 LIST_FOREACH(ne, &he->map->mapping, entries) { 778 if (ne->cmd_ranges == he->cmd_ranges) { 779 LIST_REMOVE(ne, entries); 780 kfree(ne, M_IOCTLMAP); 781 error = 0; 782 break; 783 } 784 } 785 lwkt_reltoken(&mioctl_token); 786 return(error); 787 } 788 789 static int nseldebug; 790 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, ""); 791 792 /* 793 * Select system call. 794 * 795 * MPSAFE 796 */ 797 int 798 sys_select(struct sysmsg *sysmsg, const struct select_args *uap) 799 { 800 struct timeval ktv; 801 struct timespec *ktsp, kts; 802 int error; 803 804 /* 805 * Get timeout if any. 806 */ 807 if (uap->tv != NULL) { 808 error = copyin(uap->tv, &ktv, sizeof (ktv)); 809 if (error) 810 return (error); 811 TIMEVAL_TO_TIMESPEC(&ktv, &kts); 812 ktsp = &kts; 813 } else { 814 ktsp = NULL; 815 } 816 817 /* 818 * Do real work. 819 */ 820 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 821 &sysmsg->sysmsg_result); 822 823 return (error); 824 } 825 826 827 /* 828 * Pselect system call. 829 */ 830 int 831 sys_pselect(struct sysmsg *sysmsg, const struct pselect_args *uap) 832 { 833 struct thread *td = curthread; 834 struct lwp *lp = td->td_lwp; 835 struct timespec *ktsp, kts; 836 sigset_t sigmask; 837 int error; 838 839 /* 840 * Get timeout if any. 841 */ 842 if (uap->ts != NULL) { 843 error = copyin(uap->ts, &kts, sizeof (kts)); 844 if (error) 845 return (error); 846 ktsp = &kts; 847 } else { 848 ktsp = NULL; 849 } 850 851 /* 852 * Install temporary signal mask if any provided. 853 */ 854 if (uap->sigmask != NULL) { 855 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 856 if (error) 857 return (error); 858 lwkt_gettoken(&lp->lwp_proc->p_token); 859 lp->lwp_oldsigmask = lp->lwp_sigmask; 860 SIG_CANTMASK(sigmask); 861 lp->lwp_sigmask = sigmask; 862 lwkt_reltoken(&lp->lwp_proc->p_token); 863 sigirefs_wait(lp->lwp_proc); 864 } 865 866 /* 867 * Do real job. 868 */ 869 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 870 &sysmsg->sysmsg_result); 871 872 if (uap->sigmask != NULL) { 873 lwkt_gettoken(&lp->lwp_proc->p_token); 874 /* doselect() responsible for turning ERESTART into EINTR */ 875 KKASSERT(error != ERESTART); 876 if (error == EINTR) { 877 /* 878 * We can't restore the previous signal mask now 879 * because it could block the signal that interrupted 880 * us. So make a note to restore it after executing 881 * the handler. 882 */ 883 lp->lwp_flags |= LWP_OLDMASK; 884 } else { 885 /* 886 * No handler to run. Restore previous mask immediately. 887 */ 888 lp->lwp_sigmask = lp->lwp_oldsigmask; 889 sigirefs_wait(lp->lwp_proc); 890 } 891 lwkt_reltoken(&lp->lwp_proc->p_token); 892 } 893 894 return (error); 895 } 896 897 static int 898 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 899 { 900 struct select_kevent_copyin_args *skap = NULL; 901 struct kevent *kev; 902 int fd; 903 kfd_set *fdp = NULL; 904 short filter = 0; 905 u_int fflags = 0; 906 907 skap = (struct select_kevent_copyin_args *)arg; 908 909 if (*events == maxevents) 910 return (0); 911 912 while (skap->active_set < COPYIN_DONE) { 913 switch (skap->active_set) { 914 case COPYIN_READ: 915 /* 916 * Register descriptors for the read filter 917 */ 918 fdp = skap->read_set; 919 filter = EVFILT_READ; 920 fflags = NOTE_OLDAPI; 921 if (fdp) 922 break; 923 ++skap->active_set; 924 skap->proc_fds = 0; 925 /* fall through */ 926 case COPYIN_WRITE: 927 /* 928 * Register descriptors for the write filter 929 */ 930 fdp = skap->write_set; 931 filter = EVFILT_WRITE; 932 fflags = NOTE_OLDAPI; 933 if (fdp) 934 break; 935 ++skap->active_set; 936 skap->proc_fds = 0; 937 /* fall through */ 938 case COPYIN_EXCEPT: 939 /* 940 * Register descriptors for the exception filter 941 */ 942 fdp = skap->except_set; 943 filter = EVFILT_EXCEPT; 944 fflags = NOTE_OLDAPI | NOTE_OOB; 945 if (fdp) 946 break; 947 ++skap->active_set; 948 skap->proc_fds = 0; 949 /* fall through */ 950 case COPYIN_DONE: 951 /* 952 * Nothing left to register 953 */ 954 return(0); 955 /* NOT REACHED */ 956 } 957 958 while (skap->proc_fds < skap->num_fds) { 959 fd = skap->proc_fds; 960 if (FD_ISSET(fd, fdp)) { 961 kev = &kevp[*events]; 962 EV_SET(kev, fd, filter, 963 EV_ADD|EV_ENABLE, 964 fflags, 0, 965 (void *)(uintptr_t) 966 skap->lwp->lwp_kqueue_serial); 967 FD_CLR(fd, fdp); 968 ++*events; 969 970 if (nseldebug) { 971 kprintf( 972 "select fd %d filter %d " 973 "serial %ju\n", fd, filter, 974 (uintmax_t) 975 skap->lwp->lwp_kqueue_serial); 976 } 977 } 978 ++skap->proc_fds; 979 if (*events == maxevents) 980 return (0); 981 } 982 skap->active_set++; 983 skap->proc_fds = 0; 984 } 985 986 return (0); 987 } 988 989 static int 990 select_copyout(void *arg, struct kevent *kevp, int count, int *res) 991 { 992 struct select_kevent_copyin_args *skap; 993 struct kevent kev; 994 int i; 995 int n; 996 997 skap = (struct select_kevent_copyin_args *)arg; 998 999 for (i = 0; i < count; ++i) { 1000 /* 1001 * Filter out and delete spurious events 1002 */ 1003 if ((uint64_t)(uintptr_t)kevp[i].udata != 1004 skap->lwp->lwp_kqueue_serial) 1005 { 1006 panic("select_copyout: unexpected udata"); 1007 deregister: 1008 kev = kevp[i]; 1009 kev.flags = EV_DISABLE|EV_DELETE; 1010 n = 1; 1011 kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n, 0); 1012 if (nseldebug) { 1013 kprintf("select fd %ju mismatched serial %ju\n", 1014 (uintmax_t)kevp[i].ident, 1015 (uintmax_t)skap->lwp->lwp_kqueue_serial); 1016 } 1017 continue; 1018 } 1019 1020 /* 1021 * Handle errors 1022 */ 1023 if (kevp[i].flags & EV_ERROR) { 1024 int error = kevp[i].data; 1025 1026 switch (error) { 1027 case EBADF: 1028 /* 1029 * A bad file descriptor is considered a 1030 * fatal error for select, bail out. 1031 */ 1032 skap->error = error; 1033 *res = -1; 1034 return error; 1035 1036 default: 1037 /* 1038 * Select silently swallows any unknown errors 1039 * for descriptors in the read or write sets. 1040 * 1041 * ALWAYS filter out EOPNOTSUPP errors from 1042 * filters (at least until all filters support 1043 * EVFILT_EXCEPT) 1044 * 1045 * We also filter out ENODEV since dev_dkqfilter 1046 * returns ENODEV if EOPNOTSUPP is returned in an 1047 * inner call. 1048 * 1049 * XXX: fix this 1050 */ 1051 if (kevp[i].filter != EVFILT_READ && 1052 kevp[i].filter != EVFILT_WRITE && 1053 error != EOPNOTSUPP && 1054 error != ENODEV) { 1055 skap->error = error; 1056 *res = -1; 1057 return error; 1058 } 1059 break; 1060 } 1061 1062 /* 1063 * We must deregister any unsupported select events 1064 * to avoid a live-lock. 1065 */ 1066 if (nseldebug) { 1067 kprintf("select fd %ju filter %d error %d\n", 1068 (uintmax_t)kevp[i].ident, 1069 kevp[i].filter, error); 1070 } 1071 goto deregister; 1072 } 1073 1074 switch (kevp[i].filter) { 1075 case EVFILT_READ: 1076 FD_SET(kevp[i].ident, skap->read_set); 1077 break; 1078 case EVFILT_WRITE: 1079 FD_SET(kevp[i].ident, skap->write_set); 1080 break; 1081 case EVFILT_EXCEPT: 1082 FD_SET(kevp[i].ident, skap->except_set); 1083 break; 1084 } 1085 1086 ++*res; 1087 } 1088 1089 return (0); 1090 } 1091 1092 /* 1093 * Copy select bits in from userland. Allocate kernel memory if the 1094 * set is large. 1095 */ 1096 static int 1097 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) 1098 { 1099 int error; 1100 1101 if (in_set) { 1102 if (bytes < sizeof(*tmp_set)) 1103 *out_set = tmp_set; 1104 else 1105 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); 1106 error = copyin(in_set, *out_set, bytes); 1107 } else { 1108 *out_set = NULL; 1109 error = 0; 1110 } 1111 return (error); 1112 } 1113 1114 /* 1115 * Copy returned select bits back out to userland. 1116 */ 1117 static int 1118 putbits(int bytes, kfd_set *in_set, fd_set *out_set) 1119 { 1120 int error; 1121 1122 if (in_set) { 1123 error = copyout(in_set, out_set, bytes); 1124 } else { 1125 error = 0; 1126 } 1127 return (error); 1128 } 1129 1130 static int 1131 dotimeout_only(struct timespec *ts) 1132 { 1133 return(nanosleep1(ts, NULL)); 1134 } 1135 1136 /* 1137 * Common code for sys_select() and sys_pselect(). 1138 * 1139 * in, out and ex are userland pointers. ts must point to validated 1140 * kernel-side timeout value or NULL for infinite timeout. res must 1141 * point to syscall return value. 1142 */ 1143 static int 1144 doselect(int nd, fd_set *read, fd_set *write, fd_set *except, 1145 struct timespec *ts, int *res) 1146 { 1147 struct proc *p = curproc; 1148 struct select_kevent_copyin_args *kap, ka; 1149 int bytes, error; 1150 kfd_set read_tmp; 1151 kfd_set write_tmp; 1152 kfd_set except_tmp; 1153 1154 *res = 0; 1155 if (nd < 0) 1156 return (EINVAL); 1157 if (nd == 0 && ts) 1158 return (dotimeout_only(ts)); 1159 1160 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ 1161 nd = p->p_fd->fd_nfiles; 1162 1163 kap = &ka; 1164 kap->lwp = curthread->td_lwp; 1165 kap->num_fds = nd; 1166 kap->proc_fds = 0; 1167 kap->error = 0; 1168 kap->active_set = COPYIN_READ; 1169 1170 /* 1171 * Calculate bytes based on the number of __fd_mask[] array entries 1172 * multiplied by the size of __fd_mask. 1173 */ 1174 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); 1175 1176 /* kap->read_set = NULL; not needed */ 1177 kap->write_set = NULL; 1178 kap->except_set = NULL; 1179 1180 error = getbits(bytes, read, &kap->read_set, &read_tmp); 1181 if (error == 0) 1182 error = getbits(bytes, write, &kap->write_set, &write_tmp); 1183 if (error == 0) 1184 error = getbits(bytes, except, &kap->except_set, &except_tmp); 1185 if (error) 1186 goto done; 1187 1188 /* 1189 * NOTE: Make sure the max events passed to kern_kevent() is 1190 * effectively unlimited. (nd * 3) accomplishes this. 1191 * 1192 * (*res) continues to increment as returned events are 1193 * loaded in. 1194 */ 1195 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, 1196 select_copyin, select_copyout, ts, 1197 KEVENT_AUTO_STALE); 1198 if (error == 0) 1199 error = putbits(bytes, kap->read_set, read); 1200 if (error == 0) 1201 error = putbits(bytes, kap->write_set, write); 1202 if (error == 0) 1203 error = putbits(bytes, kap->except_set, except); 1204 1205 /* 1206 * An error from an individual event that should be passed 1207 * back to userland (EBADF) 1208 */ 1209 if (kap->error) 1210 error = kap->error; 1211 1212 /* 1213 * Clean up. 1214 */ 1215 done: 1216 if (kap->read_set && kap->read_set != &read_tmp) 1217 kfree(kap->read_set, M_SELECT); 1218 if (kap->write_set && kap->write_set != &write_tmp) 1219 kfree(kap->write_set, M_SELECT); 1220 if (kap->except_set && kap->except_set != &except_tmp) 1221 kfree(kap->except_set, M_SELECT); 1222 1223 kap->lwp->lwp_kqueue_serial += kap->num_fds; 1224 1225 return (error); 1226 } 1227 1228 /* 1229 * Poll system call. 1230 * 1231 * MPSAFE 1232 */ 1233 int 1234 sys_poll(struct sysmsg *sysmsg, const struct poll_args *uap) 1235 { 1236 struct timespec ts, *tsp; 1237 int error; 1238 1239 if (uap->timeout != INFTIM) { 1240 if (uap->timeout < 0) 1241 return (EINVAL); 1242 ts.tv_sec = uap->timeout / 1000; 1243 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; 1244 tsp = &ts; 1245 } else { 1246 tsp = NULL; 1247 } 1248 1249 error = dopoll(uap->nfds, uap->fds, tsp, &sysmsg->sysmsg_result, 0); 1250 1251 return (error); 1252 } 1253 1254 /* 1255 * Ppoll system call. 1256 * 1257 * MPSAFE 1258 */ 1259 int 1260 sys_ppoll(struct sysmsg *sysmsg, const struct ppoll_args *uap) 1261 { 1262 struct thread *td = curthread; 1263 struct lwp *lp = td->td_lwp; 1264 struct timespec *ktsp, kts; 1265 sigset_t sigmask; 1266 int error; 1267 1268 /* 1269 * Get timeout if any. 1270 */ 1271 if (uap->ts != NULL) { 1272 error = copyin(uap->ts, &kts, sizeof (kts)); 1273 if (error) 1274 return (error); 1275 ktsp = &kts; 1276 } else { 1277 ktsp = NULL; 1278 } 1279 1280 /* 1281 * Install temporary signal mask if any provided. 1282 */ 1283 if (uap->sigmask != NULL) { 1284 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 1285 if (error) 1286 return (error); 1287 lwkt_gettoken(&lp->lwp_proc->p_token); 1288 lp->lwp_oldsigmask = lp->lwp_sigmask; 1289 SIG_CANTMASK(sigmask); 1290 lp->lwp_sigmask = sigmask; 1291 lwkt_reltoken(&lp->lwp_proc->p_token); 1292 sigirefs_wait(lp->lwp_proc); 1293 } 1294 1295 error = dopoll(uap->nfds, uap->fds, ktsp, &sysmsg->sysmsg_result, 1296 ktsp != NULL ? KEVENT_TIMEOUT_PRECISE : 0); 1297 1298 if (uap->sigmask != NULL) { 1299 lwkt_gettoken(&lp->lwp_proc->p_token); 1300 /* dopoll() responsible for turning ERESTART into EINTR */ 1301 KKASSERT(error != ERESTART); 1302 if (error == EINTR) { 1303 /* 1304 * We can't restore the previous signal mask now 1305 * because it could block the signal that interrupted 1306 * us. So make a note to restore it after executing 1307 * the handler. 1308 */ 1309 lp->lwp_flags |= LWP_OLDMASK; 1310 } else { 1311 /* 1312 * No handler to run. Restore previous mask immediately. 1313 */ 1314 lp->lwp_sigmask = lp->lwp_oldsigmask; 1315 sigirefs_wait(lp->lwp_proc); 1316 } 1317 lwkt_reltoken(&lp->lwp_proc->p_token); 1318 } 1319 1320 return (error); 1321 } 1322 1323 static int 1324 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1325 { 1326 struct poll_kevent_copyin_args *pkap; 1327 struct pollfd *pfd; 1328 struct kevent *kev; 1329 int kev_count; 1330 1331 pkap = (struct poll_kevent_copyin_args *)arg; 1332 1333 while (pkap->pfds < pkap->nfds) { 1334 pfd = &pkap->fds[pkap->pfds]; 1335 1336 /* Clear return events */ 1337 pfd->revents = 0; 1338 1339 /* Do not check if fd is equal to -1 */ 1340 if (pfd->fd == -1) { 1341 ++pkap->pfds; 1342 continue; 1343 } 1344 1345 /* 1346 * NOTE: pfd->events == 0 implies POLLHUP in BSDs. Used 1347 * by at least sshd and X11 udev support. 1348 */ 1349 kev_count = 0; 1350 if (pfd->events == 0) 1351 kev_count++; 1352 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) 1353 kev_count++; 1354 if (pfd->events & (POLLOUT | POLLWRNORM)) 1355 kev_count++; 1356 if (pfd->events & (POLLPRI | POLLRDBAND)) 1357 kev_count++; 1358 1359 if (*events + kev_count > maxevents) 1360 return (0); 1361 1362 /* 1363 * NOTE: A combined serial number and poll array index is 1364 * stored in kev->udata. 1365 * 1366 * NOTE: Events will be registered with KEVENT_UNIQUE_NOTES 1367 * set, using kev->data for the uniqifier. kev->data 1368 * is an implied in the actual registration. 1369 */ 1370 kev = &kevp[*events]; 1371 1372 /* 1373 * Implied POLLHUP 1374 */ 1375 if (pfd->events == 0) { 1376 int notes = NOTE_OLDAPI | NOTE_HUPONLY; 1377 1378 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1379 notes, pkap->pfds, (void *)(uintptr_t) 1380 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1381 } 1382 1383 /* 1384 * Nominal read events 1385 */ 1386 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) { 1387 int notes = NOTE_OLDAPI; 1388 if ((pfd->events & (POLLIN | POLLRDNORM)) == 0) 1389 notes |= NOTE_HUPONLY; 1390 1391 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1392 notes, pkap->pfds, (void *)(uintptr_t) 1393 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1394 } 1395 1396 /* 1397 * Nominal write events 1398 */ 1399 if (pfd->events & (POLLOUT | POLLWRNORM)) { 1400 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, 1401 NOTE_OLDAPI, pkap->pfds, (void *)(uintptr_t) 1402 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1403 } 1404 1405 /* 1406 * Nominal exceptional events 1407 */ 1408 if (pfd->events & (POLLPRI | POLLRDBAND)) { 1409 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, 1410 NOTE_OLDAPI | NOTE_OOB, pkap->pfds, 1411 (void *)(uintptr_t) 1412 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1413 } 1414 1415 if (nseldebug) { 1416 kprintf("poll index %d/%d fd %d events %08x " 1417 "serial %ju\n", pkap->pfds, pkap->nfds-1, 1418 pfd->fd, pfd->events, 1419 (uintmax_t)pkap->lwp->lwp_kqueue_serial); 1420 } 1421 1422 ++pkap->pfds; 1423 (*events) += kev_count; 1424 } 1425 1426 return (0); 1427 } 1428 1429 static int 1430 poll_copyout(void *arg, struct kevent *kevp, int count, int *res) 1431 { 1432 struct poll_kevent_copyin_args *pkap; 1433 struct pollfd *pfd; 1434 struct kevent kev; 1435 int count_res; 1436 int i; 1437 int n; 1438 uint64_t pi; 1439 1440 pkap = (struct poll_kevent_copyin_args *)arg; 1441 1442 for (i = 0; i < count; ++i) { 1443 /* 1444 * Extract the poll array index and delete spurious events. 1445 * We can easily tell if the serial number is incorrect 1446 * by checking whether the extracted index is out of range. 1447 */ 1448 pi = (uint64_t)(uintptr_t)kevp[i].udata - 1449 pkap->lwp->lwp_kqueue_serial; 1450 if (pi >= pkap->nfds) { 1451 panic("poll_copyout: unexpected udata"); 1452 deregister: 1453 kev = kevp[i]; 1454 kev.flags = EV_DISABLE|EV_DELETE; 1455 kev.data = pi; /* uniquifier */ 1456 n = 1; 1457 kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n, 1458 KEVENT_UNIQUE_NOTES); 1459 if (nseldebug) { 1460 kprintf("poll index %ju out of range against " 1461 "serial %ju\n", (uintmax_t)pi, 1462 (uintmax_t)pkap->lwp->lwp_kqueue_serial); 1463 } 1464 continue; 1465 } 1466 1467 /* 1468 * Locate the pollfd and process events 1469 */ 1470 pfd = &pkap->fds[pi]; 1471 if (kevp[i].ident == pfd->fd) { 1472 /* 1473 * A single descriptor may generate an error against 1474 * more than one filter, make sure to set the 1475 * appropriate flags but do not increment (*res) 1476 * more than once. 1477 */ 1478 count_res = (pfd->revents == 0); 1479 if (kevp[i].flags & EV_ERROR) { 1480 switch(kevp[i].data) { 1481 case EBADF: 1482 case POLLNVAL: 1483 /* Bad file descriptor */ 1484 if (count_res) 1485 ++*res; 1486 pfd->revents |= POLLNVAL; 1487 break; 1488 default: 1489 /* 1490 * Poll silently swallows any unknown 1491 * errors except in the case of POLLPRI 1492 * (OOB/urgent data). 1493 * 1494 * ALWAYS filter out EOPNOTSUPP errors 1495 * from filters, common applications 1496 * set POLLPRI|POLLRDBAND and most 1497 * filters do not support EVFILT_EXCEPT. 1498 * 1499 * We also filter out ENODEV since 1500 * dev_dkqfilter returns ENODEV if 1501 * EOPNOTSUPP is returned in an 1502 * inner call. 1503 * 1504 * XXX: fix this 1505 */ 1506 if (kevp[i].filter != EVFILT_READ && 1507 kevp[i].filter != EVFILT_WRITE && 1508 kevp[i].data != EOPNOTSUPP && 1509 kevp[i].data != ENODEV) { 1510 if (count_res) 1511 ++*res; 1512 pfd->revents |= POLLERR; 1513 } 1514 break; 1515 } 1516 if (pfd->revents == 0 && nseldebug) { 1517 kprintf("poll index EV_ERROR %ju fd %d " 1518 "filter %d error %jd\n", 1519 (uintmax_t)pi, pfd->fd, 1520 kevp[i].filter, 1521 (intmax_t)kevp[i].data); 1522 } 1523 1524 /* 1525 * Silently deregister any unhandled EV_ERROR 1526 * condition (usually EOPNOTSUPP). 1527 */ 1528 if (pfd->revents == 0) 1529 goto deregister; 1530 continue; 1531 } 1532 1533 switch (kevp[i].filter) { 1534 case EVFILT_READ: 1535 /* 1536 * NODATA on the read side can indicate a 1537 * half-closed situation and not necessarily 1538 * a disconnect, so depend on the user 1539 * issuing a read() and getting 0 bytes back. 1540 * 1541 * If EV_HUP is set the peer completely 1542 * disconnected and we can set POLLHUP. 1543 * Linux can return POLLHUP even if read 1544 * data has not been drained, so we should 1545 * too. 1546 */ 1547 /* if (kevp[i].flags & EV_NODATA) */ { 1548 if (kevp[i].flags & EV_HUP) 1549 pfd->revents |= POLLHUP; 1550 } 1551 if ((kevp[i].flags & EV_EOF) && 1552 kevp[i].fflags != 0) 1553 pfd->revents |= POLLERR; 1554 if (pfd->events & POLLIN) 1555 pfd->revents |= POLLIN; 1556 if (pfd->events & POLLRDNORM) 1557 pfd->revents |= POLLRDNORM; 1558 break; 1559 case EVFILT_WRITE: 1560 /* 1561 * As per the OpenGroup POLLHUP is mutually 1562 * exclusive with the writability flags. I 1563 * consider this a bit broken but... 1564 * 1565 * In this case a disconnect is implied even 1566 * for a half-closed (write side) situation. 1567 */ 1568 if (kevp[i].flags & EV_EOF) { 1569 pfd->revents |= POLLHUP; 1570 if (kevp[i].fflags != 0) 1571 pfd->revents |= POLLERR; 1572 } else { 1573 if (pfd->events & POLLOUT) 1574 pfd->revents |= POLLOUT; 1575 if (pfd->events & POLLWRNORM) 1576 pfd->revents |= POLLWRNORM; 1577 } 1578 break; 1579 case EVFILT_EXCEPT: 1580 /* 1581 * EV_NODATA should never be tagged for this 1582 * filter. 1583 */ 1584 if (pfd->events & POLLPRI) 1585 pfd->revents |= POLLPRI; 1586 if (pfd->events & POLLRDBAND) 1587 pfd->revents |= POLLRDBAND; 1588 break; 1589 } 1590 1591 if (nseldebug) { 1592 kprintf("poll index %ju/%d fd %d " 1593 "revents %08x\n", (uintmax_t)pi, pkap->nfds, 1594 pfd->fd, pfd->revents); 1595 } 1596 1597 if (count_res && pfd->revents) 1598 ++*res; 1599 } 1600 1601 /* 1602 * We must deregister any kqueue poll event that does not 1603 * set poll return bits to prevent a live-lock. 1604 */ 1605 if (pfd->revents == 0) { 1606 krateprintf(&krate_poll, 1607 "poll index %ju no-action %ju/%d " 1608 "events=%08x kevpfilt=%d/%08x\n", 1609 (uintmax_t)pi, (uintmax_t)kevp[i].ident, 1610 pfd->fd, pfd->events, 1611 kevp[i].filter, kevp[i].flags); 1612 goto deregister; 1613 } 1614 } 1615 1616 return (0); 1617 } 1618 1619 static int 1620 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res, int flags) 1621 { 1622 struct poll_kevent_copyin_args ka; 1623 struct pollfd sfds[64]; 1624 int bytes; 1625 int error; 1626 1627 flags |= KEVENT_AUTO_STALE | KEVENT_UNIQUE_NOTES; 1628 1629 *res = 0; 1630 if (nfds < 0) 1631 return (EINVAL); 1632 1633 if (nfds == 0 && ts) 1634 return (dotimeout_only(ts)); 1635 1636 /* 1637 * This is a bit arbitrary but we need to limit internal kmallocs. 1638 */ 1639 if (nfds > maxfilesperproc * 2) 1640 nfds = maxfilesperproc * 2; 1641 bytes = sizeof(struct pollfd) * nfds; 1642 1643 ka.lwp = curthread->td_lwp; 1644 ka.nfds = nfds; 1645 ka.pfds = 0; 1646 ka.error = 0; 1647 1648 if (ka.nfds < 64) 1649 ka.fds = sfds; 1650 else 1651 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); 1652 1653 error = copyin(fds, ka.fds, bytes); 1654 1655 if (error == 0) 1656 error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka, 1657 poll_copyin, poll_copyout, ts, flags); 1658 1659 if (error == 0) 1660 error = copyout(ka.fds, fds, bytes); 1661 1662 if (ka.fds != sfds) 1663 kfree(ka.fds, M_SELECT); 1664 1665 ka.lwp->lwp_kqueue_serial += nfds; 1666 1667 return (error); 1668 } 1669 1670 static int 1671 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1672 { 1673 return (0); 1674 } 1675 1676 static int 1677 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res) 1678 { 1679 ++*res; 1680 return (0); 1681 } 1682 1683 extern struct fileops socketops; 1684 1685 /* 1686 * NOTE: Callers of socket_wait() must already have a reference on the 1687 * socket. 1688 */ 1689 int 1690 socket_wait(struct socket *so, struct timespec *ts, int *res) 1691 { 1692 struct thread *td = curthread; 1693 struct file *fp; 1694 struct kqueue kq; 1695 struct kevent kev; 1696 int error, fd; 1697 int n; 1698 1699 if ((error = falloc(td->td_lwp, &fp, &fd)) != 0) 1700 return (error); 1701 1702 fp->f_type = DTYPE_SOCKET; 1703 fp->f_flag = FREAD | FWRITE; 1704 fp->f_ops = &socketops; 1705 fp->f_data = so; 1706 fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd); 1707 fsetfdflags(td->td_proc->p_fd, fd, UF_EXCLOSE); 1708 1709 bzero(&kq, sizeof(kq)); 1710 kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd); 1711 EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL); 1712 n = 1; 1713 if ((error = kqueue_register(&kq, &kev, &n, 0)) != 0) { 1714 fdrop(fp); 1715 return (error); 1716 } 1717 1718 error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin, 1719 socket_wait_copyout, ts, 0); 1720 1721 EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL); 1722 n = 1; 1723 kqueue_register(&kq, &kev, &n, 0); 1724 fp->f_ops = &badfileops; 1725 fdrop(fp); 1726 1727 return (error); 1728 } 1729 1730 /* 1731 * OpenBSD poll system call. 1732 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1733 * 1734 * MPSAFE 1735 */ 1736 int 1737 sys_openbsd_poll(struct sysmsg *sysmsg, const struct openbsd_poll_args *uap) 1738 { 1739 return (sys_poll(sysmsg, (const struct poll_args *)uap)); 1740 } 1741 1742 /*ARGSUSED*/ 1743 int 1744 seltrue(cdev_t dev, int events) 1745 { 1746 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1747 } 1748