1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 #include <sys/file2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 81 struct timeval *tv, int *res); 82 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 83 static int selscan (struct proc *, fd_mask **, fd_mask **, 84 int, int *); 85 static int dofileread(int, struct file *, struct uio *, int, size_t *); 86 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 87 88 /* 89 * Read system call. 90 * 91 * MPSAFE 92 */ 93 int 94 sys_read(struct read_args *uap) 95 { 96 struct thread *td = curthread; 97 struct uio auio; 98 struct iovec aiov; 99 int error; 100 101 if ((ssize_t)uap->nbyte < 0) 102 error = EINVAL; 103 104 aiov.iov_base = uap->buf; 105 aiov.iov_len = uap->nbyte; 106 auio.uio_iov = &aiov; 107 auio.uio_iovcnt = 1; 108 auio.uio_offset = -1; 109 auio.uio_resid = uap->nbyte; 110 auio.uio_rw = UIO_READ; 111 auio.uio_segflg = UIO_USERSPACE; 112 auio.uio_td = td; 113 114 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 115 return(error); 116 } 117 118 /* 119 * Positioned (Pread) read system call 120 * 121 * MPSAFE 122 */ 123 int 124 sys_extpread(struct extpread_args *uap) 125 { 126 struct thread *td = curthread; 127 struct uio auio; 128 struct iovec aiov; 129 int error; 130 int flags; 131 132 if ((ssize_t)uap->nbyte < 0) 133 return(EINVAL); 134 135 aiov.iov_base = uap->buf; 136 aiov.iov_len = uap->nbyte; 137 auio.uio_iov = &aiov; 138 auio.uio_iovcnt = 1; 139 auio.uio_offset = uap->offset; 140 auio.uio_resid = uap->nbyte; 141 auio.uio_rw = UIO_READ; 142 auio.uio_segflg = UIO_USERSPACE; 143 auio.uio_td = td; 144 145 flags = uap->flags & O_FMASK; 146 if (uap->offset != (off_t)-1) 147 flags |= O_FOFFSET; 148 149 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 150 return(error); 151 } 152 153 /* 154 * Scatter read system call. 155 * 156 * MPSAFE 157 */ 158 int 159 sys_readv(struct readv_args *uap) 160 { 161 struct thread *td = curthread; 162 struct uio auio; 163 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 164 int error; 165 166 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 167 &auio.uio_resid); 168 if (error) 169 return (error); 170 auio.uio_iov = iov; 171 auio.uio_iovcnt = uap->iovcnt; 172 auio.uio_offset = -1; 173 auio.uio_rw = UIO_READ; 174 auio.uio_segflg = UIO_USERSPACE; 175 auio.uio_td = td; 176 177 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 178 179 iovec_free(&iov, aiov); 180 return (error); 181 } 182 183 184 /* 185 * Scatter positioned read system call. 186 * 187 * MPSAFE 188 */ 189 int 190 sys_extpreadv(struct extpreadv_args *uap) 191 { 192 struct thread *td = curthread; 193 struct uio auio; 194 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 195 int error; 196 int flags; 197 198 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 199 &auio.uio_resid); 200 if (error) 201 return (error); 202 auio.uio_iov = iov; 203 auio.uio_iovcnt = uap->iovcnt; 204 auio.uio_offset = uap->offset; 205 auio.uio_rw = UIO_READ; 206 auio.uio_segflg = UIO_USERSPACE; 207 auio.uio_td = td; 208 209 flags = uap->flags & O_FMASK; 210 if (uap->offset != (off_t)-1) 211 flags |= O_FOFFSET; 212 213 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 214 215 iovec_free(&iov, aiov); 216 return(error); 217 } 218 219 /* 220 * MPSAFE 221 */ 222 int 223 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 224 { 225 struct thread *td = curthread; 226 struct proc *p = td->td_proc; 227 struct file *fp; 228 int error; 229 230 KKASSERT(p); 231 232 fp = holdfp(p->p_fd, fd, FREAD); 233 if (fp == NULL) 234 return (EBADF); 235 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 236 error = ESPIPE; 237 } else { 238 error = dofileread(fd, fp, auio, flags, res); 239 } 240 fdrop(fp); 241 return(error); 242 } 243 244 /* 245 * Common code for readv and preadv that reads data in 246 * from a file using the passed in uio, offset, and flags. 247 * 248 * MPALMOSTSAFE - ktrace needs help 249 */ 250 static int 251 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 252 { 253 struct thread *td = curthread; 254 int error; 255 size_t len; 256 #ifdef KTRACE 257 struct iovec *ktriov = NULL; 258 struct uio ktruio; 259 #endif 260 261 #ifdef KTRACE 262 /* 263 * if tracing, save a copy of iovec 264 */ 265 if (KTRPOINT(td, KTR_GENIO)) { 266 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 267 268 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 269 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 270 ktruio = *auio; 271 } 272 #endif 273 len = auio->uio_resid; 274 error = fo_read(fp, auio, fp->f_cred, flags); 275 if (error) { 276 if (auio->uio_resid != len && (error == ERESTART || 277 error == EINTR || error == EWOULDBLOCK)) 278 error = 0; 279 } 280 #ifdef KTRACE 281 if (ktriov != NULL) { 282 if (error == 0) { 283 ktruio.uio_iov = ktriov; 284 ktruio.uio_resid = len - auio->uio_resid; 285 get_mplock(); 286 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 287 rel_mplock(); 288 } 289 FREE(ktriov, M_TEMP); 290 } 291 #endif 292 if (error == 0) 293 *res = len - auio->uio_resid; 294 295 return(error); 296 } 297 298 /* 299 * Write system call 300 * 301 * MPSAFE 302 */ 303 int 304 sys_write(struct write_args *uap) 305 { 306 struct thread *td = curthread; 307 struct uio auio; 308 struct iovec aiov; 309 int error; 310 311 if ((ssize_t)uap->nbyte < 0) 312 error = EINVAL; 313 314 aiov.iov_base = (void *)(uintptr_t)uap->buf; 315 aiov.iov_len = uap->nbyte; 316 auio.uio_iov = &aiov; 317 auio.uio_iovcnt = 1; 318 auio.uio_offset = -1; 319 auio.uio_resid = uap->nbyte; 320 auio.uio_rw = UIO_WRITE; 321 auio.uio_segflg = UIO_USERSPACE; 322 auio.uio_td = td; 323 324 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 325 326 return(error); 327 } 328 329 /* 330 * Pwrite system call 331 * 332 * MPSAFE 333 */ 334 int 335 sys_extpwrite(struct extpwrite_args *uap) 336 { 337 struct thread *td = curthread; 338 struct uio auio; 339 struct iovec aiov; 340 int error; 341 int flags; 342 343 if ((ssize_t)uap->nbyte < 0) 344 error = EINVAL; 345 346 aiov.iov_base = (void *)(uintptr_t)uap->buf; 347 aiov.iov_len = uap->nbyte; 348 auio.uio_iov = &aiov; 349 auio.uio_iovcnt = 1; 350 auio.uio_offset = uap->offset; 351 auio.uio_resid = uap->nbyte; 352 auio.uio_rw = UIO_WRITE; 353 auio.uio_segflg = UIO_USERSPACE; 354 auio.uio_td = td; 355 356 flags = uap->flags & O_FMASK; 357 if (uap->offset != (off_t)-1) 358 flags |= O_FOFFSET; 359 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 360 return(error); 361 } 362 363 /* 364 * MPSAFE 365 */ 366 int 367 sys_writev(struct writev_args *uap) 368 { 369 struct thread *td = curthread; 370 struct uio auio; 371 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 372 int error; 373 374 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 375 &auio.uio_resid); 376 if (error) 377 return (error); 378 auio.uio_iov = iov; 379 auio.uio_iovcnt = uap->iovcnt; 380 auio.uio_offset = -1; 381 auio.uio_rw = UIO_WRITE; 382 auio.uio_segflg = UIO_USERSPACE; 383 auio.uio_td = td; 384 385 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 386 387 iovec_free(&iov, aiov); 388 return (error); 389 } 390 391 392 /* 393 * Gather positioned write system call 394 * 395 * MPSAFE 396 */ 397 int 398 sys_extpwritev(struct extpwritev_args *uap) 399 { 400 struct thread *td = curthread; 401 struct uio auio; 402 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 403 int error; 404 int flags; 405 406 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 407 &auio.uio_resid); 408 if (error) 409 return (error); 410 auio.uio_iov = iov; 411 auio.uio_iovcnt = uap->iovcnt; 412 auio.uio_offset = uap->offset; 413 auio.uio_rw = UIO_WRITE; 414 auio.uio_segflg = UIO_USERSPACE; 415 auio.uio_td = td; 416 417 flags = uap->flags & O_FMASK; 418 if (uap->offset != (off_t)-1) 419 flags |= O_FOFFSET; 420 421 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 422 423 iovec_free(&iov, aiov); 424 return(error); 425 } 426 427 /* 428 * MPSAFE 429 */ 430 int 431 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 432 { 433 struct thread *td = curthread; 434 struct proc *p = td->td_proc; 435 struct file *fp; 436 int error; 437 438 KKASSERT(p); 439 440 fp = holdfp(p->p_fd, fd, FWRITE); 441 if (fp == NULL) 442 return (EBADF); 443 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 444 error = ESPIPE; 445 } else { 446 error = dofilewrite(fd, fp, auio, flags, res); 447 } 448 449 fdrop(fp); 450 return (error); 451 } 452 453 /* 454 * Common code for writev and pwritev that writes data to 455 * a file using the passed in uio, offset, and flags. 456 * 457 * MPALMOSTSAFE - ktrace needs help 458 */ 459 static int 460 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 461 { 462 struct thread *td = curthread; 463 struct lwp *lp = td->td_lwp; 464 int error; 465 size_t len; 466 #ifdef KTRACE 467 struct iovec *ktriov = NULL; 468 struct uio ktruio; 469 #endif 470 471 #ifdef KTRACE 472 /* 473 * if tracing, save a copy of iovec and uio 474 */ 475 if (KTRPOINT(td, KTR_GENIO)) { 476 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 477 478 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 479 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 480 ktruio = *auio; 481 } 482 #endif 483 len = auio->uio_resid; 484 error = fo_write(fp, auio, fp->f_cred, flags); 485 if (error) { 486 if (auio->uio_resid != len && (error == ERESTART || 487 error == EINTR || error == EWOULDBLOCK)) 488 error = 0; 489 /* Socket layer is responsible for issuing SIGPIPE. */ 490 if (error == EPIPE) { 491 get_mplock(); 492 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 493 rel_mplock(); 494 } 495 } 496 #ifdef KTRACE 497 if (ktriov != NULL) { 498 if (error == 0) { 499 ktruio.uio_iov = ktriov; 500 ktruio.uio_resid = len - auio->uio_resid; 501 get_mplock(); 502 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 503 rel_mplock(); 504 } 505 FREE(ktriov, M_TEMP); 506 } 507 #endif 508 if (error == 0) 509 *res = len - auio->uio_resid; 510 511 return(error); 512 } 513 514 /* 515 * Ioctl system call 516 * 517 * MPALMOSTSAFE 518 */ 519 int 520 sys_ioctl(struct ioctl_args *uap) 521 { 522 int error; 523 524 get_mplock(); 525 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); 526 rel_mplock(); 527 return (error); 528 } 529 530 struct ioctl_map_entry { 531 const char *subsys; 532 struct ioctl_map_range *cmd_ranges; 533 LIST_ENTRY(ioctl_map_entry) entries; 534 }; 535 536 /* 537 * The true heart of all ioctl syscall handlers (native, emulation). 538 * If map != NULL, it will be searched for a matching entry for com, 539 * and appropriate conversions/conversion functions will be utilized. 540 */ 541 int 542 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 543 struct sysmsg *msg) 544 { 545 struct thread *td = curthread; 546 struct proc *p = td->td_proc; 547 struct ucred *cred; 548 struct file *fp; 549 struct ioctl_map_range *iomc = NULL; 550 int error; 551 u_int size; 552 u_long ocom = com; 553 caddr_t data, memp; 554 int tmp; 555 #define STK_PARAMS 128 556 union { 557 char stkbuf[STK_PARAMS]; 558 long align; 559 } ubuf; 560 561 KKASSERT(p); 562 cred = td->td_ucred; 563 564 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 565 if (fp == NULL) 566 return(EBADF); 567 568 if (map != NULL) { /* obey translation map */ 569 u_long maskcmd; 570 struct ioctl_map_entry *e; 571 572 maskcmd = com & map->mask; 573 574 LIST_FOREACH(e, &map->mapping, entries) { 575 for (iomc = e->cmd_ranges; iomc->start != 0 || 576 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 577 iomc->mapfunc != NULL; 578 iomc++) { 579 if (maskcmd >= iomc->start && 580 maskcmd <= iomc->end) 581 break; 582 } 583 584 /* Did we find a match? */ 585 if (iomc->start != 0 || iomc->maptocmd != 0 || 586 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 587 break; 588 } 589 590 if (iomc == NULL || 591 (iomc->start == 0 && iomc->maptocmd == 0 592 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 593 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 594 map->sys, fd, maskcmd, 595 (int)((maskcmd >> 8) & 0xff), 596 (int)(maskcmd & 0xff)); 597 error = EINVAL; 598 goto done; 599 } 600 601 /* 602 * If it's a non-range one to one mapping, maptocmd should be 603 * correct. If it's a ranged one to one mapping, we pass the 604 * original value of com, and for a range mapped to a different 605 * range, we always need a mapping function to translate the 606 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 607 */ 608 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 609 com = iomc->maptocmd; 610 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 611 if (iomc->mapfunc != NULL) 612 com = iomc->mapfunc(iomc->start, iomc->end, 613 iomc->start, iomc->end, 614 com, com); 615 } else { 616 if (iomc->mapfunc != NULL) { 617 com = iomc->mapfunc(iomc->start, iomc->end, 618 iomc->maptocmd, iomc->maptoend, 619 com, ocom); 620 } else { 621 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 622 map->sys, fd, maskcmd, 623 (int)((maskcmd >> 8) & 0xff), 624 (int)(maskcmd & 0xff)); 625 error = EINVAL; 626 goto done; 627 } 628 } 629 } 630 631 switch (com) { 632 case FIONCLEX: 633 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 634 goto done; 635 case FIOCLEX: 636 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 637 goto done; 638 } 639 640 /* 641 * Interpret high order word to find amount of data to be 642 * copied to/from the user's address space. 643 */ 644 size = IOCPARM_LEN(com); 645 if (size > IOCPARM_MAX) { 646 error = ENOTTY; 647 goto done; 648 } 649 650 memp = NULL; 651 if (size > sizeof (ubuf.stkbuf)) { 652 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 653 data = memp; 654 } else { 655 data = ubuf.stkbuf; 656 } 657 if ((com & IOC_IN) != 0) { 658 if (size != 0) { 659 error = copyin(uspc_data, data, (size_t)size); 660 if (error) { 661 if (memp != NULL) 662 kfree(memp, M_IOCTLOPS); 663 goto done; 664 } 665 } else { 666 *(caddr_t *)data = uspc_data; 667 } 668 } else if ((com & IOC_OUT) != 0 && size) { 669 /* 670 * Zero the buffer so the user always 671 * gets back something deterministic. 672 */ 673 bzero(data, (size_t)size); 674 } else if ((com & IOC_VOID) != 0) { 675 *(caddr_t *)data = uspc_data; 676 } 677 678 switch (com) { 679 case FIONBIO: 680 if ((tmp = *(int *)data)) 681 fp->f_flag |= FNONBLOCK; 682 else 683 fp->f_flag &= ~FNONBLOCK; 684 error = 0; 685 break; 686 687 case FIOASYNC: 688 if ((tmp = *(int *)data)) 689 fp->f_flag |= FASYNC; 690 else 691 fp->f_flag &= ~FASYNC; 692 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 693 break; 694 695 default: 696 /* 697 * If there is a override function, 698 * call it instead of directly routing the call 699 */ 700 if (map != NULL && iomc->wrapfunc != NULL) 701 error = iomc->wrapfunc(fp, com, ocom, data, cred); 702 else 703 error = fo_ioctl(fp, com, data, cred, msg); 704 /* 705 * Copy any data to user, size was 706 * already set and checked above. 707 */ 708 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 709 error = copyout(data, uspc_data, (size_t)size); 710 break; 711 } 712 if (memp != NULL) 713 kfree(memp, M_IOCTLOPS); 714 done: 715 fdrop(fp); 716 return(error); 717 } 718 719 int 720 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 721 { 722 struct ioctl_map_entry *ne; 723 724 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 725 he->subsys != NULL && *he->subsys != '\0'); 726 727 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 728 729 ne->subsys = he->subsys; 730 ne->cmd_ranges = he->cmd_ranges; 731 732 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 733 734 return(0); 735 } 736 737 int 738 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 739 { 740 struct ioctl_map_entry *ne; 741 742 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 743 744 LIST_FOREACH(ne, &he->map->mapping, entries) { 745 if (ne->cmd_ranges != he->cmd_ranges) 746 continue; 747 LIST_REMOVE(ne, entries); 748 kfree(ne, M_IOCTLMAP); 749 return(0); 750 } 751 return(EINVAL); 752 } 753 754 static int nselcoll; /* Select collisions since boot */ 755 int selwait; 756 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 757 758 /* 759 * Select system call. 760 * 761 * MPALMOSTSAFE 762 */ 763 int 764 sys_select(struct select_args *uap) 765 { 766 struct timeval ktv; 767 struct timeval *ktvp; 768 int error; 769 770 /* 771 * Get timeout if any. 772 */ 773 if (uap->tv != NULL) { 774 error = copyin(uap->tv, &ktv, sizeof (ktv)); 775 if (error) 776 return (error); 777 error = itimerfix(&ktv); 778 if (error) 779 return (error); 780 ktvp = &ktv; 781 } else { 782 ktvp = NULL; 783 } 784 785 /* 786 * Do real work. 787 */ 788 get_mplock(); 789 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 790 &uap->sysmsg_result); 791 rel_mplock(); 792 793 return (error); 794 } 795 796 797 /* 798 * Pselect system call. 799 * 800 * MPALMOSTSAFE 801 */ 802 int 803 sys_pselect(struct pselect_args *uap) 804 { 805 struct thread *td = curthread; 806 struct lwp *lp = td->td_lwp; 807 struct timespec kts; 808 struct timeval ktv; 809 struct timeval *ktvp; 810 sigset_t sigmask; 811 int error; 812 813 /* 814 * Get timeout if any and convert it. 815 * Round up during conversion to avoid timeout going off early. 816 */ 817 if (uap->ts != NULL) { 818 error = copyin(uap->ts, &kts, sizeof (kts)); 819 if (error) 820 return (error); 821 ktv.tv_sec = kts.tv_sec; 822 ktv.tv_usec = (kts.tv_nsec + 999) / 1000; 823 error = itimerfix(&ktv); 824 if (error) 825 return (error); 826 ktvp = &ktv; 827 } else { 828 ktvp = NULL; 829 } 830 831 /* 832 * Install temporary signal mask if any provided. 833 */ 834 if (uap->sigmask != NULL) { 835 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 836 if (error) 837 return (error); 838 get_mplock(); 839 lp->lwp_oldsigmask = lp->lwp_sigmask; 840 SIG_CANTMASK(sigmask); 841 lp->lwp_sigmask = sigmask; 842 } else { 843 get_mplock(); 844 } 845 846 /* 847 * Do real job. 848 */ 849 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 850 &uap->sysmsg_result); 851 852 if (uap->sigmask != NULL) { 853 /* doselect() responsible for turning ERESTART into EINTR */ 854 KKASSERT(error != ERESTART); 855 if (error == EINTR) { 856 /* 857 * We can't restore the previous signal mask now 858 * because it could block the signal that interrupted 859 * us. So make a note to restore it after executing 860 * the handler. 861 */ 862 lp->lwp_flag |= LWP_OLDMASK; 863 } else { 864 /* 865 * No handler to run. Restore previous mask immediately. 866 */ 867 lp->lwp_sigmask = lp->lwp_oldsigmask; 868 } 869 } 870 rel_mplock(); 871 872 return (error); 873 } 874 875 /* 876 * Common code for sys_select() and sys_pselect(). 877 * 878 * in, out and ex are userland pointers. tv must point to validated 879 * kernel-side timeout value or NULL for infinite timeout. res must 880 * point to syscall return value. 881 */ 882 static int 883 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, 884 int *res) 885 { 886 struct lwp *lp = curthread->td_lwp; 887 struct proc *p = curproc; 888 889 /* 890 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 891 * infds with the new FD_SETSIZE of 1024, and more than enough for 892 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 893 * of 256. 894 */ 895 fd_mask s_selbits[howmany(2048, NFDBITS)]; 896 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 897 struct timeval atv, rtv, ttv; 898 int ncoll, error, timo; 899 u_int nbufbytes, ncpbytes, nfdbits; 900 901 if (nd < 0) 902 return (EINVAL); 903 if (nd > p->p_fd->fd_nfiles) 904 nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 905 906 /* 907 * Allocate just enough bits for the non-null fd_sets. Use the 908 * preallocated auto buffer if possible. 909 */ 910 nfdbits = roundup(nd, NFDBITS); 911 ncpbytes = nfdbits / NBBY; 912 nbufbytes = 0; 913 if (in != NULL) 914 nbufbytes += 2 * ncpbytes; 915 if (ou != NULL) 916 nbufbytes += 2 * ncpbytes; 917 if (ex != NULL) 918 nbufbytes += 2 * ncpbytes; 919 if (nbufbytes <= sizeof s_selbits) 920 selbits = &s_selbits[0]; 921 else 922 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 923 924 /* 925 * Assign pointers into the bit buffers and fetch the input bits. 926 * Put the output buffers together so that they can be bzeroed 927 * together. 928 */ 929 sbp = selbits; 930 #define getbits(name, x) \ 931 do { \ 932 if (name == NULL) \ 933 ibits[x] = NULL; \ 934 else { \ 935 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 936 obits[x] = sbp; \ 937 sbp += ncpbytes / sizeof *sbp; \ 938 error = copyin(name, ibits[x], ncpbytes); \ 939 if (error != 0) \ 940 goto done; \ 941 } \ 942 } while (0) 943 getbits(in, 0); 944 getbits(ou, 1); 945 getbits(ex, 2); 946 #undef getbits 947 if (nbufbytes != 0) 948 bzero(selbits, nbufbytes / 2); 949 950 if (tv != NULL) { 951 atv = *tv; 952 getmicrouptime(&rtv); 953 timevaladd(&atv, &rtv); 954 } else { 955 atv.tv_sec = 0; 956 atv.tv_usec = 0; 957 } 958 timo = 0; 959 retry: 960 ncoll = nselcoll; 961 lp->lwp_flag |= LWP_SELECT; 962 error = selscan(p, ibits, obits, nd, res); 963 if (error || *res) 964 goto done; 965 if (atv.tv_sec || atv.tv_usec) { 966 getmicrouptime(&rtv); 967 if (timevalcmp(&rtv, &atv, >=)) 968 goto done; 969 ttv = atv; 970 timevalsub(&ttv, &rtv); 971 timo = ttv.tv_sec > 24 * 60 * 60 ? 972 24 * 60 * 60 * hz : tvtohz_high(&ttv); 973 } 974 crit_enter(); 975 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 976 crit_exit(); 977 goto retry; 978 } 979 lp->lwp_flag &= ~LWP_SELECT; 980 981 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo); 982 983 crit_exit(); 984 if (error == 0) 985 goto retry; 986 done: 987 lp->lwp_flag &= ~LWP_SELECT; 988 /* select is not restarted after signals... */ 989 if (error == ERESTART) 990 error = EINTR; 991 if (error == EWOULDBLOCK) 992 error = 0; 993 #define putbits(name, x) \ 994 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 995 error = error2; 996 if (error == 0) { 997 int error2; 998 999 putbits(in, 0); 1000 putbits(ou, 1); 1001 putbits(ex, 2); 1002 #undef putbits 1003 } 1004 if (selbits != &s_selbits[0]) 1005 kfree(selbits, M_SELECT); 1006 return (error); 1007 } 1008 1009 static int 1010 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 1011 { 1012 int msk, i, fd; 1013 fd_mask bits; 1014 struct file *fp; 1015 int n = 0; 1016 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 1017 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 1018 1019 for (msk = 0; msk < 3; msk++) { 1020 if (ibits[msk] == NULL) 1021 continue; 1022 for (i = 0; i < nfd; i += NFDBITS) { 1023 bits = ibits[msk][i/NFDBITS]; 1024 /* ffs(int mask) not portable, fd_mask is long */ 1025 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 1026 if (!(bits & 1)) 1027 continue; 1028 fp = holdfp(p->p_fd, fd, -1); 1029 if (fp == NULL) 1030 return (EBADF); 1031 if (fo_poll(fp, flag[msk], fp->f_cred)) { 1032 obits[msk][(fd)/NFDBITS] |= 1033 ((fd_mask)1 << ((fd) % NFDBITS)); 1034 n++; 1035 } 1036 fdrop(fp); 1037 } 1038 } 1039 } 1040 *res = n; 1041 return (0); 1042 } 1043 1044 /* 1045 * Poll system call. 1046 * 1047 * MPALMOSTSAFE 1048 */ 1049 int 1050 sys_poll(struct poll_args *uap) 1051 { 1052 struct pollfd *bits; 1053 struct pollfd smallbits[32]; 1054 struct timeval atv, rtv, ttv; 1055 int ncoll, error = 0, timo; 1056 u_int nfds; 1057 size_t ni; 1058 struct lwp *lp = curthread->td_lwp; 1059 struct proc *p = curproc; 1060 1061 nfds = uap->nfds; 1062 /* 1063 * This is kinda bogus. We have fd limits, but that is not 1064 * really related to the size of the pollfd array. Make sure 1065 * we let the process use at least FD_SETSIZE entries and at 1066 * least enough for the current limits. We want to be reasonably 1067 * safe, but not overly restrictive. 1068 */ 1069 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 1070 return (EINVAL); 1071 ni = nfds * sizeof(struct pollfd); 1072 if (ni > sizeof(smallbits)) 1073 bits = kmalloc(ni, M_TEMP, M_WAITOK); 1074 else 1075 bits = smallbits; 1076 error = copyin(uap->fds, bits, ni); 1077 if (error) 1078 goto done; 1079 if (uap->timeout != INFTIM) { 1080 atv.tv_sec = uap->timeout / 1000; 1081 atv.tv_usec = (uap->timeout % 1000) * 1000; 1082 if (itimerfix(&atv)) { 1083 error = EINVAL; 1084 goto done; 1085 } 1086 getmicrouptime(&rtv); 1087 timevaladd(&atv, &rtv); 1088 } else { 1089 atv.tv_sec = 0; 1090 atv.tv_usec = 0; 1091 } 1092 timo = 0; 1093 retry: 1094 ncoll = nselcoll; 1095 lp->lwp_flag |= LWP_SELECT; 1096 get_mplock(); 1097 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 1098 rel_mplock(); 1099 if (error || uap->sysmsg_result) 1100 goto done; 1101 if (atv.tv_sec || atv.tv_usec) { 1102 getmicrouptime(&rtv); 1103 if (timevalcmp(&rtv, &atv, >=)) 1104 goto done; 1105 ttv = atv; 1106 timevalsub(&ttv, &rtv); 1107 timo = ttv.tv_sec > 24 * 60 * 60 ? 1108 24 * 60 * 60 * hz : tvtohz_high(&ttv); 1109 } 1110 crit_enter(); 1111 tsleep_interlock(&selwait, PCATCH); 1112 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 1113 crit_exit(); 1114 goto retry; 1115 } 1116 lp->lwp_flag &= ~LWP_SELECT; 1117 error = tsleep(&selwait, PCATCH | PINTERLOCKED, "poll", timo); 1118 crit_exit(); 1119 if (error == 0) 1120 goto retry; 1121 done: 1122 lp->lwp_flag &= ~LWP_SELECT; 1123 /* poll is not restarted after signals... */ 1124 if (error == ERESTART) 1125 error = EINTR; 1126 if (error == EWOULDBLOCK) 1127 error = 0; 1128 if (error == 0) { 1129 error = copyout(bits, uap->fds, ni); 1130 if (error) 1131 goto out; 1132 } 1133 out: 1134 if (ni > sizeof(smallbits)) 1135 kfree(bits, M_TEMP); 1136 return (error); 1137 } 1138 1139 static int 1140 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1141 { 1142 int i; 1143 struct file *fp; 1144 int n = 0; 1145 1146 for (i = 0; i < nfd; i++, fds++) { 1147 if (fds->fd >= p->p_fd->fd_nfiles) { 1148 fds->revents = POLLNVAL; 1149 n++; 1150 } else if (fds->fd < 0) { 1151 fds->revents = 0; 1152 } else { 1153 fp = holdfp(p->p_fd, fds->fd, -1); 1154 if (fp == NULL) { 1155 fds->revents = POLLNVAL; 1156 n++; 1157 } else { 1158 /* 1159 * Note: backend also returns POLLHUP and 1160 * POLLERR if appropriate. 1161 */ 1162 fds->revents = fo_poll(fp, fds->events, 1163 fp->f_cred); 1164 if (fds->revents != 0) 1165 n++; 1166 fdrop(fp); 1167 } 1168 } 1169 } 1170 *res = n; 1171 return (0); 1172 } 1173 1174 /* 1175 * OpenBSD poll system call. 1176 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1177 * 1178 * MPSAFE 1179 */ 1180 int 1181 sys_openbsd_poll(struct openbsd_poll_args *uap) 1182 { 1183 return (sys_poll((struct poll_args *)uap)); 1184 } 1185 1186 /*ARGSUSED*/ 1187 int 1188 seltrue(cdev_t dev, int events) 1189 { 1190 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1191 } 1192 1193 /* 1194 * Record a select request. A global wait must be used since a process/thread 1195 * might go away after recording its request. 1196 */ 1197 void 1198 selrecord(struct thread *selector, struct selinfo *sip) 1199 { 1200 struct proc *p; 1201 struct lwp *lp = NULL; 1202 1203 if (selector->td_lwp == NULL) 1204 panic("selrecord: thread needs a process"); 1205 1206 if (sip->si_pid == selector->td_proc->p_pid && 1207 sip->si_tid == selector->td_lwp->lwp_tid) 1208 return; 1209 if (sip->si_pid && (p = pfind(sip->si_pid))) 1210 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1211 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1212 sip->si_flags |= SI_COLL; 1213 } else { 1214 sip->si_pid = selector->td_proc->p_pid; 1215 sip->si_tid = selector->td_lwp->lwp_tid; 1216 } 1217 } 1218 1219 /* 1220 * Do a wakeup when a selectable event occurs. 1221 */ 1222 void 1223 selwakeup(struct selinfo *sip) 1224 { 1225 struct proc *p; 1226 struct lwp *lp = NULL; 1227 1228 if (sip->si_pid == 0) 1229 return; 1230 if (sip->si_flags & SI_COLL) { 1231 nselcoll++; 1232 sip->si_flags &= ~SI_COLL; 1233 wakeup((caddr_t)&selwait); /* YYY fixable */ 1234 } 1235 p = pfind(sip->si_pid); 1236 sip->si_pid = 0; 1237 if (p == NULL) 1238 return; 1239 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1240 if (lp == NULL) 1241 return; 1242 1243 crit_enter(); 1244 if (lp->lwp_wchan == (caddr_t)&selwait) { 1245 /* 1246 * Flag the process to break the tsleep when 1247 * setrunnable is called, but only call setrunnable 1248 * here if the process is not in a stopped state. 1249 */ 1250 lp->lwp_flag |= LWP_BREAKTSLEEP; 1251 if (p->p_stat != SSTOP) 1252 setrunnable(lp); 1253 } else if (lp->lwp_flag & LWP_SELECT) { 1254 lp->lwp_flag &= ~LWP_SELECT; 1255 } 1256 crit_exit(); 1257 } 1258 1259