1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
36 */
37
38 #include "opt_ktrace.h"
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysmsg.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
51 #include <sys/malloc.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/kern_syscall.h>
55 #include <sys/mapped_ioctl.h>
56 #include <sys/poll.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/buf.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
71 #include <sys/signal2.h>
72
73 #include <machine/limits.h>
74
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79
80 static struct krate krate_poll = { .freq = 1 };
81
82 typedef struct kfd_set {
83 fd_mask fds_bits[2];
84 } kfd_set;
85
86 enum select_copyin_states {
87 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE };
88
89 struct select_kevent_copyin_args {
90 kfd_set *read_set;
91 kfd_set *write_set;
92 kfd_set *except_set;
93 int active_set; /* One of select_copyin_states */
94 struct lwp *lwp; /* Pointer to our lwp */
95 int num_fds; /* Number of file descriptors (syscall arg) */
96 int proc_fds; /* Processed fd's (wraps) */
97 int error; /* Returned to userland */
98 };
99
100 struct poll_kevent_copyin_args {
101 struct lwp *lwp;
102 struct pollfd *fds;
103 int nfds;
104 int pfds;
105 int error;
106 };
107
108 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token);
109
110 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex,
111 struct timespec *ts, int *res);
112 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts,
113 int *res, int flags);
114 static int dofileread(int, struct file *, struct uio *, int, size_t *);
115 static int dofilewrite(int, struct file *, struct uio *, int, size_t *);
116
117 /*
118 * Read system call.
119 *
120 * MPSAFE
121 */
122 int
sys_read(struct sysmsg * sysmsg,const struct read_args * uap)123 sys_read(struct sysmsg *sysmsg, const struct read_args *uap)
124 {
125 struct thread *td = curthread;
126 struct uio auio;
127 struct iovec aiov;
128 int error;
129
130 if ((ssize_t)uap->nbyte < 0)
131 error = EINVAL;
132
133 aiov.iov_base = uap->buf;
134 aiov.iov_len = uap->nbyte;
135 auio.uio_iov = &aiov;
136 auio.uio_iovcnt = 1;
137 auio.uio_offset = -1;
138 auio.uio_resid = uap->nbyte;
139 auio.uio_rw = UIO_READ;
140 auio.uio_segflg = UIO_USERSPACE;
141 auio.uio_td = td;
142
143 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
144 return(error);
145 }
146
147 /*
148 * Positioned (Pread) read system call
149 *
150 * MPSAFE
151 */
152 int
sys_extpread(struct sysmsg * sysmsg,const struct extpread_args * uap)153 sys_extpread(struct sysmsg *sysmsg, const struct extpread_args *uap)
154 {
155 struct thread *td = curthread;
156 struct uio auio;
157 struct iovec aiov;
158 int error;
159 int flags;
160
161 if ((ssize_t)uap->nbyte < 0)
162 return(EINVAL);
163
164 aiov.iov_base = uap->buf;
165 aiov.iov_len = uap->nbyte;
166 auio.uio_iov = &aiov;
167 auio.uio_iovcnt = 1;
168 auio.uio_offset = uap->offset;
169 auio.uio_resid = uap->nbyte;
170 auio.uio_rw = UIO_READ;
171 auio.uio_segflg = UIO_USERSPACE;
172 auio.uio_td = td;
173
174 flags = uap->flags & O_FMASK;
175 if (uap->offset != (off_t)-1)
176 flags |= O_FOFFSET;
177
178 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
179 return(error);
180 }
181
182 /*
183 * Scatter read system call.
184 *
185 * MPSAFE
186 */
187 int
sys_readv(struct sysmsg * sysmsg,const struct readv_args * uap)188 sys_readv(struct sysmsg *sysmsg, const struct readv_args *uap)
189 {
190 struct thread *td = curthread;
191 struct uio auio;
192 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
193 int error;
194
195 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
196 &auio.uio_resid);
197 if (error)
198 return (error);
199 auio.uio_iov = iov;
200 auio.uio_iovcnt = uap->iovcnt;
201 auio.uio_offset = -1;
202 auio.uio_rw = UIO_READ;
203 auio.uio_segflg = UIO_USERSPACE;
204 auio.uio_td = td;
205
206 error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
207
208 iovec_free(&iov, aiov);
209 return (error);
210 }
211
212
213 /*
214 * Scatter positioned read system call.
215 *
216 * MPSAFE
217 */
218 int
sys_extpreadv(struct sysmsg * sysmsg,const struct extpreadv_args * uap)219 sys_extpreadv(struct sysmsg *sysmsg, const struct extpreadv_args *uap)
220 {
221 struct thread *td = curthread;
222 struct uio auio;
223 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
224 int error;
225 int flags;
226
227 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
228 &auio.uio_resid);
229 if (error)
230 return (error);
231 auio.uio_iov = iov;
232 auio.uio_iovcnt = uap->iovcnt;
233 auio.uio_offset = uap->offset;
234 auio.uio_rw = UIO_READ;
235 auio.uio_segflg = UIO_USERSPACE;
236 auio.uio_td = td;
237
238 flags = uap->flags & O_FMASK;
239 if (uap->offset != (off_t)-1)
240 flags |= O_FOFFSET;
241
242 error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
243
244 iovec_free(&iov, aiov);
245 return(error);
246 }
247
248 /*
249 * MPSAFE
250 */
251 int
kern_preadv(int fd,struct uio * auio,int flags,size_t * res)252 kern_preadv(int fd, struct uio *auio, int flags, size_t *res)
253 {
254 struct thread *td = curthread;
255 struct file *fp;
256 int error;
257
258 fp = holdfp(td, fd, FREAD);
259 if (fp == NULL)
260 return (EBADF);
261 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) {
262 error = ESPIPE;
263 } else {
264 error = dofileread(fd, fp, auio, flags, res);
265 }
266 dropfp(td, fd, fp);
267
268 return(error);
269 }
270
271 /*
272 * Common code for readv and preadv that reads data in
273 * from a file using the passed in uio, offset, and flags.
274 *
275 * MPALMOSTSAFE - ktrace needs help
276 */
277 static int
dofileread(int fd,struct file * fp,struct uio * auio,int flags,size_t * res)278 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
279 {
280 int error;
281 size_t len;
282 #ifdef KTRACE
283 struct thread *td = curthread;
284 struct iovec *ktriov = NULL;
285 struct uio ktruio;
286 #endif
287
288 #ifdef KTRACE
289 /*
290 * if tracing, save a copy of iovec
291 */
292 if (KTRPOINT(td, KTR_GENIO)) {
293 int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
294
295 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
296 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
297 ktruio = *auio;
298 }
299 #endif
300 len = auio->uio_resid;
301 error = fo_read(fp, auio, fp->f_cred, flags);
302 if (error) {
303 if (auio->uio_resid != len && (error == ERESTART ||
304 error == EINTR || error == EWOULDBLOCK))
305 error = 0;
306 }
307 #ifdef KTRACE
308 if (ktriov != NULL) {
309 if (error == 0) {
310 ktruio.uio_iov = ktriov;
311 ktruio.uio_resid = len - auio->uio_resid;
312 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error);
313 }
314 kfree(ktriov, M_TEMP);
315 }
316 #endif
317 if (error == 0)
318 *res = len - auio->uio_resid;
319
320 return(error);
321 }
322
323 /*
324 * Write system call
325 *
326 * MPSAFE
327 */
328 int
sys_write(struct sysmsg * sysmsg,const struct write_args * uap)329 sys_write(struct sysmsg *sysmsg, const struct write_args *uap)
330 {
331 struct thread *td = curthread;
332 struct uio auio;
333 struct iovec aiov;
334 int error;
335
336 if ((ssize_t)uap->nbyte < 0)
337 error = EINVAL;
338
339 aiov.iov_base = (void *)(uintptr_t)uap->buf;
340 aiov.iov_len = uap->nbyte;
341 auio.uio_iov = &aiov;
342 auio.uio_iovcnt = 1;
343 auio.uio_offset = -1;
344 auio.uio_resid = uap->nbyte;
345 auio.uio_rw = UIO_WRITE;
346 auio.uio_segflg = UIO_USERSPACE;
347 auio.uio_td = td;
348
349 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
350
351 return(error);
352 }
353
354 /*
355 * Pwrite system call
356 *
357 * MPSAFE
358 */
359 int
sys_extpwrite(struct sysmsg * sysmsg,const struct extpwrite_args * uap)360 sys_extpwrite(struct sysmsg *sysmsg, const struct extpwrite_args *uap)
361 {
362 struct thread *td = curthread;
363 struct uio auio;
364 struct iovec aiov;
365 int error;
366 int flags;
367
368 if ((ssize_t)uap->nbyte < 0)
369 error = EINVAL;
370
371 aiov.iov_base = (void *)(uintptr_t)uap->buf;
372 aiov.iov_len = uap->nbyte;
373 auio.uio_iov = &aiov;
374 auio.uio_iovcnt = 1;
375 auio.uio_offset = uap->offset;
376 auio.uio_resid = uap->nbyte;
377 auio.uio_rw = UIO_WRITE;
378 auio.uio_segflg = UIO_USERSPACE;
379 auio.uio_td = td;
380
381 flags = uap->flags & O_FMASK;
382 if (uap->offset != (off_t)-1)
383 flags |= O_FOFFSET;
384 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
385 return(error);
386 }
387
388 /*
389 * MPSAFE
390 */
391 int
sys_writev(struct sysmsg * sysmsg,const struct writev_args * uap)392 sys_writev(struct sysmsg *sysmsg, const struct writev_args *uap)
393 {
394 struct thread *td = curthread;
395 struct uio auio;
396 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
397 int error;
398
399 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
400 &auio.uio_resid);
401 if (error)
402 return (error);
403 auio.uio_iov = iov;
404 auio.uio_iovcnt = uap->iovcnt;
405 auio.uio_offset = -1;
406 auio.uio_rw = UIO_WRITE;
407 auio.uio_segflg = UIO_USERSPACE;
408 auio.uio_td = td;
409
410 error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
411
412 iovec_free(&iov, aiov);
413 return (error);
414 }
415
416
417 /*
418 * Gather positioned write system call
419 *
420 * MPSAFE
421 */
422 int
sys_extpwritev(struct sysmsg * sysmsg,const struct extpwritev_args * uap)423 sys_extpwritev(struct sysmsg *sysmsg, const struct extpwritev_args *uap)
424 {
425 struct thread *td = curthread;
426 struct uio auio;
427 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
428 int error;
429 int flags;
430
431 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
432 &auio.uio_resid);
433 if (error)
434 return (error);
435 auio.uio_iov = iov;
436 auio.uio_iovcnt = uap->iovcnt;
437 auio.uio_offset = uap->offset;
438 auio.uio_rw = UIO_WRITE;
439 auio.uio_segflg = UIO_USERSPACE;
440 auio.uio_td = td;
441
442 flags = uap->flags & O_FMASK;
443 if (uap->offset != (off_t)-1)
444 flags |= O_FOFFSET;
445
446 error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
447
448 iovec_free(&iov, aiov);
449 return(error);
450 }
451
452 /*
453 * MPSAFE
454 */
455 int
kern_pwritev(int fd,struct uio * auio,int flags,size_t * res)456 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res)
457 {
458 struct thread *td = curthread;
459 struct file *fp;
460 int error;
461
462 fp = holdfp(td, fd, FWRITE);
463 if (fp == NULL)
464 return (EBADF);
465 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) {
466 error = ESPIPE;
467 } else {
468 error = dofilewrite(fd, fp, auio, flags, res);
469 }
470 dropfp(td, fd, fp);
471
472 return(error);
473 }
474
475 /*
476 * Common code for writev and pwritev that writes data to
477 * a file using the passed in uio, offset, and flags.
478 *
479 * MPALMOSTSAFE - ktrace needs help
480 */
481 static int
dofilewrite(int fd,struct file * fp,struct uio * auio,int flags,size_t * res)482 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
483 {
484 struct thread *td = curthread;
485 struct lwp *lp = td->td_lwp;
486 int error;
487 size_t len;
488 #ifdef KTRACE
489 struct iovec *ktriov = NULL;
490 struct uio ktruio;
491 #endif
492
493 #ifdef KTRACE
494 /*
495 * if tracing, save a copy of iovec and uio
496 */
497 if (KTRPOINT(td, KTR_GENIO)) {
498 int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
499
500 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
501 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
502 ktruio = *auio;
503 }
504 #endif
505 len = auio->uio_resid;
506 error = fo_write(fp, auio, fp->f_cred, flags);
507 if (error) {
508 if (auio->uio_resid != len && (error == ERESTART ||
509 error == EINTR || error == EWOULDBLOCK))
510 error = 0;
511 /* Socket layer is responsible for issuing SIGPIPE. */
512 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
513 lwpsignal(lp->lwp_proc, lp, SIGPIPE);
514 }
515 #ifdef KTRACE
516 if (ktriov != NULL) {
517 if (error == 0) {
518 ktruio.uio_iov = ktriov;
519 ktruio.uio_resid = len - auio->uio_resid;
520 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error);
521 }
522 kfree(ktriov, M_TEMP);
523 }
524 #endif
525 if (error == 0)
526 *res = len - auio->uio_resid;
527
528 return(error);
529 }
530
531 /*
532 * Ioctl system call
533 *
534 * MPSAFE
535 */
536 int
sys_ioctl(struct sysmsg * sysmsg,const struct ioctl_args * uap)537 sys_ioctl(struct sysmsg *sysmsg, const struct ioctl_args *uap)
538 {
539 int error;
540
541 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, sysmsg);
542 return (error);
543 }
544
545 struct ioctl_map_entry {
546 const char *subsys;
547 struct ioctl_map_range *cmd_ranges;
548 LIST_ENTRY(ioctl_map_entry) entries;
549 };
550
551 /*
552 * The true heart of all ioctl syscall handlers (native, emulation).
553 * If map != NULL, it will be searched for a matching entry for com,
554 * and appropriate conversions/conversion functions will be utilized.
555 *
556 * MPSAFE
557 */
558 int
mapped_ioctl(int fd,u_long com,caddr_t uspc_data,struct ioctl_map * map,struct sysmsg * msg)559 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map,
560 struct sysmsg *msg)
561 {
562 struct thread *td = curthread;
563 struct proc *p = td->td_proc;
564 struct ucred *cred;
565 struct file *fp;
566 struct ioctl_map_range *iomc = NULL;
567 int error;
568 u_int size;
569 u_long ocom = com;
570 caddr_t data, memp;
571 int tmp;
572 #define STK_PARAMS 128
573 union {
574 char stkbuf[STK_PARAMS];
575 long align;
576 } ubuf;
577
578 KKASSERT(p);
579 cred = td->td_ucred;
580 memp = NULL;
581
582 fp = holdfp(td, fd, FREAD|FWRITE);
583 if (fp == NULL)
584 return(EBADF);
585
586 if (map != NULL) { /* obey translation map */
587 u_long maskcmd;
588 struct ioctl_map_entry *e;
589
590 maskcmd = com & map->mask;
591
592 lwkt_gettoken(&mioctl_token);
593 LIST_FOREACH(e, &map->mapping, entries) {
594 for (iomc = e->cmd_ranges; iomc->start != 0 ||
595 iomc->maptocmd != 0 || iomc->wrapfunc != NULL ||
596 iomc->mapfunc != NULL;
597 iomc++) {
598 if (maskcmd >= iomc->start &&
599 maskcmd <= iomc->end)
600 break;
601 }
602
603 /* Did we find a match? */
604 if (iomc->start != 0 || iomc->maptocmd != 0 ||
605 iomc->wrapfunc != NULL || iomc->mapfunc != NULL)
606 break;
607 }
608 lwkt_reltoken(&mioctl_token);
609
610 if (iomc == NULL ||
611 (iomc->start == 0 && iomc->maptocmd == 0
612 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) {
613 krateprintf(&krate_poll,
614 "%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) "
615 "not implemented\n",
616 map->sys, fd, maskcmd,
617 (int)((maskcmd >> 8) & 0xff),
618 (int)(maskcmd & 0xff));
619 error = EINVAL;
620 goto done;
621 }
622
623 /*
624 * If it's a non-range one to one mapping, maptocmd should be
625 * correct. If it's a ranged one to one mapping, we pass the
626 * original value of com, and for a range mapped to a different
627 * range, we always need a mapping function to translate the
628 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
629 */
630 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) {
631 com = iomc->maptocmd;
632 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) {
633 if (iomc->mapfunc != NULL)
634 com = iomc->mapfunc(iomc->start, iomc->end,
635 iomc->start, iomc->end,
636 com, com);
637 } else {
638 if (iomc->mapfunc != NULL) {
639 com = iomc->mapfunc(iomc->start, iomc->end,
640 iomc->maptocmd, iomc->maptoend,
641 com, ocom);
642 } else {
643 krateprintf(&krate_poll,
644 "%s: Invalid mapping for fd=%d, "
645 "cmd=%#lx ('%c',%d)\n",
646 map->sys, fd, maskcmd,
647 (int)((maskcmd >> 8) & 0xff),
648 (int)(maskcmd & 0xff));
649 error = EINVAL;
650 goto done;
651 }
652 }
653 }
654
655 switch (com) {
656 case FIONCLEX:
657 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
658 goto done;
659 case FIOCLEX:
660 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
661 goto done;
662 }
663
664 /*
665 * Interpret high order word to find amount of data to be
666 * copied to/from the user's address space.
667 */
668 size = IOCPARM_LEN(com);
669 if (size > IOCPARM_MAX) {
670 error = ENOTTY;
671 goto done;
672 }
673
674 if ((com & IOC_VOID) == 0 && size > sizeof(ubuf.stkbuf)) {
675 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK);
676 data = memp;
677 } else {
678 memp = NULL;
679 data = ubuf.stkbuf;
680 }
681 if (com & IOC_VOID) {
682 *(caddr_t *)data = uspc_data;
683 } else if (com & IOC_IN) {
684 if (size != 0) {
685 error = copyin(uspc_data, data, (size_t)size);
686 if (error)
687 goto done;
688 } else {
689 *(caddr_t *)data = uspc_data;
690 }
691 } else if ((com & IOC_OUT) != 0 && size) {
692 /*
693 * Zero the buffer so the user always
694 * gets back something deterministic.
695 */
696 bzero(data, (size_t)size);
697 }
698
699 switch (com) {
700 case FIONBIO:
701 if ((tmp = *(int *)data))
702 atomic_set_int(&fp->f_flag, FNONBLOCK);
703 else
704 atomic_clear_int(&fp->f_flag, FNONBLOCK);
705 error = 0;
706 break;
707
708 case FIOASYNC:
709 if ((tmp = *(int *)data))
710 atomic_set_int(&fp->f_flag, FASYNC);
711 else
712 atomic_clear_int(&fp->f_flag, FASYNC);
713 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg);
714 break;
715
716 default:
717 /*
718 * If there is a override function,
719 * call it instead of directly routing the call
720 */
721 if (map != NULL && iomc->wrapfunc != NULL)
722 error = iomc->wrapfunc(fp, com, ocom, data, cred);
723 else
724 error = fo_ioctl(fp, com, data, cred, msg);
725 /*
726 * Copy any data to user, size was
727 * already set and checked above.
728 */
729 if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
730 error = copyout(data, uspc_data, (size_t)size);
731 break;
732 }
733 done:
734 if (memp != NULL)
735 kfree(memp, M_IOCTLOPS);
736 dropfp(td, fd, fp);
737
738 return(error);
739 }
740
741 /*
742 * MPSAFE
743 */
744 int
mapped_ioctl_register_handler(struct ioctl_map_handler * he)745 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
746 {
747 struct ioctl_map_entry *ne;
748
749 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
750 he->subsys != NULL && *he->subsys != '\0');
751
752 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP,
753 M_WAITOK | M_ZERO);
754
755 ne->subsys = he->subsys;
756 ne->cmd_ranges = he->cmd_ranges;
757
758 lwkt_gettoken(&mioctl_token);
759 LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
760 lwkt_reltoken(&mioctl_token);
761
762 return(0);
763 }
764
765 /*
766 * MPSAFE
767 */
768 int
mapped_ioctl_unregister_handler(struct ioctl_map_handler * he)769 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
770 {
771 struct ioctl_map_entry *ne;
772 int error = EINVAL;
773
774 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
775
776 lwkt_gettoken(&mioctl_token);
777 LIST_FOREACH(ne, &he->map->mapping, entries) {
778 if (ne->cmd_ranges == he->cmd_ranges) {
779 LIST_REMOVE(ne, entries);
780 kfree(ne, M_IOCTLMAP);
781 error = 0;
782 break;
783 }
784 }
785 lwkt_reltoken(&mioctl_token);
786 return(error);
787 }
788
789 static int nseldebug;
790 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, "");
791
792 /*
793 * Select system call.
794 *
795 * MPSAFE
796 */
797 int
sys_select(struct sysmsg * sysmsg,const struct select_args * uap)798 sys_select(struct sysmsg *sysmsg, const struct select_args *uap)
799 {
800 struct timeval ktv;
801 struct timespec *ktsp, kts;
802 int error;
803
804 /*
805 * Get timeout if any.
806 */
807 if (uap->tv != NULL) {
808 error = copyin(uap->tv, &ktv, sizeof (ktv));
809 if (error)
810 return (error);
811 TIMEVAL_TO_TIMESPEC(&ktv, &kts);
812 ktsp = &kts;
813 } else {
814 ktsp = NULL;
815 }
816
817 /*
818 * Do real work.
819 */
820 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
821 &sysmsg->sysmsg_result);
822
823 return (error);
824 }
825
826
827 /*
828 * Pselect system call.
829 */
830 int
sys_pselect(struct sysmsg * sysmsg,const struct pselect_args * uap)831 sys_pselect(struct sysmsg *sysmsg, const struct pselect_args *uap)
832 {
833 struct thread *td = curthread;
834 struct lwp *lp = td->td_lwp;
835 struct timespec *ktsp, kts;
836 sigset_t sigmask;
837 int error;
838
839 /*
840 * Get timeout if any.
841 */
842 if (uap->ts != NULL) {
843 error = copyin(uap->ts, &kts, sizeof (kts));
844 if (error)
845 return (error);
846 ktsp = &kts;
847 } else {
848 ktsp = NULL;
849 }
850
851 /*
852 * Install temporary signal mask if any provided.
853 */
854 if (uap->sigmask != NULL) {
855 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
856 if (error)
857 return (error);
858 lwkt_gettoken(&lp->lwp_proc->p_token);
859 lp->lwp_oldsigmask = lp->lwp_sigmask;
860 SIG_CANTMASK(sigmask);
861 lp->lwp_sigmask = sigmask;
862 lwkt_reltoken(&lp->lwp_proc->p_token);
863 sigirefs_wait(lp->lwp_proc);
864 }
865
866 /*
867 * Do real job.
868 */
869 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
870 &sysmsg->sysmsg_result);
871
872 if (uap->sigmask != NULL) {
873 lwkt_gettoken(&lp->lwp_proc->p_token);
874 /* doselect() responsible for turning ERESTART into EINTR */
875 KKASSERT(error != ERESTART);
876 if (error == EINTR) {
877 /*
878 * We can't restore the previous signal mask now
879 * because it could block the signal that interrupted
880 * us. So make a note to restore it after executing
881 * the handler.
882 */
883 lp->lwp_flags |= LWP_OLDMASK;
884 } else {
885 /*
886 * No handler to run. Restore previous mask immediately.
887 */
888 lp->lwp_sigmask = lp->lwp_oldsigmask;
889 sigirefs_wait(lp->lwp_proc);
890 }
891 lwkt_reltoken(&lp->lwp_proc->p_token);
892 }
893
894 return (error);
895 }
896
897 static int
select_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)898 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
899 {
900 struct select_kevent_copyin_args *skap = NULL;
901 struct kevent *kev;
902 int fd;
903 kfd_set *fdp = NULL;
904 short filter = 0;
905 u_int fflags = 0;
906
907 skap = (struct select_kevent_copyin_args *)arg;
908
909 if (*events == maxevents)
910 return (0);
911
912 while (skap->active_set < COPYIN_DONE) {
913 switch (skap->active_set) {
914 case COPYIN_READ:
915 /*
916 * Register descriptors for the read filter
917 */
918 fdp = skap->read_set;
919 filter = EVFILT_READ;
920 fflags = NOTE_OLDAPI;
921 if (fdp)
922 break;
923 ++skap->active_set;
924 skap->proc_fds = 0;
925 /* fall through */
926 case COPYIN_WRITE:
927 /*
928 * Register descriptors for the write filter
929 */
930 fdp = skap->write_set;
931 filter = EVFILT_WRITE;
932 fflags = NOTE_OLDAPI;
933 if (fdp)
934 break;
935 ++skap->active_set;
936 skap->proc_fds = 0;
937 /* fall through */
938 case COPYIN_EXCEPT:
939 /*
940 * Register descriptors for the exception filter
941 */
942 fdp = skap->except_set;
943 filter = EVFILT_EXCEPT;
944 fflags = NOTE_OLDAPI | NOTE_OOB;
945 if (fdp)
946 break;
947 ++skap->active_set;
948 skap->proc_fds = 0;
949 /* fall through */
950 case COPYIN_DONE:
951 /*
952 * Nothing left to register
953 */
954 return(0);
955 /* NOT REACHED */
956 }
957
958 while (skap->proc_fds < skap->num_fds) {
959 fd = skap->proc_fds;
960 if (FD_ISSET(fd, fdp)) {
961 kev = &kevp[*events];
962 EV_SET(kev, fd, filter,
963 EV_ADD|EV_ENABLE,
964 fflags, 0,
965 (void *)(uintptr_t)
966 skap->lwp->lwp_kqueue_serial);
967 FD_CLR(fd, fdp);
968 ++*events;
969
970 if (nseldebug) {
971 kprintf(
972 "select fd %d filter %d "
973 "serial %ju\n", fd, filter,
974 (uintmax_t)
975 skap->lwp->lwp_kqueue_serial);
976 }
977 }
978 ++skap->proc_fds;
979 if (*events == maxevents)
980 return (0);
981 }
982 skap->active_set++;
983 skap->proc_fds = 0;
984 }
985
986 return (0);
987 }
988
989 static int
select_copyout(void * arg,struct kevent * kevp,int count,int * res)990 select_copyout(void *arg, struct kevent *kevp, int count, int *res)
991 {
992 struct select_kevent_copyin_args *skap;
993 struct kevent kev;
994 int i;
995 int n;
996
997 skap = (struct select_kevent_copyin_args *)arg;
998
999 for (i = 0; i < count; ++i) {
1000 /*
1001 * Filter out and delete spurious events
1002 */
1003 if ((uint64_t)(uintptr_t)kevp[i].udata !=
1004 skap->lwp->lwp_kqueue_serial)
1005 {
1006 panic("select_copyout: unexpected udata");
1007 deregister:
1008 kev = kevp[i];
1009 kev.flags = EV_DISABLE|EV_DELETE;
1010 n = 1;
1011 kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n, 0);
1012 if (nseldebug) {
1013 kprintf("select fd %ju mismatched serial %ju\n",
1014 (uintmax_t)kevp[i].ident,
1015 (uintmax_t)skap->lwp->lwp_kqueue_serial);
1016 }
1017 continue;
1018 }
1019
1020 /*
1021 * Handle errors
1022 */
1023 if (kevp[i].flags & EV_ERROR) {
1024 int error = kevp[i].data;
1025
1026 switch (error) {
1027 case EBADF:
1028 /*
1029 * A bad file descriptor is considered a
1030 * fatal error for select, bail out.
1031 */
1032 skap->error = error;
1033 *res = -1;
1034 return error;
1035
1036 default:
1037 /*
1038 * Select silently swallows any unknown errors
1039 * for descriptors in the read or write sets.
1040 *
1041 * ALWAYS filter out EOPNOTSUPP errors from
1042 * filters (at least until all filters support
1043 * EVFILT_EXCEPT)
1044 *
1045 * We also filter out ENODEV since dev_dkqfilter
1046 * returns ENODEV if EOPNOTSUPP is returned in an
1047 * inner call.
1048 *
1049 * XXX: fix this
1050 */
1051 if (kevp[i].filter != EVFILT_READ &&
1052 kevp[i].filter != EVFILT_WRITE &&
1053 error != EOPNOTSUPP &&
1054 error != ENODEV) {
1055 skap->error = error;
1056 *res = -1;
1057 return error;
1058 }
1059 break;
1060 }
1061
1062 /*
1063 * We must deregister any unsupported select events
1064 * to avoid a live-lock.
1065 */
1066 if (nseldebug) {
1067 kprintf("select fd %ju filter %d error %d\n",
1068 (uintmax_t)kevp[i].ident,
1069 kevp[i].filter, error);
1070 }
1071 goto deregister;
1072 }
1073
1074 switch (kevp[i].filter) {
1075 case EVFILT_READ:
1076 FD_SET(kevp[i].ident, skap->read_set);
1077 break;
1078 case EVFILT_WRITE:
1079 FD_SET(kevp[i].ident, skap->write_set);
1080 break;
1081 case EVFILT_EXCEPT:
1082 FD_SET(kevp[i].ident, skap->except_set);
1083 break;
1084 }
1085
1086 ++*res;
1087 }
1088
1089 return (0);
1090 }
1091
1092 /*
1093 * Copy select bits in from userland. Allocate kernel memory if the
1094 * set is large.
1095 */
1096 static int
getbits(int bytes,fd_set * in_set,kfd_set ** out_set,kfd_set * tmp_set)1097 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set)
1098 {
1099 int error;
1100
1101 if (in_set) {
1102 if (bytes < sizeof(*tmp_set))
1103 *out_set = tmp_set;
1104 else
1105 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK);
1106 error = copyin(in_set, *out_set, bytes);
1107 } else {
1108 *out_set = NULL;
1109 error = 0;
1110 }
1111 return (error);
1112 }
1113
1114 /*
1115 * Copy returned select bits back out to userland.
1116 */
1117 static int
putbits(int bytes,kfd_set * in_set,fd_set * out_set)1118 putbits(int bytes, kfd_set *in_set, fd_set *out_set)
1119 {
1120 int error;
1121
1122 if (in_set) {
1123 error = copyout(in_set, out_set, bytes);
1124 } else {
1125 error = 0;
1126 }
1127 return (error);
1128 }
1129
1130 static int
dotimeout_only(struct timespec * ts)1131 dotimeout_only(struct timespec *ts)
1132 {
1133 return(nanosleep1(ts, NULL));
1134 }
1135
1136 /*
1137 * Common code for sys_select() and sys_pselect().
1138 *
1139 * in, out and ex are userland pointers. ts must point to validated
1140 * kernel-side timeout value or NULL for infinite timeout. res must
1141 * point to syscall return value.
1142 */
1143 static int
doselect(int nd,fd_set * read,fd_set * write,fd_set * except,struct timespec * ts,int * res)1144 doselect(int nd, fd_set *read, fd_set *write, fd_set *except,
1145 struct timespec *ts, int *res)
1146 {
1147 struct proc *p = curproc;
1148 struct select_kevent_copyin_args *kap, ka;
1149 int bytes, error;
1150 kfd_set read_tmp;
1151 kfd_set write_tmp;
1152 kfd_set except_tmp;
1153
1154 *res = 0;
1155 if (nd < 0)
1156 return (EINVAL);
1157 if (nd == 0 && ts)
1158 return (dotimeout_only(ts));
1159
1160 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */
1161 nd = p->p_fd->fd_nfiles;
1162
1163 kap = &ka;
1164 kap->lwp = curthread->td_lwp;
1165 kap->num_fds = nd;
1166 kap->proc_fds = 0;
1167 kap->error = 0;
1168 kap->active_set = COPYIN_READ;
1169
1170 /*
1171 * Calculate bytes based on the number of __fd_mask[] array entries
1172 * multiplied by the size of __fd_mask.
1173 */
1174 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask);
1175
1176 /* kap->read_set = NULL; not needed */
1177 kap->write_set = NULL;
1178 kap->except_set = NULL;
1179
1180 error = getbits(bytes, read, &kap->read_set, &read_tmp);
1181 if (error == 0)
1182 error = getbits(bytes, write, &kap->write_set, &write_tmp);
1183 if (error == 0)
1184 error = getbits(bytes, except, &kap->except_set, &except_tmp);
1185 if (error)
1186 goto done;
1187
1188 /*
1189 * NOTE: Make sure the max events passed to kern_kevent() is
1190 * effectively unlimited. (nd * 3) accomplishes this.
1191 *
1192 * (*res) continues to increment as returned events are
1193 * loaded in.
1194 */
1195 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap,
1196 select_copyin, select_copyout, ts,
1197 KEVENT_AUTO_STALE);
1198 if (error == 0)
1199 error = putbits(bytes, kap->read_set, read);
1200 if (error == 0)
1201 error = putbits(bytes, kap->write_set, write);
1202 if (error == 0)
1203 error = putbits(bytes, kap->except_set, except);
1204
1205 /*
1206 * An error from an individual event that should be passed
1207 * back to userland (EBADF)
1208 */
1209 if (kap->error)
1210 error = kap->error;
1211
1212 /*
1213 * Clean up.
1214 */
1215 done:
1216 if (kap->read_set && kap->read_set != &read_tmp)
1217 kfree(kap->read_set, M_SELECT);
1218 if (kap->write_set && kap->write_set != &write_tmp)
1219 kfree(kap->write_set, M_SELECT);
1220 if (kap->except_set && kap->except_set != &except_tmp)
1221 kfree(kap->except_set, M_SELECT);
1222
1223 kap->lwp->lwp_kqueue_serial += kap->num_fds;
1224
1225 return (error);
1226 }
1227
1228 /*
1229 * Poll system call.
1230 *
1231 * MPSAFE
1232 */
1233 int
sys_poll(struct sysmsg * sysmsg,const struct poll_args * uap)1234 sys_poll(struct sysmsg *sysmsg, const struct poll_args *uap)
1235 {
1236 struct timespec ts, *tsp;
1237 int error;
1238
1239 if (uap->timeout != INFTIM) {
1240 if (uap->timeout < 0)
1241 return (EINVAL);
1242 ts.tv_sec = uap->timeout / 1000;
1243 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000;
1244 tsp = &ts;
1245 } else {
1246 tsp = NULL;
1247 }
1248
1249 error = dopoll(uap->nfds, uap->fds, tsp, &sysmsg->sysmsg_result, 0);
1250
1251 return (error);
1252 }
1253
1254 /*
1255 * Ppoll system call.
1256 *
1257 * MPSAFE
1258 */
1259 int
sys_ppoll(struct sysmsg * sysmsg,const struct ppoll_args * uap)1260 sys_ppoll(struct sysmsg *sysmsg, const struct ppoll_args *uap)
1261 {
1262 struct thread *td = curthread;
1263 struct lwp *lp = td->td_lwp;
1264 struct timespec *ktsp, kts;
1265 sigset_t sigmask;
1266 int error;
1267
1268 /*
1269 * Get timeout if any.
1270 */
1271 if (uap->ts != NULL) {
1272 error = copyin(uap->ts, &kts, sizeof (kts));
1273 if (error)
1274 return (error);
1275 ktsp = &kts;
1276 } else {
1277 ktsp = NULL;
1278 }
1279
1280 /*
1281 * Install temporary signal mask if any provided.
1282 */
1283 if (uap->sigmask != NULL) {
1284 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
1285 if (error)
1286 return (error);
1287 lwkt_gettoken(&lp->lwp_proc->p_token);
1288 lp->lwp_oldsigmask = lp->lwp_sigmask;
1289 SIG_CANTMASK(sigmask);
1290 lp->lwp_sigmask = sigmask;
1291 lwkt_reltoken(&lp->lwp_proc->p_token);
1292 sigirefs_wait(lp->lwp_proc);
1293 }
1294
1295 error = dopoll(uap->nfds, uap->fds, ktsp, &sysmsg->sysmsg_result,
1296 ktsp != NULL ? KEVENT_TIMEOUT_PRECISE : 0);
1297
1298 if (uap->sigmask != NULL) {
1299 lwkt_gettoken(&lp->lwp_proc->p_token);
1300 /* dopoll() responsible for turning ERESTART into EINTR */
1301 KKASSERT(error != ERESTART);
1302 if (error == EINTR) {
1303 /*
1304 * We can't restore the previous signal mask now
1305 * because it could block the signal that interrupted
1306 * us. So make a note to restore it after executing
1307 * the handler.
1308 */
1309 lp->lwp_flags |= LWP_OLDMASK;
1310 } else {
1311 /*
1312 * No handler to run. Restore previous mask immediately.
1313 */
1314 lp->lwp_sigmask = lp->lwp_oldsigmask;
1315 sigirefs_wait(lp->lwp_proc);
1316 }
1317 lwkt_reltoken(&lp->lwp_proc->p_token);
1318 }
1319
1320 return (error);
1321 }
1322
1323 static int
poll_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)1324 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
1325 {
1326 struct poll_kevent_copyin_args *pkap;
1327 struct pollfd *pfd;
1328 struct kevent *kev;
1329 int kev_count;
1330
1331 pkap = (struct poll_kevent_copyin_args *)arg;
1332
1333 while (pkap->pfds < pkap->nfds) {
1334 pfd = &pkap->fds[pkap->pfds];
1335
1336 /* Clear return events */
1337 pfd->revents = 0;
1338
1339 /* Do not check if fd is equal to -1 */
1340 if (pfd->fd == -1) {
1341 ++pkap->pfds;
1342 continue;
1343 }
1344
1345 /*
1346 * NOTE: pfd->events == 0 implies POLLHUP in BSDs. Used
1347 * by at least sshd and X11 udev support.
1348 */
1349 kev_count = 0;
1350 if (pfd->events == 0)
1351 kev_count++;
1352 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM))
1353 kev_count++;
1354 if (pfd->events & (POLLOUT | POLLWRNORM))
1355 kev_count++;
1356 if (pfd->events & (POLLPRI | POLLRDBAND))
1357 kev_count++;
1358
1359 if (*events + kev_count > maxevents)
1360 return (0);
1361
1362 /*
1363 * NOTE: A combined serial number and poll array index is
1364 * stored in kev->udata.
1365 *
1366 * NOTE: Events will be registered with KEVENT_UNIQUE_NOTES
1367 * set, using kev->data for the uniqifier. kev->data
1368 * is an implied in the actual registration.
1369 */
1370 kev = &kevp[*events];
1371
1372 /*
1373 * Implied POLLHUP
1374 */
1375 if (pfd->events == 0) {
1376 int notes = NOTE_OLDAPI | NOTE_HUPONLY;
1377
1378 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE,
1379 notes, pkap->pfds, (void *)(uintptr_t)
1380 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1381 }
1382
1383 /*
1384 * Nominal read events
1385 */
1386 if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) {
1387 int notes = NOTE_OLDAPI;
1388 if ((pfd->events & (POLLIN | POLLRDNORM)) == 0)
1389 notes |= NOTE_HUPONLY;
1390
1391 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE,
1392 notes, pkap->pfds, (void *)(uintptr_t)
1393 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1394 }
1395
1396 /*
1397 * Nominal write events
1398 */
1399 if (pfd->events & (POLLOUT | POLLWRNORM)) {
1400 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE,
1401 NOTE_OLDAPI, pkap->pfds, (void *)(uintptr_t)
1402 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1403 }
1404
1405 /*
1406 * Nominal exceptional events
1407 */
1408 if (pfd->events & (POLLPRI | POLLRDBAND)) {
1409 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE,
1410 NOTE_OLDAPI | NOTE_OOB, pkap->pfds,
1411 (void *)(uintptr_t)
1412 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1413 }
1414
1415 if (nseldebug) {
1416 kprintf("poll index %d/%d fd %d events %08x "
1417 "serial %ju\n", pkap->pfds, pkap->nfds-1,
1418 pfd->fd, pfd->events,
1419 (uintmax_t)pkap->lwp->lwp_kqueue_serial);
1420 }
1421
1422 ++pkap->pfds;
1423 (*events) += kev_count;
1424 }
1425
1426 return (0);
1427 }
1428
1429 static int
poll_copyout(void * arg,struct kevent * kevp,int count,int * res)1430 poll_copyout(void *arg, struct kevent *kevp, int count, int *res)
1431 {
1432 struct poll_kevent_copyin_args *pkap;
1433 struct pollfd *pfd;
1434 struct kevent kev;
1435 int count_res;
1436 int i;
1437 int n;
1438 uint64_t pi;
1439
1440 pkap = (struct poll_kevent_copyin_args *)arg;
1441
1442 for (i = 0; i < count; ++i) {
1443 /*
1444 * Extract the poll array index and delete spurious events.
1445 * We can easily tell if the serial number is incorrect
1446 * by checking whether the extracted index is out of range.
1447 */
1448 pi = (uint64_t)(uintptr_t)kevp[i].udata -
1449 pkap->lwp->lwp_kqueue_serial;
1450 if (pi >= pkap->nfds) {
1451 panic("poll_copyout: unexpected udata");
1452 deregister:
1453 kev = kevp[i];
1454 kev.flags = EV_DISABLE|EV_DELETE;
1455 kev.data = pi; /* uniquifier */
1456 n = 1;
1457 kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n,
1458 KEVENT_UNIQUE_NOTES);
1459 if (nseldebug) {
1460 kprintf("poll index %ju out of range against "
1461 "serial %ju\n", (uintmax_t)pi,
1462 (uintmax_t)pkap->lwp->lwp_kqueue_serial);
1463 }
1464 continue;
1465 }
1466
1467 /*
1468 * Locate the pollfd and process events
1469 */
1470 pfd = &pkap->fds[pi];
1471 if (kevp[i].ident == pfd->fd) {
1472 /*
1473 * A single descriptor may generate an error against
1474 * more than one filter, make sure to set the
1475 * appropriate flags but do not increment (*res)
1476 * more than once.
1477 */
1478 count_res = (pfd->revents == 0);
1479 if (kevp[i].flags & EV_ERROR) {
1480 switch(kevp[i].data) {
1481 case EBADF:
1482 case POLLNVAL:
1483 /* Bad file descriptor */
1484 if (count_res)
1485 ++*res;
1486 pfd->revents |= POLLNVAL;
1487 break;
1488 default:
1489 /*
1490 * Poll silently swallows any unknown
1491 * errors except in the case of POLLPRI
1492 * (OOB/urgent data).
1493 *
1494 * ALWAYS filter out EOPNOTSUPP errors
1495 * from filters, common applications
1496 * set POLLPRI|POLLRDBAND and most
1497 * filters do not support EVFILT_EXCEPT.
1498 *
1499 * We also filter out ENODEV since
1500 * dev_dkqfilter returns ENODEV if
1501 * EOPNOTSUPP is returned in an
1502 * inner call.
1503 *
1504 * XXX: fix this
1505 */
1506 if (kevp[i].filter != EVFILT_READ &&
1507 kevp[i].filter != EVFILT_WRITE &&
1508 kevp[i].data != EOPNOTSUPP &&
1509 kevp[i].data != ENODEV) {
1510 if (count_res)
1511 ++*res;
1512 pfd->revents |= POLLERR;
1513 }
1514 break;
1515 }
1516 if (pfd->revents == 0 && nseldebug) {
1517 kprintf("poll index EV_ERROR %ju fd %d "
1518 "filter %d error %jd\n",
1519 (uintmax_t)pi, pfd->fd,
1520 kevp[i].filter,
1521 (intmax_t)kevp[i].data);
1522 }
1523
1524 /*
1525 * Silently deregister any unhandled EV_ERROR
1526 * condition (usually EOPNOTSUPP).
1527 */
1528 if (pfd->revents == 0)
1529 goto deregister;
1530 continue;
1531 }
1532
1533 switch (kevp[i].filter) {
1534 case EVFILT_READ:
1535 /*
1536 * NODATA on the read side can indicate a
1537 * half-closed situation and not necessarily
1538 * a disconnect, so depend on the user
1539 * issuing a read() and getting 0 bytes back.
1540 *
1541 * If EV_HUP is set the peer completely
1542 * disconnected and we can set POLLHUP.
1543 * Linux can return POLLHUP even if read
1544 * data has not been drained, so we should
1545 * too.
1546 */
1547 /* if (kevp[i].flags & EV_NODATA) */ {
1548 if (kevp[i].flags & EV_HUP)
1549 pfd->revents |= POLLHUP;
1550 }
1551 if ((kevp[i].flags & EV_EOF) &&
1552 kevp[i].fflags != 0)
1553 pfd->revents |= POLLERR;
1554 if (pfd->events & POLLIN)
1555 pfd->revents |= POLLIN;
1556 if (pfd->events & POLLRDNORM)
1557 pfd->revents |= POLLRDNORM;
1558 break;
1559 case EVFILT_WRITE:
1560 /*
1561 * As per the OpenGroup POLLHUP is mutually
1562 * exclusive with the writability flags. I
1563 * consider this a bit broken but...
1564 *
1565 * In this case a disconnect is implied even
1566 * for a half-closed (write side) situation.
1567 */
1568 if (kevp[i].flags & EV_EOF) {
1569 pfd->revents |= POLLHUP;
1570 if (kevp[i].fflags != 0)
1571 pfd->revents |= POLLERR;
1572 } else {
1573 if (pfd->events & POLLOUT)
1574 pfd->revents |= POLLOUT;
1575 if (pfd->events & POLLWRNORM)
1576 pfd->revents |= POLLWRNORM;
1577 }
1578 break;
1579 case EVFILT_EXCEPT:
1580 /*
1581 * EV_NODATA should never be tagged for this
1582 * filter.
1583 */
1584 if (pfd->events & POLLPRI)
1585 pfd->revents |= POLLPRI;
1586 if (pfd->events & POLLRDBAND)
1587 pfd->revents |= POLLRDBAND;
1588 break;
1589 }
1590
1591 if (nseldebug) {
1592 kprintf("poll index %ju/%d fd %d "
1593 "revents %08x\n", (uintmax_t)pi, pkap->nfds,
1594 pfd->fd, pfd->revents);
1595 }
1596
1597 if (count_res && pfd->revents)
1598 ++*res;
1599 }
1600
1601 /*
1602 * We must deregister any kqueue poll event that does not
1603 * set poll return bits to prevent a live-lock.
1604 */
1605 if (pfd->revents == 0) {
1606 krateprintf(&krate_poll,
1607 "poll index %ju no-action %ju/%d "
1608 "events=%08x kevpfilt=%d/%08x\n",
1609 (uintmax_t)pi, (uintmax_t)kevp[i].ident,
1610 pfd->fd, pfd->events,
1611 kevp[i].filter, kevp[i].flags);
1612 goto deregister;
1613 }
1614 }
1615
1616 return (0);
1617 }
1618
1619 static int
dopoll(int nfds,struct pollfd * fds,struct timespec * ts,int * res,int flags)1620 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res, int flags)
1621 {
1622 struct poll_kevent_copyin_args ka;
1623 struct pollfd sfds[64];
1624 int bytes;
1625 int error;
1626
1627 flags |= KEVENT_AUTO_STALE | KEVENT_UNIQUE_NOTES;
1628
1629 *res = 0;
1630 if (nfds < 0)
1631 return (EINVAL);
1632
1633 if (nfds == 0 && ts)
1634 return (dotimeout_only(ts));
1635
1636 /*
1637 * This is a bit arbitrary but we need to limit internal kmallocs.
1638 */
1639 if (nfds > maxfilesperproc * 2)
1640 nfds = maxfilesperproc * 2;
1641 bytes = sizeof(struct pollfd) * nfds;
1642
1643 ka.lwp = curthread->td_lwp;
1644 ka.nfds = nfds;
1645 ka.pfds = 0;
1646 ka.error = 0;
1647
1648 if (ka.nfds < 64)
1649 ka.fds = sfds;
1650 else
1651 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK);
1652
1653 error = copyin(fds, ka.fds, bytes);
1654
1655 if (error == 0)
1656 error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka,
1657 poll_copyin, poll_copyout, ts, flags);
1658
1659 if (error == 0)
1660 error = copyout(ka.fds, fds, bytes);
1661
1662 if (ka.fds != sfds)
1663 kfree(ka.fds, M_SELECT);
1664
1665 ka.lwp->lwp_kqueue_serial += nfds;
1666
1667 return (error);
1668 }
1669
1670 static int
socket_wait_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)1671 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
1672 {
1673 return (0);
1674 }
1675
1676 static int
socket_wait_copyout(void * arg,struct kevent * kevp,int count,int * res)1677 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res)
1678 {
1679 ++*res;
1680 return (0);
1681 }
1682
1683 extern struct fileops socketops;
1684
1685 /*
1686 * NOTE: Callers of socket_wait() must already have a reference on the
1687 * socket.
1688 */
1689 int
socket_wait(struct socket * so,struct timespec * ts,int * res)1690 socket_wait(struct socket *so, struct timespec *ts, int *res)
1691 {
1692 struct thread *td = curthread;
1693 struct file *fp;
1694 struct kqueue kq;
1695 struct kevent kev;
1696 int error, fd;
1697 int n;
1698
1699 if ((error = falloc(td->td_lwp, &fp, &fd)) != 0)
1700 return (error);
1701
1702 fp->f_type = DTYPE_SOCKET;
1703 fp->f_flag = FREAD | FWRITE;
1704 fp->f_ops = &socketops;
1705 fp->f_data = so;
1706 fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd);
1707 fsetfdflags(td->td_proc->p_fd, fd, UF_EXCLOSE);
1708
1709 bzero(&kq, sizeof(kq));
1710 kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd);
1711 EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL);
1712 n = 1;
1713 if ((error = kqueue_register(&kq, &kev, &n, 0)) != 0) {
1714 fdrop(fp);
1715 return (error);
1716 }
1717
1718 error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin,
1719 socket_wait_copyout, ts, 0);
1720
1721 EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL);
1722 n = 1;
1723 kqueue_register(&kq, &kev, &n, 0);
1724 fp->f_ops = &badfileops;
1725 fdrop(fp);
1726
1727 return (error);
1728 }
1729
1730 /*
1731 * OpenBSD poll system call.
1732 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1733 *
1734 * MPSAFE
1735 */
1736 int
sys_openbsd_poll(struct sysmsg * sysmsg,const struct openbsd_poll_args * uap)1737 sys_openbsd_poll(struct sysmsg *sysmsg, const struct openbsd_poll_args *uap)
1738 {
1739 return (sys_poll(sysmsg, (const struct poll_args *)uap));
1740 }
1741
1742 /*ARGSUSED*/
1743 int
seltrue(cdev_t dev,int events)1744 seltrue(cdev_t dev, int events)
1745 {
1746 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1747 }
1748