1*ad71ebb5Sriastradh /* $NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $ */
2e714af64Sthorpej
3e714af64Sthorpej /*-
4e714af64Sthorpej * Copyright (c) 2020 The NetBSD Foundation, Inc.
5e714af64Sthorpej * All rights reserved.
6e714af64Sthorpej *
7e714af64Sthorpej * This code is derived from software contributed to The NetBSD Foundation
8e714af64Sthorpej * by Jason R. Thorpe.
9e714af64Sthorpej *
10e714af64Sthorpej * Redistribution and use in source and binary forms, with or without
11e714af64Sthorpej * modification, are permitted provided that the following conditions
12e714af64Sthorpej * are met:
13e714af64Sthorpej * 1. Redistributions of source code must retain the above copyright
14e714af64Sthorpej * notice, this list of conditions and the following disclaimer.
15e714af64Sthorpej * 2. Redistributions in binary form must reproduce the above copyright
16e714af64Sthorpej * notice, this list of conditions and the following disclaimer in the
17e714af64Sthorpej * documentation and/or other materials provided with the distribution.
18e714af64Sthorpej *
19e714af64Sthorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20e714af64Sthorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21e714af64Sthorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22e714af64Sthorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23e714af64Sthorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24e714af64Sthorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25e714af64Sthorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26e714af64Sthorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27e714af64Sthorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28e714af64Sthorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29e714af64Sthorpej * POSSIBILITY OF SUCH DAMAGE.
30e714af64Sthorpej */
31e714af64Sthorpej
32e714af64Sthorpej #include <sys/cdefs.h>
33*ad71ebb5Sriastradh __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $");
34e714af64Sthorpej
35e714af64Sthorpej /*
36e714af64Sthorpej * eventfd
37e714af64Sthorpej *
38e714af64Sthorpej * Eventfd objects present a simple counting object associated with a
39e714af64Sthorpej * file descriptor. Writes and reads to this file descriptor increment
40e714af64Sthorpej * and decrement the count, respectively. When the count is non-zero,
41e714af64Sthorpej * the descriptor is considered "readable", and when less than the max
42e714af64Sthorpej * value (EVENTFD_MAXVAL), is considered "writable".
43e714af64Sthorpej *
44e714af64Sthorpej * This implementation is API compatible with the Linux eventfd(2)
45e714af64Sthorpej * interface.
46e714af64Sthorpej */
47e714af64Sthorpej
4863b02c12Sskrll #include <sys/param.h>
49e714af64Sthorpej #include <sys/types.h>
50e714af64Sthorpej #include <sys/condvar.h>
51e714af64Sthorpej #include <sys/eventfd.h>
52e714af64Sthorpej #include <sys/file.h>
53e714af64Sthorpej #include <sys/filedesc.h>
54e714af64Sthorpej #include <sys/kauth.h>
55e714af64Sthorpej #include <sys/mutex.h>
56e714af64Sthorpej #include <sys/poll.h>
57e714af64Sthorpej #include <sys/proc.h>
58e714af64Sthorpej #include <sys/select.h>
59e714af64Sthorpej #include <sys/stat.h>
60e714af64Sthorpej #include <sys/syscallargs.h>
61e714af64Sthorpej #include <sys/uio.h>
62e714af64Sthorpej
63e714af64Sthorpej struct eventfd {
64e714af64Sthorpej kmutex_t efd_lock;
65e714af64Sthorpej kcondvar_t efd_read_wait;
66e714af64Sthorpej kcondvar_t efd_write_wait;
67e714af64Sthorpej struct selinfo efd_read_sel;
68e714af64Sthorpej struct selinfo efd_write_sel;
69e714af64Sthorpej eventfd_t efd_val;
70e714af64Sthorpej int64_t efd_nwaiters;
71e714af64Sthorpej bool efd_restarting;
72e714af64Sthorpej bool efd_is_semaphore;
73e714af64Sthorpej
74e714af64Sthorpej /*
75e714af64Sthorpej * Information kept for stat(2).
76e714af64Sthorpej */
77e714af64Sthorpej struct timespec efd_btime; /* time created */
78e714af64Sthorpej struct timespec efd_mtime; /* last write */
79e714af64Sthorpej struct timespec efd_atime; /* last read */
80e714af64Sthorpej };
81e714af64Sthorpej
82e714af64Sthorpej #define EVENTFD_MAXVAL (UINT64_MAX - 1)
83e714af64Sthorpej
84e714af64Sthorpej /*
85e714af64Sthorpej * eventfd_create:
86e714af64Sthorpej *
87e714af64Sthorpej * Create an eventfd object.
88e714af64Sthorpej */
89e714af64Sthorpej static struct eventfd *
eventfd_create(unsigned int const val,int const flags)90e714af64Sthorpej eventfd_create(unsigned int const val, int const flags)
91e714af64Sthorpej {
92e714af64Sthorpej struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
93e714af64Sthorpej
94e714af64Sthorpej mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
95e714af64Sthorpej cv_init(&efd->efd_read_wait, "efdread");
96e714af64Sthorpej cv_init(&efd->efd_write_wait, "efdwrite");
97e714af64Sthorpej selinit(&efd->efd_read_sel);
98e714af64Sthorpej selinit(&efd->efd_write_sel);
99e714af64Sthorpej efd->efd_val = val;
100e714af64Sthorpej efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
101e714af64Sthorpej getnanotime(&efd->efd_btime);
102e714af64Sthorpej
103e714af64Sthorpej /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
104e714af64Sthorpej
105e714af64Sthorpej return efd;
106e714af64Sthorpej }
107e714af64Sthorpej
108e714af64Sthorpej /*
109e714af64Sthorpej * eventfd_destroy:
110e714af64Sthorpej *
111e714af64Sthorpej * Destroy an eventfd object.
112e714af64Sthorpej */
113e714af64Sthorpej static void
eventfd_destroy(struct eventfd * const efd)114e714af64Sthorpej eventfd_destroy(struct eventfd * const efd)
115e714af64Sthorpej {
116e714af64Sthorpej
117e714af64Sthorpej KASSERT(efd->efd_nwaiters == 0);
118e714af64Sthorpej
119e714af64Sthorpej cv_destroy(&efd->efd_read_wait);
120e714af64Sthorpej cv_destroy(&efd->efd_write_wait);
121e714af64Sthorpej
122e714af64Sthorpej seldestroy(&efd->efd_read_sel);
123e714af64Sthorpej seldestroy(&efd->efd_write_sel);
124e714af64Sthorpej
125e714af64Sthorpej mutex_destroy(&efd->efd_lock);
126a8c609a1Sthorpej
127a8c609a1Sthorpej kmem_free(efd, sizeof(*efd));
128e714af64Sthorpej }
129e714af64Sthorpej
130e714af64Sthorpej /*
131e714af64Sthorpej * eventfd_wait:
132e714af64Sthorpej *
133e714af64Sthorpej * Block on an eventfd. Handles non-blocking, as well as
134e714af64Sthorpej * the restart cases.
135e714af64Sthorpej */
136e714af64Sthorpej static int
eventfd_wait(struct eventfd * const efd,int const fflag,bool const is_write)137e714af64Sthorpej eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
138e714af64Sthorpej {
139e714af64Sthorpej kcondvar_t *waitcv;
140e714af64Sthorpej int error;
141e714af64Sthorpej
142e714af64Sthorpej if (fflag & FNONBLOCK) {
143e714af64Sthorpej return EAGAIN;
144e714af64Sthorpej }
145e714af64Sthorpej
146e714af64Sthorpej /*
147d7d2fb3eSthorpej * We're going to block. Check if we need to return ERESTART.
148e714af64Sthorpej */
149d7d2fb3eSthorpej if (efd->efd_restarting) {
150d7d2fb3eSthorpej return ERESTART;
151e714af64Sthorpej }
152e714af64Sthorpej
153e714af64Sthorpej if (is_write) {
154e714af64Sthorpej waitcv = &efd->efd_write_wait;
155e714af64Sthorpej } else {
156e714af64Sthorpej waitcv = &efd->efd_read_wait;
157e714af64Sthorpej }
158e714af64Sthorpej
159e714af64Sthorpej efd->efd_nwaiters++;
160e714af64Sthorpej KASSERT(efd->efd_nwaiters > 0);
161e714af64Sthorpej error = cv_wait_sig(waitcv, &efd->efd_lock);
162e714af64Sthorpej efd->efd_nwaiters--;
163e714af64Sthorpej KASSERT(efd->efd_nwaiters >= 0);
164e714af64Sthorpej
165e714af64Sthorpej /*
166e714af64Sthorpej * If a restart was triggered while we were asleep, we need
167d7d2fb3eSthorpej * to return ERESTART if no other error was returned.
168e714af64Sthorpej */
169e714af64Sthorpej if (efd->efd_restarting) {
170e714af64Sthorpej if (error == 0) {
171e714af64Sthorpej error = ERESTART;
172e714af64Sthorpej }
173e714af64Sthorpej }
174e714af64Sthorpej
175e714af64Sthorpej return error;
176e714af64Sthorpej }
177e714af64Sthorpej
178e714af64Sthorpej /*
179e714af64Sthorpej * eventfd_wake:
180e714af64Sthorpej *
181e714af64Sthorpej * Wake LWPs block on an eventfd.
182e714af64Sthorpej */
183e714af64Sthorpej static void
eventfd_wake(struct eventfd * const efd,bool const is_write)184e714af64Sthorpej eventfd_wake(struct eventfd * const efd, bool const is_write)
185e714af64Sthorpej {
186e714af64Sthorpej kcondvar_t *waitcv = NULL;
187e714af64Sthorpej struct selinfo *sel;
188e714af64Sthorpej int pollev;
189e714af64Sthorpej
190e714af64Sthorpej if (is_write) {
191e714af64Sthorpej waitcv = &efd->efd_read_wait;
192e714af64Sthorpej sel = &efd->efd_read_sel;
193e714af64Sthorpej pollev = POLLIN | POLLRDNORM;
194e714af64Sthorpej } else {
195e714af64Sthorpej waitcv = &efd->efd_write_wait;
196e714af64Sthorpej sel = &efd->efd_write_sel;
197e714af64Sthorpej pollev = POLLOUT | POLLWRNORM;
198e714af64Sthorpej }
199e714af64Sthorpej cv_broadcast(waitcv);
200e714af64Sthorpej selnotify(sel, pollev, NOTE_SUBMIT);
201e714af64Sthorpej }
202e714af64Sthorpej
203e714af64Sthorpej /*
204e714af64Sthorpej * eventfd file operations
205e714af64Sthorpej */
206e714af64Sthorpej
207e714af64Sthorpej static int
eventfd_fop_read(file_t * const fp,off_t * const offset,struct uio * const uio,kauth_cred_t const cred,int const flags)208e714af64Sthorpej eventfd_fop_read(file_t * const fp, off_t * const offset,
209e714af64Sthorpej struct uio * const uio, kauth_cred_t const cred, int const flags)
210e714af64Sthorpej {
211e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
212e714af64Sthorpej int const fflag = fp->f_flag;
213e714af64Sthorpej eventfd_t return_value;
214e714af64Sthorpej int error;
215e714af64Sthorpej
216e714af64Sthorpej if (uio->uio_resid < sizeof(eventfd_t)) {
217e714af64Sthorpej return EINVAL;
218e714af64Sthorpej }
219e714af64Sthorpej
220e714af64Sthorpej mutex_enter(&efd->efd_lock);
221e714af64Sthorpej
222e714af64Sthorpej while (efd->efd_val == 0) {
223e714af64Sthorpej if ((error = eventfd_wait(efd, fflag, false)) != 0) {
224e714af64Sthorpej mutex_exit(&efd->efd_lock);
225e714af64Sthorpej return error;
226e714af64Sthorpej }
227e714af64Sthorpej }
228e714af64Sthorpej
229e714af64Sthorpej if (efd->efd_is_semaphore) {
230e714af64Sthorpej return_value = 1;
231e714af64Sthorpej efd->efd_val--;
232e714af64Sthorpej } else {
233e714af64Sthorpej return_value = efd->efd_val;
234e714af64Sthorpej efd->efd_val = 0;
235e714af64Sthorpej }
236e714af64Sthorpej
237e714af64Sthorpej getnanotime(&efd->efd_atime);
238e714af64Sthorpej eventfd_wake(efd, false);
239e714af64Sthorpej
240e714af64Sthorpej mutex_exit(&efd->efd_lock);
241e714af64Sthorpej
242e714af64Sthorpej error = uiomove(&return_value, sizeof(return_value), uio);
243e714af64Sthorpej
244e714af64Sthorpej return error;
245e714af64Sthorpej }
246e714af64Sthorpej
247e714af64Sthorpej static int
eventfd_fop_write(file_t * const fp,off_t * const offset,struct uio * const uio,kauth_cred_t const cred,int const flags)248e714af64Sthorpej eventfd_fop_write(file_t * const fp, off_t * const offset,
249e714af64Sthorpej struct uio * const uio, kauth_cred_t const cred, int const flags)
250e714af64Sthorpej {
251e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
252e714af64Sthorpej int const fflag = fp->f_flag;
253e714af64Sthorpej eventfd_t write_value;
254e714af64Sthorpej int error;
255e714af64Sthorpej
256e714af64Sthorpej if (uio->uio_resid < sizeof(eventfd_t)) {
257e714af64Sthorpej return EINVAL;
258e714af64Sthorpej }
259e714af64Sthorpej
260e714af64Sthorpej if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
261e714af64Sthorpej return error;
262e714af64Sthorpej }
263e714af64Sthorpej
264e714af64Sthorpej if (write_value > EVENTFD_MAXVAL) {
265e714af64Sthorpej error = EINVAL;
266e714af64Sthorpej goto out;
267e714af64Sthorpej }
268e714af64Sthorpej
269e714af64Sthorpej mutex_enter(&efd->efd_lock);
270e714af64Sthorpej
271e714af64Sthorpej KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
272e714af64Sthorpej while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
273e714af64Sthorpej if ((error = eventfd_wait(efd, fflag, true)) != 0) {
274e714af64Sthorpej mutex_exit(&efd->efd_lock);
275e714af64Sthorpej goto out;
276e714af64Sthorpej }
277e714af64Sthorpej }
278e714af64Sthorpej
279e714af64Sthorpej efd->efd_val += write_value;
280e714af64Sthorpej KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
281e714af64Sthorpej
282e714af64Sthorpej getnanotime(&efd->efd_mtime);
283e714af64Sthorpej eventfd_wake(efd, true);
284e714af64Sthorpej
285e714af64Sthorpej mutex_exit(&efd->efd_lock);
286e714af64Sthorpej
287e714af64Sthorpej out:
288e714af64Sthorpej if (error) {
289e714af64Sthorpej /*
290e714af64Sthorpej * Undo the effect of uiomove() so that the error
291e714af64Sthorpej * gets reported correctly; see dofilewrite().
292e714af64Sthorpej */
293e714af64Sthorpej uio->uio_resid += sizeof(write_value);
294e714af64Sthorpej }
295e714af64Sthorpej return error;
296e714af64Sthorpej }
297e714af64Sthorpej
298e714af64Sthorpej static int
eventfd_ioctl(file_t * const fp,u_long const cmd,void * const data)29987d9925dSthorpej eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data)
30087d9925dSthorpej {
30187d9925dSthorpej struct eventfd * const efd = fp->f_eventfd;
30287d9925dSthorpej
30387d9925dSthorpej switch (cmd) {
30487d9925dSthorpej case FIONBIO:
30587d9925dSthorpej return 0;
30687d9925dSthorpej
30787d9925dSthorpej case FIONREAD:
30887d9925dSthorpej mutex_enter(&efd->efd_lock);
30987d9925dSthorpej *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0;
31087d9925dSthorpej mutex_exit(&efd->efd_lock);
31187d9925dSthorpej return 0;
31287d9925dSthorpej
31387d9925dSthorpej case FIONWRITE:
31487d9925dSthorpej *(int *)data = 0;
31587d9925dSthorpej return 0;
31687d9925dSthorpej
31787d9925dSthorpej case FIONSPACE:
31887d9925dSthorpej /*
31987d9925dSthorpej * FIONSPACE doesn't really work for eventfd, because the
32087d9925dSthorpej * writability depends on the contents (value) being written.
32187d9925dSthorpej */
32287d9925dSthorpej break;
32387d9925dSthorpej
32487d9925dSthorpej default:
32587d9925dSthorpej break;
32687d9925dSthorpej }
32787d9925dSthorpej
32887d9925dSthorpej return EPASSTHROUGH;
32987d9925dSthorpej }
33087d9925dSthorpej
33187d9925dSthorpej static int
eventfd_fop_poll(file_t * const fp,int const events)332e714af64Sthorpej eventfd_fop_poll(file_t * const fp, int const events)
333e714af64Sthorpej {
334e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
335e714af64Sthorpej int revents = 0;
336e714af64Sthorpej
337e714af64Sthorpej /*
338e714af64Sthorpej * Note that Linux will return POLLERR if the eventfd count
339e714af64Sthorpej * overflows, but that is not possible in the normal read/write
340e714af64Sthorpej * API, only with Linux kernel-internal interfaces. So, this
341e714af64Sthorpej * implementation never returns POLLERR.
342e714af64Sthorpej *
343e714af64Sthorpej * Also note that the Linux eventfd(2) man page does not
344e714af64Sthorpej * specifically discuss returning POLLRDNORM, but we check
345e714af64Sthorpej * for that event in addition to POLLIN.
346e714af64Sthorpej */
347e714af64Sthorpej
348e714af64Sthorpej mutex_enter(&efd->efd_lock);
349e714af64Sthorpej
350e714af64Sthorpej if (events & (POLLIN | POLLRDNORM)) {
351e714af64Sthorpej if (efd->efd_val != 0) {
352e714af64Sthorpej revents |= events & (POLLIN | POLLRDNORM);
353e714af64Sthorpej } else {
354e714af64Sthorpej selrecord(curlwp, &efd->efd_read_sel);
355e714af64Sthorpej }
356e714af64Sthorpej }
357e714af64Sthorpej
358e714af64Sthorpej if (events & (POLLOUT | POLLWRNORM)) {
359e714af64Sthorpej if (efd->efd_val < EVENTFD_MAXVAL) {
360e714af64Sthorpej revents |= events & (POLLOUT | POLLWRNORM);
361e714af64Sthorpej } else {
362e714af64Sthorpej selrecord(curlwp, &efd->efd_write_sel);
363e714af64Sthorpej }
364e714af64Sthorpej }
365e714af64Sthorpej
366e714af64Sthorpej mutex_exit(&efd->efd_lock);
367e714af64Sthorpej
368e714af64Sthorpej return revents;
369e714af64Sthorpej }
370e714af64Sthorpej
371e714af64Sthorpej static int
eventfd_fop_stat(file_t * const fp,struct stat * const st)372e714af64Sthorpej eventfd_fop_stat(file_t * const fp, struct stat * const st)
373e714af64Sthorpej {
374e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
375e714af64Sthorpej
376e714af64Sthorpej memset(st, 0, sizeof(*st));
377e714af64Sthorpej
378e714af64Sthorpej mutex_enter(&efd->efd_lock);
379e714af64Sthorpej st->st_size = (off_t)efd->efd_val;
380e714af64Sthorpej st->st_blksize = sizeof(eventfd_t);
381e714af64Sthorpej st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
382e714af64Sthorpej st->st_blocks = 1;
383e714af64Sthorpej st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
384e714af64Sthorpej st->st_atimespec = efd->efd_atime;
385e714af64Sthorpej st->st_mtimespec = efd->efd_mtime;
386e714af64Sthorpej st->st_uid = kauth_cred_geteuid(fp->f_cred);
387e714af64Sthorpej st->st_gid = kauth_cred_getegid(fp->f_cred);
388e714af64Sthorpej mutex_exit(&efd->efd_lock);
389e714af64Sthorpej
390e714af64Sthorpej return 0;
391e714af64Sthorpej }
392e714af64Sthorpej
393e714af64Sthorpej static int
eventfd_fop_close(file_t * const fp)394e714af64Sthorpej eventfd_fop_close(file_t * const fp)
395e714af64Sthorpej {
396e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
397e714af64Sthorpej
398e714af64Sthorpej fp->f_eventfd = NULL;
399e714af64Sthorpej eventfd_destroy(efd);
400e714af64Sthorpej
401e714af64Sthorpej return 0;
402e714af64Sthorpej }
403e714af64Sthorpej
404e714af64Sthorpej static void
eventfd_filt_read_detach(struct knote * const kn)405e714af64Sthorpej eventfd_filt_read_detach(struct knote * const kn)
406e714af64Sthorpej {
407e714af64Sthorpej struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
408e714af64Sthorpej
409e714af64Sthorpej mutex_enter(&efd->efd_lock);
410e714af64Sthorpej KASSERT(kn->kn_hook == efd);
411e714af64Sthorpej selremove_knote(&efd->efd_read_sel, kn);
412e714af64Sthorpej mutex_exit(&efd->efd_lock);
413e714af64Sthorpej }
414e714af64Sthorpej
415e714af64Sthorpej static int
eventfd_filt_read(struct knote * const kn,long const hint)416e714af64Sthorpej eventfd_filt_read(struct knote * const kn, long const hint)
417e714af64Sthorpej {
418e714af64Sthorpej struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
419ec9c6f37Sthorpej int rv;
420e714af64Sthorpej
421e714af64Sthorpej if (hint & NOTE_SUBMIT) {
422e714af64Sthorpej KASSERT(mutex_owned(&efd->efd_lock));
423e714af64Sthorpej } else {
424e714af64Sthorpej mutex_enter(&efd->efd_lock);
425e714af64Sthorpej }
426e714af64Sthorpej
427e714af64Sthorpej kn->kn_data = (int64_t)efd->efd_val;
428ec9c6f37Sthorpej rv = (eventfd_t)kn->kn_data > 0;
429e714af64Sthorpej
430e714af64Sthorpej if ((hint & NOTE_SUBMIT) == 0) {
431e714af64Sthorpej mutex_exit(&efd->efd_lock);
432e714af64Sthorpej }
433e714af64Sthorpej
434ec9c6f37Sthorpej return rv;
435e714af64Sthorpej }
436e714af64Sthorpej
437e714af64Sthorpej static const struct filterops eventfd_read_filterops = {
4386b6dcbbaSthorpej .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
439e714af64Sthorpej .f_detach = eventfd_filt_read_detach,
440e714af64Sthorpej .f_event = eventfd_filt_read,
441e714af64Sthorpej };
442e714af64Sthorpej
443e714af64Sthorpej static void
eventfd_filt_write_detach(struct knote * const kn)444e714af64Sthorpej eventfd_filt_write_detach(struct knote * const kn)
445e714af64Sthorpej {
446e714af64Sthorpej struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
447e714af64Sthorpej
448e714af64Sthorpej mutex_enter(&efd->efd_lock);
449e714af64Sthorpej KASSERT(kn->kn_hook == efd);
450e714af64Sthorpej selremove_knote(&efd->efd_write_sel, kn);
451e714af64Sthorpej mutex_exit(&efd->efd_lock);
452e714af64Sthorpej }
453e714af64Sthorpej
454e714af64Sthorpej static int
eventfd_filt_write(struct knote * const kn,long const hint)455e714af64Sthorpej eventfd_filt_write(struct knote * const kn, long const hint)
456e714af64Sthorpej {
457e714af64Sthorpej struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
458ec9c6f37Sthorpej int rv;
459e714af64Sthorpej
460e714af64Sthorpej if (hint & NOTE_SUBMIT) {
461e714af64Sthorpej KASSERT(mutex_owned(&efd->efd_lock));
462e714af64Sthorpej } else {
463e714af64Sthorpej mutex_enter(&efd->efd_lock);
464e714af64Sthorpej }
465e714af64Sthorpej
466e714af64Sthorpej kn->kn_data = (int64_t)efd->efd_val;
467ec9c6f37Sthorpej rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
468e714af64Sthorpej
469e714af64Sthorpej if ((hint & NOTE_SUBMIT) == 0) {
470e714af64Sthorpej mutex_exit(&efd->efd_lock);
471e714af64Sthorpej }
472e714af64Sthorpej
473ec9c6f37Sthorpej return rv;
474e714af64Sthorpej }
475e714af64Sthorpej
476e714af64Sthorpej static const struct filterops eventfd_write_filterops = {
4776b6dcbbaSthorpej .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
478e714af64Sthorpej .f_detach = eventfd_filt_write_detach,
479e714af64Sthorpej .f_event = eventfd_filt_write,
480e714af64Sthorpej };
481e714af64Sthorpej
482e714af64Sthorpej static int
eventfd_fop_kqfilter(file_t * const fp,struct knote * const kn)483e714af64Sthorpej eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
484e714af64Sthorpej {
485e714af64Sthorpej struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
486e714af64Sthorpej struct selinfo *sel;
487e714af64Sthorpej
488e714af64Sthorpej switch (kn->kn_filter) {
489e714af64Sthorpej case EVFILT_READ:
490e714af64Sthorpej sel = &efd->efd_read_sel;
491e714af64Sthorpej kn->kn_fop = &eventfd_read_filterops;
492e714af64Sthorpej break;
493e714af64Sthorpej
494e714af64Sthorpej case EVFILT_WRITE:
495e714af64Sthorpej sel = &efd->efd_write_sel;
496e714af64Sthorpej kn->kn_fop = &eventfd_write_filterops;
497e714af64Sthorpej break;
498e714af64Sthorpej
499e714af64Sthorpej default:
500e714af64Sthorpej return EINVAL;
501e714af64Sthorpej }
502e714af64Sthorpej
503e714af64Sthorpej kn->kn_hook = efd;
504e714af64Sthorpej
505e714af64Sthorpej mutex_enter(&efd->efd_lock);
506e714af64Sthorpej selrecord_knote(sel, kn);
507e714af64Sthorpej mutex_exit(&efd->efd_lock);
508e714af64Sthorpej
509e714af64Sthorpej return 0;
510e714af64Sthorpej }
511e714af64Sthorpej
512e714af64Sthorpej static void
eventfd_fop_restart(file_t * const fp)513e714af64Sthorpej eventfd_fop_restart(file_t * const fp)
514e714af64Sthorpej {
515e714af64Sthorpej struct eventfd * const efd = fp->f_eventfd;
516e714af64Sthorpej
517e714af64Sthorpej /*
518e714af64Sthorpej * Unblock blocked reads/writes in order to allow close() to complete.
519e714af64Sthorpej * System calls return ERESTART so that the fd is revalidated.
520e714af64Sthorpej */
521e714af64Sthorpej
522e714af64Sthorpej mutex_enter(&efd->efd_lock);
523e714af64Sthorpej
524e714af64Sthorpej if (efd->efd_nwaiters != 0) {
525e714af64Sthorpej efd->efd_restarting = true;
526e714af64Sthorpej cv_broadcast(&efd->efd_read_wait);
527e714af64Sthorpej cv_broadcast(&efd->efd_write_wait);
528e714af64Sthorpej }
529e714af64Sthorpej
530e714af64Sthorpej mutex_exit(&efd->efd_lock);
531e714af64Sthorpej }
532e714af64Sthorpej
533e714af64Sthorpej static const struct fileops eventfd_fileops = {
534e714af64Sthorpej .fo_name = "eventfd",
535e714af64Sthorpej .fo_read = eventfd_fop_read,
536e714af64Sthorpej .fo_write = eventfd_fop_write,
53787d9925dSthorpej .fo_ioctl = eventfd_ioctl,
538e714af64Sthorpej .fo_fcntl = fnullop_fcntl,
539e714af64Sthorpej .fo_poll = eventfd_fop_poll,
540e714af64Sthorpej .fo_stat = eventfd_fop_stat,
541e714af64Sthorpej .fo_close = eventfd_fop_close,
542e714af64Sthorpej .fo_kqfilter = eventfd_fop_kqfilter,
543e714af64Sthorpej .fo_restart = eventfd_fop_restart,
544e714af64Sthorpej };
545e714af64Sthorpej
546e714af64Sthorpej /*
547e714af64Sthorpej * eventfd(2) system call
548e714af64Sthorpej */
549e714af64Sthorpej int
do_eventfd(struct lwp * const l,unsigned int const val,int const flags,register_t * retval)550e714af64Sthorpej do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
551e714af64Sthorpej register_t *retval)
552e714af64Sthorpej {
553e714af64Sthorpej file_t *fp;
554e714af64Sthorpej int fd, error;
555e714af64Sthorpej
556e714af64Sthorpej if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
557e714af64Sthorpej return EINVAL;
558e714af64Sthorpej }
559e714af64Sthorpej
560e714af64Sthorpej if ((error = fd_allocfile(&fp, &fd)) != 0) {
561e714af64Sthorpej return error;
562e714af64Sthorpej }
563e714af64Sthorpej
564e714af64Sthorpej fp->f_flag = FREAD | FWRITE;
565e714af64Sthorpej if (flags & EFD_NONBLOCK) {
566e714af64Sthorpej fp->f_flag |= FNONBLOCK;
567e714af64Sthorpej }
568e714af64Sthorpej fp->f_type = DTYPE_EVENTFD;
569e714af64Sthorpej fp->f_ops = &eventfd_fileops;
570e714af64Sthorpej fp->f_eventfd = eventfd_create(val, flags);
571e714af64Sthorpej fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
572e714af64Sthorpej fd_affix(curproc, fp, fd);
573e714af64Sthorpej
574e714af64Sthorpej *retval = fd;
575e714af64Sthorpej return 0;
576e714af64Sthorpej }
577e714af64Sthorpej
578e714af64Sthorpej int
sys_eventfd(struct lwp * l,const struct sys_eventfd_args * uap,register_t * retval)579e714af64Sthorpej sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
580e714af64Sthorpej register_t *retval)
581e714af64Sthorpej {
582e714af64Sthorpej /* {
583e714af64Sthorpej syscallarg(unsigned int) val;
584e714af64Sthorpej syscallarg(int) flags;
585e714af64Sthorpej } */
586e714af64Sthorpej
587e714af64Sthorpej return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
588e714af64Sthorpej }
589