xref: /netbsd-src/lib/libc/sys/kqueue.2 (revision 181254a7b1bdde6873432bffef2d2decc4b5c22f)
1.\"	$NetBSD: kqueue.2,v 1.50 2019/12/23 01:46:09 kamil Exp $
2.\"
3.\" Copyright (c) 2000 Jonathan Lemon
4.\" All rights reserved.
5.\"
6.\" Copyright (c) 2001, 2002, 2003 The NetBSD Foundation, Inc.
7.\" All rights reserved.
8.\"
9.\" Portions of this documentation is derived from text contributed by
10.\" Luke Mewburn.
11.\"
12.\" Redistribution and use in source and binary forms, with or without
13.\" modification, are permitted provided that the following conditions
14.\" are met:
15.\" 1. Redistributions of source code must retain the above copyright
16.\"    notice, this list of conditions and the following disclaimer.
17.\" 2. Redistributions in binary form must reproduce the above copyright
18.\"    notice, this list of conditions and the following disclaimer in the
19.\"    documentation and/or other materials provided with the distribution.
20.\"
21.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND
22.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.\" $FreeBSD: src/lib/libc/sys/kqueue.2,v 1.22 2001/06/27 19:55:57 dd Exp $
34.\"
35.Dd December 22, 2019
36.Dt KQUEUE 2
37.Os
38.Sh NAME
39.Nm kqueue ,
40.Nm kqueue1 ,
41.Nm kevent ,
42.Nm EV_SET
43.Nd kernel event notification mechanism
44.Sh LIBRARY
45.Lb libc
46.Sh SYNOPSIS
47.In sys/event.h
48.In sys/time.h
49.Ft int
50.Fn kqueue "void"
51.Ft int
52.Fn kqueue1 "int flags"
53.Ft int
54.Fn kevent "int kq" "const struct kevent *changelist" "size_t nchanges" "struct kevent *eventlist" "size_t nevents" "const struct timespec *timeout"
55.Fn EV_SET "&kev" ident filter flags fflags data udata
56.Sh DESCRIPTION
57.Fn kqueue
58provides a generic method of notifying the user when an event
59happens or a condition holds, based on the results of small
60pieces of kernel code termed filters.
61A kevent is identified by the (ident, filter) pair; there may only
62be one unique kevent per kqueue.
63.Pp
64The filter is executed upon the initial registration of a kevent
65in order to detect whether a preexisting condition is present, and is also
66executed whenever an event is passed to the filter for evaluation.
67If the filter determines that the condition should be reported,
68then the kevent is placed on the kqueue for the user to retrieve.
69.Pp
70The filter is also run when the user attempts to retrieve the kevent
71from the kqueue.
72If the filter indicates that the condition that triggered
73the event no longer holds, the kevent is removed from the kqueue and
74is not returned.
75.Pp
76Multiple events which trigger the filter do not result in multiple
77kevents being placed on the kqueue; instead, the filter will aggregate
78the events into a single struct kevent.
79Calling
80.Xr close 2
81on a file descriptor will remove any kevents that reference the descriptor.
82.Pp
83.Fn kqueue
84creates a new kernel event queue and returns a descriptor.
85.Pp
86The
87.Fn kqueue1
88function also allows to set the following
89.Fa flags
90on the returned file descriptor:
91.Bl -column O_NONBLOCK -offset indent
92.It Dv O_CLOEXEC
93Set the close on exec property.
94.It Dv O_NONBLOCK
95Set non-blocking I/O.
96.It Dv O_NOSIGPIPE
97Return
98.Er EPIPE
99instead of raising
100.Dv SIGPIPE .
101.El
102.Pp
103The queue is not inherited by a child created with
104.Xr fork 2 .
105.\" However, if
106.\" .Xr rfork 2
107.\" is called without the
108.\" .Dv RFFDG
109.\" flag, then the descriptor table is shared,
110.\" which will allow sharing of the kqueue between two processes.
111.Pp
112.Fn kevent
113is used to register events with the queue, and return any pending
114events to the user.
115.Fa changelist
116is a pointer to an array of
117.Va kevent
118structures, as defined in
119.In sys/event.h .
120All changes contained in the
121.Fa changelist
122are applied before any pending events are read from the queue.
123.Fa nchanges
124gives the size of
125.Fa changelist .
126.Fa eventlist
127is a pointer to an array of kevent structures.
128.Fa nevents
129determines the size of
130.Fa eventlist .
131If
132.Fa timeout
133is a
134.No non- Ns Dv NULL
135pointer, it specifies a maximum interval to wait
136for an event, which will be interpreted as a
137.Li struct timespec .
138If
139.Fa timeout
140is a
141.Dv NULL
142pointer,
143.Fn kevent
144waits indefinitely.
145To effect a poll, the
146.Fa timeout
147argument should be
148.No non- Ns Dv NULL ,
149pointing to a zero-valued
150.Xr timespec 3
151structure.
152The same array may be used for the
153.Fa changelist
154and
155.Fa eventlist .
156.Pp
157.Fn EV_SET
158is a macro which is provided for ease of initializing a kevent structure.
159This macro does not evaluate its parameters multiple times.
160.Pp
161The
162.Va kevent
163structure is defined as:
164.Bd -literal
165struct kevent {
166	uintptr_t ident;	/* identifier for this event */
167	uint32_t  filter;	/* filter for event */
168	uint32_t  flags;	/* action flags for kqueue */
169	uint32_t  fflags;	/* filter flag value */
170	int64_t   data;		/* filter data value */
171	void     *udata;	/* opaque user data identifier */
172};
173.Ed
174.Pp
175The fields of
176.Fa struct kevent
177are:
178.Bl -tag -width XXXfilter -offset indent
179.It ident
180Value used to identify this event.
181The exact interpretation is determined by the attached filter,
182but often is a file descriptor.
183.It filter
184Identifies the kernel filter used to process this event.
185There are pre-defined system filters (which are described below), and
186other filters may be added by kernel subsystems as necessary.
187.It flags
188Actions to perform on the event.
189.It fflags
190Filter-specific flags.
191.It data
192Filter-specific data value.
193.It udata
194Opaque user-defined value passed through the kernel unchanged.
195.El
196.Pp
197The
198.Va flags
199field can contain the following values:
200.Bl -tag -width XXXEV_ONESHOT -offset indent
201.It Dv EV_ADD
202Adds the event to the kqueue.
203Re-adding an existing event will modify the parameters of the original
204event, and not result in a duplicate entry.
205Adding an event automatically enables it,
206unless overridden by the EV_DISABLE flag.
207.It Dv EV_ENABLE
208Permit
209.Fn kevent
210to return the event if it is triggered.
211.It Dv EV_DISABLE
212Disable the event so
213.Fn kevent
214will not return it.
215The filter itself is not disabled.
216.It Dv EV_DISPATCH
217Disable the event source immediately after delivery of an event.
218See
219.Dv EV_DISABLE
220above.
221.It Dv EV_DELETE
222Removes the event from the kqueue.
223Events which are attached to file descriptors are automatically deleted
224on the last close of the descriptor.
225.It Dv EV_RECEIPT
226This flag is useful for making bulk changes to a kqueue without draining
227any pending events.
228When passed as input, it forces
229.Dv EV_ERROR
230to always be returned.
231When a filter is successfully added the
232.Va data
233field will be zero.
234.It Dv EV_ONESHOT
235Causes the event to return only the first occurrence of the filter
236being triggered.
237After the user retrieves the event from the kqueue, it is deleted.
238.It Dv EV_CLEAR
239After the event is retrieved by the user, its state is reset.
240This is useful for filters which report state transitions
241instead of the current state.
242Note that some filters may automatically set this flag internally.
243.It Dv EV_EOF
244Filters may set this flag to indicate filter-specific EOF condition.
245.It Dv EV_ERROR
246See
247.Sx RETURN VALUES
248below.
249.El
250.Ss Filters
251Filters are identified by a number.
252There are two types of filters; pre-defined filters which
253are described below, and third-party filters that may be added with
254.Xr kfilter_register 9
255by kernel sub-systems, third-party device drivers, or loadable
256kernel modules.
257.Pp
258As a third-party filter is referenced by a well-known name instead
259of a statically assigned number, two
260.Xr ioctl 2 Ns s
261are supported on the file descriptor returned by
262.Fn kqueue
263to map a filter name to a filter number, and vice-versa (passing
264arguments in a structure described below):
265.Bl -tag -width KFILTER_BYFILTER -offset indent
266.It Dv KFILTER_BYFILTER
267Map
268.Va filter
269to
270.Va name ,
271which is of size
272.Va len .
273.It Dv KFILTER_BYNAME
274Map
275.Va name
276to
277.Va filter .
278.Va len
279is ignored.
280.El
281.Pp
282The following structure is used to pass arguments in and out of the
283.Xr ioctl 2 :
284.Bd -literal -offset indent
285struct kfilter_mapping {
286	char	 *name;		/* name to lookup or return */
287	size_t	 len;		/* length of name */
288	uint32_t filter;	/* filter to lookup or return */
289};
290.Ed
291.Pp
292Arguments may be passed to and from the filter via the
293.Va fflags
294and
295.Va data
296fields in the kevent structure.
297.Pp
298The predefined system filters are:
299.Bl -tag -width EVFILT_SIGNAL
300.It Dv EVFILT_READ
301Takes a descriptor as the identifier, and returns whenever
302there is data available to read.
303The behavior of the filter is slightly different depending
304on the descriptor type.
305.Bl -tag -width 2n
306.It Sockets
307Sockets which have previously been passed to
308.Xr listen 2
309return when there is an incoming connection pending.
310.Va data
311contains the size of the listen backlog (i.e., the number of
312connections ready to be accepted with
313.Xr accept 2 . )
314.Pp
315Other socket descriptors return when there is data to be read,
316subject to the
317.Dv SO_RCVLOWAT
318value of the socket buffer.
319This may be overridden with a per-filter low water mark at the
320time the filter is added by setting the
321NOTE_LOWAT
322flag in
323.Va fflags ,
324and specifying the new low water mark in
325.Va data .
326On return,
327.Va data
328contains the number of bytes in the socket buffer.
329.Pp
330If the read direction of the socket has shutdown, then the filter
331also sets EV_EOF in
332.Va flags ,
333and returns the socket error (if any) in
334.Va fflags .
335It is possible for EOF to be returned (indicating the connection is gone)
336while there is still data pending in the socket buffer.
337.It Vnodes
338Returns when the file pointer is not at the end of file.
339.Va data
340contains the offset from current position to end of file,
341and may be negative.
342.It "Fifos, Pipes"
343Returns when there is data to read;
344.Va data
345contains the number of bytes available.
346.Pp
347When the last writer disconnects, the filter will set EV_EOF in
348.Va flags .
349This may be cleared by passing in EV_CLEAR, at which point the
350filter will resume waiting for data to become available before
351returning.
352.El
353.It Dv EVFILT_WRITE
354Takes a descriptor as the identifier, and returns whenever
355it is possible to write to the descriptor.
356For sockets, pipes, fifos, and ttys,
357.Va data
358will contain the amount of space remaining in the write buffer.
359The filter will set EV_EOF when the reader disconnects, and for
360the fifo case, this may be cleared by use of EV_CLEAR.
361Note that this filter is not supported for vnodes.
362.Pp
363For sockets, the low water mark and socket error handling is
364identical to the EVFILT_READ case.
365.It Dv EVFILT_AIO
366This is not implemented in
367.Nx .
368.ig
369The sigevent portion of the AIO request is filled in, with
370.Va sigev_notify_kqueue
371containing the descriptor of the kqueue that the event should
372be attached to,
373.Va sigev_value
374containing the udata value, and
375.Va sigev_notify
376set to SIGEV_EVENT.
377When the aio_* function is called, the event will be registered
378with the specified kqueue, and the
379.Va ident
380argument set to the
381.Fa struct aiocb
382returned by the aio_* function.
383The filter returns under the same conditions as aio_error.
384.Pp
385Alternatively, a kevent structure may be initialized, with
386.Va ident
387containing the descriptor of the kqueue, and the
388address of the kevent structure placed in the
389.Va aio_lio_opcode
390field of the AIO request.
391However, this approach will not work on
392architectures with 64-bit pointers, and should be considered deprecated.
393..
394.It Dv EVFILT_VNODE
395Takes a file descriptor as the identifier and the events to watch for in
396.Va fflags ,
397and returns when one or more of the requested events occurs on the descriptor.
398The events to monitor are:
399.Bl -tag -width XXNOTE_RENAME
400.It Dv NOTE_DELETE
401.Xr unlink 2
402was called on the file referenced by the descriptor.
403.It Dv NOTE_WRITE
404A write occurred on the file referenced by the descriptor.
405.It Dv NOTE_EXTEND
406The file referenced by the descriptor was extended.
407.It Dv NOTE_ATTRIB
408The file referenced by the descriptor had its attributes changed.
409.It Dv NOTE_LINK
410The link count on the file changed.
411.It Dv NOTE_RENAME
412The file referenced by the descriptor was renamed.
413.It Dv NOTE_REVOKE
414Access to the file was revoked via
415.Xr revoke 2
416or the underlying file system was unmounted.
417.El
418.Pp
419On return,
420.Va fflags
421contains the events which triggered the filter.
422.It Dv EVFILT_PROC
423Takes the process ID to monitor as the identifier and the events to watch for
424in
425.Va fflags ,
426and returns when the process performs one or more of the requested events.
427If a process can normally see another process, it can attach an event to it.
428The events to monitor are:
429.Bl -tag -width XXNOTE_TRACKERR
430.It Dv NOTE_EXIT
431The process has exited.
432The exit code of the process is stored in
433.Va data .
434.It Dv NOTE_FORK
435The process has called
436.Xr fork 2 .
437.It Dv NOTE_EXEC
438The process has executed a new process via
439.Xr execve 2
440or similar call.
441.It Dv NOTE_TRACK
442Follow a process across
443.Xr fork 2
444calls.
445The parent process will return with NOTE_TRACK set in the
446.Va fflags
447field, while the child process will return with NOTE_CHILD set in
448.Va fflags
449and the parent PID in
450.Va data .
451.It Dv NOTE_TRACKERR
452This flag is returned if the system was unable to attach an event to
453the child process, usually due to resource limitations.
454.El
455.Pp
456On return,
457.Va fflags
458contains the events which triggered the filter.
459.It Dv EVFILT_SIGNAL
460Takes the signal number to monitor as the identifier and returns
461when the given signal is delivered to the current process.
462This coexists with the
463.Xr signal 3
464and
465.Xr sigaction 2
466facilities, and has a lower precedence.
467The filter will record
468all attempts to deliver a signal to a process, even if the signal has
469been marked as SIG_IGN.
470Event notification happens after normal signal delivery processing.
471.Va data
472returns the number of times the signal has occurred since the last call to
473.Fn kevent .
474This filter automatically sets the EV_CLEAR flag internally.
475.It Dv EVFILT_TIMER
476Establishes an arbitrary timer identified by
477.Va ident .
478When adding a timer,
479.Va data
480specifies the timeout period in milliseconds.
481The timer will be periodic unless EV_ONESHOT is specified.
482On return,
483.Va data
484contains the number of times the timeout has expired since the last call to
485.Fn kevent .
486This filter automatically sets the EV_CLEAR flag internally.
487.It Dv EVFILT_FS
488Establishes a file system monitor.
489Currently it only monitors file system mount and unmount actions.
490.El
491.Sh RETURN VALUES
492.Fn kqueue
493creates a new kernel event queue and returns a file descriptor.
494If there was an error creating the kernel event queue, a value of \-1 is
495returned and
496.Dv errno
497is set.
498.Pp
499.Fn kevent
500returns the number of events placed in the
501.Fa eventlist ,
502up to the value given by
503.Fa nevents .
504If an error occurs while processing an element of the
505.Fa changelist
506and there is enough room in the
507.Fa eventlist ,
508then the event will be placed in the
509.Fa eventlist
510with
511.Dv EV_ERROR
512set in
513.Va flags
514and the system error in
515.Va data .
516Otherwise,
517.Dv \-1
518will be returned, and
519.Dv errno
520will be set to indicate the error condition.
521If the time limit expires, then
522.Fn kevent
523returns 0.
524.Sh EXAMPLES
525The following example program monitors a file (provided to it as the first
526argument) and prints information about some common events it receives
527notifications for:
528.Bd -literal -offset indent
529#include <sys/types.h>
530#include <sys/event.h>
531#include <sys/time.h>
532#include <stdio.h>
533#include <unistd.h>
534#include <stdlib.h>
535#include <fcntl.h>
536#include <err.h>
537
538int
539main(int argc, char *argv[])
540{
541        int fd, kq, nev;
542        struct kevent ev;
543        static const struct timespec tout = { 1, 0 };
544
545        if ((fd = open(argv[1], O_RDONLY)) == -1)
546                err(1, "Cannot open `%s'", argv[1]);
547
548        if ((kq = kqueue()) == -1)
549                err(1, "Cannot create kqueue");
550
551        EV_SET(&ev, fd, EVFILT_VNODE, EV_ADD | EV_ENABLE | EV_CLEAR,
552            NOTE_DELETE|NOTE_WRITE|NOTE_EXTEND|NOTE_ATTRIB|NOTE_LINK|
553            NOTE_RENAME|NOTE_REVOKE, 0, 0);
554        if (kevent(kq, &ev, 1, NULL, 0, &tout) == -1)
555                err(1, "kevent");
556        for (;;) {
557                nev = kevent(kq, NULL, 0, &ev, 1, &tout);
558                if (nev == -1)
559                        err(1, "kevent");
560                if (nev == 0)
561                        continue;
562                if (ev.fflags & NOTE_DELETE) {
563                        printf("deleted ");
564                        ev.fflags &= ~NOTE_DELETE;
565                }
566                if (ev.fflags & NOTE_WRITE) {
567                        printf("written ");
568                        ev.fflags &= ~NOTE_WRITE;
569                }
570                if (ev.fflags & NOTE_EXTEND) {
571                        printf("extended ");
572                        ev.fflags &= ~NOTE_EXTEND;
573                }
574                if (ev.fflags & NOTE_ATTRIB) {
575                        printf("chmod/chown/utimes ");
576                        ev.fflags &= ~NOTE_ATTRIB;
577                }
578                if (ev.fflags & NOTE_LINK) {
579                        printf("hardlinked ");
580                        ev.fflags &= ~NOTE_LINK;
581                }
582                if (ev.fflags & NOTE_RENAME) {
583                        printf("renamed ");
584                        ev.fflags &= ~NOTE_RENAME;
585                }
586                if (ev.fflags & NOTE_REVOKE) {
587                        printf("revoked ");
588                        ev.fflags &= ~NOTE_REVOKE;
589                }
590                printf("\\n");
591                if (ev.fflags)
592                        warnx("unknown event 0x%x\\n", ev.fflags);
593        }
594}
595.Ed
596.Sh ERRORS
597The
598.Fn kqueue
599function fails if:
600.Bl -tag -width Er
601.It Bq Er EMFILE
602The per-process descriptor table is full.
603.It Bq Er ENFILE
604The system file table is full.
605.It Bq Er ENOMEM
606The kernel failed to allocate enough memory for the kernel queue.
607.El
608.Pp
609The
610.Fn kevent
611function fails if:
612.Bl -tag -width Er
613.It Bq Er EACCES
614The process does not have permission to register a filter.
615.It Bq Er EBADF
616The specified descriptor is invalid.
617.It Bq Er EFAULT
618There was an error reading or writing the
619.Va kevent
620structure.
621.It Bq Er EINTR
622A signal was delivered before the timeout expired and before any
623events were placed on the kqueue for return.
624All changes contained in the
625.Fa changelist
626are applied before returning this error.
627.It Bq Er EINVAL
628The specified time limit or filter is invalid.
629.It Bq Er ENOENT
630The event could not be found to be modified or deleted.
631.It Bq Er ENOMEM
632No memory was available to register the event.
633.It Bq Er EOPNOTSUPP
634This type of file descriptor is not supported for
635.Fn kevent
636operations.
637.It Bq Er ESRCH
638The specified process to attach to does not exist.
639.El
640.Sh SEE ALSO
641.\" .Xr aio_error 2 ,
642.\" .Xr aio_read 2 ,
643.\" .Xr aio_return 2 ,
644.Xr fork 2 ,
645.Xr ioctl 2 ,
646.Xr listen 2 ,
647.Xr poll 2 ,
648.Xr read 2 ,
649.Xr select 2 ,
650.Xr sigaction 2 ,
651.Xr unlink 2 ,
652.Xr write 2 ,
653.Xr signal 3 ,
654.Xr timespec 3 ,
655.Xr kfilter_register 9 ,
656.Xr knote 9
657.Rs
658.%A Jonathan Lemon
659.%T "Kqueue: A Generic and Scalable Event Notification Facility"
660.%I USENIX Association
661.%B Proceedings of the FREENIX Track: 2001 USENIX Annual Technical Conference
662.%D June 25-30, 2001
663.%U http://www.usenix.org/event/usenix01/freenix01/full_papers/lemon/lemon.pdf
664.Re
665.Sh HISTORY
666The
667.Fn kqueue
668and
669.Fn kevent
670functions first appeared in
671.Fx 4.1 ,
672and then in
673.Nx 2.0 .
674The
675.Fn kqueue1
676function first appeared in
677.Nx 6.0 .
678.Pp
679The
680.Fn EV_SET
681macro was protected from evaluating multiple times the first argument in
682.Nx 8.0 .
683.Pp
684The
685.Va udata
686type was changed from intptr_t to void * in
687.Nx 10.0 .
688