xref: /netbsd-src/lib/libc/sys/kqueue.2 (revision 1b9578b8c2c1f848eeb16dabbfd7d1f0d9fdefbd)
1.\"	$NetBSD: kqueue.2,v 1.31 2011/06/26 16:42:41 christos Exp $
2.\"
3.\" Copyright (c) 2000 Jonathan Lemon
4.\" All rights reserved.
5.\"
6.\" Copyright (c) 2001, 2002, 2003 The NetBSD Foundation, Inc.
7.\" All rights reserved.
8.\"
9.\" Portions of this documentation is derived from text contributed by
10.\" Luke Mewburn.
11.\"
12.\" Redistribution and use in source and binary forms, with or without
13.\" modification, are permitted provided that the following conditions
14.\" are met:
15.\" 1. Redistributions of source code must retain the above copyright
16.\"    notice, this list of conditions and the following disclaimer.
17.\" 2. Redistributions in binary form must reproduce the above copyright
18.\"    notice, this list of conditions and the following disclaimer in the
19.\"    documentation and/or other materials provided with the distribution.
20.\"
21.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND
22.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.\" $FreeBSD: src/lib/libc/sys/kqueue.2,v 1.22 2001/06/27 19:55:57 dd Exp $
34.\"
35.Dd June 24, 2011
36.Dt KQUEUE 2
37.Os
38.Sh NAME
39.Nm kqueue ,
40.Nm kqueue1 ,
41.Nm kevent
42.Nd kernel event notification mechanism
43.Sh LIBRARY
44.Lb libc
45.Sh SYNOPSIS
46.In sys/event.h
47.In sys/time.h
48.Ft int
49.Fn kqueue "void"
50.Ft int
51.Fn kqueue1 "int flags"
52.Ft int
53.Fn kevent "int kq" "const struct kevent *changelist" "size_t nchanges" "struct kevent *eventlist" "size_t nevents" "const struct timespec *timeout"
54.Fn EV_SET "\*[Am]kev" ident filter flags fflags data udata
55.Sh DESCRIPTION
56.Fn kqueue
57provides a generic method of notifying the user when an event
58happens or a condition holds, based on the results of small
59pieces of kernel code termed filters.
60A kevent is identified by the (ident, filter) pair; there may only
61be one unique kevent per kqueue.
62.Pp
63The filter is executed upon the initial registration of a kevent
64in order to detect whether a preexisting condition is present, and is also
65executed whenever an event is passed to the filter for evaluation.
66If the filter determines that the condition should be reported,
67then the kevent is placed on the kqueue for the user to retrieve.
68.Pp
69The filter is also run when the user attempts to retrieve the kevent
70from the kqueue.
71If the filter indicates that the condition that triggered
72the event no longer holds, the kevent is removed from the kqueue and
73is not returned.
74.Pp
75Multiple events which trigger the filter do not result in multiple
76kevents being placed on the kqueue; instead, the filter will aggregate
77the events into a single struct kevent.
78Calling
79.Fn close
80on a file descriptor will remove any kevents that reference the descriptor.
81.Pp
82.Fn kqueue
83creates a new kernel event queue and returns a descriptor.
84.Pp
85The
86.Fn kqueue1
87also allows to set the following
88.Fa flags
89on the returned file descriptor:
90.Bl -column O_NONBLOCK -offset indent
91.It Dv O_CLOEXEC
92Set the close on exec property.
93.It Dv O_NONBLOCK
94Sets non-blocking I/O.
95.El
96The queue is not inherited by a child created with
97.Xr fork 2 .
98.\" However, if
99.\" .Xr rfork 2
100.\" is called without the
101.\" .Dv RFFDG
102.\" flag, then the descriptor table is shared,
103.\" which will allow sharing of the kqueue between two processes.
104.Pp
105.Fn kevent
106is used to register events with the queue, and return any pending
107events to the user.
108.Fa changelist
109is a pointer to an array of
110.Va kevent
111structures, as defined in
112.In sys/event.h .
113All changes contained in the
114.Fa changelist
115are applied before any pending events are read from the queue.
116.Fa nchanges
117gives the size of
118.Fa changelist .
119.Fa eventlist
120is a pointer to an array of kevent structures.
121.Fa nevents
122determines the size of
123.Fa eventlist .
124If
125.Fa timeout
126is a
127.No non- Ns Dv NULL
128pointer, it specifies a maximum interval to wait
129for an event, which will be interpreted as a struct timespec.
130If
131.Fa timeout
132is a
133.Dv NULL
134pointer,
135.Fn kevent
136waits indefinitely.
137To effect a poll, the
138.Fa timeout
139argument should be
140.No non- Ns Dv NULL ,
141pointing to a zero-valued
142.Va timespec
143structure.
144The same array may be used for the
145.Fa changelist
146and
147.Fa eventlist .
148.Pp
149.Fn EV_SET
150is a macro which is provided for ease of initializing a
151kevent structure.
152.Pp
153The
154.Va kevent
155structure is defined as:
156.Bd -literal
157struct kevent {
158	uintptr_t ident;	/* identifier for this event */
159	uint32_t  filter;	/* filter for event */
160	uint32_t  flags;	/* action flags for kqueue */
161	uint32_t  fflags;	/* filter flag value */
162	int64_t   data;		/* filter data value */
163	intptr_t  udata;	/* opaque user data identifier */
164};
165.Ed
166.Pp
167The fields of
168.Fa struct kevent
169are:
170.Bl -tag -width XXXfilter -offset indent
171.It ident
172Value used to identify this event.
173The exact interpretation is determined by the attached filter,
174but often is a file descriptor.
175.It filter
176Identifies the kernel filter used to process this event.
177There are pre-defined system filters (which are described below), and
178other filters may be added by kernel subsystems as necessary.
179.It flags
180Actions to perform on the event.
181.It fflags
182Filter-specific flags.
183.It data
184Filter-specific data value.
185.It udata
186Opaque user-defined value passed through the kernel unchanged.
187.El
188.Pp
189The
190.Va flags
191field can contain the following values:
192.Bl -tag -width XXXEV_ONESHOT -offset indent
193.It EV_ADD
194Adds the event to the kqueue.
195Re-adding an existing event will modify the parameters of the original
196event, and not result in a duplicate entry.
197Adding an event automatically enables it,
198unless overridden by the EV_DISABLE flag.
199.It EV_ENABLE
200Permit
201.Fn kevent
202to return the event if it is triggered.
203.It EV_DISABLE
204Disable the event so
205.Fn kevent
206will not return it.
207The filter itself is not disabled.
208.It EV_DELETE
209Removes the event from the kqueue.
210Events which are attached to file descriptors are automatically deleted
211on the last close of the descriptor.
212.It EV_ONESHOT
213Causes the event to return only the first occurrence of the filter
214being triggered.
215After the user retrieves the event from the kqueue, it is deleted.
216.It EV_CLEAR
217After the event is retrieved by the user, its state is reset.
218This is useful for filters which report state transitions
219instead of the current state.
220Note that some filters may automatically set this flag internally.
221.It EV_EOF
222Filters may set this flag to indicate filter-specific EOF condition.
223.It EV_ERROR
224See
225.Sx RETURN VALUES
226below.
227.El
228.Ss Filters
229Filters are identified by a number.
230There are two types of filters; pre-defined filters which
231are described below, and third-party filters that may be added with
232.Xr kfilter_register 9
233by kernel sub-systems, third-party device drivers, or loadable
234kernel modules.
235.Pp
236As a third-party filter is referenced by a well-known name instead
237of a statically assigned number, two
238.Xr ioctl 2 Ns s
239are supported on the file descriptor returned by
240.Fn kqueue
241to map a filter name to a filter number, and vice-versa (passing
242arguments in a structure described below):
243.Bl -tag -width KFILTER_BYFILTER -offset indent
244.It KFILTER_BYFILTER
245Map
246.Va filter
247to
248.Va name ,
249which is of size
250.Va len .
251.It KFILTER_BYNAME
252Map
253.Va name
254to
255.Va filter .
256.Va len
257is ignored.
258.El
259.Pp
260The following structure is used to pass arguments in and out of the
261.Xr ioctl 2 :
262.Bd -literal -offset indent
263struct kfilter_mapping {
264	char	 *name;		/* name to lookup or return */
265	size_t	 len;		/* length of name */
266	uint32_t filter;	/* filter to lookup or return */
267};
268.Ed
269.Pp
270Arguments may be passed to and from the filter via the
271.Va fflags
272and
273.Va data
274fields in the kevent structure.
275.Pp
276The predefined system filters are:
277.Bl -tag -width EVFILT_SIGNAL
278.It EVFILT_READ
279Takes a descriptor as the identifier, and returns whenever
280there is data available to read.
281The behavior of the filter is slightly different depending
282on the descriptor type.
283.Pp
284.Bl -tag -width 2n
285.It Sockets
286Sockets which have previously been passed to
287.Fn listen
288return when there is an incoming connection pending.
289.Va data
290contains the size of the listen backlog (i.e., the number of
291connections ready to be accepted with
292.Xr accept 2 . )
293.Pp
294Other socket descriptors return when there is data to be read,
295subject to the
296.Dv SO_RCVLOWAT
297value of the socket buffer.
298This may be overridden with a per-filter low water mark at the
299time the filter is added by setting the
300NOTE_LOWAT
301flag in
302.Va fflags ,
303and specifying the new low water mark in
304.Va data .
305On return,
306.Va data
307contains the number of bytes in the socket buffer.
308.Pp
309If the read direction of the socket has shutdown, then the filter
310also sets EV_EOF in
311.Va flags ,
312and returns the socket error (if any) in
313.Va fflags .
314It is possible for EOF to be returned (indicating the connection is gone)
315while there is still data pending in the socket buffer.
316.It Vnodes
317Returns when the file pointer is not at the end of file.
318.Va data
319contains the offset from current position to end of file,
320and may be negative.
321.It "Fifos, Pipes"
322Returns when there is data to read;
323.Va data
324contains the number of bytes available.
325.Pp
326When the last writer disconnects, the filter will set EV_EOF in
327.Va flags .
328This may be cleared by passing in EV_CLEAR, at which point the
329filter will resume waiting for data to become available before
330returning.
331.El
332.It EVFILT_WRITE
333Takes a descriptor as the identifier, and returns whenever
334it is possible to write to the descriptor.
335For sockets, pipes, fifos, and ttys,
336.Va data
337will contain the amount of space remaining in the write buffer.
338The filter will set EV_EOF when the reader disconnects, and for
339the fifo case, this may be cleared by use of EV_CLEAR.
340Note that this filter is not supported for vnodes.
341.Pp
342For sockets, the low water mark and socket error handling is
343identical to the EVFILT_READ case.
344.It EVFILT_AIO
345This is not implemented in
346.Nx .
347.ig
348The sigevent portion of the AIO request is filled in, with
349.Va sigev_notify_kqueue
350containing the descriptor of the kqueue that the event should
351be attached to,
352.Va sigev_value
353containing the udata value, and
354.Va sigev_notify
355set to SIGEV_EVENT.
356When the aio_* function is called, the event will be registered
357with the specified kqueue, and the
358.Va ident
359argument set to the
360.Fa struct aiocb
361returned by the aio_* function.
362The filter returns under the same conditions as aio_error.
363.Pp
364Alternatively, a kevent structure may be initialized, with
365.Va ident
366containing the descriptor of the kqueue, and the
367address of the kevent structure placed in the
368.Va aio_lio_opcode
369field of the AIO request.
370However, this approach will not work on
371architectures with 64-bit pointers, and should be considered deprecated.
372..
373.It EVFILT_VNODE
374Takes a file descriptor as the identifier and the events to watch for in
375.Va fflags ,
376and returns when one or more of the requested events occurs on the descriptor.
377The events to monitor are:
378.Bl -tag -width XXNOTE_RENAME
379.It NOTE_DELETE
380.Fn unlink
381was called on the file referenced by the descriptor.
382.It NOTE_WRITE
383A write occurred on the file referenced by the descriptor.
384.It NOTE_EXTEND
385The file referenced by the descriptor was extended.
386.It NOTE_ATTRIB
387The file referenced by the descriptor had its attributes changed.
388.It NOTE_LINK
389The link count on the file changed.
390.It NOTE_RENAME
391The file referenced by the descriptor was renamed.
392.It NOTE_REVOKE
393Access to the file was revoked via
394.Xr revoke 2
395or the underlying fileystem was unmounted.
396.El
397.Pp
398On return,
399.Va fflags
400contains the events which triggered the filter.
401.It EVFILT_PROC
402Takes the process ID to monitor as the identifier and the events to watch for
403in
404.Va fflags ,
405and returns when the process performs one or more of the requested events.
406If a process can normally see another process, it can attach an event to it.
407The events to monitor are:
408.Bl -tag -width XXNOTE_TRACKERR
409.It NOTE_EXIT
410The process has exited.
411.It NOTE_FORK
412The process has called
413.Fn fork .
414.It NOTE_EXEC
415The process has executed a new process via
416.Xr execve 2
417or similar call.
418.It NOTE_TRACK
419Follow a process across
420.Fn fork
421calls.
422The parent process will return with NOTE_TRACK set in the
423.Va fflags
424field, while the child process will return with NOTE_CHILD set in
425.Va fflags
426and the parent PID in
427.Va data .
428.It NOTE_TRACKERR
429This flag is returned if the system was unable to attach an event to
430the child process, usually due to resource limitations.
431.El
432.Pp
433On return,
434.Va fflags
435contains the events which triggered the filter.
436.It EVFILT_SIGNAL
437Takes the signal number to monitor as the identifier and returns
438when the given signal is delivered to the current process.
439This coexists with the
440.Fn signal
441and
442.Fn sigaction
443facilities, and has a lower precedence.
444The filter will record
445all attempts to deliver a signal to a process, even if the signal has
446been marked as SIG_IGN.
447Event notification happens after normal signal delivery processing.
448.Va data
449returns the number of times the signal has occurred since the last call to
450.Fn kevent .
451This filter automatically sets the EV_CLEAR flag internally.
452.It EVFILT_TIMER
453Establishes an arbitrary timer identified by
454.Va ident .
455When adding a timer,
456.Va data
457specifies the timeout period in milliseconds.
458The timer will be periodic unless EV_ONESHOT is specified.
459On return,
460.Va data
461contains the number of times the timeout has expired since the last call to
462.Fn kevent .
463This filter automatically sets the EV_CLEAR flag internally.
464.El
465.Sh RETURN VALUES
466.Fn kqueue
467creates a new kernel event queue and returns a file descriptor.
468If there was an error creating the kernel event queue, a value of \-1 is
469returned and errno set.
470.Pp
471.Fn kevent
472returns the number of events placed in the
473.Fa eventlist ,
474up to the value given by
475.Fa nevents .
476If an error occurs while processing an element of the
477.Fa changelist
478and there is enough room in the
479.Fa eventlist ,
480then the event will be placed in the
481.Fa eventlist
482with
483.Dv EV_ERROR
484set in
485.Va flags
486and the system error in
487.Va data .
488Otherwise,
489.Dv \-1
490will be returned, and
491.Dv errno
492will be set to indicate the error condition.
493If the time limit expires, then
494.Fn kevent
495returns 0.
496.Sh EXAMPLES
497The following example program monitors a file (provided to it as the first
498argument) and prints information about some common events it receives
499notifications for:
500.Bd -literal -offset indent
501#include \*[Lt]sys/types.h\*[Gt]
502#include \*[Lt]sys/event.h\*[Gt]
503#include \*[Lt]sys/time.h\*[Gt]
504#include \*[Lt]stdio.h\*[Gt]
505#include \*[Lt]unistd.h\*[Gt]
506#include \*[Lt]stdlib.h\*[Gt]
507#include \*[Lt]fcntl.h\*[Gt]
508#include \*[Lt]err.h\*[Gt]
509
510int
511main(int argc, char *argv[])
512{
513        int fd, kq, nev;
514        struct kevent ev;
515        static const struct timespec tout = { 1, 0 };
516
517        if ((fd = open(argv[1], O_RDONLY)) == -1)
518                err(1, "Cannot open `%s'", argv[1]);
519
520        if ((kq = kqueue()) == -1)
521                err(1, "Cannot create kqueue");
522
523        EV_SET(\*[Am]ev, fd, EVFILT_VNODE, EV_ADD | EV_ENABLE | EV_CLEAR,
524            NOTE_DELETE|NOTE_WRITE|NOTE_EXTEND|NOTE_ATTRIB|NOTE_LINK|
525            NOTE_RENAME|NOTE_REVOKE, 0, 0);
526        if (kevent(kq, \*[Am]ev, 1, NULL, 0, \*[Am]tout) == -1)
527                err(1, "kevent");
528        for (;;) {
529                nev = kevent(kq, NULL, 0, \*[Am]ev, 1, \*[Am]tout);
530                if (nev == -1)
531                        err(1, "kevent");
532                if (nev == 0)
533                        continue;
534                if (ev.fflags \*[Am] NOTE_DELETE) {
535                        printf("deleted ");
536                        ev.fflags \*[Am]= ~NOTE_DELETE;
537                }
538                if (ev.fflags \*[Am] NOTE_WRITE) {
539                        printf("written ");
540                        ev.fflags \*[Am]= ~NOTE_WRITE;
541                }
542                if (ev.fflags \*[Am] NOTE_EXTEND) {
543                        printf("extended ");
544                        ev.fflags \*[Am]= ~NOTE_EXTEND;
545                }
546                if (ev.fflags \*[Am] NOTE_ATTRIB) {
547                        printf("chmod/chown/utimes ");
548                        ev.fflags \*[Am]= ~NOTE_ATTRIB;
549                }
550                if (ev.fflags \*[Am] NOTE_LINK) {
551                        printf("hardlinked ");
552                        ev.fflags \*[Am]= ~NOTE_LINK;
553                }
554                if (ev.fflags \*[Am] NOTE_RENAME) {
555                        printf("renamed ");
556                        ev.fflags \*[Am]= ~NOTE_RENAME;
557                }
558                if (ev.fflags \*[Am] NOTE_REVOKE) {
559                        printf("revoked ");
560                        ev.fflags \*[Am]= ~NOTE_REVOKE;
561                }
562                printf("\\n");
563                if (ev.fflags)
564                        warnx("unknown event 0x%x\\n", ev.fflags);
565        }
566}
567.Ed
568.Sh ERRORS
569The
570.Fn kqueue
571function fails if:
572.Bl -tag -width Er
573.It Bq Er EMFILE
574The per-process descriptor table is full.
575.It Bq Er ENFILE
576The system file table is full.
577.It Bq Er ENOMEM
578The kernel failed to allocate enough memory for the kernel queue.
579.El
580.Pp
581The
582.Fn kevent
583function fails if:
584.Bl -tag -width Er
585.It Bq Er EACCES
586The process does not have permission to register a filter.
587.It Bq Er EBADF
588The specified descriptor is invalid.
589.It Bq Er EFAULT
590There was an error reading or writing the
591.Va kevent
592structure.
593.It Bq Er EINTR
594A signal was delivered before the timeout expired and before any
595events were placed on the kqueue for return.
596.It Bq Er EINVAL
597The specified time limit or filter is invalid.
598.It Bq Er ENOENT
599The event could not be found to be modified or deleted.
600.It Bq Er ENOMEM
601No memory was available to register the event.
602.It Bq Er ESRCH
603The specified process to attach to does not exist.
604.El
605.Sh SEE ALSO
606.\" .Xr aio_error 2 ,
607.\" .Xr aio_read 2 ,
608.\" .Xr aio_return 2 ,
609.Xr ioctl 2 ,
610.Xr poll 2 ,
611.Xr read 2 ,
612.Xr select 2 ,
613.Xr sigaction 2 ,
614.Xr write 2 ,
615.Xr signal 3 ,
616.Xr kfilter_register 9 ,
617.Xr knote 9
618.Rs
619.%A Jonathan Lemon
620.%T "Kqueue: A Generic and Scalable Event Notification Facility"
621.%I USENIX Association
622.%B Proceedings of the FREENIX Track: 2001 USENIX Annual Technical Conference
623.%D June 25-30, 2001
624.%U http://www.usenix.org/event/usenix01/freenix01/full_papers/lemon/lemon.pdf
625.Re
626.Sh HISTORY
627The
628.Fn kqueue
629and
630.Fn kevent
631functions first appeared in
632.Fx 4.1 ,
633and then in
634.Nx 2.0 .
635The
636.Fn kqueue1
637function first appeared in
638.Nx 6.0 .
639