xref: /netbsd-src/lib/libc/sys/kqueue.2 (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1.\"	$NetBSD: kqueue.2,v 1.29 2010/04/13 10:45:46 wiz Exp $
2.\"
3.\" Copyright (c) 2000 Jonathan Lemon
4.\" All rights reserved.
5.\"
6.\" Copyright (c) 2001, 2002, 2003 The NetBSD Foundation, Inc.
7.\" All rights reserved.
8.\"
9.\" Portions of this documentation is derived from text contributed by
10.\" Luke Mewburn.
11.\"
12.\" Redistribution and use in source and binary forms, with or without
13.\" modification, are permitted provided that the following conditions
14.\" are met:
15.\" 1. Redistributions of source code must retain the above copyright
16.\"    notice, this list of conditions and the following disclaimer.
17.\" 2. Redistributions in binary form must reproduce the above copyright
18.\"    notice, this list of conditions and the following disclaimer in the
19.\"    documentation and/or other materials provided with the distribution.
20.\"
21.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND
22.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.\" $FreeBSD: src/lib/libc/sys/kqueue.2,v 1.22 2001/06/27 19:55:57 dd Exp $
34.\"
35.Dd April 13, 2010
36.Dt KQUEUE 2
37.Os
38.Sh NAME
39.Nm kqueue ,
40.Nm kevent
41.Nd kernel event notification mechanism
42.Sh LIBRARY
43.Lb libc
44.Sh SYNOPSIS
45.In sys/event.h
46.In sys/time.h
47.Ft int
48.Fn kqueue "void"
49.Ft int
50.Fn kevent "int kq" "const struct kevent *changelist" "size_t nchanges" "struct kevent *eventlist" "size_t nevents" "const struct timespec *timeout"
51.Fn EV_SET "\*[Am]kev" ident filter flags fflags data udata
52.Sh DESCRIPTION
53.Fn kqueue
54provides a generic method of notifying the user when an event
55happens or a condition holds, based on the results of small
56pieces of kernel code termed filters.
57A kevent is identified by the (ident, filter) pair; there may only
58be one unique kevent per kqueue.
59.Pp
60The filter is executed upon the initial registration of a kevent
61in order to detect whether a preexisting condition is present, and is also
62executed whenever an event is passed to the filter for evaluation.
63If the filter determines that the condition should be reported,
64then the kevent is placed on the kqueue for the user to retrieve.
65.Pp
66The filter is also run when the user attempts to retrieve the kevent
67from the kqueue.
68If the filter indicates that the condition that triggered
69the event no longer holds, the kevent is removed from the kqueue and
70is not returned.
71.Pp
72Multiple events which trigger the filter do not result in multiple
73kevents being placed on the kqueue; instead, the filter will aggregate
74the events into a single struct kevent.
75Calling
76.Fn close
77on a file descriptor will remove any kevents that reference the descriptor.
78.Pp
79.Fn kqueue
80creates a new kernel event queue and returns a descriptor.
81The queue is not inherited by a child created with
82.Xr fork 2 .
83.\" However, if
84.\" .Xr rfork 2
85.\" is called without the
86.\" .Dv RFFDG
87.\" flag, then the descriptor table is shared,
88.\" which will allow sharing of the kqueue between two processes.
89.Pp
90.Fn kevent
91is used to register events with the queue, and return any pending
92events to the user.
93.Fa changelist
94is a pointer to an array of
95.Va kevent
96structures, as defined in
97.In sys/event.h .
98All changes contained in the
99.Fa changelist
100are applied before any pending events are read from the queue.
101.Fa nchanges
102gives the size of
103.Fa changelist .
104.Fa eventlist
105is a pointer to an array of kevent structures.
106.Fa nevents
107determines the size of
108.Fa eventlist .
109If
110.Fa timeout
111is a
112.No non- Ns Dv NULL
113pointer, it specifies a maximum interval to wait
114for an event, which will be interpreted as a struct timespec.
115If
116.Fa timeout
117is a
118.Dv NULL
119pointer,
120.Fn kevent
121waits indefinitely.
122To effect a poll, the
123.Fa timeout
124argument should be
125.No non- Ns Dv NULL ,
126pointing to a zero-valued
127.Va timespec
128structure.
129The same array may be used for the
130.Fa changelist
131and
132.Fa eventlist .
133.Pp
134.Fn EV_SET
135is a macro which is provided for ease of initializing a
136kevent structure.
137.Pp
138The
139.Va kevent
140structure is defined as:
141.Bd -literal
142struct kevent {
143	uintptr_t ident;	/* identifier for this event */
144	uint32_t  filter;	/* filter for event */
145	uint32_t  flags;	/* action flags for kqueue */
146	uint32_t  fflags;	/* filter flag value */
147	int64_t   data;		/* filter data value */
148	intptr_t  udata;	/* opaque user data identifier */
149};
150.Ed
151.Pp
152The fields of
153.Fa struct kevent
154are:
155.Bl -tag -width XXXfilter -offset indent
156.It ident
157Value used to identify this event.
158The exact interpretation is determined by the attached filter,
159but often is a file descriptor.
160.It filter
161Identifies the kernel filter used to process this event.
162There are pre-defined system filters (which are described below), and
163other filters may be added by kernel subsystems as necessary.
164.It flags
165Actions to perform on the event.
166.It fflags
167Filter-specific flags.
168.It data
169Filter-specific data value.
170.It udata
171Opaque user-defined value passed through the kernel unchanged.
172.El
173.Pp
174The
175.Va flags
176field can contain the following values:
177.Bl -tag -width XXXEV_ONESHOT -offset indent
178.It EV_ADD
179Adds the event to the kqueue.
180Re-adding an existing event will modify the parameters of the original
181event, and not result in a duplicate entry.
182Adding an event automatically enables it,
183unless overridden by the EV_DISABLE flag.
184.It EV_ENABLE
185Permit
186.Fn kevent
187to return the event if it is triggered.
188.It EV_DISABLE
189Disable the event so
190.Fn kevent
191will not return it.
192The filter itself is not disabled.
193.It EV_DELETE
194Removes the event from the kqueue.
195Events which are attached to file descriptors are automatically deleted
196on the last close of the descriptor.
197.It EV_ONESHOT
198Causes the event to return only the first occurrence of the filter
199being triggered.
200After the user retrieves the event from the kqueue, it is deleted.
201.It EV_CLEAR
202After the event is retrieved by the user, its state is reset.
203This is useful for filters which report state transitions
204instead of the current state.
205Note that some filters may automatically set this flag internally.
206.It EV_EOF
207Filters may set this flag to indicate filter-specific EOF condition.
208.It EV_ERROR
209See
210.Sx RETURN VALUES
211below.
212.El
213.Ss Filters
214Filters are identified by a number.
215There are two types of filters; pre-defined filters which
216are described below, and third-party filters that may be added with
217.Xr kfilter_register 9
218by kernel sub-systems, third-party device drivers, or loadable
219kernel modules.
220.Pp
221As a third-party filter is referenced by a well-known name instead
222of a statically assigned number, two
223.Xr ioctl 2 Ns s
224are supported on the file descriptor returned by
225.Fn kqueue
226to map a filter name to a filter number, and vice-versa (passing
227arguments in a structure described below):
228.Bl -tag -width KFILTER_BYFILTER -offset indent
229.It KFILTER_BYFILTER
230Map
231.Va filter
232to
233.Va name ,
234which is of size
235.Va len .
236.It KFILTER_BYNAME
237Map
238.Va name
239to
240.Va filter .
241.Va len
242is ignored.
243.El
244.Pp
245The following structure is used to pass arguments in and out of the
246.Xr ioctl 2 :
247.Bd -literal -offset indent
248struct kfilter_mapping {
249	char	 *name;		/* name to lookup or return */
250	size_t	 len;		/* length of name */
251	uint32_t filter;	/* filter to lookup or return */
252};
253.Ed
254.Pp
255Arguments may be passed to and from the filter via the
256.Va fflags
257and
258.Va data
259fields in the kevent structure.
260.Pp
261The predefined system filters are:
262.Bl -tag -width EVFILT_SIGNAL
263.It EVFILT_READ
264Takes a descriptor as the identifier, and returns whenever
265there is data available to read.
266The behavior of the filter is slightly different depending
267on the descriptor type.
268.Pp
269.Bl -tag -width 2n
270.It Sockets
271Sockets which have previously been passed to
272.Fn listen
273return when there is an incoming connection pending.
274.Va data
275contains the size of the listen backlog (i.e., the number of
276connections ready to be accepted with
277.Xr accept 2 . )
278.Pp
279Other socket descriptors return when there is data to be read,
280subject to the
281.Dv SO_RCVLOWAT
282value of the socket buffer.
283This may be overridden with a per-filter low water mark at the
284time the filter is added by setting the
285NOTE_LOWAT
286flag in
287.Va fflags ,
288and specifying the new low water mark in
289.Va data .
290On return,
291.Va data
292contains the number of bytes in the socket buffer.
293.Pp
294If the read direction of the socket has shutdown, then the filter
295also sets EV_EOF in
296.Va flags ,
297and returns the socket error (if any) in
298.Va fflags .
299It is possible for EOF to be returned (indicating the connection is gone)
300while there is still data pending in the socket buffer.
301.It Vnodes
302Returns when the file pointer is not at the end of file.
303.Va data
304contains the offset from current position to end of file,
305and may be negative.
306.It "Fifos, Pipes"
307Returns when there is data to read;
308.Va data
309contains the number of bytes available.
310.Pp
311When the last writer disconnects, the filter will set EV_EOF in
312.Va flags .
313This may be cleared by passing in EV_CLEAR, at which point the
314filter will resume waiting for data to become available before
315returning.
316.El
317.It EVFILT_WRITE
318Takes a descriptor as the identifier, and returns whenever
319it is possible to write to the descriptor.
320For sockets, pipes, fifos, and ttys,
321.Va data
322will contain the amount of space remaining in the write buffer.
323The filter will set EV_EOF when the reader disconnects, and for
324the fifo case, this may be cleared by use of EV_CLEAR.
325Note that this filter is not supported for vnodes.
326.Pp
327For sockets, the low water mark and socket error handling is
328identical to the EVFILT_READ case.
329.It EVFILT_AIO
330This is not implemented in
331.Nx .
332.ig
333The sigevent portion of the AIO request is filled in, with
334.Va sigev_notify_kqueue
335containing the descriptor of the kqueue that the event should
336be attached to,
337.Va sigev_value
338containing the udata value, and
339.Va sigev_notify
340set to SIGEV_EVENT.
341When the aio_* function is called, the event will be registered
342with the specified kqueue, and the
343.Va ident
344argument set to the
345.Fa struct aiocb
346returned by the aio_* function.
347The filter returns under the same conditions as aio_error.
348.Pp
349Alternatively, a kevent structure may be initialized, with
350.Va ident
351containing the descriptor of the kqueue, and the
352address of the kevent structure placed in the
353.Va aio_lio_opcode
354field of the AIO request.
355However, this approach will not work on
356architectures with 64-bit pointers, and should be considered deprecated.
357..
358.It EVFILT_VNODE
359Takes a file descriptor as the identifier and the events to watch for in
360.Va fflags ,
361and returns when one or more of the requested events occurs on the descriptor.
362The events to monitor are:
363.Bl -tag -width XXNOTE_RENAME
364.It NOTE_DELETE
365.Fn unlink
366was called on the file referenced by the descriptor.
367.It NOTE_WRITE
368A write occurred on the file referenced by the descriptor.
369.It NOTE_EXTEND
370The file referenced by the descriptor was extended.
371.It NOTE_ATTRIB
372The file referenced by the descriptor had its attributes changed.
373.It NOTE_LINK
374The link count on the file changed.
375.It NOTE_RENAME
376The file referenced by the descriptor was renamed.
377.It NOTE_REVOKE
378Access to the file was revoked via
379.Xr revoke 2
380or the underlying fileystem was unmounted.
381.El
382.Pp
383On return,
384.Va fflags
385contains the events which triggered the filter.
386.It EVFILT_PROC
387Takes the process ID to monitor as the identifier and the events to watch for
388in
389.Va fflags ,
390and returns when the process performs one or more of the requested events.
391If a process can normally see another process, it can attach an event to it.
392The events to monitor are:
393.Bl -tag -width XXNOTE_TRACKERR
394.It NOTE_EXIT
395The process has exited.
396.It NOTE_FORK
397The process has called
398.Fn fork .
399.It NOTE_EXEC
400The process has executed a new process via
401.Xr execve 2
402or similar call.
403.It NOTE_TRACK
404Follow a process across
405.Fn fork
406calls.
407The parent process will return with NOTE_TRACK set in the
408.Va fflags
409field, while the child process will return with NOTE_CHILD set in
410.Va fflags
411and the parent PID in
412.Va data .
413.It NOTE_TRACKERR
414This flag is returned if the system was unable to attach an event to
415the child process, usually due to resource limitations.
416.El
417.Pp
418On return,
419.Va fflags
420contains the events which triggered the filter.
421.It EVFILT_SIGNAL
422Takes the signal number to monitor as the identifier and returns
423when the given signal is delivered to the current process.
424This coexists with the
425.Fn signal
426and
427.Fn sigaction
428facilities, and has a lower precedence.
429The filter will record
430all attempts to deliver a signal to a process, even if the signal has
431been marked as SIG_IGN.
432Event notification happens after normal signal delivery processing.
433.Va data
434returns the number of times the signal has occurred since the last call to
435.Fn kevent .
436This filter automatically sets the EV_CLEAR flag internally.
437.It EVFILT_TIMER
438Establishes an arbitrary timer identified by
439.Va ident .
440When adding a timer,
441.Va data
442specifies the timeout period in milliseconds.
443The timer will be periodic unless EV_ONESHOT is specified.
444On return,
445.Va data
446contains the number of times the timeout has expired since the last call to
447.Fn kevent .
448This filter automatically sets the EV_CLEAR flag internally.
449.El
450.Sh RETURN VALUES
451.Fn kqueue
452creates a new kernel event queue and returns a file descriptor.
453If there was an error creating the kernel event queue, a value of \-1 is
454returned and errno set.
455.Pp
456.Fn kevent
457returns the number of events placed in the
458.Fa eventlist ,
459up to the value given by
460.Fa nevents .
461If an error occurs while processing an element of the
462.Fa changelist
463and there is enough room in the
464.Fa eventlist ,
465then the event will be placed in the
466.Fa eventlist
467with
468.Dv EV_ERROR
469set in
470.Va flags
471and the system error in
472.Va data .
473Otherwise,
474.Dv \-1
475will be returned, and
476.Dv errno
477will be set to indicate the error condition.
478If the time limit expires, then
479.Fn kevent
480returns 0.
481.Sh EXAMPLES
482The following example program monitors a file (provided to it as the first
483argument) and prints information about some common events it receives
484notifications for:
485.Bd -literal -offset indent
486#include \*[Lt]sys/types.h\*[Gt]
487#include \*[Lt]sys/event.h\*[Gt]
488#include \*[Lt]sys/time.h\*[Gt]
489#include \*[Lt]stdio.h\*[Gt]
490#include \*[Lt]unistd.h\*[Gt]
491#include \*[Lt]stdlib.h\*[Gt]
492#include \*[Lt]fcntl.h\*[Gt]
493#include \*[Lt]err.h\*[Gt]
494
495int
496main(int argc, char *argv[])
497{
498        int fd, kq, nev;
499        struct kevent ev;
500        static const struct timespec tout = { 1, 0 };
501
502        if ((fd = open(argv[1], O_RDONLY)) == -1)
503                err(1, "Cannot open `%s'", argv[1]);
504
505        if ((kq = kqueue()) == -1)
506                err(1, "Cannot create kqueue");
507
508        EV_SET(\*[Am]ev, fd, EVFILT_VNODE, EV_ADD | EV_ENABLE | EV_CLEAR,
509            NOTE_DELETE|NOTE_WRITE|NOTE_EXTEND|NOTE_ATTRIB|NOTE_LINK|
510            NOTE_RENAME|NOTE_REVOKE, 0, 0);
511	if (kevent(kq, \*[Am]ch, 1, NULL, 0, \*[Am]tout) == -1)
512		err(1, "kevent");
513        for (;;) {
514                nev = kevent(kq, NULL, 0, \*[Am]ev, 1, \*[Am]tout);
515                if (nev == -1)
516                        err(1, "kevent");
517                if (nev == 0)
518                        continue;
519                if (ev.fflags \*[Am] NOTE_DELETE) {
520                        printf("deleted ");
521                        ev.fflags \*[Am]= ~NOTE_DELETE;
522                }
523                if (ev.fflags \*[Am] NOTE_WRITE) {
524                        printf("written ");
525                        ev.fflags \*[Am]= ~NOTE_WRITE;
526                }
527                if (ev.fflags \*[Am] NOTE_EXTEND) {
528                        printf("extended ");
529                        ev.fflags \*[Am]= ~NOTE_EXTEND;
530                }
531                if (ev.fflags \*[Am] NOTE_ATTRIB) {
532                        printf("chmod/chown/utimes ");
533                        ev.fflags \*[Am]= ~NOTE_ATTRIB;
534                }
535                if (ev.fflags \*[Am] NOTE_LINK) {
536                        printf("hardlinked ");
537                        ev.fflags \*[Am]= ~NOTE_LINK;
538                }
539                if (ev.fflags \*[Am] NOTE_RENAME) {
540                        printf("renamed ");
541                        ev.fflags \*[Am]= ~NOTE_RENAME;
542                }
543                if (ev.fflags \*[Am] NOTE_REVOKE) {
544                        printf("revoked ");
545                        ev.fflags \*[Am]= ~NOTE_REVOKE;
546                }
547                printf("\\n");
548                if (ev.fflags)
549                        warnx("unknown event 0x%x\\n", ev.fflags);
550        }
551}
552.Ed
553.Sh ERRORS
554The
555.Fn kqueue
556function fails if:
557.Bl -tag -width Er
558.It Bq Er EMFILE
559The per-process descriptor table is full.
560.It Bq Er ENFILE
561The system file table is full.
562.It Bq Er ENOMEM
563The kernel failed to allocate enough memory for the kernel queue.
564.El
565.Pp
566The
567.Fn kevent
568function fails if:
569.Bl -tag -width Er
570.It Bq Er EACCES
571The process does not have permission to register a filter.
572.It Bq Er EBADF
573The specified descriptor is invalid.
574.It Bq Er EFAULT
575There was an error reading or writing the
576.Va kevent
577structure.
578.It Bq Er EINTR
579A signal was delivered before the timeout expired and before any
580events were placed on the kqueue for return.
581.It Bq Er EINVAL
582The specified time limit or filter is invalid.
583.It Bq Er ENOENT
584The event could not be found to be modified or deleted.
585.It Bq Er ENOMEM
586No memory was available to register the event.
587.It Bq Er ESRCH
588The specified process to attach to does not exist.
589.El
590.Sh SEE ALSO
591.\" .Xr aio_error 2 ,
592.\" .Xr aio_read 2 ,
593.\" .Xr aio_return 2 ,
594.Xr ioctl 2 ,
595.Xr poll 2 ,
596.Xr read 2 ,
597.Xr select 2 ,
598.Xr sigaction 2 ,
599.Xr write 2 ,
600.Xr signal 3 ,
601.Xr kfilter_register 9 ,
602.Xr knote 9
603.Rs
604.%A Jonathan Lemon
605.%T "Kqueue: A Generic and Scalable Event Notification Facility"
606.%I USENIX Association
607.%B Proceedings of the FREENIX Track: 2001 USENIX Annual Technical Conference
608.%D June 25-30, 2001
609.%U http://www.usenix.org/event/usenix01/freenix01/full_papers/lemon/lemon.pdf
610.Re
611.Sh HISTORY
612The
613.Fn kqueue
614and
615.Fn kevent
616functions first appeared in
617.Fx 4.1 ,
618and then in
619.Nx 2.0 .
620