xref: /netbsd-src/sys/compat/linux/common/linux_inotify.c (revision ba958ad86f5e190a72c2d7504e3b550ca7e18afc)
1 /*	$NetBSD: linux_inotify.c,v 1.4 2023/08/23 19:17:59 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Theodore Preduta.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.4 2023/08/23 19:17:59 christos Exp $");
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/bitops.h>
37 #include <sys/dirent.h>
38 #include <sys/event.h>
39 #include <sys/eventvar.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/fcntl.h>
44 #include <sys/poll.h>
45 #include <sys/proc.h>
46 #include <sys/selinfo.h>
47 #include <sys/select.h>
48 #include <sys/signal.h>
49 #include <sys/vnode.h>
50 
51 #include <sys/syscallargs.h>
52 
53 #include <compat/linux/common/linux_machdep.h>
54 #include <compat/linux/common/linux_fcntl.h>
55 #include <compat/linux/common/linux_inotify.h>
56 #include <compat/linux/common/linux_ipc.h>
57 #include <compat/linux/common/linux_sched.h>
58 #include <compat/linux/common/linux_sem.h>
59 #include <compat/linux/common/linux_signal.h>
60 
61 #include <compat/linux/linux_syscallargs.h>
62 
63 /*
64  * inotify(2).  This interface allows the user to get file system
65  * events and (unlike kqueue(2)) their order is strictly preserved.
66  * While nice, the API has sufficient gotchas that mean we don't want
67  * to add native entry points for it.  They are:
68  *
69  * - Because data is returned via read(2), this API is prone to
70  *   unaligned memory accesses.  There is a note in the Linux man page
71  *   that says the name field of struct linux_inotify_event *can* be
72  *   used for alignment purposes.  In practice, even Linux doesn't
73  *   always do this, so for simplicity, we don't ever do this.
74  */
75 
76 #define	LINUX_INOTIFY_MAX_QUEUED	16384
77 #define	LINUX_INOTIFY_MAX_FROM_KEVENT	3
78 
79 #if DEBUG_LINUX
80 #define	DPRINTF(x) uprintf x
81 #else
82 #define	DPRINTF(x) __nothing
83 #endif
84 
85 struct inotify_entry {
86 	TAILQ_ENTRY(inotify_entry)	ie_entries;
87 	char				ie_name[NAME_MAX + 1];
88 	struct linux_inotify_event	ie_event;
89 };
90 
91 struct inotify_dir_entries {
92 	size_t	ide_count;
93 	struct inotify_dir_entry {
94 		char	name[NAME_MAX + 1];
95 		ino_t	fileno;
96 	} ide_entries[];
97 };
98 #define	INOTIFY_DIR_ENTRIES_SIZE(count)	(sizeof(struct inotify_dir_entries) \
99     + count * sizeof(struct inotify_dir_entry))
100 
101 struct inotifyfd {
102 	int		ifd_kqfd;	/* kqueue fd used by this inotify */
103 					/* instance */
104 	struct selinfo	ifd_sel;	/* for EVFILT_READ by epoll */
105 	kmutex_t	ifd_lock;	/* lock for ifd_sel, ifd_wds and */
106 					/* ifd_nwds */
107 
108 	struct inotify_dir_entries **ifd_wds;
109 					/* keeps track of watch descriptors */
110 					/* for directories: snapshot of the */
111 					/* directory state */
112 					/* for files: an inotify_dir_entries */
113 					/* with ide_count == 0 */
114 	size_t		ifd_nwds;	/* max watch descriptor that can be */
115 					/* stored in ifd_wds + 1 */
116 
117         TAILQ_HEAD(, inotify_entry) ifd_qhead;	/* queue of pending events */
118 	size_t		ifd_qcount;	/* number of pending events */
119 	kcondvar_t	ifd_qcv;	/* condvar for blocking reads */
120 	kmutex_t	ifd_qlock;	/* lock for ifd_q* and interlock */
121 					/* for ifd_qcv */
122 };
123 
124 struct inotify_kevent_mask_pair {
125 	uint32_t inotify;
126 	uint32_t kevent;
127 };
128 
129 static int	inotify_kev_fetch_changes(void *, const struct kevent *,
130     struct kevent *, size_t, int);
131 static int	do_inotify_init(struct lwp *, register_t *, int);
132 static int	inotify_close_wd(struct inotifyfd *, int);
133 static uint32_t	inotify_mask_to_kevent_fflags(uint32_t, enum vtype);
134 static void	do_kevent_to_inotify(int32_t, uint32_t, uint32_t,
135     struct inotify_entry *, size_t *, char *);
136 static int	kevent_to_inotify(struct inotifyfd *, int, enum vtype, uint32_t,
137     uint32_t, struct inotify_entry *, size_t *);
138 static int	inotify_readdir(file_t *, struct dirent *, int *, bool);
139 static struct inotify_dir_entries *get_inotify_dir_entries(int, bool);
140 
141 static int	inotify_filt_attach(struct knote *);
142 static void	inotify_filt_detach(struct knote *);
143 static int	inotify_filt_event(struct knote *, long);
144 static void	inotify_read_filt_detach(struct knote *);
145 static int	inotify_read_filt_event(struct knote *, long);
146 
147 static int	inotify_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
148 static int	inotify_close(file_t *);
149 static int	inotify_poll(file_t *, int);
150 static int	inotify_kqfilter(file_t *, struct knote *);
151 static void	inotify_restart(file_t *);
152 
153 static const char inotify_filtname[] = "LINUX_INOTIFY";
154 static int inotify_filtid;
155 
156 /* "fake" EVFILT_VNODE that gets attached to ifd_deps */
157 static const struct filterops inotify_filtops = {
158 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
159 	.f_attach = inotify_filt_attach,
160 	.f_detach = inotify_filt_detach,
161 	.f_event = inotify_filt_event,
162 	.f_touch = NULL,
163 };
164 
165 /* EVFILT_READ attached to inotifyfd (to support watching via epoll) */
166 static const struct filterops inotify_read_filtops = {
167 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
168 	.f_attach = NULL, /* attached via .fo_kqfilter */
169 	.f_detach = inotify_read_filt_detach,
170 	.f_event = inotify_read_filt_event,
171 	.f_touch = NULL,
172 };
173 
174 static const struct fileops inotify_fileops = {
175 	.fo_name = "inotify",
176 	.fo_read = inotify_read,
177 	.fo_write = fbadop_write,
178 	.fo_ioctl = fbadop_ioctl,
179 	.fo_fcntl = fnullop_fcntl,
180 	.fo_poll = inotify_poll,
181 	.fo_stat = fbadop_stat,
182 	.fo_close = inotify_close,
183 	.fo_kqfilter = inotify_kqfilter,
184 	.fo_restart = inotify_restart,
185 	.fo_fpathconf = (void *)eopnotsupp,
186 };
187 
188 /* basic flag translations */
189 static const struct inotify_kevent_mask_pair common_inotify_to_kevent[] = {
190 	{ .inotify = LINUX_IN_ATTRIB,		.kevent = NOTE_ATTRIB, },
191 	{ .inotify = LINUX_IN_CLOSE_NOWRITE,	.kevent = NOTE_CLOSE, },
192 	{ .inotify = LINUX_IN_OPEN,		.kevent = NOTE_OPEN, },
193 	{ .inotify = LINUX_IN_MOVE_SELF,	.kevent = NOTE_RENAME, },
194 };
195 static const size_t common_inotify_to_kevent_len =
196     __arraycount(common_inotify_to_kevent);
197 
198 static const struct inotify_kevent_mask_pair vreg_inotify_to_kevent[] = {
199 	{ .inotify = LINUX_IN_ACCESS,		.kevent = NOTE_READ, },
200 	{ .inotify = LINUX_IN_ATTRIB,		.kevent = NOTE_ATTRIB|NOTE_LINK, },
201 	{ .inotify = LINUX_IN_CLOSE_WRITE,	.kevent = NOTE_CLOSE_WRITE, },
202 	{ .inotify = LINUX_IN_MODIFY,		.kevent = NOTE_WRITE, },
203 };
204 static const size_t vreg_inotify_to_kevent_len =
205     __arraycount(vreg_inotify_to_kevent);
206 
207 static const struct inotify_kevent_mask_pair vdir_inotify_to_kevent[] = {
208 	{ .inotify = LINUX_IN_ACCESS,		.kevent = NOTE_READ, },
209 	{ .inotify = LINUX_IN_CREATE,		.kevent = NOTE_WRITE, },
210 	{ .inotify = LINUX_IN_DELETE,		.kevent = NOTE_WRITE, },
211 	{ .inotify = LINUX_IN_MOVED_FROM,	.kevent = NOTE_WRITE, },
212 	{ .inotify = LINUX_IN_MOVED_TO,		.kevent = NOTE_WRITE, },
213 };
214 static const size_t vdir_inotify_to_kevent_len =
215     __arraycount(vdir_inotify_to_kevent);
216 
217 static const struct inotify_kevent_mask_pair common_kevent_to_inotify[] = {
218 	{ .kevent = NOTE_ATTRIB,	.inotify = LINUX_IN_ATTRIB, },
219 	{ .kevent = NOTE_CLOSE,		.inotify = LINUX_IN_CLOSE_NOWRITE, },
220 	{ .kevent = NOTE_CLOSE_WRITE,	.inotify = LINUX_IN_CLOSE_WRITE, },
221 	{ .kevent = NOTE_OPEN,		.inotify = LINUX_IN_OPEN, },
222 	{ .kevent = NOTE_READ,		.inotify = LINUX_IN_ACCESS, },
223 	{ .kevent = NOTE_RENAME,	.inotify = LINUX_IN_MOVE_SELF, },
224 	{ .kevent = NOTE_REVOKE,	.inotify = LINUX_IN_UNMOUNT, },
225 };
226 static const size_t common_kevent_to_inotify_len =
227     __arraycount(common_kevent_to_inotify);
228 
229 static const struct inotify_kevent_mask_pair vreg_kevent_to_inotify[] = {
230 	{ .kevent = NOTE_DELETE|NOTE_LINK, .inotify = LINUX_IN_ATTRIB, },
231 	{ .kevent = NOTE_WRITE,		.inotify = LINUX_IN_MODIFY, },
232 };
233 static const size_t vreg_kevent_to_inotify_len =
234     __arraycount(vreg_kevent_to_inotify);
235 
236 /*
237  * Register the custom kfilter for inotify.
238  */
239 int
240 linux_inotify_init(void)
241 {
242 	return kfilter_register(inotify_filtname, &inotify_filtops,
243 	    &inotify_filtid);
244 }
245 
246 /*
247  * Unregister the custom kfilter for inotify.
248  */
249 int
250 linux_inotify_fini(void)
251 {
252 	return kfilter_unregister(inotify_filtname);
253 }
254 
255 /*
256  * Copyin callback used by kevent.  This copies already converted
257  * filters from kernel memory to the kevent internal kernel memory.
258  * Hence the memcpy instead of copyin.
259  */
260 static int
261 inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist,
262     struct kevent *changes, size_t index, int n)
263 {
264 	memcpy(changes, changelist + index, n * sizeof(*changes));
265 
266 	return 0;
267 }
268 
269 /*
270  * Initialize a new inotify fd.
271  */
272 static int
273 do_inotify_init(struct lwp *l, register_t *retval, int flags)
274 {
275 	file_t *fp;
276 	int error, fd;
277 	struct proc *p = l->l_proc;
278 	struct inotifyfd *ifd;
279 	struct sys_kqueue1_args kqa;
280 
281 	if (flags & ~(LINUX_IN_ALL_FLAGS))
282 		return EINVAL;
283 
284 	ifd = kmem_zalloc(sizeof(*ifd), KM_SLEEP);
285 	mutex_init(&ifd->ifd_lock, MUTEX_DEFAULT, IPL_NONE);
286 	mutex_init(&ifd->ifd_qlock, MUTEX_DEFAULT, IPL_NONE);
287 	cv_init(&ifd->ifd_qcv, "inotify");
288 	selinit(&ifd->ifd_sel);
289 	TAILQ_INIT(&ifd->ifd_qhead);
290 
291 	ifd->ifd_nwds = 1;
292 	ifd->ifd_wds = kmem_zalloc(ifd->ifd_nwds * sizeof(*ifd->ifd_wds),
293 	KM_SLEEP);
294 
295 	SCARG(&kqa, flags) = 0;
296 	if (flags & LINUX_IN_NONBLOCK)
297 		SCARG(&kqa, flags) |= O_NONBLOCK;
298 	error = sys_kqueue1(l, &kqa, retval);
299 	if (error != 0)
300 		goto leave0;
301 	ifd->ifd_kqfd = *retval;
302 
303 	error = fd_allocfile(&fp, &fd);
304 	if (error != 0)
305 		goto leave1;
306 
307 	fp->f_flag = FREAD;
308 	if (flags & LINUX_IN_NONBLOCK)
309 		fp->f_flag |= FNONBLOCK;
310 	fp->f_type = DTYPE_MISC;
311 	fp->f_ops = &inotify_fileops;
312 	fp->f_data = ifd;
313 	fd_set_exclose(l, fd, (flags & LINUX_IN_CLOEXEC) != 0);
314 	fd_affix(p, fp, fd);
315 
316 	*retval = fd;
317 	return 0;
318 
319 leave1:
320 	KASSERT(fd_getfile(ifd->ifd_kqfd) != NULL);
321 	fd_close(ifd->ifd_kqfd);
322 leave0:
323 	kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
324 	kmem_free(ifd, sizeof(*ifd));
325 
326 	mutex_destroy(&ifd->ifd_lock);
327 	mutex_destroy(&ifd->ifd_qlock);
328 	cv_destroy(&ifd->ifd_qcv);
329 	seldestroy(&ifd->ifd_sel);
330 
331 	return error;
332 }
333 
334 #ifndef __aarch64__
335 /*
336  * inotify_init(2).  Initialize a new inotify fd with flags=0.
337  */
338 int
339 linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval)
340 {
341 	return do_inotify_init(l, retval, 0);
342 }
343 #endif
344 
345 /*
346  * inotify_init(2).  Initialize a new inotify fd with the given flags.
347  */
348 int
349 linux_sys_inotify_init1(struct lwp *l,
350     const struct linux_sys_inotify_init1_args *uap, register_t *retval)
351 {
352 	/* {
353 		syscallarg(int) flags;
354 	} */
355 
356 	return do_inotify_init(l, retval, SCARG(uap, flags));
357 }
358 
359 /*
360  * Convert inotify mask to the fflags of an equivalent kevent.
361  */
362 static uint32_t
363 inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type)
364 {
365 	const struct inotify_kevent_mask_pair *type_inotify_to_kevent;
366 	uint32_t fflags;
367 	size_t i, type_inotify_to_kevent_len;
368 
369 	switch (type) {
370 	case VREG:
371 	case VDIR:
372 	case VLNK:
373 		break;
374 
375 	default:
376 		return 0;
377 	}
378 
379 	/* flags that all watches could have */
380 	fflags = NOTE_DELETE|NOTE_REVOKE;
381 	for (i = 0; i < common_inotify_to_kevent_len; i++)
382 		if (mask & common_inotify_to_kevent[i].inotify)
383 			fflags |= common_inotify_to_kevent[i].kevent;
384 
385 	/* flags that depend on type */
386 	switch (type) {
387 	case VREG:
388 		type_inotify_to_kevent = vreg_inotify_to_kevent;
389 		type_inotify_to_kevent_len = vreg_inotify_to_kevent_len;
390 		break;
391 
392 	case VDIR:
393 		type_inotify_to_kevent = vdir_inotify_to_kevent;
394 		type_inotify_to_kevent_len = vdir_inotify_to_kevent_len;
395 		break;
396 
397 	default:
398 		type_inotify_to_kevent_len = 0;
399 		break;
400 	}
401 	for (i = 0; i < type_inotify_to_kevent_len; i++)
402 		if (mask & type_inotify_to_kevent[i].inotify)
403 			fflags |= type_inotify_to_kevent[i].kevent;
404 
405 	return fflags;
406 }
407 
408 /*
409  * inotify_add_watch(2).  Open a fd for pathname (if desired by mask)
410  * track it and add an equivalent kqueue event for it in
411  * ifd->ifd_kqfd.
412  */
413 int
414 linux_sys_inotify_add_watch(struct lwp *l,
415     const struct linux_sys_inotify_add_watch_args *uap, register_t *retval)
416 {
417 	/* {
418 		syscallarg(int) fd;
419 		syscallarg(const char *) pathname;
420 		syscallarg(uint32_t) mask;
421 	} */
422 	int wd, i, error = 0;
423 	file_t *fp, *wp, *cur_fp;
424 	struct inotifyfd *ifd;
425 	struct inotify_dir_entries **new_wds;
426 	struct knote *kn, *tmpkn;
427 	struct sys_open_args oa;
428 	struct kevent kev;
429 	struct vnode *wvp;
430 	namei_simple_flags_t sflags;
431 	struct kevent_ops k_ops = {
432 		.keo_private = NULL,
433 		.keo_fetch_timeout = NULL,
434 		.keo_fetch_changes = inotify_kev_fetch_changes,
435 		.keo_put_events = NULL,
436 	};
437 	const int fd = SCARG(uap, fd);
438 	const uint32_t mask = SCARG(uap, mask);
439 
440 	if (mask & ~LINUX_IN_ADD_KNOWN)
441 		return EINVAL;
442 
443 	fp = fd_getfile(fd);
444 	if (fp == NULL)
445 		return EBADF;
446 
447 	if (fp->f_ops != &inotify_fileops) {
448 		/* not an inotify fd */
449 		error = EBADF;
450 		goto leave0;
451 	}
452 
453 	ifd = fp->f_data;
454 
455 	mutex_enter(&ifd->ifd_lock);
456 
457 	if (mask & LINUX_IN_DONT_FOLLOW)
458 		sflags = NSM_NOFOLLOW_TRYEMULROOT;
459 	else
460 		sflags = NSM_FOLLOW_TRYEMULROOT;
461 	error = namei_simple_user(SCARG(uap, pathname), sflags, &wvp);
462 	if (error != 0)
463 		goto leave1;
464 
465 	/* Check to see if we already have a descriptor to wd's file. */
466         wd = -1;
467 	for (i = 0; i < ifd->ifd_nwds; i++) {
468 		if (ifd->ifd_wds[i] != NULL) {
469 			cur_fp = fd_getfile(i);
470 			if (cur_fp == NULL) {
471 				DPRINTF(("%s: wd=%d was closed externally\n",
472 				    __func__, i));
473 				error = EBADF;
474 				goto leave1;
475 			}
476 			if (cur_fp->f_type != DTYPE_VNODE) {
477 				DPRINTF(("%s: wd=%d was replaced "
478 				    "with a non-vnode\n", __func__, i));
479 				error = EBADF;
480 			}
481 			if (error == 0 && cur_fp->f_vnode == wvp)
482 				wd = i;
483 			fd_putfile(i);
484 			if (error != 0)
485 				goto leave1;
486 
487 			if (wd != -1)
488 				break;
489 		}
490 	}
491 
492 	if (wd == -1) {
493 		/*
494 		 * If we do not have a descriptor to wd's file, we
495 		 * need to open the watch descriptor.
496 		 */
497 		SCARG(&oa, path) = SCARG(uap, pathname);
498 		SCARG(&oa, mode) = 0;
499 		SCARG(&oa, flags) = O_RDONLY;
500 		if (mask & LINUX_IN_DONT_FOLLOW)
501 			SCARG(&oa, flags) |= O_NOFOLLOW;
502 		if (mask & LINUX_IN_ONLYDIR)
503 			SCARG(&oa, flags) |= O_DIRECTORY;
504 
505 		error = sys_open(l, &oa, retval);
506 		if (error != 0)
507 			goto leave1;
508 		wd = *retval;
509 		wp = fd_getfile(wd);
510 	        KASSERT(wp != NULL);
511 		KASSERT(wp->f_type == DTYPE_VNODE);
512 
513 		/* translate the flags */
514 		memset(&kev, 0, sizeof(kev));
515 		EV_SET(&kev, wd, inotify_filtid, EV_ADD|EV_ENABLE,
516 		    NOTE_DELETE|NOTE_REVOKE, 0, ifd);
517 		if (mask & LINUX_IN_ONESHOT)
518 			kev.flags |= EV_ONESHOT;
519 		kev.fflags |= inotify_mask_to_kevent_fflags(mask,
520 		    wp->f_vnode->v_type);
521 
522 		error = kevent1(retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL,
523 		    &k_ops);
524 		if (error != 0) {
525 			KASSERT(fd_getfile(wd) != NULL);
526 			fd_close(wd);
527 		} else {
528 			/* Success! */
529 			*retval = wd;
530 
531 			/* Resize ifd_nwds to accomodate wd. */
532 			if (wd+1 > ifd->ifd_nwds) {
533 				new_wds = kmem_zalloc(
534 				    (wd+1) * sizeof(*ifd->ifd_wds), KM_SLEEP);
535 				memcpy(new_wds, ifd->ifd_wds,
536 				    ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
537 
538 				kmem_free(ifd->ifd_wds,
539 				    ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
540 
541 				ifd->ifd_wds = new_wds;
542 				ifd->ifd_nwds = wd+1;
543 			}
544 
545 			ifd->ifd_wds[wd] = get_inotify_dir_entries(wd, true);
546 		}
547 	} else {
548 		/*
549 		 * If we do have a descriptor to wd's file, try to edit
550 		 * the relevant knote.
551 		 */
552 		if (mask & LINUX_IN_MASK_CREATE) {
553 			error = EEXIST;
554 			goto leave1;
555 		}
556 
557 		wp = fd_getfile(wd);
558 		if (wp == NULL) {
559 			DPRINTF(("%s: wd=%d was closed externally "
560 			    "(race, probably)\n", __func__, wd));
561 			error = EBADF;
562 			goto leave1;
563 		}
564 		if (wp->f_type != DTYPE_VNODE) {
565 			DPRINTF(("%s: wd=%d was replace with a non-vnode "
566 			    "(race, probably)\n", __func__, wd));
567 			error = EBADF;
568 			goto leave2;
569 		}
570 
571 		kev.fflags = NOTE_DELETE | NOTE_REVOKE
572 		    | inotify_mask_to_kevent_fflags(mask, wp->f_vnode->v_type);
573 
574 		mutex_enter(wp->f_vnode->v_interlock);
575 
576 		/*
577 		 * XXX We are forced to find the appropriate knote
578 		 * manually because we cannot create a custom f_touch
579 		 * function for inotify_filtops.  See filter_touch()
580 		 * in kern_event.c for details.
581 		 */
582 	        SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist,
583 		    kn_selnext, tmpkn) {
584 			if (kn->kn_fop == &inotify_filtops
585 			    && ifd == kn->kn_kevent.udata) {
586 				mutex_enter(&kn->kn_kq->kq_lock);
587 				if (mask & LINUX_IN_MASK_ADD)
588 					kn->kn_sfflags |= kev.fflags;
589 				else
590 					kn->kn_sfflags = kev.fflags;
591 				wp->f_vnode->v_klist->vk_interest |=
592 				    kn->kn_sfflags;
593 				mutex_exit(&kn->kn_kq->kq_lock);
594 			}
595 		}
596 
597 		mutex_exit(wp->f_vnode->v_interlock);
598 
599 		/* Success! */
600 		*retval = wd;
601 	}
602 
603 leave2:
604 	fd_putfile(wd);
605 leave1:
606 	mutex_exit(&ifd->ifd_lock);
607 leave0:
608 	fd_putfile(fd);
609 	return error;
610 }
611 
612 /*
613  * Remove a wd from ifd and close wd.
614  */
615 static int
616 inotify_close_wd(struct inotifyfd *ifd, int wd)
617 {
618 	file_t *wp;
619 	int error;
620 	register_t retval;
621 	struct kevent kev;
622 	struct kevent_ops k_ops = {
623 		.keo_private = NULL,
624 		.keo_fetch_timeout = NULL,
625 		.keo_fetch_changes = inotify_kev_fetch_changes,
626 		.keo_put_events = NULL,
627 	};
628 
629 	mutex_enter(&ifd->ifd_lock);
630 
631 	KASSERT(0 <= wd && wd < ifd->ifd_nwds && ifd->ifd_wds[wd] != NULL);
632 
633 	kmem_free(ifd->ifd_wds[wd],
634 	    INOTIFY_DIR_ENTRIES_SIZE(ifd->ifd_wds[wd]->ide_count));
635 	ifd->ifd_wds[wd] = NULL;
636 
637 	mutex_exit(&ifd->ifd_lock);
638 
639 	wp = fd_getfile(wd);
640 	if (wp == NULL) {
641 		DPRINTF(("%s: wd=%d is already closed\n", __func__, wd));
642 		return 0;
643 	}
644 	KASSERT(!mutex_owned(wp->f_vnode->v_interlock));
645 
646 	memset(&kev, 0, sizeof(kev));
647 	EV_SET(&kev, wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
648 	error = kevent1(&retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops);
649 	if (error != 0)
650 		DPRINTF(("%s: attempt to disable all events for wd=%d "
651 		    "had error=%d\n", __func__, wd, error));
652 
653 	return fd_close(wd);
654 }
655 
656 /*
657  * inotify_rm_watch(2).  Close wd and remove it from ifd->ifd_wds.
658  */
659 int
660 linux_sys_inotify_rm_watch(struct lwp *l,
661     const struct linux_sys_inotify_rm_watch_args *uap, register_t *retval)
662 {
663 	/* {
664 		syscallarg(int) fd;
665 		syscallarg(int) wd;
666 	} */
667 	struct inotifyfd *ifd;
668 	file_t *fp;
669 	int error = 0;
670 	const int fd = SCARG(uap, fd);
671 	const int wd = SCARG(uap, wd);
672 
673 	fp = fd_getfile(fd);
674 	if (fp == NULL)
675 		return EBADF;
676 	if (fp->f_ops != &inotify_fileops) {
677 		/* not an inotify fd */
678 		error = EINVAL;
679 		goto leave;
680 	}
681 
682 	ifd = fp->f_data;
683 	if (wd < 0 || wd >= ifd->ifd_nwds || ifd->ifd_wds[wd] == NULL) {
684 		error = EINVAL;
685 		goto leave;
686 	}
687 
688 	error = inotify_close_wd(ifd, wd);
689 
690 leave:
691 	fd_putfile(fd);
692 	return error;
693 }
694 
695 /*
696  * Attach the inotify filter.
697  */
698 static int
699 inotify_filt_attach(struct knote *kn)
700 {
701 	file_t *fp = kn->kn_obj;
702 	struct vnode *vp;
703 
704 	KASSERT(fp->f_type == DTYPE_VNODE);
705 	vp = fp->f_vnode;
706 
707 	/*
708 	 * Needs to be set so that we get the same event handling as
709 	 * EVFILT_VNODE.  Otherwise we don't get any events.
710 	 *
711 	 * A consequence of this is that modifications/removals of
712 	 * this knote need to specify EVFILT_VNODE rather than
713 	 * inotify_filtid.
714 	 */
715 	kn->kn_filter = EVFILT_VNODE;
716 
717 	kn->kn_fop = &inotify_filtops;
718 	kn->kn_hook = vp;
719 	vn_knote_attach(vp, kn);
720 
721 	return 0;
722 }
723 
724 /*
725  * Detach the inotify filter.
726  */
727 static void
728 inotify_filt_detach(struct knote *kn)
729 {
730 	struct vnode *vp = (struct vnode *)kn->kn_hook;
731 
732 	vn_knote_detach(vp, kn);
733 }
734 
735 /*
736  * Create a single inotify event.
737  */
738 static void
739 do_kevent_to_inotify(int32_t wd, uint32_t mask, uint32_t cookie,
740     struct inotify_entry *buf, size_t *nbuf, char *name)
741 {
742 	KASSERT(*nbuf < LINUX_INOTIFY_MAX_FROM_KEVENT);
743 
744 	buf += *nbuf;
745 
746 	memset(buf, 0, sizeof(*buf));
747 
748 	buf->ie_event.wd = wd;
749 	buf->ie_event.mask = mask;
750 	buf->ie_event.cookie = cookie;
751 
752 	if (name != NULL) {
753 		buf->ie_event.len = strlen(name) + 1;
754 		KASSERT(buf->ie_event.len < sizeof(buf->ie_name));
755 		strcpy(buf->ie_name, name);
756 	}
757 
758 	++(*nbuf);
759 }
760 
761 /*
762  * Like vn_readdir(), but with vnode locking only if needs_lock is
763  * true (to avoid double locking in some situations).
764  */
765 static int
766 inotify_readdir(file_t *fp, struct dirent *dep, int *done, bool needs_lock)
767 {
768 	struct vnode *vp;
769 	struct iovec iov;
770 	struct uio uio;
771 	int error, eofflag;
772 
773 	KASSERT(fp->f_type == DTYPE_VNODE);
774 	vp = fp->f_vnode;
775 	KASSERT(vp->v_type == VDIR);
776 
777 	iov.iov_base = dep;
778 	iov.iov_len = sizeof(*dep);
779 
780 	uio.uio_iov = &iov;
781 	uio.uio_iovcnt = 1;
782 	uio.uio_rw = UIO_READ;
783 	uio.uio_resid = sizeof(*dep);
784 	UIO_SETUP_SYSSPACE(&uio);
785 
786 	mutex_enter(&fp->f_lock);
787 	uio.uio_offset = fp->f_offset;
788 	mutex_exit(&fp->f_lock);
789 
790 	/* XXX: should pass whether to lock or not */
791 	if (needs_lock)
792 		vn_lock(vp, LK_SHARED | LK_RETRY);
793 	error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL);
794 	if (needs_lock)
795 		VOP_UNLOCK(vp);
796 
797 	mutex_enter(&fp->f_lock);
798 	fp->f_offset = uio.uio_offset;
799 	mutex_exit(&fp->f_lock);
800 
801 	*done = sizeof(*dep) - uio.uio_resid;
802 	return error;
803 }
804 
805 /*
806  * Create (and allocate) an appropriate inotify_dir_entries struct for wd to be
807  * used on ifd_wds of inotifyfd.  If the entries on a directory fail to be read,
808  * NULL is returned.  needs_lock indicates if the vnode's lock is not already
809  * owned.
810  */
811 static struct inotify_dir_entries *
812 get_inotify_dir_entries(int wd, bool needs_lock)
813 {
814 	struct dirent de;
815 	struct dirent *currdep;
816 	struct inotify_dir_entries *idep = NULL;
817 	file_t *wp;
818 	int done, error;
819 	size_t i, decount;
820 
821 	wp = fd_getfile(wd);
822 	if (wp == NULL)
823 		return NULL;
824 	if (wp->f_type != DTYPE_VNODE)
825 		goto leave;
826 
827 	/* for non-directories, we have 0 entries. */
828 	if (wp->f_vnode->v_type != VDIR) {
829 		idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP);
830 		goto leave;
831 	}
832 
833 	mutex_enter(&wp->f_lock);
834 	wp->f_offset = 0;
835 	mutex_exit(&wp->f_lock);
836 	decount = 0;
837 	for (;;) {
838 		error = inotify_readdir(wp, &de, &done, needs_lock);
839 		if (error != 0)
840 			goto leave;
841 		if (done == 0)
842 			break;
843 
844 		currdep = &de;
845 	        while ((char *)currdep < ((char *)&de) + done) {
846 			decount++;
847 			currdep = _DIRENT_NEXT(currdep);
848 		}
849 	}
850 
851 	idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(decount), KM_SLEEP);
852 	idep->ide_count = decount;
853 
854 	mutex_enter(&wp->f_lock);
855 	wp->f_offset = 0;
856 	mutex_exit(&wp->f_lock);
857 	for (i = 0; i < decount;) {
858 		error = inotify_readdir(wp, &de, &done, needs_lock);
859 		if (error != 0 || done == 0) {
860 			kmem_free(idep, INOTIFY_DIR_ENTRIES_SIZE(decount));
861 			idep = NULL;
862 			goto leave;
863 		}
864 
865 		currdep = &de;
866 		while ((char *)currdep < ((char *)&de) + done) {
867 			idep->ide_entries[i].fileno = currdep->d_fileno;
868 			strcpy(idep->ide_entries[i].name, currdep->d_name);
869 
870 			currdep = _DIRENT_NEXT(currdep);
871 			i++;
872 		}
873 	}
874 
875 leave:
876 	fd_putfile(wd);
877 	return idep;
878 }
879 
880 static size_t
881 find_entry(struct inotify_dir_entries *i1, struct inotify_dir_entries *i2)
882 {
883 	for (size_t i = 0; i < i2->ide_count; i++)
884 		if (i2->ide_entries[i].fileno != i1->ide_entries[i].fileno)
885 			return i;
886 	KASSERTMSG(0, "Entry not found");
887 	return -1;
888 }
889 
890 static void
891 handle_write(struct inotifyfd *ifd, int wd, struct inotify_entry *buf,
892     size_t *nbuf)
893 {
894 	struct inotify_dir_entries *old_idep, *new_idep;
895 	size_t i;
896 
897 	mutex_enter(&ifd->ifd_lock);
898 
899 	old_idep = ifd->ifd_wds[wd];
900 	KASSERT(old_idep != NULL);
901 	new_idep = get_inotify_dir_entries(wd, false);
902 	if (new_idep == NULL) {
903 		DPRINTF(("%s: directory for wd=%d could not be read\n",
904 		    __func__, wd));
905 		mutex_exit(&ifd->ifd_lock);
906 		return;
907 	}
908 
909 
910 	if (old_idep->ide_count < new_idep->ide_count) {
911 		KASSERT(old_idep->ide_count + 1 == new_idep->ide_count);
912 
913 		/* Find the new entry. */
914 		i = find_entry(new_idep, old_idep);
915 		do_kevent_to_inotify(wd, LINUX_IN_CREATE, 0,
916 		    buf, nbuf, new_idep->ide_entries[i].name);
917 		goto out;
918 	}
919 
920 	if (old_idep->ide_count > new_idep->ide_count) {
921 		KASSERT(old_idep->ide_count == new_idep->ide_count + 1);
922 
923 		/* Find the deleted entry. */
924 		i = find_entry(old_idep, new_idep);
925 
926 		do_kevent_to_inotify(wd, LINUX_IN_DELETE, 0,
927 		    buf, nbuf, old_idep->ide_entries[i].name);
928 		goto out;
929 	}
930 
931 	/*
932 	 * XXX Because we are not watching the entire
933 	 * file system, the only time we know for sure
934 	 * that the event is a LINUX_IN_MOVED_FROM/
935 	 * LINUX_IN_MOVED_TO is when the move happens
936 	 * within a single directory...  ie. the number
937 	 * of directory entries has not changed.
938 	 *
939 	 * Otherwise all we can say for sure is that
940 	 * something was created/deleted.  So we issue a
941 	 * LINUX_IN_CREATE/LINUX_IN_DELETE.
942 	 */
943 	ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno;
944 
945 	/* Find the deleted entry. */
946 	for (i = 0; i < old_idep->ide_count; i++)
947 		if (old_idep->ide_entries[i].fileno == changed)
948 			break;
949 	KASSERT(i != old_idep->ide_count);
950 
951 	do_kevent_to_inotify(wd, LINUX_IN_MOVED_FROM, changed, buf, nbuf,
952 	    old_idep->ide_entries[i].name);
953 
954 	do_kevent_to_inotify(wd, LINUX_IN_MOVED_TO, changed, buf, nbuf,
955 	    new_idep->ide_entries[new_idep->ide_count - 1].name);
956 
957 out:
958 	ifd->ifd_wds[wd] = new_idep;
959 	mutex_exit(&ifd->ifd_lock);
960 }
961 
962 /*
963  * Convert a kevent flags and fflags for EVFILT_VNODE to some number
964  * of inotify events.
965  */
966 static int
967 kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype,
968     uint32_t flags, uint32_t fflags, struct inotify_entry *buf,
969     size_t *nbuf)
970 {
971 	struct stat st;
972 	file_t *wp;
973 	size_t i;
974 	int error = 0;
975 
976 	for (i = 0; i < common_kevent_to_inotify_len; i++)
977 		if (fflags & common_kevent_to_inotify[i].kevent)
978 			do_kevent_to_inotify(wd,
979 			    common_kevent_to_inotify[i].inotify, 0, buf, nbuf,
980 			    NULL);
981 
982 	if (wtype == VREG) {
983 		for (i = 0; i < vreg_kevent_to_inotify_len; i++)
984 			if (fflags & vreg_kevent_to_inotify[i].kevent)
985 				do_kevent_to_inotify(wd,
986 				    vreg_kevent_to_inotify[i].inotify, 0,
987 				    buf, nbuf, NULL);
988 	} else if (wtype == VDIR) {
989 		for (i = 0; i < *nbuf; i++)
990 			if (buf[i].ie_event.mask &
991 			    (LINUX_IN_ACCESS|LINUX_IN_ATTRIB
992 		            |LINUX_IN_CLOSE|LINUX_IN_OPEN))
993 				buf[i].ie_event.mask |= LINUX_IN_ISDIR;
994 
995 		/* Need to disambiguate the possible NOTE_WRITEs. */
996 		if (fflags & NOTE_WRITE)
997 			handle_write(ifd, wd, buf, nbuf);
998 	}
999 
1000 	/*
1001 	 * Need to check if wd is actually has a link count of 0 to issue a
1002 	 * LINUX_IN_DELETE_SELF.
1003 	 */
1004 	if (fflags & NOTE_DELETE) {
1005 		wp = fd_getfile(wd);
1006 		KASSERT(wp != NULL);
1007 		KASSERT(wp->f_type == DTYPE_VNODE);
1008 		vn_stat(wp->f_vnode, &st);
1009 		fd_putfile(wd);
1010 
1011 		if (st.st_nlink == 0)
1012 			do_kevent_to_inotify(wd, LINUX_IN_DELETE_SELF, 0,
1013 			    buf, nbuf, NULL);
1014 	}
1015 
1016 	/* LINUX_IN_IGNORED must be the last event issued for wd. */
1017 	if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) {
1018 		do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL);
1019 		/*
1020 		 * XXX in theory we could call inotify_close_wd(ifd, wd) but if
1021 		 * we get here we must already be holding v_interlock for
1022 		 * wd... so we can't.
1023 		 *
1024 		 * For simplicity we do nothing, and so wd will only be closed
1025 		 * when the inotify fd is closed.
1026 		 */
1027 	}
1028 
1029 	return error;
1030 }
1031 
1032 /*
1033  * Handle an event.  Unlike EVFILT_VNODE, we translate the event to a
1034  * linux_inotify_event and put it in our own custom queue.
1035  */
1036 static int
1037 inotify_filt_event(struct knote *kn, long hint)
1038 {
1039         struct vnode *vp = (struct vnode *)kn->kn_hook;
1040 	struct inotifyfd *ifd;
1041 	struct inotify_entry *cur_ie;
1042 	size_t nbuf, i;
1043 	uint32_t status;
1044 	struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT];
1045 
1046 	/*
1047 	 * If KN_WILLDETACH is set then
1048 	 * 1. kn->kn_kevent.udata has already been trashed with a
1049 	 *    struct lwp *, so we don't have access to a real ifd
1050 	 *    anymore, and
1051 	 * 2. we're about to detach anyways, so we don't really care
1052 	 *    about the events.
1053 	 * (Also because of this we need to get ifd under the same
1054 	 * lock as kn->kn_status.)
1055 	 */
1056 	mutex_enter(&kn->kn_kq->kq_lock);
1057 	status = kn->kn_status;
1058 	ifd = kn->kn_kevent.udata;
1059 	mutex_exit(&kn->kn_kq->kq_lock);
1060 	if (status & KN_WILLDETACH)
1061 		return 0;
1062 
1063 	/*
1064 	 * If we don't care about the NOTEs in hint, we don't generate
1065 	 * any events.
1066 	 */
1067 	hint &= kn->kn_sfflags;
1068 	if (hint == 0)
1069 		return 0;
1070 
1071 	KASSERT(mutex_owned(vp->v_interlock));
1072 	KASSERT(!mutex_owned(&ifd->ifd_lock));
1073 
1074 	mutex_enter(&ifd->ifd_qlock);
1075 
1076 	/*
1077 	 * early out: there's no point even traslating the event if we
1078 	 * have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has
1079 	 * already been added).
1080 	 */
1081 	if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED)
1082 		goto leave;
1083 
1084 	nbuf = 0;
1085 	(void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags,
1086 	    hint, buf, &nbuf);
1087 	for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1;
1088 	     i++) {
1089 		cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
1090 		memcpy(cur_ie, &buf[i], sizeof(*cur_ie));
1091 
1092 		TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
1093 		ifd->ifd_qcount++;
1094 	}
1095 	/* handle early overflow, by adding an overflow event to the end */
1096 	if (i != nbuf) {
1097 		nbuf = 0;
1098 		cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
1099 		do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0,
1100 		    cur_ie, &nbuf, NULL);
1101 
1102 		TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
1103 		ifd->ifd_qcount++;
1104 	}
1105 
1106 	if (nbuf > 0) {
1107 		cv_signal(&ifd->ifd_qcv);
1108 
1109 		mutex_enter(&ifd->ifd_lock);
1110 		selnotify(&ifd->ifd_sel, 0, 0);
1111 		mutex_exit(&ifd->ifd_lock);
1112 	} else
1113 		DPRINTF(("%s: hint=%lx resulted in 0 inotify events\n",
1114 		    __func__, hint));
1115 
1116 leave:
1117 	mutex_exit(&ifd->ifd_qlock);
1118 	return 0;
1119 }
1120 
1121 /*
1122  * Read inotify events from the queue.
1123  */
1124 static int
1125 inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
1126     int flags)
1127 {
1128 	struct inotify_entry *cur_iep;
1129 	size_t cur_size, nread;
1130 	int error = 0;
1131 	struct inotifyfd *ifd = fp->f_data;
1132 
1133 	mutex_enter(&ifd->ifd_qlock);
1134 
1135 	if (ifd->ifd_qcount == 0) {
1136 		if (fp->f_flag & O_NONBLOCK) {
1137 			error = EAGAIN;
1138 			goto leave;
1139 		}
1140 
1141 		while (ifd->ifd_qcount == 0) {
1142 			/* wait until there is an event to read */
1143 			error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock);
1144 			if (error != 0) {
1145 				error = EINTR;
1146 				goto leave;
1147 			}
1148 		}
1149 	}
1150 
1151 	KASSERT(ifd->ifd_qcount > 0);
1152 	KASSERT(mutex_owned(&ifd->ifd_qlock));
1153 
1154 	nread = 0;
1155 	while (ifd->ifd_qcount > 0) {
1156 		cur_iep = TAILQ_FIRST(&ifd->ifd_qhead);
1157 		KASSERT(cur_iep != NULL);
1158 
1159 		cur_size = sizeof(cur_iep->ie_event) + cur_iep->ie_event.len;
1160 		if (cur_size > uio->uio_resid) {
1161 			if (nread == 0)
1162 				error = EINVAL;
1163 			break;
1164 		}
1165 
1166 		error = uiomove(&cur_iep->ie_event, sizeof(cur_iep->ie_event),
1167 		    uio);
1168 		if (error != 0)
1169 			break;
1170 		error = uiomove(&cur_iep->ie_name, cur_iep->ie_event.len, uio);
1171 		if (error != 0)
1172 			break;
1173 
1174 		/* cleanup */
1175 		TAILQ_REMOVE(&ifd->ifd_qhead, cur_iep, ie_entries);
1176 		kmem_free(cur_iep, sizeof(*cur_iep));
1177 
1178 		nread++;
1179 		ifd->ifd_qcount--;
1180 	}
1181 
1182 leave:
1183 	/* Wake up the next reader, if the queue is not empty. */
1184 	if (ifd->ifd_qcount > 0)
1185 		cv_signal(&ifd->ifd_qcv);
1186 
1187 	mutex_exit(&ifd->ifd_qlock);
1188 	return error;
1189 }
1190 
1191 /*
1192  * Close all the file descriptors associated with fp.
1193  */
1194 static int
1195 inotify_close(file_t *fp)
1196 {
1197 	int error;
1198 	size_t i;
1199 	file_t *kqfp;
1200 	struct inotifyfd *ifd = fp->f_data;
1201 
1202 	for (i = 0; i < ifd->ifd_nwds; i++) {
1203 		if (ifd->ifd_wds[i] != NULL) {
1204 			error = inotify_close_wd(ifd, i);
1205 			if (error != 0)
1206 				return error;
1207 		}
1208 	}
1209 
1210 	/* the reference we need to hold is ifd->ifd_kqfp */
1211 	kqfp = fd_getfile(ifd->ifd_kqfd);
1212 	if (kqfp == NULL) {
1213 		DPRINTF(("%s: kqfp=%d is already closed\n", __func__,
1214 		    ifd->ifd_kqfd));
1215 	} else {
1216 		error = fd_close(ifd->ifd_kqfd);
1217 		if (error != 0)
1218 			return error;
1219 	}
1220 
1221 	mutex_destroy(&ifd->ifd_lock);
1222 	mutex_destroy(&ifd->ifd_qlock);
1223 	cv_destroy(&ifd->ifd_qcv);
1224 	seldestroy(&ifd->ifd_sel);
1225 
1226 	kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
1227 	kmem_free(ifd, sizeof(*ifd));
1228 	fp->f_data = NULL;
1229 
1230 	return 0;
1231 }
1232 
1233 /*
1234  * Check if there are pending read events.
1235  */
1236 static int
1237 inotify_poll(file_t *fp, int events)
1238 {
1239 	int revents;
1240 	struct inotifyfd *ifd = fp->f_data;
1241 
1242 	revents = 0;
1243 	if (events & (POLLIN|POLLRDNORM)) {
1244 		mutex_enter(&ifd->ifd_qlock);
1245 
1246 		if (ifd->ifd_qcount > 0)
1247 			revents |= events & (POLLIN|POLLRDNORM);
1248 
1249 		mutex_exit(&ifd->ifd_qlock);
1250 	}
1251 
1252 	return revents;
1253 }
1254 
1255 /*
1256  * Attach EVFILT_READ to the inotify instance in fp.
1257  *
1258  * This is so you can watch inotify with epoll.  No other kqueue
1259  * filter needs to be supported.
1260  */
1261 static int
1262 inotify_kqfilter(file_t *fp, struct knote *kn)
1263 {
1264 	struct inotifyfd *ifd = fp->f_data;
1265 
1266 	KASSERT(fp == kn->kn_obj);
1267 
1268 	if (kn->kn_filter != EVFILT_READ)
1269 		return EINVAL;
1270 
1271 	kn->kn_fop = &inotify_read_filtops;
1272 	mutex_enter(&ifd->ifd_lock);
1273 	selrecord_knote(&ifd->ifd_sel, kn);
1274 	mutex_exit(&ifd->ifd_lock);
1275 
1276 	return 0;
1277 }
1278 
1279 /*
1280  * Detach a filter from an inotify instance.
1281  */
1282 static void
1283 inotify_read_filt_detach(struct knote *kn)
1284 {
1285 	struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
1286 
1287 	mutex_enter(&ifd->ifd_lock);
1288 	selremove_knote(&ifd->ifd_sel, kn);
1289 	mutex_exit(&ifd->ifd_lock);
1290 }
1291 
1292 /*
1293  * Handle EVFILT_READ events.  Note that nothing is put in kn_data.
1294  */
1295 static int
1296 inotify_read_filt_event(struct knote *kn, long hint)
1297 {
1298 	int rv;
1299 	struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
1300 
1301 	mutex_enter(&ifd->ifd_qlock);
1302 	rv = (ifd->ifd_qcount > 0);
1303 	mutex_exit(&ifd->ifd_qlock);
1304 
1305 	return rv;
1306 }
1307 
1308 /*
1309  * Restart the inotify instance.
1310  */
1311 static void
1312 inotify_restart(file_t *fp)
1313 {
1314 	struct inotifyfd *ifd = fp->f_data;
1315 
1316 	mutex_enter(&ifd->ifd_qlock);
1317 	cv_broadcast(&ifd->ifd_qcv);
1318 	mutex_exit(&ifd->ifd_qlock);
1319 }
1320