xref: /dflybsd-src/sys/vfs/tmpfs/tmpfs_vnops.c (revision a8718c1450be85caa410b3a4cb3d98706d04f20c)
1 /*-
2  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
7  * 2005 program.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $
31  */
32 
33 /*
34  * tmpfs vnode interface.
35  */
36 
37 #include <sys/kernel.h>
38 #include <sys/kern_syscall.h>
39 #include <sys/param.h>
40 #include <sys/fcntl.h>
41 #include <sys/lockf.h>
42 #include <sys/priv.h>
43 #include <sys/proc.h>
44 #include <sys/resourcevar.h>
45 #include <sys/sched.h>
46 #include <sys/stat.h>
47 #include <sys/systm.h>
48 #include <sys/unistd.h>
49 #include <sys/vfsops.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_pageout.h>
58 #include <vm/vm_pager.h>
59 #include <vm/swap_pager.h>
60 
61 #include <sys/buf2.h>
62 #include <vm/vm_page2.h>
63 
64 #include <vfs/fifofs/fifo.h>
65 #include <vfs/tmpfs/tmpfs_vnops.h>
66 #include "tmpfs.h"
67 
68 static void tmpfs_strategy_done(struct bio *bio);
69 
70 static __inline
71 void
72 tmpfs_knote(struct vnode *vp, int flags)
73 {
74 	if (flags)
75 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
76 }
77 
78 
79 /* --------------------------------------------------------------------- */
80 
81 static int
82 tmpfs_nresolve(struct vop_nresolve_args *v)
83 {
84 	struct vnode *dvp = v->a_dvp;
85 	struct vnode *vp = NULL;
86 	struct namecache *ncp = v->a_nch->ncp;
87 	struct tmpfs_node *tnode;
88 	struct mount *mp;
89 	struct tmpfs_dirent *de;
90 	struct tmpfs_node *dnode;
91 	int error;
92 
93 	mp = dvp->v_mount;
94 
95 	dnode = VP_TO_TMPFS_DIR(dvp);
96 
97 	TMPFS_NODE_LOCK_SH(dnode);
98 	de = tmpfs_dir_lookup(dnode, NULL, ncp);
99 	if (de == NULL) {
100 		error = ENOENT;
101 	} else {
102 		/*
103 		 * Allocate a vnode for the node we found.
104 		 */
105 		tnode = de->td_node;
106 		error = tmpfs_alloc_vp(dvp->v_mount, tnode,
107 				       LK_EXCLUSIVE | LK_RETRY, &vp);
108 		if (error == 0)
109 			KKASSERT(vp);
110 	}
111 
112 	dnode->tn_status |= TMPFS_NODE_ACCESSED;
113 	TMPFS_NODE_UNLOCK(dnode);
114 
115 	/*
116 	 * Store the result of this lookup in the cache.  Avoid this if the
117 	 * request was for creation, as it does not improve timings on
118 	 * emprical tests.
119 	 */
120 	if (vp) {
121 		vn_unlock(vp);
122 		cache_setvp(v->a_nch, vp);
123 		vrele(vp);
124 	} else if (error == ENOENT) {
125 		cache_setvp(v->a_nch, NULL);
126 	}
127 	return (error);
128 }
129 
130 static int
131 tmpfs_nlookupdotdot(struct vop_nlookupdotdot_args *v)
132 {
133 	struct vnode *dvp = v->a_dvp;
134 	struct vnode **vpp = v->a_vpp;
135 	struct tmpfs_node *dnode = VP_TO_TMPFS_NODE(dvp);
136 	struct ucred *cred = v->a_cred;
137 	struct mount *mp;
138 	int error;
139 
140 	*vpp = NULL;
141 
142 	mp = dvp->v_mount;
143 
144 	/* Check accessibility of requested node as a first step. */
145 	error = VOP_ACCESS(dvp, VEXEC, cred);
146 	if (error != 0)
147 		return error;
148 
149 	if (dnode->tn_dir.tn_parent != NULL) {
150 		/* Allocate a new vnode on the matching entry. */
151 		error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent,
152 				       LK_EXCLUSIVE | LK_RETRY, vpp);
153 
154 		if (*vpp)
155 			vn_unlock(*vpp);
156 	}
157 	return (*vpp == NULL) ? ENOENT : 0;
158 }
159 
160 /* --------------------------------------------------------------------- */
161 
162 static int
163 tmpfs_ncreate(struct vop_ncreate_args *v)
164 {
165 	struct vnode *dvp = v->a_dvp;
166 	struct vnode **vpp = v->a_vpp;
167 	struct namecache *ncp = v->a_nch->ncp;
168 	struct vattr *vap = v->a_vap;
169 	struct ucred *cred = v->a_cred;
170 	struct mount *mp;
171 	int error;
172 
173 	mp = dvp->v_mount;
174 
175 	KKASSERT(vap->va_type == VREG || vap->va_type == VSOCK);
176 
177 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
178 	if (error == 0) {
179 		cache_setunresolved(v->a_nch);
180 		cache_setvp(v->a_nch, *vpp);
181 		tmpfs_knote(dvp, NOTE_WRITE);
182 	}
183 	return (error);
184 }
185 /* --------------------------------------------------------------------- */
186 
187 static int
188 tmpfs_nmknod(struct vop_nmknod_args *v)
189 {
190 	struct vnode *dvp = v->a_dvp;
191 	struct vnode **vpp = v->a_vpp;
192 	struct namecache *ncp = v->a_nch->ncp;
193 	struct vattr *vap = v->a_vap;
194 	struct ucred *cred = v->a_cred;
195 	int error;
196 
197 	if (vap->va_type != VBLK && vap->va_type != VCHR &&
198 	    vap->va_type != VFIFO) {
199 		return (EINVAL);
200 	}
201 
202 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
203 	if (error == 0) {
204 		cache_setunresolved(v->a_nch);
205 		cache_setvp(v->a_nch, *vpp);
206 		tmpfs_knote(dvp, NOTE_WRITE);
207 	}
208 	return error;
209 }
210 
211 /* --------------------------------------------------------------------- */
212 
213 static int
214 tmpfs_open(struct vop_open_args *v)
215 {
216 	struct vnode *vp = v->a_vp;
217 	int mode = v->a_mode;
218 	struct tmpfs_node *node;
219 	int error;
220 
221 	node = VP_TO_TMPFS_NODE(vp);
222 
223 #if 0
224 	/* The file is still active but all its names have been removed
225 	 * (e.g. by a "rmdir $(pwd)").  It cannot be opened any more as
226 	 * it is about to die. */
227 	if (node->tn_links < 1)
228 		return (ENOENT);
229 #endif
230 
231 	/* If the file is marked append-only, deny write requests. */
232 	if ((node->tn_flags & APPEND) &&
233 	    (mode & (FWRITE | O_APPEND)) == FWRITE) {
234 		error = EPERM;
235 	} else {
236 		error = (vop_stdopen(v));
237 	}
238 
239 	return (error);
240 }
241 
242 /* --------------------------------------------------------------------- */
243 
244 static int
245 tmpfs_close(struct vop_close_args *v)
246 {
247 	struct vnode *vp = v->a_vp;
248 	struct tmpfs_node *node;
249 	int error;
250 
251 	node = VP_TO_TMPFS_NODE(vp);
252 
253 	if (node->tn_links > 0) {
254 		/*
255 		 * Update node times.  No need to do it if the node has
256 		 * been deleted, because it will vanish after we return.
257 		 */
258 		tmpfs_update(vp);
259 	}
260 
261 	error = vop_stdclose(v);
262 
263 	return (error);
264 }
265 
266 /* --------------------------------------------------------------------- */
267 
268 int
269 tmpfs_access(struct vop_access_args *v)
270 {
271 	struct vnode *vp = v->a_vp;
272 	int error;
273 	struct tmpfs_node *node;
274 
275 	node = VP_TO_TMPFS_NODE(vp);
276 
277 	switch (vp->v_type) {
278 	case VDIR:
279 		/* FALLTHROUGH */
280 	case VLNK:
281 		/* FALLTHROUGH */
282 	case VREG:
283 		if ((v->a_mode & VWRITE) &&
284 	            (vp->v_mount->mnt_flag & MNT_RDONLY)) {
285 			error = EROFS;
286 			goto out;
287 		}
288 		break;
289 
290 	case VBLK:
291 		/* FALLTHROUGH */
292 	case VCHR:
293 		/* FALLTHROUGH */
294 	case VSOCK:
295 		/* FALLTHROUGH */
296 	case VFIFO:
297 		break;
298 
299 	default:
300 		error = EINVAL;
301 		goto out;
302 	}
303 
304 	if ((v->a_mode & VWRITE) && (node->tn_flags & IMMUTABLE)) {
305 		error = EPERM;
306 		goto out;
307 	}
308 
309 	error = vop_helper_access(v, node->tn_uid, node->tn_gid,
310 			          node->tn_mode, 0);
311 out:
312 	return error;
313 }
314 
315 /* --------------------------------------------------------------------- */
316 
317 int
318 tmpfs_getattr(struct vop_getattr_args *v)
319 {
320 	struct vnode *vp = v->a_vp;
321 	struct vattr *vap = v->a_vap;
322 	struct tmpfs_node *node;
323 
324 	node = VP_TO_TMPFS_NODE(vp);
325 
326 	tmpfs_update(vp);
327 
328 	TMPFS_NODE_LOCK_SH(node);
329 	vap->va_type = vp->v_type;
330 	vap->va_mode = node->tn_mode;
331 	vap->va_nlink = node->tn_links;
332 	vap->va_uid = node->tn_uid;
333 	vap->va_gid = node->tn_gid;
334 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
335 	vap->va_fileid = node->tn_id;
336 	vap->va_size = node->tn_size;
337 	vap->va_blocksize = PAGE_SIZE;
338 	vap->va_atime.tv_sec = node->tn_atime;
339 	vap->va_atime.tv_nsec = node->tn_atimensec;
340 	vap->va_mtime.tv_sec = node->tn_mtime;
341 	vap->va_mtime.tv_nsec = node->tn_mtimensec;
342 	vap->va_ctime.tv_sec = node->tn_ctime;
343 	vap->va_ctime.tv_nsec = node->tn_ctimensec;
344 	vap->va_gen = node->tn_gen;
345 	vap->va_flags = node->tn_flags;
346 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
347 		vap->va_rmajor = umajor(node->tn_rdev);
348 		vap->va_rminor = uminor(node->tn_rdev);
349 	}
350 	vap->va_bytes = round_page(node->tn_size);
351 	vap->va_filerev = 0;
352 	TMPFS_NODE_UNLOCK(node);
353 
354 	return 0;
355 }
356 
357 /* --------------------------------------------------------------------- */
358 
359 int
360 tmpfs_setattr(struct vop_setattr_args *v)
361 {
362 	struct vnode *vp = v->a_vp;
363 	struct vattr *vap = v->a_vap;
364 	struct ucred *cred = v->a_cred;
365 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
366 	int error = 0;
367 	int kflags = 0;
368 
369 	TMPFS_NODE_LOCK(node);
370 	if (error == 0 && (vap->va_flags != VNOVAL)) {
371 		error = tmpfs_chflags(vp, vap->va_flags, cred);
372 		kflags |= NOTE_ATTRIB;
373 	}
374 
375 	if (error == 0 && (vap->va_size != VNOVAL)) {
376 		if (vap->va_size > node->tn_size)
377 			kflags |= NOTE_WRITE | NOTE_EXTEND;
378 		else
379 			kflags |= NOTE_WRITE;
380 		error = tmpfs_chsize(vp, vap->va_size, cred);
381 	}
382 
383 	if (error == 0 && (vap->va_uid != (uid_t)VNOVAL ||
384 			   vap->va_gid != (gid_t)VNOVAL)) {
385 		error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred);
386 		kflags |= NOTE_ATTRIB;
387 	}
388 
389 	if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) {
390 		error = tmpfs_chmod(vp, vap->va_mode, cred);
391 		kflags |= NOTE_ATTRIB;
392 	}
393 
394 	if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL &&
395 	    vap->va_atime.tv_nsec != VNOVAL) ||
396 	    (vap->va_mtime.tv_sec != VNOVAL &&
397 	    vap->va_mtime.tv_nsec != VNOVAL) )) {
398 		error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime,
399 				      vap->va_vaflags, cred);
400 		kflags |= NOTE_ATTRIB;
401 	}
402 
403 	/*
404 	 * Update the node times.  We give preference to the error codes
405 	 * generated by this function rather than the ones that may arise
406 	 * from tmpfs_update.
407 	 */
408 	tmpfs_update(vp);
409 	TMPFS_NODE_UNLOCK(node);
410 	tmpfs_knote(vp, kflags);
411 
412 	return (error);
413 }
414 
415 /* --------------------------------------------------------------------- */
416 
417 /*
418  * fsync is usually a NOP, but we must take action when unmounting or
419  * when recycling.
420  */
421 static int
422 tmpfs_fsync(struct vop_fsync_args *v)
423 {
424 	struct tmpfs_node *node;
425 	struct vnode *vp = v->a_vp;
426 
427 	node = VP_TO_TMPFS_NODE(vp);
428 
429 	tmpfs_update(vp);
430 	if (vp->v_type == VREG) {
431 		if (vp->v_flag & VRECLAIMED) {
432 			if (node->tn_links == 0)
433 				tmpfs_truncate(vp, 0);
434 			else
435 				vfsync(v->a_vp, v->a_waitfor, 1, NULL, NULL);
436 		}
437 	}
438 	return 0;
439 }
440 
441 /* --------------------------------------------------------------------- */
442 
443 static int
444 tmpfs_read (struct vop_read_args *ap)
445 {
446 	struct buf *bp;
447 	struct vnode *vp = ap->a_vp;
448 	struct uio *uio = ap->a_uio;
449 	struct tmpfs_node *node;
450 	off_t base_offset;
451 	size_t offset;
452 	size_t len;
453 	size_t resid;
454 	int error;
455 
456 	/*
457 	 * Check the basics
458 	 */
459 	if (uio->uio_offset < 0)
460 		return (EINVAL);
461 	if (vp->v_type != VREG)
462 		return (EINVAL);
463 
464 	/*
465 	 * Extract node, try to shortcut the operation through
466 	 * the VM page cache, allowing us to avoid buffer cache
467 	 * overheads.
468 	 */
469 	node = VP_TO_TMPFS_NODE(vp);
470         resid = uio->uio_resid;
471         error = vop_helper_read_shortcut(ap);
472         if (error)
473                 return error;
474         if (uio->uio_resid == 0) {
475 		if (resid)
476 			goto finished;
477 		return error;
478 	}
479 
480 	/*
481 	 * Fall-through to our normal read code.
482 	 */
483 	while (uio->uio_resid > 0 && uio->uio_offset < node->tn_size) {
484 		/*
485 		 * Use buffer cache I/O (via tmpfs_strategy)
486 		 */
487 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
488 		base_offset = (off_t)uio->uio_offset - offset;
489 		bp = getcacheblk(vp, base_offset, TMPFS_BLKSIZE, 0);
490 		if (bp == NULL) {
491 			error = bread(vp, base_offset, TMPFS_BLKSIZE, &bp);
492 			if (error) {
493 				brelse(bp);
494 				kprintf("tmpfs_read bread error %d\n", error);
495 				break;
496 			}
497 
498 			/*
499 			 * tmpfs pretty much fiddles directly with the VM
500 			 * system, don't let it exhaust it or we won't play
501 			 * nice with other processes.
502 			 *
503 			 * Only do this if the VOP is coming from a normal
504 			 * read/write.  The VM system handles the case for
505 			 * UIO_NOCOPY.
506 			 */
507 			if (uio->uio_segflg != UIO_NOCOPY)
508 				vm_wait_nominal();
509 		}
510 		bp->b_flags |= B_CLUSTEROK;
511 
512 		/*
513 		 * Figure out how many bytes we can actually copy this loop.
514 		 */
515 		len = TMPFS_BLKSIZE - offset;
516 		if (len > uio->uio_resid)
517 			len = uio->uio_resid;
518 		if (len > node->tn_size - uio->uio_offset)
519 			len = (size_t)(node->tn_size - uio->uio_offset);
520 
521 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
522 		bqrelse(bp);
523 		if (error) {
524 			kprintf("tmpfs_read uiomove error %d\n", error);
525 			break;
526 		}
527 	}
528 
529 finished:
530 	TMPFS_NODE_LOCK(node);
531 	node->tn_status |= TMPFS_NODE_ACCESSED;
532 	TMPFS_NODE_UNLOCK(node);
533 	return (error);
534 }
535 
536 static int
537 tmpfs_write (struct vop_write_args *ap)
538 {
539 	struct buf *bp;
540 	struct vnode *vp = ap->a_vp;
541 	struct uio *uio = ap->a_uio;
542 	struct thread *td = uio->uio_td;
543 	struct tmpfs_node *node;
544 	boolean_t extended;
545 	off_t oldsize;
546 	int error;
547 	off_t base_offset;
548 	size_t offset;
549 	size_t len;
550 	struct rlimit limit;
551 	int trivial = 0;
552 	int kflags = 0;
553 	int seqcount;
554 
555 	error = 0;
556 	if (uio->uio_resid == 0) {
557 		return error;
558 	}
559 
560 	node = VP_TO_TMPFS_NODE(vp);
561 
562 	if (vp->v_type != VREG)
563 		return (EINVAL);
564 	seqcount = ap->a_ioflag >> 16;
565 
566 	oldsize = node->tn_size;
567 	if (ap->a_ioflag & IO_APPEND)
568 		uio->uio_offset = node->tn_size;
569 
570 	/*
571 	 * Check for illegal write offsets.
572 	 */
573 	if (uio->uio_offset + uio->uio_resid >
574 	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
575 		return (EFBIG);
576 	}
577 
578 	/*
579 	 * NOTE: Ignore if UIO does not come from a user thread (e.g. VN).
580 	 */
581 	if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) {
582 		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
583 		if (error != 0) {
584 			return error;
585 		}
586 		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
587 			ksignal(td->td_proc, SIGXFSZ);
588 			return (EFBIG);
589 		}
590 	}
591 
592 
593 	/*
594 	 * Extend the file's size if necessary
595 	 */
596 	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);
597 
598 	while (uio->uio_resid > 0) {
599 		/*
600 		 * Don't completely blow out running buffer I/O
601 		 * when being hit from the pageout daemon.
602 		 */
603 		if (uio->uio_segflg == UIO_NOCOPY &&
604 		    (ap->a_ioflag & IO_RECURSE) == 0) {
605 			bwillwrite(TMPFS_BLKSIZE);
606 		}
607 
608 		/*
609 		 * Use buffer cache I/O (via tmpfs_strategy)
610 		 */
611 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
612 		base_offset = (off_t)uio->uio_offset - offset;
613 		len = TMPFS_BLKSIZE - offset;
614 		if (len > uio->uio_resid)
615 			len = uio->uio_resid;
616 
617 		if ((uio->uio_offset + len) > node->tn_size) {
618 			trivial = (uio->uio_offset <= node->tn_size);
619 			error = tmpfs_reg_resize(vp, uio->uio_offset + len,  trivial);
620 			if (error)
621 				break;
622 		}
623 
624 		/*
625 		 * Read to fill in any gaps.  Theoretically we could
626 		 * optimize this if the write covers the entire buffer
627 		 * and is not a UIO_NOCOPY write, however this can lead
628 		 * to a security violation exposing random kernel memory
629 		 * (whatever junk was in the backing VM pages before).
630 		 *
631 		 * So just use bread() to do the right thing.
632 		 */
633 		error = bread(vp, base_offset, TMPFS_BLKSIZE, &bp);
634 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
635 		if (error) {
636 			kprintf("tmpfs_write uiomove error %d\n", error);
637 			brelse(bp);
638 			break;
639 		}
640 
641 		if (uio->uio_offset > node->tn_size) {
642 			node->tn_size = uio->uio_offset;
643 			kflags |= NOTE_EXTEND;
644 		}
645 		kflags |= NOTE_WRITE;
646 
647 		/*
648 		 * Always try to flush the page in the UIO_NOCOPY case.  This
649 		 * can come from the pageout daemon or during vnode eviction.
650 		 * It is not necessarily going to be marked IO_ASYNC/IO_SYNC.
651 		 *
652 		 * For the normal case we buwrite(), dirtying the underlying
653 		 * VM pages instead of dirtying the buffer and releasing the
654 		 * buffer as a clean buffer.  This allows tmpfs to use
655 		 * essentially all available memory to cache file data.
656 		 * If we used bdwrite() the buffer cache would wind up
657 		 * flushing the data to swap too quickly.
658 		 *
659 		 * But because tmpfs can seriously load the VM system we
660 		 * fall-back to using bdwrite() when free memory starts
661 		 * to get low.  This shifts the load away from the VM system
662 		 * and makes tmpfs act more like a normal filesystem with
663 		 * regards to disk activity.
664 		 *
665 		 * tmpfs pretty much fiddles directly with the VM
666 		 * system, don't let it exhaust it or we won't play
667 		 * nice with other processes.  Only do this if the
668 		 * VOP is coming from a normal read/write.  The VM system
669 		 * handles the case for UIO_NOCOPY.
670 		 */
671 		bp->b_flags |= B_CLUSTEROK;
672 		if (uio->uio_segflg == UIO_NOCOPY) {
673 			/*
674 			 * Flush from the pageout daemon, deal with
675 			 * potentially very heavy tmpfs write activity
676 			 * causing long stalls in the pageout daemon
677 			 * before pages get to free/cache.
678 			 *
679 			 * (a) Under severe pressure setting B_DIRECT will
680 			 *     cause a buffer release to try to free the
681 			 *     underlying pages.
682 			 *
683 			 * (b) Under modest memory pressure the B_RELBUF
684 			 *     alone is sufficient to get the pages moved
685 			 *     to the cache.  We could also force this by
686 			 *     setting B_NOTMETA but that might have other
687 			 *     unintended side-effects (e.g. setting
688 			 *     PG_NOTMETA on the VM page).
689 			 *
690 			 * Hopefully this will unblock the VM system more
691 			 * quickly under extreme tmpfs write load.
692 			 */
693 			if (vm_page_count_min(vm_page_free_hysteresis))
694 				bp->b_flags |= B_DIRECT;
695 			bp->b_flags |= B_AGE | B_RELBUF;
696 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
697 			cluster_awrite(bp);
698 		} else if (vm_page_count_target()) {
699 			/*
700 			 * Normal (userland) write but we are low on memory,
701 			 * run the buffer the buffer cache.
702 			 */
703 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
704 			bdwrite(bp);
705 		} else {
706 			/*
707 			 * Otherwise run the buffer directly through to the
708 			 * backing VM store.
709 			 */
710 			buwrite(bp);
711 			/*vm_wait_nominal();*/
712 		}
713 
714 		if (bp->b_error) {
715 			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
716 			break;
717 		}
718 	}
719 
720 	if (error) {
721 		if (extended) {
722 			(void)tmpfs_reg_resize(vp, oldsize, trivial);
723 			kflags &= ~NOTE_EXTEND;
724 		}
725 		goto done;
726 	}
727 
728 	/*
729 	 * Currently we don't set the mtime on files modified via mmap()
730 	 * because we can't tell the difference between those modifications
731 	 * and an attempt by the pageout daemon to flush tmpfs pages to
732 	 * swap.
733 	 *
734 	 * This is because in order to defer flushes as long as possible
735 	 * buwrite() works by marking the underlying VM pages dirty in
736 	 * order to be able to dispose of the buffer cache buffer without
737 	 * flushing it.
738 	 */
739 	TMPFS_NODE_LOCK(node);
740 	if (uio->uio_segflg != UIO_NOCOPY)
741 		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED;
742 	if (extended)
743 		node->tn_status |= TMPFS_NODE_CHANGED;
744 
745 	if (node->tn_mode & (S_ISUID | S_ISGID)) {
746 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
747 			node->tn_mode &= ~(S_ISUID | S_ISGID);
748 	}
749 	TMPFS_NODE_UNLOCK(node);
750 done:
751 	tmpfs_knote(vp, kflags);
752 
753 	return(error);
754 }
755 
756 static int
757 tmpfs_advlock (struct vop_advlock_args *ap)
758 {
759 	struct tmpfs_node *node;
760 	struct vnode *vp = ap->a_vp;
761 	int error;
762 
763 	node = VP_TO_TMPFS_NODE(vp);
764 	error = (lf_advlock(ap, &node->tn_advlock, node->tn_size));
765 
766 	return (error);
767 }
768 
769 /*
770  * The strategy function is typically only called when memory pressure
771  * forces the system to attempt to pageout pages.  It can also be called
772  * by [n]vtruncbuf() when a truncation cuts a page in half.  Normal write
773  * operations
774  */
775 static int
776 tmpfs_strategy(struct vop_strategy_args *ap)
777 {
778 	struct bio *bio = ap->a_bio;
779 	struct bio *nbio;
780 	struct buf *bp = bio->bio_buf;
781 	struct vnode *vp = ap->a_vp;
782 	struct tmpfs_node *node;
783 	vm_object_t uobj;
784 	vm_page_t m;
785 	int i;
786 
787 	if (vp->v_type != VREG) {
788 		bp->b_resid = bp->b_bcount;
789 		bp->b_flags |= B_ERROR | B_INVAL;
790 		bp->b_error = EINVAL;
791 		biodone(bio);
792 		return(0);
793 	}
794 
795 	node = VP_TO_TMPFS_NODE(vp);
796 
797 	uobj = node->tn_reg.tn_aobj;
798 
799 	/*
800 	 * Don't bother flushing to swap if there is no swap, just
801 	 * ensure that the pages are marked as needing a commit (still).
802 	 */
803 	if (bp->b_cmd == BUF_CMD_WRITE && vm_swap_size == 0) {
804 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
805 			m = bp->b_xio.xio_pages[i];
806 			vm_page_need_commit(m);
807 		}
808 		bp->b_resid = 0;
809 		bp->b_error = 0;
810 		biodone(bio);
811 	} else {
812 		nbio = push_bio(bio);
813 		nbio->bio_done = tmpfs_strategy_done;
814 		nbio->bio_offset = bio->bio_offset;
815 		swap_pager_strategy(uobj, nbio);
816 	}
817 	return 0;
818 }
819 
820 /*
821  * If we were unable to commit the pages to swap make sure they are marked
822  * as needing a commit (again).  If we were, clear the flag to allow the
823  * pages to be freed.
824  */
825 static void
826 tmpfs_strategy_done(struct bio *bio)
827 {
828 	struct buf *bp;
829 	vm_page_t m;
830 	int i;
831 
832 	bp = bio->bio_buf;
833 
834 	if (bp->b_flags & B_ERROR) {
835 		bp->b_flags &= ~B_ERROR;
836 		bp->b_error = 0;
837 		bp->b_resid = 0;
838 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
839 			m = bp->b_xio.xio_pages[i];
840 			vm_page_need_commit(m);
841 		}
842 	} else {
843 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
844 			m = bp->b_xio.xio_pages[i];
845 			vm_page_clear_commit(m);
846 		}
847 	}
848 	bio = pop_bio(bio);
849 	biodone(bio);
850 }
851 
852 static int
853 tmpfs_bmap(struct vop_bmap_args *ap)
854 {
855 	if (ap->a_doffsetp != NULL)
856 		*ap->a_doffsetp = ap->a_loffset;
857 	if (ap->a_runp != NULL)
858 		*ap->a_runp = 0;
859 	if (ap->a_runb != NULL)
860 		*ap->a_runb = 0;
861 
862 	return 0;
863 }
864 
865 /* --------------------------------------------------------------------- */
866 
867 static int
868 tmpfs_nremove(struct vop_nremove_args *v)
869 {
870 	struct vnode *dvp = v->a_dvp;
871 	struct namecache *ncp = v->a_nch->ncp;
872 	struct vnode *vp;
873 	int error;
874 	struct tmpfs_dirent *de;
875 	struct tmpfs_mount *tmp;
876 	struct tmpfs_node *dnode;
877 	struct tmpfs_node *node;
878 	struct mount *mp;
879 
880 	mp = dvp->v_mount;
881 
882 	/*
883 	 * We have to acquire the vp from v->a_nch because we will likely
884 	 * unresolve the namecache entry, and a vrele/vput is needed to
885 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
886 	 *
887 	 * We have to use vget to clear any inactive state on the vnode,
888 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
889 	 * will not get called when we release it.
890 	 */
891 	error = cache_vget(v->a_nch, v->a_cred, LK_SHARED, &vp);
892 	KKASSERT(vp->v_mount == dvp->v_mount);
893 	KKASSERT(error == 0);
894 	vn_unlock(vp);
895 
896 	if (vp->v_type == VDIR) {
897 		error = EISDIR;
898 		goto out2;
899 	}
900 
901 	dnode = VP_TO_TMPFS_DIR(dvp);
902 	node = VP_TO_TMPFS_NODE(vp);
903 	tmp = VFS_TO_TMPFS(vp->v_mount);
904 
905 	TMPFS_NODE_LOCK(dnode);
906 	de = tmpfs_dir_lookup(dnode, node, ncp);
907 	if (de == NULL) {
908 		error = ENOENT;
909 		goto out;
910 	}
911 
912 	/* Files marked as immutable or append-only cannot be deleted. */
913 	if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
914 	    (dnode->tn_flags & APPEND)) {
915 		error = EPERM;
916 		goto out;
917 	}
918 
919 	/* Remove the entry from the directory; as it is a file, we do not
920 	 * have to change the number of hard links of the directory. */
921 	tmpfs_dir_detach(dnode, de);
922 
923 	/* Free the directory entry we just deleted.  Note that the node
924 	 * referred by it will not be removed until the vnode is really
925 	 * reclaimed. */
926 	tmpfs_free_dirent(tmp, de);
927 
928 	if (node->tn_links > 0) {
929 	        TMPFS_NODE_LOCK(node);
930 		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
931 	                TMPFS_NODE_MODIFIED;
932 	        TMPFS_NODE_UNLOCK(node);
933 	}
934 
935 	cache_unlink(v->a_nch);
936 	tmpfs_knote(vp, NOTE_DELETE);
937 	error = 0;
938 
939 out:
940 	TMPFS_NODE_UNLOCK(dnode);
941 	if (error == 0)
942 		tmpfs_knote(dvp, NOTE_WRITE);
943 out2:
944 	vrele(vp);
945 
946 	return error;
947 }
948 
949 /* --------------------------------------------------------------------- */
950 
951 static int
952 tmpfs_nlink(struct vop_nlink_args *v)
953 {
954 	struct vnode *dvp = v->a_dvp;
955 	struct vnode *vp = v->a_vp;
956 	struct namecache *ncp = v->a_nch->ncp;
957 	struct tmpfs_dirent *de;
958 	struct tmpfs_node *node;
959 	struct tmpfs_node *dnode;
960 	struct mount *mp;
961 	int error;
962 
963 	if (dvp->v_mount != vp->v_mount)
964 		return(EXDEV);
965 	mp = dvp->v_mount;
966 
967 	KKASSERT(dvp != vp); /* XXX When can this be false? */
968 
969 	node = VP_TO_TMPFS_NODE(vp);
970 	dnode = VP_TO_TMPFS_NODE(dvp);
971 	TMPFS_NODE_LOCK(dnode);
972 
973 	/* XXX: Why aren't the following two tests done by the caller? */
974 
975 	/* Hard links of directories are forbidden. */
976 	if (vp->v_type == VDIR) {
977 		error = EPERM;
978 		goto out;
979 	}
980 
981 	/* Cannot create cross-device links. */
982 	if (dvp->v_mount != vp->v_mount) {
983 		error = EXDEV;
984 		goto out;
985 	}
986 
987 	/* Ensure that we do not overflow the maximum number of links imposed
988 	 * by the system. */
989 	KKASSERT(node->tn_links <= LINK_MAX);
990 	if (node->tn_links >= LINK_MAX) {
991 		error = EMLINK;
992 		goto out;
993 	}
994 
995 	/* We cannot create links of files marked immutable or append-only. */
996 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
997 		error = EPERM;
998 		goto out;
999 	}
1000 
1001 	/* Allocate a new directory entry to represent the node. */
1002 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node,
1003 				   ncp->nc_name, ncp->nc_nlen, &de);
1004 	if (error != 0)
1005 		goto out;
1006 
1007 	/* Insert the new directory entry into the appropriate directory. */
1008 	tmpfs_dir_attach(dnode, de);
1009 
1010 	/* vp link count has changed, so update node times. */
1011 
1012 	TMPFS_NODE_LOCK(node);
1013 	node->tn_status |= TMPFS_NODE_CHANGED;
1014 	TMPFS_NODE_UNLOCK(node);
1015 	tmpfs_update(vp);
1016 
1017 	tmpfs_knote(vp, NOTE_LINK);
1018 	cache_setunresolved(v->a_nch);
1019 	cache_setvp(v->a_nch, vp);
1020 	error = 0;
1021 
1022 out:
1023 	TMPFS_NODE_UNLOCK(dnode);
1024 	if (error == 0)
1025 		tmpfs_knote(dvp, NOTE_WRITE);
1026 	return error;
1027 }
1028 
1029 /* --------------------------------------------------------------------- */
1030 
1031 static int
1032 tmpfs_nrename(struct vop_nrename_args *v)
1033 {
1034 	struct vnode *fdvp = v->a_fdvp;
1035 	struct namecache *fncp = v->a_fnch->ncp;
1036 	struct vnode *fvp = fncp->nc_vp;
1037 	struct vnode *tdvp = v->a_tdvp;
1038 	struct namecache *tncp = v->a_tnch->ncp;
1039 	struct vnode *tvp;
1040 	struct tmpfs_dirent *de, *tde;
1041 	struct tmpfs_mount *tmp;
1042 	struct tmpfs_node *fdnode;
1043 	struct tmpfs_node *fnode;
1044 	struct tmpfs_node *tnode;
1045 	struct tmpfs_node *tdnode;
1046 	struct mount *mp;
1047 	char *newname;
1048 	char *oldname;
1049 	int error;
1050 
1051 	mp = fdvp->v_mount;
1052 	KKASSERT(fdvp->v_mount == fvp->v_mount);
1053 
1054 	/*
1055 	 * Because tvp can get overwritten we have to vget it instead of
1056 	 * just vref or use it, otherwise it's VINACTIVE flag may not get
1057 	 * cleared and the node won't get destroyed.
1058 	 */
1059 	error = cache_vget(v->a_tnch, v->a_cred, LK_SHARED, &tvp);
1060 	if (error == 0) {
1061 		tnode = VP_TO_TMPFS_NODE(tvp);
1062 		vn_unlock(tvp);
1063 	} else {
1064 		tnode = NULL;
1065 	}
1066 
1067 	/* Disallow cross-device renames.
1068 	 * XXX Why isn't this done by the caller? */
1069 	if (fvp->v_mount != tdvp->v_mount ||
1070 	    (tvp != NULL && fvp->v_mount != tvp->v_mount)) {
1071 		error = EXDEV;
1072 		goto out;
1073 	}
1074 
1075 	tmp = VFS_TO_TMPFS(tdvp->v_mount);
1076 	tdnode = VP_TO_TMPFS_DIR(tdvp);
1077 
1078 	/* If source and target are the same file, there is nothing to do. */
1079 	if (fvp == tvp) {
1080 		error = 0;
1081 		goto out;
1082 	}
1083 
1084 	fdnode = VP_TO_TMPFS_DIR(fdvp);
1085 	fnode = VP_TO_TMPFS_NODE(fvp);
1086 	TMPFS_NODE_LOCK(fdnode);
1087 	de = tmpfs_dir_lookup(fdnode, fnode, fncp);
1088 	TMPFS_NODE_UNLOCK(fdnode);	/* XXX depend on namecache lock */
1089 
1090 	/* Avoid manipulating '.' and '..' entries. */
1091 	if (de == NULL) {
1092 		error = ENOENT;
1093 		goto out_locked;
1094 	}
1095 	KKASSERT(de->td_node == fnode);
1096 
1097 	/*
1098 	 * If replacing an entry in the target directory and that entry
1099 	 * is a directory, it must be empty.
1100 	 *
1101 	 * Kern_rename gurantees the destination to be a directory
1102 	 * if the source is one (it does?).
1103 	 */
1104 	if (tvp != NULL) {
1105 		KKASSERT(tnode != NULL);
1106 
1107 		if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1108 		    (tdnode->tn_flags & (APPEND | IMMUTABLE))) {
1109 			error = EPERM;
1110 			goto out_locked;
1111 		}
1112 
1113 		if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) {
1114 			if (tnode->tn_size > 0) {
1115 				error = ENOTEMPTY;
1116 				goto out_locked;
1117 			}
1118 		} else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) {
1119 			error = ENOTDIR;
1120 			goto out_locked;
1121 		} else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) {
1122 			error = EISDIR;
1123 			goto out_locked;
1124 		} else {
1125 			KKASSERT(fnode->tn_type != VDIR &&
1126 				tnode->tn_type != VDIR);
1127 		}
1128 	}
1129 
1130 	if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1131 	    (fdnode->tn_flags & (APPEND | IMMUTABLE))) {
1132 		error = EPERM;
1133 		goto out_locked;
1134 	}
1135 
1136 	/*
1137 	 * Ensure that we have enough memory to hold the new name, if it
1138 	 * has to be changed.
1139 	 */
1140 	if (fncp->nc_nlen != tncp->nc_nlen ||
1141 	    bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) {
1142 		newname = kmalloc(tncp->nc_nlen + 1, tmp->tm_name_zone,
1143 				  M_WAITOK | M_NULLOK);
1144 		if (newname == NULL) {
1145 			error = ENOSPC;
1146 			goto out_locked;
1147 		}
1148 		bcopy(tncp->nc_name, newname, tncp->nc_nlen);
1149 		newname[tncp->nc_nlen] = '\0';
1150 	} else {
1151 		newname = NULL;
1152 	}
1153 
1154 	/*
1155 	 * Unlink entry from source directory.  Note that the kernel has
1156 	 * already checked for illegal recursion cases (renaming a directory
1157 	 * into a subdirectory of itself).
1158 	 */
1159 	if (fdnode != tdnode) {
1160 		tmpfs_dir_detach(fdnode, de);
1161 	} else {
1162 		/* XXX depend on namecache lock */
1163 		TMPFS_NODE_LOCK(fdnode);
1164 		KKASSERT(de == tmpfs_dir_lookup(fdnode, fnode, fncp));
1165 		RB_REMOVE(tmpfs_dirtree, &fdnode->tn_dir.tn_dirtree, de);
1166 		RB_REMOVE(tmpfs_dirtree_cookie,
1167 			  &fdnode->tn_dir.tn_cookietree, de);
1168 		TMPFS_NODE_UNLOCK(fdnode);
1169 	}
1170 
1171 	/*
1172 	 * Handle any name change.  Swap with newname, we will
1173 	 * deallocate it at the end.
1174 	 */
1175 	if (newname != NULL) {
1176 #if 0
1177 		TMPFS_NODE_LOCK(fnode);
1178 		fnode->tn_status |= TMPFS_NODE_CHANGED;
1179 		TMPFS_NODE_UNLOCK(fnode);
1180 #endif
1181 		oldname = de->td_name;
1182 		de->td_name = newname;
1183 		de->td_namelen = (uint16_t)tncp->nc_nlen;
1184 		newname = oldname;
1185 	}
1186 
1187 	/*
1188 	 * If we are overwriting an entry, we have to remove the old one
1189 	 * from the target directory.
1190 	 */
1191 	if (tvp != NULL) {
1192 		/* Remove the old entry from the target directory. */
1193 		TMPFS_NODE_LOCK(tdnode);
1194 		tde = tmpfs_dir_lookup(tdnode, tnode, tncp);
1195 		tmpfs_dir_detach(tdnode, tde);
1196 		TMPFS_NODE_UNLOCK(tdnode);
1197 		tmpfs_knote(tdnode->tn_vnode, NOTE_DELETE);
1198 
1199 		/*
1200 		 * Free the directory entry we just deleted.  Note that the
1201 		 * node referred by it will not be removed until the vnode is
1202 		 * really reclaimed.
1203 		 */
1204 		tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde);
1205 		/*cache_inval_vp(tvp, CINV_DESTROY);*/
1206 	}
1207 
1208 	/*
1209 	 * Link entry to target directory.  If the entry
1210 	 * represents a directory move the parent linkage
1211 	 * as well.
1212 	 */
1213 	if (fdnode != tdnode) {
1214 		if (de->td_node->tn_type == VDIR) {
1215 			TMPFS_VALIDATE_DIR(fnode);
1216 		}
1217 		tmpfs_dir_attach(tdnode, de);
1218 	} else {
1219 		TMPFS_NODE_LOCK(tdnode);
1220 		tdnode->tn_status |= TMPFS_NODE_MODIFIED;
1221 		RB_INSERT(tmpfs_dirtree, &tdnode->tn_dir.tn_dirtree, de);
1222 		RB_INSERT(tmpfs_dirtree_cookie,
1223 			  &tdnode->tn_dir.tn_cookietree, de);
1224 		TMPFS_NODE_UNLOCK(tdnode);
1225 	}
1226 
1227 	/*
1228 	 * Finish up
1229 	 */
1230 	if (newname) {
1231 		kfree(newname, tmp->tm_name_zone);
1232 		newname = NULL;
1233 	}
1234 	cache_rename(v->a_fnch, v->a_tnch);
1235 	tmpfs_knote(v->a_fdvp, NOTE_WRITE);
1236 	tmpfs_knote(v->a_tdvp, NOTE_WRITE);
1237 	if (fnode->tn_vnode)
1238 		tmpfs_knote(fnode->tn_vnode, NOTE_RENAME);
1239 	error = 0;
1240 
1241 out_locked:
1242 	;
1243 out:
1244 	if (tvp)
1245 		vrele(tvp);
1246 	return error;
1247 }
1248 
1249 /* --------------------------------------------------------------------- */
1250 
1251 static int
1252 tmpfs_nmkdir(struct vop_nmkdir_args *v)
1253 {
1254 	struct vnode *dvp = v->a_dvp;
1255 	struct vnode **vpp = v->a_vpp;
1256 	struct namecache *ncp = v->a_nch->ncp;
1257 	struct vattr *vap = v->a_vap;
1258 	struct ucred *cred = v->a_cred;
1259 	struct mount *mp;
1260 	int error;
1261 
1262 	mp = dvp->v_mount;
1263 
1264 	KKASSERT(vap->va_type == VDIR);
1265 
1266 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
1267 	if (error == 0) {
1268 		cache_setunresolved(v->a_nch);
1269 		cache_setvp(v->a_nch, *vpp);
1270 		tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1271 	}
1272 	return error;
1273 }
1274 
1275 /* --------------------------------------------------------------------- */
1276 
1277 static int
1278 tmpfs_nrmdir(struct vop_nrmdir_args *v)
1279 {
1280 	struct vnode *dvp = v->a_dvp;
1281 	struct namecache *ncp = v->a_nch->ncp;
1282 	struct vnode *vp;
1283 	struct tmpfs_dirent *de;
1284 	struct tmpfs_mount *tmp;
1285 	struct tmpfs_node *dnode;
1286 	struct tmpfs_node *node;
1287 	struct mount *mp;
1288 	int error;
1289 
1290 	mp = dvp->v_mount;
1291 
1292 	/*
1293 	 * We have to acquire the vp from v->a_nch because we will likely
1294 	 * unresolve the namecache entry, and a vrele/vput is needed to
1295 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
1296 	 *
1297 	 * We have to use vget to clear any inactive state on the vnode,
1298 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
1299 	 * will not get called when we release it.
1300 	 */
1301 	error = cache_vget(v->a_nch, v->a_cred, LK_SHARED, &vp);
1302 	KKASSERT(error == 0);
1303 	vn_unlock(vp);
1304 
1305 	/*
1306 	 * Prevalidate so we don't hit an assertion later
1307 	 */
1308 	if (vp->v_type != VDIR) {
1309 		error = ENOTDIR;
1310 		goto out;
1311 	}
1312 
1313 	tmp = VFS_TO_TMPFS(dvp->v_mount);
1314 	dnode = VP_TO_TMPFS_DIR(dvp);
1315 	node = VP_TO_TMPFS_DIR(vp);
1316 
1317 	/*
1318 	 * Directories with more than two entries ('.' and '..') cannot
1319 	 * be removed.
1320 	 */
1321 	if (node->tn_size > 0) {
1322 		error = ENOTEMPTY;
1323 		goto out;
1324 	}
1325 
1326 	if ((dnode->tn_flags & APPEND)
1327 	    || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
1328 		error = EPERM;
1329 		goto out;
1330 	}
1331 
1332 	/*
1333 	 * This invariant holds only if we are not trying to
1334 	 * remove "..".  We checked for that above so this is safe now.
1335 	 */
1336 	KKASSERT(node->tn_dir.tn_parent == dnode);
1337 
1338 	/*
1339 	 * Get the directory entry associated with node (vp).  This
1340 	 * was filled by tmpfs_lookup while looking up the entry.
1341 	 */
1342 	TMPFS_NODE_LOCK(dnode);
1343 	de = tmpfs_dir_lookup(dnode, node, ncp);
1344 	KKASSERT(TMPFS_DIRENT_MATCHES(de, ncp->nc_name, ncp->nc_nlen));
1345 
1346 	/* Check flags to see if we are allowed to remove the directory. */
1347 	if ((dnode->tn_flags & APPEND) ||
1348 	    node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) {
1349 		error = EPERM;
1350 		TMPFS_NODE_UNLOCK(dnode);
1351 		goto out;
1352 	}
1353 
1354 	/* Detach the directory entry from the directory (dnode). */
1355 	tmpfs_dir_detach(dnode, de);
1356 	TMPFS_NODE_UNLOCK(dnode);
1357 
1358 	/* No vnode should be allocated for this entry from this point */
1359 	TMPFS_NODE_LOCK(dnode);
1360 	TMPFS_ASSERT_ELOCKED(dnode);
1361 	TMPFS_NODE_LOCK(node);
1362 	TMPFS_ASSERT_ELOCKED(node);
1363 
1364 	/*
1365 	 * Must set parent linkage to NULL (tested by ncreate to disallow
1366 	 * the creation of new files/dirs in a deleted directory)
1367 	 */
1368 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
1369 			   TMPFS_NODE_MODIFIED;
1370 
1371 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
1372 			    TMPFS_NODE_MODIFIED;
1373 
1374 	TMPFS_NODE_UNLOCK(node);
1375 	TMPFS_NODE_UNLOCK(dnode);
1376 
1377 	/* Free the directory entry we just deleted.  Note that the node
1378 	 * referred by it will not be removed until the vnode is really
1379 	 * reclaimed. */
1380 	tmpfs_free_dirent(tmp, de);
1381 
1382 	/* Release the deleted vnode (will destroy the node, notify
1383 	 * interested parties and clean it from the cache). */
1384 
1385 	TMPFS_NODE_LOCK(dnode);
1386 	dnode->tn_status |= TMPFS_NODE_CHANGED;
1387 	TMPFS_NODE_UNLOCK(dnode);
1388 	tmpfs_update(dvp);
1389 
1390 	cache_unlink(v->a_nch);
1391 	tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1392 	error = 0;
1393 
1394 out:
1395 	vrele(vp);
1396 
1397 	return error;
1398 }
1399 
1400 /* --------------------------------------------------------------------- */
1401 
1402 static int
1403 tmpfs_nsymlink(struct vop_nsymlink_args *v)
1404 {
1405 	struct vnode *dvp = v->a_dvp;
1406 	struct vnode **vpp = v->a_vpp;
1407 	struct namecache *ncp = v->a_nch->ncp;
1408 	struct vattr *vap = v->a_vap;
1409 	struct ucred *cred = v->a_cred;
1410 	char *target = v->a_target;
1411 	int error;
1412 
1413 	vap->va_type = VLNK;
1414 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, target);
1415 	if (error == 0) {
1416 		tmpfs_knote(*vpp, NOTE_WRITE);
1417 		cache_setunresolved(v->a_nch);
1418 		cache_setvp(v->a_nch, *vpp);
1419 	}
1420 	return error;
1421 }
1422 
1423 /* --------------------------------------------------------------------- */
1424 
1425 static int
1426 tmpfs_readdir(struct vop_readdir_args *v)
1427 {
1428 	struct vnode *vp = v->a_vp;
1429 	struct uio *uio = v->a_uio;
1430 	int *eofflag = v->a_eofflag;
1431 	off_t **cookies = v->a_cookies;
1432 	int *ncookies = v->a_ncookies;
1433 	struct tmpfs_mount *tmp;
1434 	int error;
1435 	off_t startoff;
1436 	off_t cnt = 0;
1437 	struct tmpfs_node *node;
1438 
1439 	/* This operation only makes sense on directory nodes. */
1440 	if (vp->v_type != VDIR) {
1441 		return ENOTDIR;
1442 	}
1443 
1444 	tmp = VFS_TO_TMPFS(vp->v_mount);
1445 	node = VP_TO_TMPFS_DIR(vp);
1446 	startoff = uio->uio_offset;
1447 
1448 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) {
1449 		error = tmpfs_dir_getdotdent(node, uio);
1450 		if (error != 0) {
1451 			TMPFS_NODE_LOCK_SH(node);
1452 			goto outok;
1453 		}
1454 		cnt++;
1455 	}
1456 
1457 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) {
1458 		/* may lock parent, cannot hold node lock */
1459 		error = tmpfs_dir_getdotdotdent(tmp, node, uio);
1460 		if (error != 0) {
1461 			TMPFS_NODE_LOCK_SH(node);
1462 			goto outok;
1463 		}
1464 		cnt++;
1465 	}
1466 
1467 	TMPFS_NODE_LOCK_SH(node);
1468 	error = tmpfs_dir_getdents(node, uio, &cnt);
1469 
1470 outok:
1471 	KKASSERT(error >= -1);
1472 
1473 	if (error == -1)
1474 		error = 0;
1475 
1476 	if (eofflag != NULL)
1477 		*eofflag =
1478 		    (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF);
1479 
1480 	/* Update NFS-related variables. */
1481 	if (error == 0 && cookies != NULL && ncookies != NULL) {
1482 		off_t i;
1483 		off_t off = startoff;
1484 		struct tmpfs_dirent *de = NULL;
1485 
1486 		*ncookies = cnt;
1487 		*cookies = kmalloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK);
1488 
1489 		for (i = 0; i < cnt; i++) {
1490 			KKASSERT(off != TMPFS_DIRCOOKIE_EOF);
1491 			if (off == TMPFS_DIRCOOKIE_DOT) {
1492 				off = TMPFS_DIRCOOKIE_DOTDOT;
1493 			} else {
1494 				if (off == TMPFS_DIRCOOKIE_DOTDOT) {
1495 					de = RB_MIN(tmpfs_dirtree_cookie,
1496 						&node->tn_dir.tn_cookietree);
1497 				} else if (de != NULL) {
1498 					de = RB_NEXT(tmpfs_dirtree_cookie,
1499 					       &node->tn_dir.tn_cookietree, de);
1500 				} else {
1501 					de = tmpfs_dir_lookupbycookie(node,
1502 								      off);
1503 					KKASSERT(de != NULL);
1504 					de = RB_NEXT(tmpfs_dirtree_cookie,
1505 					       &node->tn_dir.tn_cookietree, de);
1506 				}
1507 				if (de == NULL)
1508 					off = TMPFS_DIRCOOKIE_EOF;
1509 				else
1510 					off = tmpfs_dircookie(de);
1511 			}
1512 			(*cookies)[i] = off;
1513 		}
1514 		KKASSERT(uio->uio_offset == off);
1515 	}
1516 
1517 	node->tn_status |= TMPFS_NODE_ACCESSED;
1518 	TMPFS_NODE_UNLOCK(node);
1519 	return error;
1520 }
1521 
1522 /* --------------------------------------------------------------------- */
1523 
1524 static int
1525 tmpfs_readlink(struct vop_readlink_args *v)
1526 {
1527 	struct vnode *vp = v->a_vp;
1528 	struct uio *uio = v->a_uio;
1529 	int error;
1530 	struct tmpfs_node *node;
1531 
1532 	KKASSERT(uio->uio_offset == 0);
1533 	KKASSERT(vp->v_type == VLNK);
1534 
1535 	node = VP_TO_TMPFS_NODE(vp);
1536 	TMPFS_NODE_LOCK_SH(node);
1537 	error = uiomove(node->tn_link,
1538 			MIN(node->tn_size, uio->uio_resid), uio);
1539 	node->tn_status |= TMPFS_NODE_ACCESSED;
1540 	TMPFS_NODE_UNLOCK(node);
1541 	return error;
1542 }
1543 
1544 /* --------------------------------------------------------------------- */
1545 
1546 static int
1547 tmpfs_inactive(struct vop_inactive_args *v)
1548 {
1549 	struct vnode *vp = v->a_vp;
1550 	struct tmpfs_node *node;
1551 	struct mount *mp;
1552 
1553 	mp = vp->v_mount;
1554 	lwkt_gettoken(&mp->mnt_token);
1555 	node = VP_TO_TMPFS_NODE(vp);
1556 
1557 	/*
1558 	 * Degenerate case
1559 	 */
1560 	if (node == NULL) {
1561 		vrecycle(vp);
1562 		lwkt_reltoken(&mp->mnt_token);
1563 		return(0);
1564 	}
1565 
1566 	/*
1567 	 * Get rid of unreferenced deleted vnodes sooner rather than
1568 	 * later so the data memory can be recovered immediately.
1569 	 *
1570 	 * We must truncate the vnode to prevent the normal reclamation
1571 	 * path from flushing the data for the removed file to disk.
1572 	 */
1573 	TMPFS_NODE_LOCK(node);
1574 	if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 &&
1575 	    node->tn_links == 0)
1576 	{
1577 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1578 		TMPFS_NODE_UNLOCK(node);
1579 		if (node->tn_type == VREG)
1580 			tmpfs_truncate(vp, 0);
1581 		vrecycle(vp);
1582 	} else {
1583 		TMPFS_NODE_UNLOCK(node);
1584 	}
1585 	lwkt_reltoken(&mp->mnt_token);
1586 
1587 	return 0;
1588 }
1589 
1590 /* --------------------------------------------------------------------- */
1591 
1592 int
1593 tmpfs_reclaim(struct vop_reclaim_args *v)
1594 {
1595 	struct vnode *vp = v->a_vp;
1596 	struct tmpfs_mount *tmp;
1597 	struct tmpfs_node *node;
1598 	struct mount *mp;
1599 
1600 	mp = vp->v_mount;
1601 	lwkt_gettoken(&mp->mnt_token);
1602 
1603 	node = VP_TO_TMPFS_NODE(vp);
1604 	tmp = VFS_TO_TMPFS(vp->v_mount);
1605 	KKASSERT(mp == tmp->tm_mount);
1606 
1607 	tmpfs_free_vp(vp);
1608 
1609 	/*
1610 	 * If the node referenced by this vnode was deleted by the
1611 	 * user, we must free its associated data structures now that
1612 	 * the vnode is being reclaimed.
1613 	 *
1614 	 * Directories have an extra link ref.
1615 	 */
1616 	TMPFS_NODE_LOCK(node);
1617 	if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 &&
1618 	    node->tn_links == 0) {
1619 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1620 		tmpfs_free_node(tmp, node);
1621 		/* eats the lock */
1622 	} else {
1623 		TMPFS_NODE_UNLOCK(node);
1624 	}
1625 	lwkt_reltoken(&mp->mnt_token);
1626 
1627 	KKASSERT(vp->v_data == NULL);
1628 	return 0;
1629 }
1630 
1631 /* --------------------------------------------------------------------- */
1632 
1633 static int
1634 tmpfs_mountctl(struct vop_mountctl_args *ap)
1635 {
1636 	struct tmpfs_mount *tmp;
1637 	struct mount *mp;
1638 	int rc;
1639 
1640 	mp = ap->a_head.a_ops->head.vv_mount;
1641 	lwkt_gettoken(&mp->mnt_token);
1642 
1643 	switch (ap->a_op) {
1644 	case (MOUNTCTL_SET_EXPORT):
1645 		tmp = (struct tmpfs_mount *) mp->mnt_data;
1646 
1647 		if (ap->a_ctllen != sizeof(struct export_args))
1648 			rc = (EINVAL);
1649 		else
1650 			rc = vfs_export(mp, &tmp->tm_export,
1651 					(const struct export_args *) ap->a_ctl);
1652 		break;
1653 	default:
1654 		rc = vop_stdmountctl(ap);
1655 		break;
1656 	}
1657 
1658 	lwkt_reltoken(&mp->mnt_token);
1659 	return (rc);
1660 }
1661 
1662 /* --------------------------------------------------------------------- */
1663 
1664 static int
1665 tmpfs_print(struct vop_print_args *v)
1666 {
1667 	struct vnode *vp = v->a_vp;
1668 
1669 	struct tmpfs_node *node;
1670 
1671 	node = VP_TO_TMPFS_NODE(vp);
1672 
1673 	kprintf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n",
1674 	    node, node->tn_flags, node->tn_links);
1675 	kprintf("\tmode 0%o, owner %d, group %d, size %ju, status 0x%x\n",
1676 	    node->tn_mode, node->tn_uid, node->tn_gid,
1677 	    (uintmax_t)node->tn_size, node->tn_status);
1678 
1679 	if (vp->v_type == VFIFO)
1680 		fifo_printinfo(vp);
1681 
1682 	kprintf("\n");
1683 
1684 	return 0;
1685 }
1686 
1687 /* --------------------------------------------------------------------- */
1688 
1689 static int
1690 tmpfs_pathconf(struct vop_pathconf_args *v)
1691 {
1692 	int name = v->a_name;
1693 	register_t *retval = v->a_retval;
1694 
1695 	int error;
1696 
1697 	error = 0;
1698 
1699 	switch (name) {
1700 	case _PC_LINK_MAX:
1701 		*retval = LINK_MAX;
1702 		break;
1703 
1704 	case _PC_NAME_MAX:
1705 		*retval = NAME_MAX;
1706 		break;
1707 
1708 	case _PC_PATH_MAX:
1709 		*retval = PATH_MAX;
1710 		break;
1711 
1712 	case _PC_PIPE_BUF:
1713 		*retval = PIPE_BUF;
1714 		break;
1715 
1716 	case _PC_CHOWN_RESTRICTED:
1717 		*retval = 1;
1718 		break;
1719 
1720 	case _PC_NO_TRUNC:
1721 		*retval = 1;
1722 		break;
1723 
1724 	case _PC_SYNC_IO:
1725 		*retval = 1;
1726 		break;
1727 
1728 	case _PC_FILESIZEBITS:
1729 		*retval = 0; /* XXX Don't know which value should I return. */
1730 		break;
1731 
1732 	default:
1733 		error = EINVAL;
1734 	}
1735 
1736 	return error;
1737 }
1738 
1739 /************************************************************************
1740  *                          KQFILTER OPS                                *
1741  ************************************************************************/
1742 
1743 static void filt_tmpfsdetach(struct knote *kn);
1744 static int filt_tmpfsread(struct knote *kn, long hint);
1745 static int filt_tmpfswrite(struct knote *kn, long hint);
1746 static int filt_tmpfsvnode(struct knote *kn, long hint);
1747 
1748 static struct filterops tmpfsread_filtops =
1749 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1750 	  NULL, filt_tmpfsdetach, filt_tmpfsread };
1751 static struct filterops tmpfswrite_filtops =
1752 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1753 	  NULL, filt_tmpfsdetach, filt_tmpfswrite };
1754 static struct filterops tmpfsvnode_filtops =
1755 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1756 	  NULL, filt_tmpfsdetach, filt_tmpfsvnode };
1757 
1758 static int
1759 tmpfs_kqfilter (struct vop_kqfilter_args *ap)
1760 {
1761 	struct vnode *vp = ap->a_vp;
1762 	struct knote *kn = ap->a_kn;
1763 
1764 	switch (kn->kn_filter) {
1765 	case EVFILT_READ:
1766 		kn->kn_fop = &tmpfsread_filtops;
1767 		break;
1768 	case EVFILT_WRITE:
1769 		kn->kn_fop = &tmpfswrite_filtops;
1770 		break;
1771 	case EVFILT_VNODE:
1772 		kn->kn_fop = &tmpfsvnode_filtops;
1773 		break;
1774 	default:
1775 		return (EOPNOTSUPP);
1776 	}
1777 
1778 	kn->kn_hook = (caddr_t)vp;
1779 
1780 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1781 
1782 	return(0);
1783 }
1784 
1785 static void
1786 filt_tmpfsdetach(struct knote *kn)
1787 {
1788 	struct vnode *vp = (void *)kn->kn_hook;
1789 
1790 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1791 }
1792 
1793 static int
1794 filt_tmpfsread(struct knote *kn, long hint)
1795 {
1796 	struct vnode *vp = (void *)kn->kn_hook;
1797 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
1798 	off_t off;
1799 
1800 	if (hint == NOTE_REVOKE) {
1801 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1802 		return(1);
1803 	}
1804 
1805 	/*
1806 	 * Interlock against MP races when performing this function.
1807 	 */
1808 	TMPFS_NODE_LOCK_SH(node);
1809 	off = node->tn_size - kn->kn_fp->f_offset;
1810 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
1811 	if (kn->kn_sfflags & NOTE_OLDAPI) {
1812 		TMPFS_NODE_UNLOCK(node);
1813 		return(1);
1814 	}
1815 	if (kn->kn_data == 0) {
1816 		kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
1817 	}
1818 	TMPFS_NODE_UNLOCK(node);
1819 	return (kn->kn_data != 0);
1820 }
1821 
1822 static int
1823 filt_tmpfswrite(struct knote *kn, long hint)
1824 {
1825 	if (hint == NOTE_REVOKE)
1826 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1827 	kn->kn_data = 0;
1828 	return (1);
1829 }
1830 
1831 static int
1832 filt_tmpfsvnode(struct knote *kn, long hint)
1833 {
1834 	if (kn->kn_sfflags & hint)
1835 		kn->kn_fflags |= hint;
1836 	if (hint == NOTE_REVOKE) {
1837 		kn->kn_flags |= (EV_EOF | EV_NODATA);
1838 		return (1);
1839 	}
1840 	return (kn->kn_fflags != 0);
1841 }
1842 
1843 
1844 /* --------------------------------------------------------------------- */
1845 
1846 /*
1847  * vnode operations vector used for files stored in a tmpfs file system.
1848  */
1849 struct vop_ops tmpfs_vnode_vops = {
1850 	.vop_default =			vop_defaultop,
1851 	.vop_getpages = 		vop_stdgetpages,
1852 	.vop_putpages = 		vop_stdputpages,
1853 	.vop_ncreate =			tmpfs_ncreate,
1854 	.vop_nresolve =			tmpfs_nresolve,
1855 	.vop_nlookupdotdot =		tmpfs_nlookupdotdot,
1856 	.vop_nmknod =			tmpfs_nmknod,
1857 	.vop_open =			tmpfs_open,
1858 	.vop_close =			tmpfs_close,
1859 	.vop_access =			tmpfs_access,
1860 	.vop_getattr =			tmpfs_getattr,
1861 	.vop_setattr =			tmpfs_setattr,
1862 	.vop_read =			tmpfs_read,
1863 	.vop_write =			tmpfs_write,
1864 	.vop_fsync =			tmpfs_fsync,
1865 	.vop_mountctl =			tmpfs_mountctl,
1866 	.vop_nremove =			tmpfs_nremove,
1867 	.vop_nlink =			tmpfs_nlink,
1868 	.vop_nrename =			tmpfs_nrename,
1869 	.vop_nmkdir =			tmpfs_nmkdir,
1870 	.vop_nrmdir =			tmpfs_nrmdir,
1871 	.vop_nsymlink =			tmpfs_nsymlink,
1872 	.vop_readdir =			tmpfs_readdir,
1873 	.vop_readlink =			tmpfs_readlink,
1874 	.vop_inactive =			tmpfs_inactive,
1875 	.vop_reclaim =			tmpfs_reclaim,
1876 	.vop_print =			tmpfs_print,
1877 	.vop_pathconf =			tmpfs_pathconf,
1878 	.vop_bmap =			tmpfs_bmap,
1879 	.vop_strategy =			tmpfs_strategy,
1880 	.vop_advlock =			tmpfs_advlock,
1881 	.vop_kqfilter =			tmpfs_kqfilter
1882 };
1883