xref: /openbsd-src/sys/tmpfs/tmpfs_subr.c (revision bbc49888b8451e9ac00361c5ad428a454a626afc)
1 /*	$OpenBSD: tmpfs_subr.c,v 1.22 2018/05/28 16:02:08 visa Exp $	*/
2 /*	$NetBSD: tmpfs_subr.c,v 1.79 2012/03/13 18:40:50 elad Exp $	*/
3 
4 /*
5  * Copyright (c) 2005-2011 The NetBSD Foundation, Inc.
6  * Copyright (c) 2013 Pedro Martelletto
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to The NetBSD Foundation
10  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
11  * 2005 program, and by Mindaugas Rasiukevicius.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Efficient memory file system: interfaces for inode and directory entry
37  * construction, destruction and manipulation.
38  *
39  * Reference counting
40  *
41  *	The link count of inode (tmpfs_node_t::tn_links) is used as a
42  *	reference counter.  However, it has slightly different semantics.
43  *
44  *	For directories - link count represents directory entries, which
45  *	refer to the directories.  In other words, it represents the count
46  *	of sub-directories.  It also takes into account the virtual '.'
47  *	entry (which has no real entry in the list).  For files - link count
48  *	represents the hard links.  Since only empty directories can be
49  *	removed - link count aligns the reference counting requirements
50  *	enough.  Note: to check whether directory is not empty, the inode
51  *	size (tmpfs_node_t::tn_size) can be used.
52  *
53  *	The inode itself, as an object, gathers its first reference when
54  *	directory entry is attached via tmpfs_dir_attach(9).  For instance,
55  *	after regular tmpfs_create(), a file would have a link count of 1,
56  *	while directory after tmpfs_mkdir() would have 2 (due to '.').
57  *
58  * Reclamation
59  *
60  *	It should be noted that tmpfs inodes rely on a combination of vnode
61  *	reference counting and link counting.  That is, an inode can only be
62  *	destroyed if its associated vnode is inactive.  The destruction is
63  *	done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
64  *	that tmpfs_node_t::tn_links being 0 is a destruction criterion.
65  *
66  *	If an inode has references within the file system (tn_links > 0) and
67  *	its inactive vnode gets reclaimed/recycled - then the association is
68  *	broken in tmpfs_reclaim().  In such case, an inode will always pass
69  *	tmpfs_lookup() and thus tmpfs_vnode_get() to associate a new vnode.
70  *
71  * Lock order
72  *
73  *	tmpfs_node_t::tn_nlock ->
74  *		struct vnode::v_vlock ->
75  *			struct vnode::v_interlock
76  */
77 
78 #include <sys/param.h>
79 #include <sys/dirent.h>
80 #include <sys/event.h>
81 #include <sys/mount.h>
82 #include <sys/namei.h>
83 #include <sys/time.h>
84 #include <sys/proc.h>
85 #include <sys/stat.h>
86 #include <sys/systm.h>
87 #include <sys/vnode.h>
88 
89 #include <uvm/uvm_aobj.h>
90 
91 #include <tmpfs/tmpfs.h>
92 #include <tmpfs/tmpfs_vnops.h>
93 
94 
95 /* Local functions. */
96 void	tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
97 int	tmpfs_dir_getdotents(tmpfs_node_t *, struct dirent *, struct uio *);
98 
99 /*
100  * tmpfs_alloc_node: allocate a new inode of a specified type and
101  * insert it into the list of specified mount point.
102  */
103 int
104 tmpfs_alloc_node(tmpfs_mount_t *tmp, enum vtype type, uid_t uid, gid_t gid,
105     mode_t mode, char *target, dev_t rdev, tmpfs_node_t **node)
106 {
107 	tmpfs_node_t *nnode;
108 	struct uvm_object *uobj;
109 
110 	nnode = tmpfs_node_get(tmp);
111 	if (nnode == NULL) {
112 		return ENOSPC;
113 	}
114 
115 	/* Initially, no references and no associations. */
116 	nnode->tn_links = 0;
117 	nnode->tn_vnode = NULL;
118 	nnode->tn_dirent_hint = NULL;
119 
120 	rw_enter_write(&tmp->tm_acc_lock);
121 	nnode->tn_id = ++tmp->tm_highest_inode;
122 	if (nnode->tn_id == 0) {
123 		--tmp->tm_highest_inode;
124 		rw_exit_write(&tmp->tm_acc_lock);
125 		tmpfs_node_put(tmp, nnode);
126 		return ENOSPC;
127 	}
128 	 rw_exit_write(&tmp->tm_acc_lock);
129 
130 	/* Generic initialization. */
131 	nnode->tn_type = type;
132 	nnode->tn_size = 0;
133 	nnode->tn_flags = 0;
134 	nnode->tn_lockf = NULL;
135 	nnode->tn_gen = TMPFS_NODE_GEN_MASK & arc4random();
136 
137 	nanotime(&nnode->tn_atime);
138 	nnode->tn_birthtime = nnode->tn_atime;
139 	nnode->tn_ctime = nnode->tn_atime;
140 	nnode->tn_mtime = nnode->tn_atime;
141 
142 	/* XXX pedro: we should check for UID_MAX and GID_MAX instead. */
143 	KASSERT(uid != VNOVAL && gid != VNOVAL && mode != VNOVAL);
144 
145 	nnode->tn_uid = uid;
146 	nnode->tn_gid = gid;
147 	nnode->tn_mode = mode;
148 
149 	/* Type-specific initialization. */
150 	switch (nnode->tn_type) {
151 	case VBLK:
152 	case VCHR:
153 		/* Character/block special device. */
154 		KASSERT(rdev != VNOVAL);
155 		nnode->tn_spec.tn_dev.tn_rdev = rdev;
156 		break;
157 	case VDIR:
158 		/* Directory. */
159 		TAILQ_INIT(&nnode->tn_spec.tn_dir.tn_dir);
160 		nnode->tn_spec.tn_dir.tn_parent = NULL;
161 		nnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
162 		nnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
163 
164 		/* Extra link count for the virtual '.' entry. */
165 		nnode->tn_links++;
166 		break;
167 	case VFIFO:
168 	case VSOCK:
169 		break;
170 	case VLNK:
171 		/* Symbolic link.  Target specifies the file name. */
172 		KASSERT(target && strlen(target) < MAXPATHLEN);
173 
174 		nnode->tn_size = strlen(target);
175 		if (nnode->tn_size == 0) {
176 			nnode->tn_spec.tn_lnk.tn_link = NULL;
177 			break;
178 		}
179 		nnode->tn_spec.tn_lnk.tn_link =
180 		    tmpfs_strname_alloc(tmp, nnode->tn_size);
181 		if (nnode->tn_spec.tn_lnk.tn_link == NULL) {
182 			tmpfs_node_put(tmp, nnode);
183 			return ENOSPC;
184 		}
185 		memcpy(nnode->tn_spec.tn_lnk.tn_link, target, nnode->tn_size);
186 		break;
187 	case VREG:
188 		/* Regular file.  Create an underlying UVM object. */
189 		uobj = uao_create(0, UAO_FLAG_CANFAIL);
190 		if (uobj == NULL) {
191 			tmpfs_node_put(tmp, nnode);
192 			return ENOSPC;
193 		}
194 		nnode->tn_spec.tn_reg.tn_aobj = uobj;
195 		nnode->tn_spec.tn_reg.tn_aobj_pages = 0;
196 		nnode->tn_spec.tn_reg.tn_aobj_pgptr = (vaddr_t)NULL;
197 		nnode->tn_spec.tn_reg.tn_aobj_pgnum = (voff_t)-1;
198 		break;
199 	default:
200 		KASSERT(0);
201 	}
202 
203 	rw_init(&nnode->tn_nlock, "tvlk");
204 
205 	rw_enter_write(&tmp->tm_lock);
206 	LIST_INSERT_HEAD(&tmp->tm_nodes, nnode, tn_entries);
207 	rw_exit_write(&tmp->tm_lock);
208 
209 	*node = nnode;
210 	return 0;
211 }
212 
213 /*
214  * tmpfs_free_node: remove the inode from a list in the mount point and
215  * destroy the inode structures.
216  */
217 void
218 tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
219 {
220 	size_t objsz;
221 
222 	rw_enter_write(&tmp->tm_lock);
223 	LIST_REMOVE(node, tn_entries);
224 	rw_exit_write(&tmp->tm_lock);
225 
226 	switch (node->tn_type) {
227 	case VLNK:
228 		if (node->tn_size > 0) {
229 			KASSERT(node->tn_size <= SIZE_MAX);
230 			tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
231 			    node->tn_size);
232 		}
233 		break;
234 	case VREG:
235 		/*
236 		 * Calculate the size of inode data, decrease the used-memory
237 		 * counter, and destroy the underlying UVM object (if any).
238 		 */
239 		objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
240 		if (objsz != 0) {
241 			tmpfs_mem_decr(tmp, objsz);
242 		}
243 		if (node->tn_spec.tn_reg.tn_aobj != NULL) {
244 			uao_detach(node->tn_spec.tn_reg.tn_aobj);
245 			node->tn_spec.tn_reg.tn_aobj = NULL;
246 		}
247 		break;
248 	case VDIR:
249 		KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
250 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
251 		    node == tmp->tm_root);
252 		break;
253 	default:
254 		break;
255 	}
256 
257 	rw_enter_write(&tmp->tm_acc_lock);
258 	if (node->tn_id == tmp->tm_highest_inode)
259 		--tmp->tm_highest_inode;
260 	rw_exit_write(&tmp->tm_acc_lock);
261 
262 	/* mutex_destroy(&node->tn_nlock); */
263 	tmpfs_node_put(tmp, node);
264 }
265 
266 /*
267  * tmpfs_vnode_get: allocate or reclaim a vnode for a specified inode.
268  *
269  * => Must be called with tmpfs_node_t::tn_nlock held.
270  * => Returns vnode (*vpp) locked.
271  */
272 int
273 tmpfs_vnode_get(struct mount *mp, tmpfs_node_t *node, struct vnode **vpp)
274 {
275 	struct vnode *vp, *nvp;
276 	/* kmutex_t *slock; */
277 	int error;
278 again:
279 	/* If there is already a vnode, try to reclaim it. */
280 	if ((vp = node->tn_vnode) != NULL) {
281 		/* atomic_or_ulong(&node->tn_gen, TMPFS_RECLAIMING_BIT); */
282 		node->tn_gen |= TMPFS_RECLAIMING_BIT;
283 		rw_exit_write(&node->tn_nlock);
284 		error = vget(vp, LK_EXCLUSIVE);
285 		if (error == ENOENT) {
286 			rw_enter_write(&node->tn_nlock);
287 			goto again;
288 		}
289 		/* atomic_and_ulong(&node->tn_gen, ~TMPFS_RECLAIMING_BIT); */
290 		node->tn_gen &= ~TMPFS_RECLAIMING_BIT;
291 		*vpp = vp;
292 		return error;
293 	}
294 	if (TMPFS_NODE_RECLAIMING(node)) {
295 		/* atomic_and_ulong(&node->tn_gen, ~TMPFS_RECLAIMING_BIT); */
296 		node->tn_gen &= ~TMPFS_RECLAIMING_BIT;
297 	}
298 
299 	/*
300 	 * Get a new vnode and associate it with our inode.  Share the
301 	 * lock with underlying UVM object, if there is one (VREG case).
302 	 */
303 #if 0
304 	if (node->tn_type == VREG) {
305 		struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
306 		slock = uobj->vmobjlock;
307 	} else {
308 		slock = NULL;
309 	}
310 #endif
311 	error = getnewvnode(VT_TMPFS, mp, &tmpfs_vops, &vp);
312 	if (error) {
313 		rw_exit_write(&node->tn_nlock);
314 		return error;
315 	}
316 
317 	rrw_init_flags(&node->tn_vlock, "tnode", RWL_DUPOK | RWL_IS_VNODE);
318 	vp->v_type = node->tn_type;
319 
320 	/* Type-specific initialization. */
321 	switch (node->tn_type) {
322 	case VBLK:
323 	case VCHR:
324 		vp->v_op = &tmpfs_specvops;
325 		if ((nvp = checkalias(vp, node->tn_spec.tn_dev.tn_rdev, mp))) {
326 			nvp->v_data = vp->v_data;
327 			vp->v_data = NULL;
328 			vp->v_op = &spec_vops;
329 			vrele(vp);
330 			vgone(vp);
331 			vp = nvp;
332 			node->tn_vnode = vp;
333 		}
334 		break;
335 	case VDIR:
336 		vp->v_flag |= node->tn_spec.tn_dir.tn_parent == node ?
337 		    VROOT : 0;
338 		break;
339 #ifdef FIFO
340 	case VFIFO:
341 		vp->v_op = &tmpfs_fifovops;
342 		break;
343 #endif
344 	case VLNK:
345 	case VREG:
346 	case VSOCK:
347 		break;
348 	default:
349 		KASSERT(0);
350 	}
351 
352 	uvm_vnp_setsize(vp, node->tn_size);
353 	vp->v_data = node;
354 	node->tn_vnode = vp;
355 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
356 	rw_exit_write(&node->tn_nlock);
357 
358 	KASSERT(VOP_ISLOCKED(vp));
359 	*vpp = vp;
360 	return 0;
361 }
362 
363 /*
364  * tmpfs_alloc_file: allocate a new file of specified type and adds it
365  * into the parent directory.
366  *
367  * => Credentials of the caller are used.
368  */
369 int
370 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
371     struct componentname *cnp, char *target)
372 {
373 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
374 	tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
375 	tmpfs_dirent_t *de;
376 	int error;
377 
378 	KASSERT(VOP_ISLOCKED(dvp));
379 	*vpp = NULL;
380 
381 	/* Check for the maximum number of links limit. */
382 	if (vap->va_type == VDIR) {
383 		/* Check for maximum links limit. */
384 		if (dnode->tn_links == LINK_MAX) {
385 			error = EMLINK;
386 			goto out;
387 		}
388 		KASSERT(dnode->tn_links < LINK_MAX);
389 	}
390 
391 	if (TMPFS_DIRSEQ_FULL(dnode)) {
392 		error = ENOSPC;
393 		goto out;
394 	}
395 
396 	if (dnode->tn_links == 0) {
397 		error = ENOENT;
398 		goto out;
399 	}
400 
401 	/* Allocate a node that represents the new file. */
402 	error = tmpfs_alloc_node(tmp, vap->va_type, cnp->cn_cred->cr_uid,
403 	    dnode->tn_gid, vap->va_mode, target, vap->va_rdev, &node);
404 	if (error)
405 		goto out;
406 
407 	/* Allocate a directory entry that points to the new file. */
408 	error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
409 	if (error) {
410 		tmpfs_free_node(tmp, node);
411 		goto out;
412 	}
413 
414 	/* Get a vnode for the new file. */
415 	rw_enter_write(&node->tn_nlock);
416 	error = tmpfs_vnode_get(dvp->v_mount, node, vpp);
417 	if (error) {
418 		tmpfs_free_dirent(tmp, de);
419 		tmpfs_free_node(tmp, node);
420 		goto out;
421 	}
422 
423 	/* Associate inode and attach the entry into the directory. */
424 	tmpfs_dir_attach(dnode, de, node);
425 
426 out:
427 	if (error == 0 && (cnp->cn_flags & SAVESTART) == 0)
428 		pool_put(&namei_pool, cnp->cn_pnbuf);
429 	return error;
430 }
431 
432 /*
433  * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
434  * The directory entry contains a path name component.
435  */
436 int
437 tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
438     tmpfs_dirent_t **de)
439 {
440 	tmpfs_dirent_t *nde;
441 
442 	nde = tmpfs_dirent_get(tmp);
443 	if (nde == NULL)
444 		return ENOSPC;
445 
446 	nde->td_name = tmpfs_strname_alloc(tmp, len);
447 	if (nde->td_name == NULL) {
448 		tmpfs_dirent_put(tmp, nde);
449 		return ENOSPC;
450 	}
451 	nde->td_namelen = len;
452 	memcpy(nde->td_name, name, len);
453 	nde->td_seq = TMPFS_DIRSEQ_NONE;
454 
455 	*de = nde;
456 	return 0;
457 }
458 
459 /*
460  * tmpfs_free_dirent: free a directory entry.
461  */
462 void
463 tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
464 {
465 
466 	KASSERT(de->td_node == NULL);
467 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
468 	tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
469 	tmpfs_dirent_put(tmp, de);
470 }
471 
472 /*
473  * tmpfs_dir_attach: associate directory entry with a specified inode,
474  * and attach the entry into the directory, specified by vnode.
475  *
476  * => Increases link count on the associated node.
477  * => Increases link count on directory node, if our node is VDIR.
478  *    It is caller's responsibility to check for the LINK_MAX limit.
479  * => Triggers kqueue events here.
480  */
481 void
482 tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
483 {
484 	struct vnode *dvp = dnode->tn_vnode;
485 	int events = NOTE_WRITE;
486 
487 	KASSERT(dvp != NULL);
488 	KASSERT(VOP_ISLOCKED(dvp));
489 
490 	/* Get a new sequence number. */
491 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
492 	de->td_seq = tmpfs_dir_getseq(dnode, de);
493 
494 	/* Associate directory entry and the inode. */
495 	de->td_node = node;
496 	KASSERT(node->tn_links < LINK_MAX);
497 	node->tn_links++;
498 
499 	/* Save the hint (might overwrite). */
500 	node->tn_dirent_hint = de;
501 
502 	/* Insert the entry to the directory (parent of inode). */
503 	TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
504 	dnode->tn_size += sizeof(tmpfs_dirent_t);
505 	tmpfs_update(dnode, TMPFS_NODE_STATUSALL);
506 	uvm_vnp_setsize(dvp, dnode->tn_size);
507 
508 	if (node->tn_type == VDIR) {
509 		/* Set parent. */
510 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
511 		node->tn_spec.tn_dir.tn_parent = dnode;
512 
513 		/* Increase the link count of parent. */
514 		KASSERT(dnode->tn_links < LINK_MAX);
515 		dnode->tn_links++;
516 		events |= NOTE_LINK;
517 
518 		TMPFS_VALIDATE_DIR(node);
519 	}
520 	VN_KNOTE(dvp, events);
521 }
522 
523 /*
524  * tmpfs_dir_detach: disassociate directory entry and its inode,
525  * and detach the entry from the directory, specified by vnode.
526  *
527  * => Decreases link count on the associated node.
528  * => Decreases the link count on directory node, if our node is VDIR.
529  * => Triggers kqueue events here.
530  */
531 void
532 tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
533 {
534 	tmpfs_node_t *node = de->td_node;
535 	struct vnode *vp, *dvp = dnode->tn_vnode;
536 	int events = NOTE_WRITE;
537 
538 	KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
539 
540 	/* Deassociate the inode and entry. */
541 	de->td_node = NULL;
542 	node->tn_dirent_hint = NULL;
543 
544 	KASSERT(node->tn_links > 0);
545 	node->tn_links--;
546 	if ((vp = node->tn_vnode) != NULL) {
547 		KASSERT(VOP_ISLOCKED(vp));
548 		VN_KNOTE(vp, node->tn_links ?  NOTE_LINK : NOTE_DELETE);
549 	}
550 
551 	/* If directory - decrease the link count of parent. */
552 	if (node->tn_type == VDIR) {
553 		KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
554 		node->tn_spec.tn_dir.tn_parent = NULL;
555 
556 		KASSERT(dnode->tn_links > 0);
557 		dnode->tn_links--;
558 		events |= NOTE_LINK;
559 	}
560 
561 	/* Remove the entry from the directory. */
562 	if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
563 		dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
564 	}
565 	TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
566 
567 	dnode->tn_size -= sizeof(tmpfs_dirent_t);
568 	tmpfs_update(dnode, TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED);
569 	tmpfs_dir_putseq(dnode, de);
570 	if (dvp) {
571 		tmpfs_update(dnode, 0);
572 		uvm_vnp_setsize(dvp, dnode->tn_size);
573 		VN_KNOTE(dvp, events);
574 	}
575 }
576 
577 /*
578  * tmpfs_dir_lookup: find a directory entry in the specified inode.
579  *
580  * Note that the . and .. components are not allowed as they do not
581  * physically exist within directories.
582  */
583 tmpfs_dirent_t *
584 tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
585 {
586 	const char *name = cnp->cn_nameptr;
587 	const uint16_t nlen = cnp->cn_namelen;
588 	tmpfs_dirent_t *de;
589 
590 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
591 	KASSERT(nlen != 1 || !(name[0] == '.'));
592 	KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
593 	TMPFS_VALIDATE_DIR(node);
594 
595 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
596 		if (de->td_namelen != nlen)
597 			continue;
598 		if (memcmp(de->td_name, name, nlen) != 0)
599 			continue;
600 		break;
601 	}
602 	tmpfs_update(node, TMPFS_NODE_ACCESSED);
603 	return de;
604 }
605 
606 /*
607  * tmpfs_dir_cached: get a cached directory entry if it is valid.  Used to
608  * avoid unnecessary tmpfs_dir_lookup().
609  *
610  * => The vnode must be locked.
611  */
612 tmpfs_dirent_t *
613 tmpfs_dir_cached(tmpfs_node_t *node)
614 {
615 	tmpfs_dirent_t *de = node->tn_dirent_hint;
616 
617 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
618 
619 	if (de == NULL) {
620 		return NULL;
621 	}
622 	KASSERT(de->td_node == node);
623 
624 	/*
625 	 * Directories always have a valid hint.  For files, check if there
626 	 * are any hard links.  If there are - hint might be invalid.
627 	 */
628 	return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
629 }
630 
631 /*
632  * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
633  */
634 uint64_t
635 tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
636 {
637 	uint64_t seq = de->td_seq;
638 
639 	TMPFS_VALIDATE_DIR(dnode);
640 
641 	if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
642 		/* Already set. */
643 		KASSERT(seq >= TMPFS_DIRSEQ_START);
644 		return seq;
645 	}
646 
647 	/*
648 	 * The "." and ".." and the end-of-directory have reserved numbers.
649 	 * The other sequence numbers are allocated incrementally.
650 	 */
651 
652 	seq = dnode->tn_spec.tn_dir.tn_next_seq;
653 	KASSERT(seq >= TMPFS_DIRSEQ_START);
654 	KASSERT(seq < TMPFS_DIRSEQ_END);
655 	dnode->tn_spec.tn_dir.tn_next_seq++;
656 	return seq;
657 }
658 
659 void
660 tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
661 {
662 	uint64_t seq = de->td_seq;
663 
664 	TMPFS_VALIDATE_DIR(dnode);
665 	KASSERT(seq == TMPFS_DIRSEQ_NONE || seq >= TMPFS_DIRSEQ_START);
666 	KASSERT(seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END);
667 
668 	de->td_seq = TMPFS_DIRSEQ_NONE;
669 
670 	/* Empty?  We can reset. */
671 	if (dnode->tn_size == 0) {
672 		dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
673 	} else if (seq != TMPFS_DIRSEQ_NONE &&
674 		seq == dnode->tn_spec.tn_dir.tn_next_seq - 1) {
675 		dnode->tn_spec.tn_dir.tn_next_seq--;
676 	}
677 }
678 
679 /*
680  * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
681  */
682 tmpfs_dirent_t *
683 tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
684 {
685 	tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
686 
687 	TMPFS_VALIDATE_DIR(node);
688 
689 	/*
690 	 * First, check the cache.  If does not match - perform a lookup.
691 	 */
692 	if (de && de->td_seq == seq) {
693 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
694 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
695 		return de;
696 	}
697 
698 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
699 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
700 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
701 		if (de->td_seq == seq)
702 			return de;
703 	}
704 	return NULL;
705 }
706 
707 /*
708  * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
709  * dot meta entries, that is, "." or "..".  Copy it to the UIO space.
710  */
711 int
712 tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
713 {
714 	tmpfs_dirent_t *de;
715 	off_t next = 0;
716 	int error;
717 
718 	switch (uio->uio_offset) {
719 		case TMPFS_DIRSEQ_DOT:
720 			dp->d_fileno = node->tn_id;
721 			strlcpy(dp->d_name, ".", sizeof(dp->d_name));
722 			next = TMPFS_DIRSEQ_DOTDOT;
723 			break;
724 		case TMPFS_DIRSEQ_DOTDOT:
725 			dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
726 			strlcpy(dp->d_name, "..", sizeof(dp->d_name));
727 			de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
728 			next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
729 			break;
730 		default:
731 			KASSERT(false);
732 	}
733 	dp->d_type = DT_DIR;
734 	dp->d_namlen = strlen(dp->d_name);
735 	dp->d_reclen = DIRENT_SIZE(dp);
736 	dp->d_off = next;
737 
738 	if (dp->d_reclen > uio->uio_resid) {
739 		return EJUSTRETURN;
740 	}
741 
742 	if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
743 		return error;
744 	}
745 
746 	uio->uio_offset = next;
747 	return error;
748 }
749 
750 /*
751  * tmpfs_dir_getdents: helper function for tmpfs_readdir.
752  *
753  * => Returns as much directory entries as can fit in the uio space.
754  * => The read starts at uio->uio_offset.
755  */
756 int
757 tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio)
758 {
759 	tmpfs_dirent_t *de, *next_de;
760 	struct dirent dent;
761 	int error = 0;
762 
763 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
764 	TMPFS_VALIDATE_DIR(node);
765 	memset(&dent, 0, sizeof(dent));
766 
767 	if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
768 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
769 			goto done;
770 		}
771 	}
772 	if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
773 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
774 			goto done;
775 		}
776 	}
777 	/* Done if we reached the end. */
778 	if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
779 		goto done;
780 	}
781 
782 	/* Locate the directory entry given by the given sequence number. */
783 	de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
784 	if (de == NULL) {
785 		error = EINVAL;
786 		goto done;
787 	}
788 
789 	/*
790 	 * Read as many entries as possible; i.e., until we reach the end
791 	 * of the directory or we exhaust UIO space.
792 	 */
793 	do {
794 		dent.d_fileno = de->td_node->tn_id;
795 		switch (de->td_node->tn_type) {
796 		case VBLK:
797 			dent.d_type = DT_BLK;
798 			break;
799 		case VCHR:
800 			dent.d_type = DT_CHR;
801 			break;
802 		case VDIR:
803 			dent.d_type = DT_DIR;
804 			break;
805 		case VFIFO:
806 			dent.d_type = DT_FIFO;
807 			break;
808 		case VLNK:
809 			dent.d_type = DT_LNK;
810 			break;
811 		case VREG:
812 			dent.d_type = DT_REG;
813 			break;
814 		case VSOCK:
815 			dent.d_type = DT_SOCK;
816 			break;
817 		default:
818 			KASSERT(0);
819 		}
820 		dent.d_namlen = de->td_namelen;
821 		KASSERT(de->td_namelen < sizeof(dent.d_name));
822 		memcpy(dent.d_name, de->td_name, de->td_namelen);
823 		dent.d_name[de->td_namelen] = '\0';
824 		dent.d_reclen = DIRENT_SIZE(&dent);
825 
826 		next_de = TAILQ_NEXT(de, td_entries);
827 		if (next_de == NULL)
828 			dent.d_off = TMPFS_DIRSEQ_EOF;
829 		else
830 			dent.d_off = tmpfs_dir_getseq(node, next_de);
831 
832 		if (dent.d_reclen > uio->uio_resid) {
833 			/* Exhausted UIO space. */
834 			error = EJUSTRETURN;
835 			break;
836 		}
837 
838 		/* Copy out the directory entry and continue. */
839 		error = uiomove(&dent, dent.d_reclen, uio);
840 		if (error) {
841 			break;
842 		}
843 		de = TAILQ_NEXT(de, td_entries);
844 
845 	} while (uio->uio_resid > 0 && de);
846 
847 	/* Cache the last entry or clear and mark EOF. */
848 	uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
849 	node->tn_spec.tn_dir.tn_readdir_lastp = de;
850 done:
851 	tmpfs_update(node, TMPFS_NODE_ACCESSED);
852 
853 	if (error == EJUSTRETURN) {
854 		/* Exhausted UIO space - just return. */
855 		error = 0;
856 	}
857 	KASSERT(error >= 0);
858 	return error;
859 }
860 
861 /*
862  * tmpfs_reg_resize: resize the underlying UVM object associated with the
863  * specified regular file.
864  */
865 
866 int
867 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
868 {
869 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
870 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
871 	struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
872 	size_t newpages, oldpages, bytes;
873 	off_t oldsize;
874 	vaddr_t pgoff;
875 	int error;
876 
877 	KASSERT(vp->v_type == VREG);
878 	KASSERT(newsize >= 0);
879 
880 	oldsize = node->tn_size;
881 	oldpages = round_page(oldsize) >> PAGE_SHIFT;
882 	newpages = round_page(newsize) >> PAGE_SHIFT;
883 	KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
884 
885 	if (newpages > oldpages) {
886 		/* Increase the used-memory counter if getting extra pages. */
887 		bytes = (newpages - oldpages) << PAGE_SHIFT;
888 		if (tmpfs_mem_incr(tmp, bytes) == 0)
889 			return ENOSPC;
890 		if (uao_grow(uobj, newpages) != 0) {
891 			tmpfs_mem_decr(tmp, bytes);
892 			return ENOSPC;
893 		}
894 	}
895 
896 	node->tn_spec.tn_reg.tn_aobj_pages = newpages;
897 	node->tn_size = newsize;
898 	uvm_vnp_setsize(vp, newsize);
899 	uvm_vnp_uncache(vp);
900 
901 	/*
902 	 * Free "backing store".
903 	 */
904 	if (newpages < oldpages) {
905 		if (tmpfs_uio_cached(node))
906 			tmpfs_uio_uncache(node);
907 		if (uao_shrink(uobj, newpages))
908 			panic("shrink failed");
909 		/* Decrease the used-memory counter. */
910 		tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
911 	}
912 	if (newsize > oldsize) {
913 		if (tmpfs_uio_cached(node))
914 			tmpfs_uio_uncache(node);
915 		pgoff = oldsize & PAGE_MASK;
916 		if (pgoff != 0) {
917 			/*
918 			 * Growing from an offset which is not at a page
919 			 * boundary; zero out unused bytes in current page.
920 			 */
921 			error = tmpfs_zeropg(node, trunc_page(oldsize), pgoff);
922 			if (error)
923 				panic("tmpfs_zeropg: error %d", error);
924 		}
925 		VN_KNOTE(vp, NOTE_EXTEND);
926 	}
927 	return 0;
928 }
929 
930 /*
931  * tmpfs_chflags: change flags of the given vnode.
932  *
933  */
934 int
935 tmpfs_chflags(struct vnode *vp, int flags, struct ucred *cred, struct proc *p)
936 {
937 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
938 	int error;
939 
940 	KASSERT(VOP_ISLOCKED(vp));
941 
942 	/* Disallow this operation if the file system is mounted read-only. */
943 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
944 		return EROFS;
945 
946 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)))
947 		return error;
948 
949 	if (cred->cr_uid == 0) {
950 		if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND) &&
951 		    securelevel > 0)
952 			return EPERM;
953 		node->tn_flags = flags;
954 	} else {
955 		if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND) ||
956 		    (flags & UF_SETTABLE) != flags)
957 			return EPERM;
958 		node->tn_flags &= SF_SETTABLE;
959 		node->tn_flags |= (flags & UF_SETTABLE);
960 	}
961 
962 	tmpfs_update(node, TMPFS_NODE_CHANGED);
963 	VN_KNOTE(vp, NOTE_ATTRIB);
964 	return 0;
965 }
966 
967 /*
968  * tmpfs_chmod: change access mode on the given vnode.
969  *
970  */
971 int
972 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct proc *p)
973 {
974 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
975 	int error;
976 
977 	KASSERT(VOP_ISLOCKED(vp));
978 
979 	/* Disallow this operation if the file system is mounted read-only. */
980 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
981 		return EROFS;
982 
983 	/* Immutable or append-only files cannot be modified, either. */
984 	if (node->tn_flags & (IMMUTABLE | APPEND))
985 		return EPERM;
986 
987 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)))
988 		return error;
989 	if (cred->cr_uid != 0) {
990 		if (vp->v_type != VDIR && (mode & S_ISTXT))
991 			return EFTYPE;
992 		if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID))
993 			return EPERM;
994 	}
995 
996 	node->tn_mode = (mode & ALLPERMS);
997 	tmpfs_update(node, TMPFS_NODE_CHANGED);
998 	if ((vp->v_flag & VTEXT) && (node->tn_mode & S_ISTXT) == 0)
999 		uvm_vnp_uncache(vp);
1000 	VN_KNOTE(vp, NOTE_ATTRIB);
1001 	return 0;
1002 }
1003 
1004 /*
1005  * tmpfs_chown: change ownership of the given vnode.
1006  *
1007  * => At least one of uid or gid must be different than VNOVAL.
1008  * => Attribute is unchanged for VNOVAL case.
1009  */
1010 int
1011 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct proc *p)
1012 {
1013 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1014 	int error;
1015 
1016 	KASSERT(VOP_ISLOCKED(vp));
1017 
1018 	/* Assign default values if they are unknown. */
1019 	KASSERT(uid != VNOVAL || gid != VNOVAL);
1020 	if (uid == VNOVAL) {
1021 		uid = node->tn_uid;
1022 	}
1023 	if (gid == VNOVAL) {
1024 		gid = node->tn_gid;
1025 	}
1026 
1027 	/* Disallow this operation if the file system is mounted read-only. */
1028 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1029 		return EROFS;
1030 
1031 	/* Immutable or append-only files cannot be modified, either. */
1032 	if (node->tn_flags & (IMMUTABLE | APPEND))
1033 		return EPERM;
1034 
1035 	if ((cred->cr_uid != node->tn_uid || uid != node->tn_uid ||
1036 	    (gid != node->tn_gid && !groupmember(gid, cred))) &&
1037 	    (error = suser_ucred(cred)))
1038 	    	return error;
1039 
1040 	node->tn_uid = uid;
1041 	node->tn_gid = gid;
1042 	tmpfs_update(node, TMPFS_NODE_CHANGED);
1043 	VN_KNOTE(vp, NOTE_ATTRIB);
1044 	return 0;
1045 }
1046 
1047 /*
1048  * tmpfs_chsize: change size of the given vnode.
1049  */
1050 int
1051 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct proc *p)
1052 {
1053 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1054 
1055 	KASSERT(VOP_ISLOCKED(vp));
1056 
1057 	/* Decide whether this is a valid operation based on the file type. */
1058 	switch (vp->v_type) {
1059 	case VDIR:
1060 		return EISDIR;
1061 	case VREG:
1062 		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1063 			return EROFS;
1064 		}
1065 		break;
1066 	case VBLK:
1067 	case VCHR:
1068 	case VFIFO:
1069 		/*
1070 		 * Allow modifications of special files even if in the file
1071 		 * system is mounted read-only (we are not modifying the
1072 		 * files themselves, but the objects they represent).
1073 		 */
1074 		return 0;
1075 	default:
1076 		return EOPNOTSUPP;
1077 	}
1078 
1079 	/* Immutable or append-only files cannot be modified, either. */
1080 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1081 		return EPERM;
1082 	}
1083 
1084 	/* Note: tmpfs_truncate() will raise NOTE_EXTEND and NOTE_ATTRIB. */
1085 	return tmpfs_truncate(vp, size);
1086 }
1087 
1088 /*
1089  * tmpfs_chtimes: change access and modification times for vnode.
1090  */
1091 int
1092 tmpfs_chtimes(struct vnode *vp, const struct timespec *atime,
1093     const struct timespec *mtime, int vaflags, struct ucred *cred,
1094     struct proc *p)
1095 {
1096 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1097 	int error;
1098 
1099 	KASSERT(VOP_ISLOCKED(vp));
1100 
1101 	/* Disallow this operation if the file system is mounted read-only. */
1102 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1103 		return EROFS;
1104 
1105 	/* Immutable or append-only files cannot be modified, either. */
1106 	if (node->tn_flags & (IMMUTABLE | APPEND))
1107 		return EPERM;
1108 
1109 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)) &&
1110 	    ((vaflags & VA_UTIMES_NULL) == 0 ||
1111 	    (error = VOP_ACCESS(vp, VWRITE, cred, p))))
1112 	    	return error;
1113 
1114 	if (atime->tv_nsec != VNOVAL)
1115 		node->tn_atime = *atime;
1116 
1117 	if (mtime->tv_nsec != VNOVAL)
1118 		node->tn_mtime = *mtime;
1119 
1120 	if (mtime->tv_nsec != VNOVAL || (vaflags & VA_UTIMES_CHANGE))
1121 		tmpfs_update(VP_TO_TMPFS_NODE(vp), TMPFS_NODE_CHANGED);
1122 
1123 	VN_KNOTE(vp, NOTE_ATTRIB);
1124 
1125 	return 0;
1126 }
1127 
1128 /*
1129  * tmpfs_update: update timestamps, et al.
1130  */
1131 void
1132 tmpfs_update(tmpfs_node_t *node, int flags)
1133 {
1134 	struct timespec nowtm;
1135 
1136 	nanotime(&nowtm);
1137 
1138 	if (flags & TMPFS_NODE_ACCESSED) {
1139 		node->tn_atime = nowtm;
1140  	}
1141 	if (flags & TMPFS_NODE_MODIFIED) {
1142 		node->tn_mtime = nowtm;
1143  	}
1144 	if (flags & TMPFS_NODE_CHANGED) {
1145  		node->tn_ctime = nowtm;
1146  	}
1147 }
1148 
1149 int
1150 tmpfs_truncate(struct vnode *vp, off_t length)
1151 {
1152 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1153 	int error;
1154 
1155 	if (length < 0) {
1156 		error = EINVAL;
1157 		goto out;
1158 	}
1159 	if (node->tn_size == length) {
1160 		error = 0;
1161 		goto out;
1162 	}
1163 	error = tmpfs_reg_resize(vp, length);
1164 	if (error == 0) {
1165 		tmpfs_update(node, TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED);
1166 	}
1167 out:
1168 	return error;
1169 }
1170 
1171 int
1172 tmpfs_uio_cached(tmpfs_node_t *node)
1173 {
1174 	int pgnum_valid = (node->tn_pgnum != (voff_t)-1);
1175 	int pgptr_valid = (node->tn_pgptr != (vaddr_t)NULL);
1176 	KASSERT(pgnum_valid == pgptr_valid);
1177 	return pgnum_valid && pgptr_valid;
1178 }
1179 
1180 vaddr_t
1181 tmpfs_uio_lookup(tmpfs_node_t *node, voff_t pgnum)
1182 {
1183 	if (tmpfs_uio_cached(node) == 1 && node->tn_pgnum == pgnum)
1184 		return node->tn_pgptr;
1185 
1186 	return (vaddr_t)NULL;
1187 }
1188 
1189 void
1190 tmpfs_uio_uncache(tmpfs_node_t *node)
1191 {
1192 	KASSERT(node->tn_pgnum != (voff_t)-1);
1193 	KASSERT(node->tn_pgptr != (vaddr_t)NULL);
1194 	uvm_unmap(kernel_map, node->tn_pgptr, node->tn_pgptr + PAGE_SIZE);
1195 	node->tn_pgnum = (voff_t)-1;
1196 	node->tn_pgptr = (vaddr_t)NULL;
1197 }
1198 
1199 void
1200 tmpfs_uio_cache(tmpfs_node_t *node, voff_t pgnum, vaddr_t pgptr)
1201 {
1202 	KASSERT(node->tn_pgnum == (voff_t)-1);
1203 	KASSERT(node->tn_pgptr == (vaddr_t)NULL);
1204 	node->tn_pgnum = pgnum;
1205 	node->tn_pgptr = pgptr;
1206 }
1207 
1208 /*
1209  * Be gentle to kernel_map, don't allow more than 4MB in a single transaction.
1210  */
1211 #define TMPFS_UIO_MAXBYTES	((1 << 22) - PAGE_SIZE)
1212 
1213 int
1214 tmpfs_uiomove(tmpfs_node_t *node, struct uio *uio, vsize_t len)
1215 {
1216 	vaddr_t va, pgoff;
1217 	int error, adv;
1218 	voff_t pgnum;
1219 	vsize_t sz;
1220 
1221 	pgnum = trunc_page(uio->uio_offset);
1222 	pgoff = uio->uio_offset & PAGE_MASK;
1223 
1224 	if (pgoff + len < PAGE_SIZE) {
1225 		va = tmpfs_uio_lookup(node, pgnum);
1226 		if (va != (vaddr_t)NULL)
1227 			return uiomove((void *)va + pgoff, len, uio);
1228 	}
1229 
1230 	if (len >= TMPFS_UIO_MAXBYTES) {
1231 		sz = TMPFS_UIO_MAXBYTES;
1232 		adv = MADV_NORMAL;
1233 	} else {
1234 		sz = len;
1235 		adv = MADV_SEQUENTIAL;
1236 	}
1237 
1238 	if (tmpfs_uio_cached(node))
1239 		tmpfs_uio_uncache(node);
1240 
1241 	uao_reference(node->tn_uobj);
1242 
1243 	error = uvm_map(kernel_map, &va, round_page(pgoff + sz), node->tn_uobj,
1244 	    trunc_page(uio->uio_offset), 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE,
1245 	    PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, adv, 0));
1246 	if (error) {
1247 		uao_detach(node->tn_uobj); /* Drop reference. */
1248 		return error;
1249 	}
1250 
1251 	error = uiomove((void *)va + pgoff, sz, uio);
1252 	if (error == 0 && pgoff + sz < PAGE_SIZE)
1253 		tmpfs_uio_cache(node, pgnum, va);
1254 	else
1255 		uvm_unmap(kernel_map, va, va + round_page(pgoff + sz));
1256 
1257 	return error;
1258 }
1259 
1260 int
1261 tmpfs_zeropg(tmpfs_node_t *node, voff_t pgnum, vaddr_t pgoff)
1262 {
1263 	vaddr_t va;
1264 	int error;
1265 
1266 	KASSERT(tmpfs_uio_cached(node) == 0);
1267 
1268 	uao_reference(node->tn_uobj);
1269 
1270 	error = uvm_map(kernel_map, &va, PAGE_SIZE, node->tn_uobj, pgnum, 0,
1271 	    UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
1272 	    MAP_INHERIT_NONE, MADV_NORMAL, 0));
1273 	if (error) {
1274 		uao_detach(node->tn_uobj); /* Drop reference. */
1275 		return error;
1276 	}
1277 
1278 	bzero((void *)va + pgoff, PAGE_SIZE - pgoff);
1279 	uvm_unmap(kernel_map, va, va + PAGE_SIZE);
1280 
1281 	return 0;
1282 }
1283 
1284