xref: /openbsd-src/sys/tmpfs/tmpfs_subr.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /*	$OpenBSD: tmpfs_subr.c,v 1.7 2014/11/16 12:31:00 deraadt Exp $	*/
2 /*	$NetBSD: tmpfs_subr.c,v 1.79 2012/03/13 18:40:50 elad Exp $	*/
3 
4 /*
5  * Copyright (c) 2005-2011 The NetBSD Foundation, Inc.
6  * Copyright (c) 2013 Pedro Martelletto
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to The NetBSD Foundation
10  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
11  * 2005 program, and by Mindaugas Rasiukevicius.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Efficient memory file system: interfaces for inode and directory entry
37  * construction, destruction and manipulation.
38  *
39  * Reference counting
40  *
41  *	The link count of inode (tmpfs_node_t::tn_links) is used as a
42  *	reference counter.  However, it has slightly different semantics.
43  *
44  *	For directories - link count represents directory entries, which
45  *	refer to the directories.  In other words, it represents the count
46  *	of sub-directories.  It also takes into account the virtual '.'
47  *	entry (which has no real entry in the list).  For files - link count
48  *	represents the hard links.  Since only empty directories can be
49  *	removed - link count aligns the reference counting requirements
50  *	enough.  Note: to check whether directory is not empty, the inode
51  *	size (tmpfs_node_t::tn_size) can be used.
52  *
53  *	The inode itself, as an object, gathers its first reference when
54  *	directory entry is attached via tmpfs_dir_attach(9).  For instance,
55  *	after regular tmpfs_create(), a file would have a link count of 1,
56  *	while directory after tmpfs_mkdir() would have 2 (due to '.').
57  *
58  * Reclamation
59  *
60  *	It should be noted that tmpfs inodes rely on a combination of vnode
61  *	reference counting and link counting.  That is, an inode can only be
62  *	destroyed if its associated vnode is inactive.  The destruction is
63  *	done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
64  *	that tmpfs_node_t::tn_links being 0 is a destruction criterion.
65  *
66  *	If an inode has references within the file system (tn_links > 0) and
67  *	its inactive vnode gets reclaimed/recycled - then the association is
68  *	broken in tmpfs_reclaim().  In such case, an inode will always pass
69  *	tmpfs_lookup() and thus tmpfs_vnode_get() to associate a new vnode.
70  *
71  * Lock order
72  *
73  *	tmpfs_node_t::tn_nlock ->
74  *		struct vnode::v_vlock ->
75  *			struct vnode::v_interlock
76  */
77 
78 #if 0
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.79 2012/03/13 18:40:50 elad Exp $");
81 #endif
82 
83 #include <sys/param.h>
84 #include <sys/dirent.h>
85 #include <sys/event.h>
86 #include <sys/mount.h>
87 #include <sys/namei.h>
88 #include <sys/time.h>
89 #include <sys/proc.h>
90 #include <sys/stat.h>
91 #include <sys/systm.h>
92 #include <sys/vnode.h>
93 
94 #include <uvm/uvm_aobj.h>
95 
96 #include <dev/rndvar.h>
97 
98 #include <tmpfs/tmpfs.h>
99 #include <tmpfs/tmpfs_vnops.h>
100 
101 
102 /* Local functions. */
103 void	tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
104 int	tmpfs_dir_getdotents(tmpfs_node_t *, struct dirent *, struct uio *);
105 
106 /*
107  * tmpfs_alloc_node: allocate a new inode of a specified type and
108  * insert it into the list of specified mount point.
109  */
110 int
111 tmpfs_alloc_node(tmpfs_mount_t *tmp, enum vtype type, uid_t uid, gid_t gid,
112     mode_t mode, char *target, dev_t rdev, tmpfs_node_t **node)
113 {
114 	tmpfs_node_t *nnode;
115 	struct uvm_object *uobj;
116 
117 	nnode = tmpfs_node_get(tmp);
118 	if (nnode == NULL) {
119 		return ENOSPC;
120 	}
121 
122 	/* Initially, no references and no associations. */
123 	nnode->tn_links = 0;
124 	nnode->tn_vnode = NULL;
125 	nnode->tn_dirent_hint = NULL;
126 
127 	rw_enter_write(&tmp->tm_acc_lock);
128 	nnode->tn_id = ++tmp->tm_highest_inode;
129 	if (nnode->tn_id == 0) {
130 		--tmp->tm_highest_inode;
131 		rw_exit_write(&tmp->tm_acc_lock);
132 		tmpfs_node_put(tmp, nnode);
133 		return ENOSPC;
134 	}
135 	 rw_exit_write(&tmp->tm_acc_lock);
136 
137 	/* Generic initialization. */
138 	nnode->tn_type = type;
139 	nnode->tn_size = 0;
140 	nnode->tn_flags = 0;
141 	nnode->tn_lockf = NULL;
142 	nnode->tn_gen = TMPFS_NODE_GEN_MASK & arc4random();
143 
144 	nanotime(&nnode->tn_atime);
145 	nnode->tn_birthtime = nnode->tn_atime;
146 	nnode->tn_ctime = nnode->tn_atime;
147 	nnode->tn_mtime = nnode->tn_atime;
148 
149 	/* XXX pedro: we should check for UID_MAX and GID_MAX instead. */
150 	KASSERT(uid != VNOVAL && gid != VNOVAL && mode != VNOVAL);
151 
152 	nnode->tn_uid = uid;
153 	nnode->tn_gid = gid;
154 	nnode->tn_mode = mode;
155 
156 	/* Type-specific initialization. */
157 	switch (nnode->tn_type) {
158 	case VBLK:
159 	case VCHR:
160 		/* Character/block special device. */
161 		KASSERT(rdev != VNOVAL);
162 		nnode->tn_spec.tn_dev.tn_rdev = rdev;
163 		break;
164 	case VDIR:
165 		/* Directory. */
166 		TAILQ_INIT(&nnode->tn_spec.tn_dir.tn_dir);
167 		nnode->tn_spec.tn_dir.tn_parent = NULL;
168 		nnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
169 		nnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
170 
171 		/* Extra link count for the virtual '.' entry. */
172 		nnode->tn_links++;
173 		break;
174 	case VFIFO:
175 	case VSOCK:
176 		break;
177 	case VLNK:
178 		/* Symbolic link.  Target specifies the file name. */
179 		KASSERT(target && strlen(target) < MAXPATHLEN);
180 
181 		nnode->tn_size = strlen(target);
182 		if (nnode->tn_size == 0) {
183 			nnode->tn_spec.tn_lnk.tn_link = NULL;
184 			break;
185 		}
186 		nnode->tn_spec.tn_lnk.tn_link =
187 		    tmpfs_strname_alloc(tmp, nnode->tn_size);
188 		if (nnode->tn_spec.tn_lnk.tn_link == NULL) {
189 			tmpfs_node_put(tmp, nnode);
190 			return ENOSPC;
191 		}
192 		memcpy(nnode->tn_spec.tn_lnk.tn_link, target, nnode->tn_size);
193 		break;
194 	case VREG:
195 		/* Regular file.  Create an underlying UVM object. */
196 		uobj = uao_create(0, UAO_FLAG_CANFAIL);
197 		if (uobj == NULL) {
198 			tmpfs_node_put(tmp, nnode);
199 			return ENOSPC;
200 		}
201 		nnode->tn_spec.tn_reg.tn_aobj = uobj;
202 		nnode->tn_spec.tn_reg.tn_aobj_pages = 0;
203 		nnode->tn_spec.tn_reg.tn_aobj_pgptr = (vaddr_t)NULL;
204 		nnode->tn_spec.tn_reg.tn_aobj_pgnum = (voff_t)-1;
205 		break;
206 	default:
207 		KASSERT(0);
208 	}
209 
210 	rw_init(&nnode->tn_nlock, "tvlk");
211 
212 	rw_enter_write(&tmp->tm_lock);
213 	LIST_INSERT_HEAD(&tmp->tm_nodes, nnode, tn_entries);
214 	rw_exit_write(&tmp->tm_lock);
215 
216 	*node = nnode;
217 	return 0;
218 }
219 
220 /*
221  * tmpfs_free_node: remove the inode from a list in the mount point and
222  * destroy the inode structures.
223  */
224 void
225 tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
226 {
227 	size_t objsz;
228 
229 	rw_enter_write(&tmp->tm_lock);
230 	LIST_REMOVE(node, tn_entries);
231 	rw_exit_write(&tmp->tm_lock);
232 
233 	switch (node->tn_type) {
234 	case VLNK:
235 		if (node->tn_size > 0) {
236 			KASSERT(node->tn_size <= SIZE_MAX);
237 			tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
238 			    node->tn_size);
239 		}
240 		break;
241 	case VREG:
242 		/*
243 		 * Calculate the size of inode data, decrease the used-memory
244 		 * counter, and destroy the underlying UVM object (if any).
245 		 */
246 		objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
247 		if (objsz != 0) {
248 			tmpfs_mem_decr(tmp, objsz);
249 		}
250 		if (node->tn_spec.tn_reg.tn_aobj != NULL) {
251 			uao_detach(node->tn_spec.tn_reg.tn_aobj);
252 			node->tn_spec.tn_reg.tn_aobj = NULL;
253 		}
254 		break;
255 	case VDIR:
256 		KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
257 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
258 		    node == tmp->tm_root);
259 		break;
260 	default:
261 		break;
262 	}
263 
264 	rw_enter_write(&tmp->tm_acc_lock);
265 	if (node->tn_id == tmp->tm_highest_inode)
266 		--tmp->tm_highest_inode;
267 	rw_exit_write(&tmp->tm_acc_lock);
268 
269 	/* mutex_destroy(&node->tn_nlock); */
270 	tmpfs_node_put(tmp, node);
271 }
272 
273 /*
274  * tmpfs_vnode_get: allocate or reclaim a vnode for a specified inode.
275  *
276  * => Must be called with tmpfs_node_t::tn_nlock held.
277  * => Returns vnode (*vpp) locked.
278  */
279 int
280 tmpfs_vnode_get(struct mount *mp, tmpfs_node_t *node, struct vnode **vpp)
281 {
282 	struct vnode *vp, *nvp;
283 	/* kmutex_t *slock; */
284 	int error;
285 again:
286 	/* If there is already a vnode, try to reclaim it. */
287 	if ((vp = node->tn_vnode) != NULL) {
288 		/* atomic_or_ulong(&node->tn_gen, TMPFS_RECLAIMING_BIT); */
289 		node->tn_gen |= TMPFS_RECLAIMING_BIT;
290 		rw_exit_write(&node->tn_nlock);
291 		error = vget(vp, LK_EXCLUSIVE, curproc);
292 		if (error == ENOENT) {
293 			rw_enter_write(&node->tn_nlock);
294 			goto again;
295 		}
296 		/* atomic_and_ulong(&node->tn_gen, ~TMPFS_RECLAIMING_BIT); */
297 		node->tn_gen &= ~TMPFS_RECLAIMING_BIT;
298 		*vpp = vp;
299 		return error;
300 	}
301 	if (TMPFS_NODE_RECLAIMING(node)) {
302 		/* atomic_and_ulong(&node->tn_gen, ~TMPFS_RECLAIMING_BIT); */
303 		node->tn_gen &= ~TMPFS_RECLAIMING_BIT;
304 	}
305 
306 	/*
307 	 * Get a new vnode and associate it with our inode.  Share the
308 	 * lock with underlying UVM object, if there is one (VREG case).
309 	 */
310 #if 0
311 	if (node->tn_type == VREG) {
312 		struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
313 		slock = uobj->vmobjlock;
314 	} else {
315 		slock = NULL;
316 	}
317 #endif
318 	error = getnewvnode(VT_TMPFS, mp, &tmpfs_vops, &vp);
319 	if (error) {
320 		rw_exit_write(&node->tn_nlock);
321 		return error;
322 	}
323 
324 	lockinit(&node->tn_vlock, PINOD, "tnode", 0, 0);
325 	vp->v_type = node->tn_type;
326 
327 	/* Type-specific initialization. */
328 	switch (node->tn_type) {
329 	case VBLK:
330 	case VCHR:
331 		vp->v_op = &tmpfs_specvops;
332 		if ((nvp = checkalias(vp, node->tn_spec.tn_dev.tn_rdev, mp))) {
333 			nvp->v_data = vp->v_data;
334 			vp->v_data = NULL;
335 			vp->v_op = &spec_vops;
336 			vrele(vp);
337 			vgone(vp);
338 			vp = nvp;
339 			node->tn_vnode = vp;
340 		}
341 		break;
342 	case VDIR:
343 		vp->v_flag |= node->tn_spec.tn_dir.tn_parent == node ?
344 		    VROOT : 0;
345 		break;
346 #ifdef FIFO
347 	case VFIFO:
348 		vp->v_op = &tmpfs_fifovops;
349 		break;
350 #endif
351 	case VLNK:
352 	case VREG:
353 	case VSOCK:
354 		break;
355 	default:
356 		KASSERT(0);
357 	}
358 
359 	uvm_vnp_setsize(vp, node->tn_size);
360 	vp->v_data = node;
361 	node->tn_vnode = vp;
362 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curproc);
363 	rw_exit_write(&node->tn_nlock);
364 
365 	KASSERT(VOP_ISLOCKED(vp));
366 	*vpp = vp;
367 	return 0;
368 }
369 
370 /*
371  * tmpfs_alloc_file: allocate a new file of specified type and adds it
372  * into the parent directory.
373  *
374  * => Credentials of the caller are used.
375  */
376 int
377 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
378     struct componentname *cnp, char *target)
379 {
380 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
381 	tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
382 	tmpfs_dirent_t *de;
383 	int error;
384 
385 	KASSERT(VOP_ISLOCKED(dvp));
386 	*vpp = NULL;
387 
388 	/* Check for the maximum number of links limit. */
389 	if (vap->va_type == VDIR) {
390 		/* Check for maximum links limit. */
391 		if (dnode->tn_links == LINK_MAX) {
392 			error = EMLINK;
393 			goto out;
394 		}
395 		KASSERT(dnode->tn_links < LINK_MAX);
396 	}
397 
398 	if (TMPFS_DIRSEQ_FULL(dnode)) {
399 		error = ENOSPC;
400 		goto out;
401 	}
402 
403 	/* Allocate a node that represents the new file. */
404 	error = tmpfs_alloc_node(tmp, vap->va_type, cnp->cn_cred->cr_uid,
405 	    dnode->tn_gid, vap->va_mode, target, vap->va_rdev, &node);
406 	if (error)
407 		goto out;
408 
409 	/* Allocate a directory entry that points to the new file. */
410 	error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
411 	if (error) {
412 		tmpfs_free_node(tmp, node);
413 		goto out;
414 	}
415 
416 	/* Get a vnode for the new file. */
417 	rw_enter_write(&node->tn_nlock);
418 	error = tmpfs_vnode_get(dvp->v_mount, node, vpp);
419 	if (error) {
420 		tmpfs_free_dirent(tmp, de);
421 		tmpfs_free_node(tmp, node);
422 		goto out;
423 	}
424 
425 	/* Associate inode and attach the entry into the directory. */
426 	tmpfs_dir_attach(dnode, de, node);
427 
428 out:
429 	if (error == 0 && (cnp->cn_flags & SAVESTART) == 0)
430 		pool_put(&namei_pool, cnp->cn_pnbuf);
431 	vput(dvp);
432 	return error;
433 }
434 
435 /*
436  * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
437  * The directory entry contains a path name component.
438  */
439 int
440 tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
441     tmpfs_dirent_t **de)
442 {
443 	tmpfs_dirent_t *nde;
444 
445 	nde = tmpfs_dirent_get(tmp);
446 	if (nde == NULL)
447 		return ENOSPC;
448 
449 	nde->td_name = tmpfs_strname_alloc(tmp, len);
450 	if (nde->td_name == NULL) {
451 		tmpfs_dirent_put(tmp, nde);
452 		return ENOSPC;
453 	}
454 	nde->td_namelen = len;
455 	memcpy(nde->td_name, name, len);
456 	nde->td_seq = TMPFS_DIRSEQ_NONE;
457 
458 	*de = nde;
459 	return 0;
460 }
461 
462 /*
463  * tmpfs_free_dirent: free a directory entry.
464  */
465 void
466 tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
467 {
468 
469 	KASSERT(de->td_node == NULL);
470 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
471 	tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
472 	tmpfs_dirent_put(tmp, de);
473 }
474 
475 /*
476  * tmpfs_dir_attach: associate directory entry with a specified inode,
477  * and attach the entry into the directory, specified by vnode.
478  *
479  * => Increases link count on the associated node.
480  * => Increases link count on directory node, if our node is VDIR.
481  *    It is caller's responsibility to check for the LINK_MAX limit.
482  * => Triggers kqueue events here.
483  */
484 void
485 tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
486 {
487 	struct vnode *dvp = dnode->tn_vnode;
488 	int events = NOTE_WRITE;
489 
490 	KASSERT(dvp != NULL);
491 	KASSERT(VOP_ISLOCKED(dvp));
492 
493 	/* Get a new sequence number. */
494 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
495 	de->td_seq = tmpfs_dir_getseq(dnode, de);
496 
497 	/* Associate directory entry and the inode. */
498 	de->td_node = node;
499 	KASSERT(node->tn_links < LINK_MAX);
500 	node->tn_links++;
501 
502 	/* Save the hint (might overwrite). */
503 	node->tn_dirent_hint = de;
504 
505 	/* Insert the entry to the directory (parent of inode). */
506 	TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
507 	dnode->tn_size += sizeof(tmpfs_dirent_t);
508 	tmpfs_update(dnode, TMPFS_NODE_STATUSALL);
509 	uvm_vnp_setsize(dvp, dnode->tn_size);
510 
511 	if (node->tn_type == VDIR) {
512 		/* Set parent. */
513 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
514 		node->tn_spec.tn_dir.tn_parent = dnode;
515 
516 		/* Increase the link count of parent. */
517 		KASSERT(dnode->tn_links < LINK_MAX);
518 		dnode->tn_links++;
519 		events |= NOTE_LINK;
520 
521 		TMPFS_VALIDATE_DIR(node);
522 	}
523 	VN_KNOTE(dvp, events);
524 }
525 
526 /*
527  * tmpfs_dir_detach: disassociate directory entry and its inode,
528  * and detach the entry from the directory, specified by vnode.
529  *
530  * => Decreases link count on the associated node.
531  * => Decreases the link count on directory node, if our node is VDIR.
532  * => Triggers kqueue events here.
533  */
534 void
535 tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
536 {
537 	tmpfs_node_t *node = de->td_node;
538 	struct vnode *vp, *dvp = dnode->tn_vnode;
539 	int events = NOTE_WRITE;
540 
541 	KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
542 
543 	/* Deassociate the inode and entry. */
544 	de->td_node = NULL;
545 	node->tn_dirent_hint = NULL;
546 
547 	KASSERT(node->tn_links > 0);
548 	node->tn_links--;
549 	if ((vp = node->tn_vnode) != NULL) {
550 		KASSERT(VOP_ISLOCKED(vp));
551 		VN_KNOTE(vp, node->tn_links ?  NOTE_LINK : NOTE_DELETE);
552 	}
553 
554 	/* If directory - decrease the link count of parent. */
555 	if (node->tn_type == VDIR) {
556 		KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
557 		node->tn_spec.tn_dir.tn_parent = NULL;
558 
559 		KASSERT(dnode->tn_links > 0);
560 		dnode->tn_links--;
561 		events |= NOTE_LINK;
562 	}
563 
564 	/* Remove the entry from the directory. */
565 	if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
566 		dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
567 	}
568 	TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
569 
570 	dnode->tn_size -= sizeof(tmpfs_dirent_t);
571 	tmpfs_update(dnode, TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED);
572 	tmpfs_dir_putseq(dnode, de);
573 	if (dvp) {
574 		tmpfs_update(dnode, 0);
575 		uvm_vnp_setsize(dvp, dnode->tn_size);
576 		VN_KNOTE(dvp, events);
577 	}
578 }
579 
580 /*
581  * tmpfs_dir_lookup: find a directory entry in the specified inode.
582  *
583  * Note that the . and .. components are not allowed as they do not
584  * physically exist within directories.
585  */
586 tmpfs_dirent_t *
587 tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
588 {
589 	const char *name = cnp->cn_nameptr;
590 	const uint16_t nlen = cnp->cn_namelen;
591 	tmpfs_dirent_t *de;
592 
593 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
594 	KASSERT(nlen != 1 || !(name[0] == '.'));
595 	KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
596 	TMPFS_VALIDATE_DIR(node);
597 
598 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
599 		if (de->td_namelen != nlen)
600 			continue;
601 		if (memcmp(de->td_name, name, nlen) != 0)
602 			continue;
603 		break;
604 	}
605 	tmpfs_update(node, TMPFS_NODE_ACCESSED);
606 	return de;
607 }
608 
609 /*
610  * tmpfs_dir_cached: get a cached directory entry if it is valid.  Used to
611  * avoid unnecessary tmpfs_dir_lookup().
612  *
613  * => The vnode must be locked.
614  */
615 tmpfs_dirent_t *
616 tmpfs_dir_cached(tmpfs_node_t *node)
617 {
618 	tmpfs_dirent_t *de = node->tn_dirent_hint;
619 
620 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
621 
622 	if (de == NULL) {
623 		return NULL;
624 	}
625 	KASSERT(de->td_node == node);
626 
627 	/*
628 	 * Directories always have a valid hint.  For files, check if there
629 	 * are any hard links.  If there are - hint might be invalid.
630 	 */
631 	return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
632 }
633 
634 /*
635  * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
636  */
637 uint64_t
638 tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
639 {
640 	uint64_t seq = de->td_seq;
641 
642 	TMPFS_VALIDATE_DIR(dnode);
643 
644 	if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
645 		/* Already set. */
646 		KASSERT(seq >= TMPFS_DIRSEQ_START);
647 		return seq;
648 	}
649 
650 	/*
651 	 * The "." and ".." and the end-of-directory have reserved numbers.
652 	 * The other sequence numbers are allocated incrementally.
653 	 */
654 
655 	seq = dnode->tn_spec.tn_dir.tn_next_seq;
656 	KASSERT(seq >= TMPFS_DIRSEQ_START);
657 	KASSERT(seq < TMPFS_DIRSEQ_END);
658 	dnode->tn_spec.tn_dir.tn_next_seq++;
659 	return seq;
660 }
661 
662 void
663 tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
664 {
665 	uint64_t seq = de->td_seq;
666 
667 	TMPFS_VALIDATE_DIR(dnode);
668 	KASSERT(seq == TMPFS_DIRSEQ_NONE || seq >= TMPFS_DIRSEQ_START);
669 	KASSERT(seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END);
670 
671 	de->td_seq = TMPFS_DIRSEQ_NONE;
672 
673 	/* Empty?  We can reset. */
674 	if (dnode->tn_size == 0) {
675 		dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
676 	} else if (seq != TMPFS_DIRSEQ_NONE &&
677 		seq == dnode->tn_spec.tn_dir.tn_next_seq - 1) {
678 		dnode->tn_spec.tn_dir.tn_next_seq--;
679 	}
680 }
681 
682 /*
683  * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
684  */
685 tmpfs_dirent_t *
686 tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
687 {
688 	tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
689 
690 	TMPFS_VALIDATE_DIR(node);
691 
692 	/*
693 	 * First, check the cache.  If does not match - perform a lookup.
694 	 */
695 	if (de && de->td_seq == seq) {
696 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
697 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
698 		return de;
699 	}
700 
701 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
702 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
703 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
704 		if (de->td_seq == seq)
705 			return de;
706 	}
707 	return NULL;
708 }
709 
710 /*
711  * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
712  * dot meta entries, that is, "." or "..".  Copy it to the UIO space.
713  */
714 int
715 tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
716 {
717 	tmpfs_dirent_t *de;
718 	off_t next = 0;
719 	int error;
720 
721 	switch (uio->uio_offset) {
722 		case TMPFS_DIRSEQ_DOT:
723 			dp->d_fileno = node->tn_id;
724 			strlcpy(dp->d_name, ".", sizeof(dp->d_name));
725 			next = TMPFS_DIRSEQ_DOTDOT;
726 			break;
727 		case TMPFS_DIRSEQ_DOTDOT:
728 			dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
729 			strlcpy(dp->d_name, "..", sizeof(dp->d_name));
730 			de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
731 			next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
732 			break;
733 		default:
734 			KASSERT(false);
735 	}
736 	dp->d_type = DT_DIR;
737 	dp->d_namlen = strlen(dp->d_name);
738 	dp->d_reclen = DIRENT_SIZE(dp);
739 	dp->d_off = next;
740 
741 	if (dp->d_reclen > uio->uio_resid) {
742 		return EJUSTRETURN;
743 	}
744 
745 	if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
746 		return error;
747 	}
748 
749 	uio->uio_offset = next;
750 	return error;
751 }
752 
753 /*
754  * tmpfs_dir_getdents: helper function for tmpfs_readdir.
755  *
756  * => Returns as much directory entries as can fit in the uio space.
757  * => The read starts at uio->uio_offset.
758  */
759 int
760 tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio)
761 {
762 	tmpfs_dirent_t *de, *next_de;
763 	struct dirent dent;
764 	int error = 0;
765 
766 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
767 	TMPFS_VALIDATE_DIR(node);
768 	memset(&dent, 0, sizeof(dent));
769 
770 	if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
771 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
772 			goto done;
773 		}
774 	}
775 	if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
776 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
777 			goto done;
778 		}
779 	}
780 	/* Done if we reached the end. */
781 	if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
782 		goto done;
783 	}
784 
785 	/* Locate the directory entry given by the given sequence number. */
786 	de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
787 	if (de == NULL) {
788 		error = EINVAL;
789 		goto done;
790 	}
791 
792 	/*
793 	 * Read as many entries as possible; i.e., until we reach the end
794 	 * of the directory or we exhaust UIO space.
795 	 */
796 	do {
797 		dent.d_fileno = de->td_node->tn_id;
798 		switch (de->td_node->tn_type) {
799 		case VBLK:
800 			dent.d_type = DT_BLK;
801 			break;
802 		case VCHR:
803 			dent.d_type = DT_CHR;
804 			break;
805 		case VDIR:
806 			dent.d_type = DT_DIR;
807 			break;
808 		case VFIFO:
809 			dent.d_type = DT_FIFO;
810 			break;
811 		case VLNK:
812 			dent.d_type = DT_LNK;
813 			break;
814 		case VREG:
815 			dent.d_type = DT_REG;
816 			break;
817 		case VSOCK:
818 			dent.d_type = DT_SOCK;
819 			break;
820 		default:
821 			KASSERT(0);
822 		}
823 		dent.d_namlen = de->td_namelen;
824 		KASSERT(de->td_namelen < sizeof(dent.d_name));
825 		memcpy(dent.d_name, de->td_name, de->td_namelen);
826 		dent.d_name[de->td_namelen] = '\0';
827 		dent.d_reclen = DIRENT_SIZE(&dent);
828 
829 		next_de = TAILQ_NEXT(de, td_entries);
830 		if (next_de == NULL)
831 			dent.d_off = TMPFS_DIRSEQ_EOF;
832 		else
833 			dent.d_off = tmpfs_dir_getseq(node, next_de);
834 
835 		if (dent.d_reclen > uio->uio_resid) {
836 			/* Exhausted UIO space. */
837 			error = EJUSTRETURN;
838 			break;
839 		}
840 
841 		/* Copy out the directory entry and continue. */
842 		error = uiomove(&dent, dent.d_reclen, uio);
843 		if (error) {
844 			break;
845 		}
846 		de = TAILQ_NEXT(de, td_entries);
847 
848 	} while (uio->uio_resid > 0 && de);
849 
850 	/* Cache the last entry or clear and mark EOF. */
851 	uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
852 	node->tn_spec.tn_dir.tn_readdir_lastp = de;
853 done:
854 	tmpfs_update(node, TMPFS_NODE_ACCESSED);
855 
856 	if (error == EJUSTRETURN) {
857 		/* Exhausted UIO space - just return. */
858 		error = 0;
859 	}
860 	KASSERT(error >= 0);
861 	return error;
862 }
863 
864 /*
865  * tmpfs_reg_resize: resize the underlying UVM object associated with the
866  * specified regular file.
867  */
868 
869 int
870 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
871 {
872 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
873 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
874 	struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
875 	size_t newpages, oldpages, bytes;
876 	off_t oldsize;
877 	vaddr_t pgoff;
878 	int error;
879 
880 	KASSERT(vp->v_type == VREG);
881 	KASSERT(newsize >= 0);
882 
883 	oldsize = node->tn_size;
884 	oldpages = round_page(oldsize) >> PAGE_SHIFT;
885 	newpages = round_page(newsize) >> PAGE_SHIFT;
886 	KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
887 
888 	if (newpages > oldpages) {
889 		/* Increase the used-memory counter if getting extra pages. */
890 		bytes = (newpages - oldpages) << PAGE_SHIFT;
891 		if (tmpfs_mem_incr(tmp, bytes) == 0)
892 			return ENOSPC;
893 		if (uao_grow(uobj, newpages) != 0) {
894 			tmpfs_mem_decr(tmp, bytes);
895 			return ENOSPC;
896 		}
897 	}
898 
899 	node->tn_spec.tn_reg.tn_aobj_pages = newpages;
900 	node->tn_size = newsize;
901 	uvm_vnp_setsize(vp, newsize);
902 	uvm_vnp_uncache(vp);
903 
904 	/*
905 	 * Free "backing store".
906 	 */
907 	if (newpages < oldpages) {
908 		if (tmpfs_uio_cached(node))
909 			tmpfs_uio_uncache(node);
910 		if (uao_shrink(uobj, newpages))
911 			panic("shrink failed");
912 		/* Decrease the used-memory counter. */
913 		tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
914 	}
915 	if (newsize > oldsize) {
916 		if (tmpfs_uio_cached(node))
917 			tmpfs_uio_uncache(node);
918 		pgoff = oldsize & PAGE_MASK;
919 		if (pgoff != 0) {
920 			/*
921 			 * Growing from an offset which is not at a page
922 			 * boundary; zero out unused bytes in current page.
923 			 */
924 			error = tmpfs_zeropg(node, trunc_page(oldsize), pgoff);
925 			if (error)
926 				panic("tmpfs_zeropg: error %d", error);
927 		}
928 		VN_KNOTE(vp, NOTE_EXTEND);
929 	}
930 	return 0;
931 }
932 
933 /*
934  * tmpfs_chflags: change flags of the given vnode.
935  *
936  */
937 int
938 tmpfs_chflags(struct vnode *vp, int flags, struct ucred *cred, struct proc *p)
939 {
940 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
941 	int error;
942 
943 	KASSERT(VOP_ISLOCKED(vp));
944 
945 	/* Disallow this operation if the file system is mounted read-only. */
946 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
947 		return EROFS;
948 
949 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)))
950 		return error;
951 
952 	if (cred->cr_uid == 0) {
953 		if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND) &&
954 		    securelevel > 0)
955 			return EPERM;
956 		node->tn_flags = flags;
957 	} else {
958 		if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND) ||
959 		    (flags & UF_SETTABLE) != flags)
960 			return EPERM;
961 		node->tn_flags &= SF_SETTABLE;
962 		node->tn_flags |= (flags & UF_SETTABLE);
963 	}
964 
965 	tmpfs_update(node, TMPFS_NODE_CHANGED);
966 	VN_KNOTE(vp, NOTE_ATTRIB);
967 	return 0;
968 }
969 
970 /*
971  * tmpfs_chmod: change access mode on the given vnode.
972  *
973  */
974 int
975 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct proc *p)
976 {
977 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
978 	int error;
979 
980 	KASSERT(VOP_ISLOCKED(vp));
981 
982 	/* Disallow this operation if the file system is mounted read-only. */
983 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
984 		return EROFS;
985 
986 	/* Immutable or append-only files cannot be modified, either. */
987 	if (node->tn_flags & (IMMUTABLE | APPEND))
988 		return EPERM;
989 
990 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)))
991 		return error;
992 	if (cred->cr_uid != 0) {
993 		if (vp->v_type != VDIR && (mode & S_ISTXT))
994 			return EFTYPE;
995 		if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID))
996 			return EPERM;
997 	}
998 
999 	node->tn_mode = (mode & ALLPERMS);
1000 	tmpfs_update(node, TMPFS_NODE_CHANGED);
1001 	if ((vp->v_flag & VTEXT) && (node->tn_mode & S_ISTXT) == 0)
1002 		uvm_vnp_uncache(vp);
1003 	VN_KNOTE(vp, NOTE_ATTRIB);
1004 	return 0;
1005 }
1006 
1007 /*
1008  * tmpfs_chown: change ownership of the given vnode.
1009  *
1010  * => At least one of uid or gid must be different than VNOVAL.
1011  * => Attribute is unchanged for VNOVAL case.
1012  */
1013 int
1014 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct proc *p)
1015 {
1016 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1017 	int error;
1018 
1019 	KASSERT(VOP_ISLOCKED(vp));
1020 
1021 	/* Assign default values if they are unknown. */
1022 	KASSERT(uid != VNOVAL || gid != VNOVAL);
1023 	if (uid == VNOVAL) {
1024 		uid = node->tn_uid;
1025 	}
1026 	if (gid == VNOVAL) {
1027 		gid = node->tn_gid;
1028 	}
1029 
1030 	/* Disallow this operation if the file system is mounted read-only. */
1031 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1032 		return EROFS;
1033 
1034 	/* Immutable or append-only files cannot be modified, either. */
1035 	if (node->tn_flags & (IMMUTABLE | APPEND))
1036 		return EPERM;
1037 
1038 	if ((cred->cr_uid != node->tn_uid || uid != node->tn_uid ||
1039 	    (gid != node->tn_gid && !groupmember(gid, cred))) &&
1040 	    (error = suser_ucred(cred)))
1041 	    	return error;
1042 
1043 	node->tn_uid = uid;
1044 	node->tn_gid = gid;
1045 	tmpfs_update(node, TMPFS_NODE_CHANGED);
1046 	VN_KNOTE(vp, NOTE_ATTRIB);
1047 	return 0;
1048 }
1049 
1050 /*
1051  * tmpfs_chsize: change size of the given vnode.
1052  */
1053 int
1054 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct proc *p)
1055 {
1056 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1057 
1058 	KASSERT(VOP_ISLOCKED(vp));
1059 
1060 	/* Decide whether this is a valid operation based on the file type. */
1061 	switch (vp->v_type) {
1062 	case VDIR:
1063 		return EISDIR;
1064 	case VREG:
1065 		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1066 			return EROFS;
1067 		}
1068 		break;
1069 	case VBLK:
1070 	case VCHR:
1071 	case VFIFO:
1072 		/*
1073 		 * Allow modifications of special files even if in the file
1074 		 * system is mounted read-only (we are not modifying the
1075 		 * files themselves, but the objects they represent).
1076 		 */
1077 		return 0;
1078 	default:
1079 		return EOPNOTSUPP;
1080 	}
1081 
1082 	/* Immutable or append-only files cannot be modified, either. */
1083 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1084 		return EPERM;
1085 	}
1086 
1087 	/* Note: tmpfs_truncate() will raise NOTE_EXTEND and NOTE_ATTRIB. */
1088 	return tmpfs_truncate(vp, size);
1089 }
1090 
1091 /*
1092  * tmpfs_chtimes: change access and modification times for vnode.
1093  */
1094 int
1095 tmpfs_chtimes(struct vnode *vp, const struct timespec *atime,
1096     const struct timespec *mtime, int vaflags, struct ucred *cred,
1097     struct proc *p)
1098 {
1099 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1100 	int error;
1101 
1102 	KASSERT(VOP_ISLOCKED(vp));
1103 
1104 	/* Disallow this operation if the file system is mounted read-only. */
1105 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1106 		return EROFS;
1107 
1108 	/* Immutable or append-only files cannot be modified, either. */
1109 	if (node->tn_flags & (IMMUTABLE | APPEND))
1110 		return EPERM;
1111 
1112 	if (cred->cr_uid != node->tn_uid && (error = suser_ucred(cred)) &&
1113 	    ((vaflags & VA_UTIMES_NULL) == 0 ||
1114 	    (error = VOP_ACCESS(vp, VWRITE, cred, p))))
1115 	    	return error;
1116 
1117  	if (atime->tv_sec != VNOVAL && atime->tv_nsec != VNOVAL)
1118 		node->tn_atime = *atime;
1119 
1120  	if (mtime->tv_sec != VNOVAL && mtime->tv_nsec != VNOVAL)
1121 		node->tn_mtime = *mtime;
1122 	VN_KNOTE(vp, NOTE_ATTRIB);
1123 	return 0;
1124 }
1125 
1126 /*
1127  * tmpfs_update: update timestamps, et al.
1128  */
1129 void
1130 tmpfs_update(tmpfs_node_t *node, int flags)
1131 {
1132 	struct timespec nowtm;
1133 
1134 	nanotime(&nowtm);
1135 
1136 	if (flags & TMPFS_NODE_ACCESSED) {
1137 		node->tn_atime = nowtm;
1138  	}
1139 	if (flags & TMPFS_NODE_MODIFIED) {
1140 		node->tn_mtime = nowtm;
1141  	}
1142 	if (flags & TMPFS_NODE_CHANGED) {
1143  		node->tn_ctime = nowtm;
1144  	}
1145 }
1146 
1147 int
1148 tmpfs_truncate(struct vnode *vp, off_t length)
1149 {
1150 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1151 	int error;
1152 
1153 	if (length < 0) {
1154 		error = EINVAL;
1155 		goto out;
1156 	}
1157 	if (node->tn_size == length) {
1158 		error = 0;
1159 		goto out;
1160 	}
1161 	error = tmpfs_reg_resize(vp, length);
1162 	if (error == 0) {
1163 		tmpfs_update(node, TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED);
1164 	}
1165 out:
1166 	return error;
1167 }
1168 
1169 int
1170 tmpfs_uio_cached(tmpfs_node_t *node)
1171 {
1172 	int pgnum_valid = (node->tn_pgnum != (voff_t)-1);
1173 	int pgptr_valid = (node->tn_pgptr != (vaddr_t)NULL);
1174 	KASSERT(pgnum_valid == pgptr_valid);
1175 	return pgnum_valid && pgptr_valid;
1176 }
1177 
1178 vaddr_t
1179 tmpfs_uio_lookup(tmpfs_node_t *node, voff_t pgnum)
1180 {
1181 	if (tmpfs_uio_cached(node) == 1 && node->tn_pgnum == pgnum)
1182 		return node->tn_pgptr;
1183 
1184 	return (vaddr_t)NULL;
1185 }
1186 
1187 void
1188 tmpfs_uio_uncache(tmpfs_node_t *node)
1189 {
1190 	KASSERT(node->tn_pgnum != (voff_t)-1);
1191 	KASSERT(node->tn_pgptr != (vaddr_t)NULL);
1192 	uvm_unmap(kernel_map, node->tn_pgptr, node->tn_pgptr + PAGE_SIZE);
1193 	node->tn_pgnum = (voff_t)-1;
1194 	node->tn_pgptr = (vaddr_t)NULL;
1195 }
1196 
1197 void
1198 tmpfs_uio_cache(tmpfs_node_t *node, voff_t pgnum, vaddr_t pgptr)
1199 {
1200 	KASSERT(node->tn_pgnum == (voff_t)-1);
1201 	KASSERT(node->tn_pgptr == (vaddr_t)NULL);
1202 	node->tn_pgnum = pgnum;
1203 	node->tn_pgptr = pgptr;
1204 }
1205 
1206 /*
1207  * Be gentle to kernel_map, don't allow more than 4MB in a single transaction.
1208  */
1209 #define TMPFS_UIO_MAXBYTES	((1 << 22) - PAGE_SIZE)
1210 
1211 int
1212 tmpfs_uiomove(tmpfs_node_t *node, struct uio *uio, vsize_t len)
1213 {
1214 	vaddr_t va, pgoff;
1215 	int error, adv;
1216 	voff_t pgnum;
1217 	vsize_t sz;
1218 
1219 	pgnum = trunc_page(uio->uio_offset);
1220 	pgoff = uio->uio_offset & PAGE_MASK;
1221 
1222 	if (pgoff + len < PAGE_SIZE) {
1223 		va = tmpfs_uio_lookup(node, pgnum);
1224 		if (va != (vaddr_t)NULL)
1225 			return uiomove((void *)va + pgoff, len, uio);
1226 	}
1227 
1228 	if (len >= TMPFS_UIO_MAXBYTES) {
1229 		sz = TMPFS_UIO_MAXBYTES;
1230 		adv = POSIX_MADV_NORMAL;
1231 	} else {
1232 		sz = len;
1233 		adv = POSIX_MADV_SEQUENTIAL;
1234 	}
1235 
1236 	if (tmpfs_uio_cached(node))
1237 		tmpfs_uio_uncache(node);
1238 
1239 	uao_reference(node->tn_uobj);
1240 
1241 	error = uvm_map(kernel_map, &va, round_page(pgoff + sz), node->tn_uobj,
1242 	    trunc_page(uio->uio_offset), 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE,
1243 	    PROT_READ | PROT_WRITE, UVM_INH_NONE, adv, 0));
1244 	if (error) {
1245 		uao_detach(node->tn_uobj); /* Drop reference. */
1246 		return error;
1247 	}
1248 
1249 	error = uiomove((void *)va + pgoff, sz, uio);
1250 	if (error == 0 && pgoff + sz < PAGE_SIZE)
1251 		tmpfs_uio_cache(node, pgnum, va);
1252 	else
1253 		uvm_unmap(kernel_map, va, va + round_page(pgoff + sz));
1254 
1255 	return error;
1256 }
1257 
1258 int
1259 tmpfs_zeropg(tmpfs_node_t *node, voff_t pgnum, vaddr_t pgoff)
1260 {
1261 	vaddr_t va;
1262 	int error;
1263 
1264 	KASSERT(tmpfs_uio_cached(node) == 0);
1265 
1266 	uao_reference(node->tn_uobj);
1267 
1268 	error = uvm_map(kernel_map, &va, PAGE_SIZE, node->tn_uobj, pgnum, 0,
1269 	    UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
1270 	    UVM_INH_NONE, POSIX_MADV_NORMAL, 0));
1271 	if (error) {
1272 		uao_detach(node->tn_uobj); /* Drop reference. */
1273 		return error;
1274 	}
1275 
1276 	bzero((void *)va + pgoff, PAGE_SIZE - pgoff);
1277 	uvm_unmap(kernel_map, va, va + PAGE_SIZE);
1278 
1279 	return 0;
1280 }
1281 
1282