xref: /netbsd-src/sys/fs/tmpfs/tmpfs_subr.c (revision 002edac65260eb3d3e28132e73d495acf2398c97)
1 /*	$NetBSD: tmpfs_subr.c,v 1.96 2014/01/23 10:13:56 hannken Exp $	*/
2 
3 /*
4  * Copyright (c) 2005-2013 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
9  * 2005 program, and by Mindaugas Rasiukevicius.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Efficient memory file system: interfaces for inode and directory entry
35  * construction, destruction and manipulation.
36  *
37  * Reference counting
38  *
39  *	The link count of inode (tmpfs_node_t::tn_links) is used as a
40  *	reference counter.  However, it has slightly different semantics.
41  *
42  *	For directories - link count represents directory entries, which
43  *	refer to the directories.  In other words, it represents the count
44  *	of sub-directories.  It also takes into account the virtual '.'
45  *	entry (which has no real entry in the list).  For files - link count
46  *	represents the hard links.  Since only empty directories can be
47  *	removed - link count aligns the reference counting requirements
48  *	enough.  Note: to check whether directory is not empty, the inode
49  *	size (tmpfs_node_t::tn_size) can be used.
50  *
51  *	The inode itself, as an object, gathers its first reference when
52  *	directory entry is attached via tmpfs_dir_attach(9).  For instance,
53  *	after regular tmpfs_create(), a file would have a link count of 1,
54  *	while directory after tmpfs_mkdir() would have 2 (due to '.').
55  *
56  * Reclamation
57  *
58  *	It should be noted that tmpfs inodes rely on a combination of vnode
59  *	reference counting and link counting.  That is, an inode can only be
60  *	destroyed if its associated vnode is inactive.  The destruction is
61  *	done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
62  *	that tmpfs_node_t::tn_links being 0 is a destruction criterion.
63  *
64  *	If an inode has references within the file system (tn_links > 0) and
65  *	its inactive vnode gets reclaimed/recycled - then the association is
66  *	broken in tmpfs_reclaim().  In such case, an inode will always pass
67  *	tmpfs_lookup() and thus tmpfs_vnode_get() to associate a new vnode.
68  *
69  * Lock order
70  *
71  *	tmpfs_node_t::tn_vlock ->
72  *		vnode_t::v_vlock ->
73  *			vnode_t::v_interlock
74  */
75 
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.96 2014/01/23 10:13:56 hannken Exp $");
78 
79 #include <sys/param.h>
80 #include <sys/cprng.h>
81 #include <sys/dirent.h>
82 #include <sys/event.h>
83 #include <sys/kmem.h>
84 #include <sys/mount.h>
85 #include <sys/namei.h>
86 #include <sys/time.h>
87 #include <sys/stat.h>
88 #include <sys/systm.h>
89 #include <sys/vnode.h>
90 #include <sys/kauth.h>
91 #include <sys/atomic.h>
92 
93 #include <uvm/uvm.h>
94 
95 #include <miscfs/specfs/specdev.h>
96 #include <miscfs/genfs/genfs.h>
97 #include <fs/tmpfs/tmpfs.h>
98 #include <fs/tmpfs/tmpfs_fifoops.h>
99 #include <fs/tmpfs/tmpfs_specops.h>
100 #include <fs/tmpfs/tmpfs_vnops.h>
101 
102 static void	tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
103 
104 /*
105  * tmpfs_alloc_node: allocate a new inode of a specified type and
106  * insert it into the list of specified mount point.
107  */
108 int
109 tmpfs_alloc_node(tmpfs_mount_t *tmp, enum vtype type, uid_t uid, gid_t gid,
110     mode_t mode, char *target, dev_t rdev, tmpfs_node_t **node)
111 {
112 	tmpfs_node_t *nnode;
113 
114 	nnode = tmpfs_node_get(tmp);
115 	if (nnode == NULL) {
116 		return ENOSPC;
117 	}
118 
119 	/* Initially, no references and no associations. */
120 	nnode->tn_links = 0;
121 	nnode->tn_vnode = NULL;
122 	nnode->tn_dirent_hint = NULL;
123 
124 	/*
125 	 * XXX Where the pool is backed by a map larger than (4GB *
126 	 * sizeof(*nnode)), this may produce duplicate inode numbers
127 	 * for applications that do not understand 64-bit ino_t.
128 	 */
129 	nnode->tn_id = (ino_t)((uintptr_t)nnode / sizeof(*nnode));
130 	/*
131 	 * Make sure the generation number is not zero.
132 	 * tmpfs_inactive() uses generation zero to mark dead nodes.
133 	 */
134 	do {
135 		nnode->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
136 	} while (nnode->tn_gen == 0);
137 
138 	/* Generic initialization. */
139 	nnode->tn_type = type;
140 	nnode->tn_size = 0;
141 	nnode->tn_flags = 0;
142 	nnode->tn_lockf = NULL;
143 
144 	vfs_timestamp(&nnode->tn_atime);
145 	nnode->tn_birthtime = nnode->tn_atime;
146 	nnode->tn_ctime = nnode->tn_atime;
147 	nnode->tn_mtime = nnode->tn_atime;
148 
149 	KASSERT(uid != VNOVAL && gid != VNOVAL && mode != VNOVAL);
150 	nnode->tn_uid = uid;
151 	nnode->tn_gid = gid;
152 	nnode->tn_mode = mode;
153 
154 	/* Type-specific initialization. */
155 	switch (nnode->tn_type) {
156 	case VBLK:
157 	case VCHR:
158 		/* Character/block special device. */
159 		KASSERT(rdev != VNOVAL);
160 		nnode->tn_spec.tn_dev.tn_rdev = rdev;
161 		break;
162 	case VDIR:
163 		/* Directory. */
164 		TAILQ_INIT(&nnode->tn_spec.tn_dir.tn_dir);
165 		nnode->tn_spec.tn_dir.tn_parent = NULL;
166 		nnode->tn_spec.tn_dir.tn_seq_arena = NULL;
167 		nnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
168 		nnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
169 
170 		/* Extra link count for the virtual '.' entry. */
171 		nnode->tn_links++;
172 		break;
173 	case VFIFO:
174 	case VSOCK:
175 		break;
176 	case VLNK:
177 		/* Symbolic link.  Target specifies the file name. */
178 		KASSERT(target != NULL);
179 		nnode->tn_size = strlen(target);
180 
181 		if (nnode->tn_size == 0) {
182 			/* Zero-length targets are supported. */
183 			nnode->tn_spec.tn_lnk.tn_link = NULL;
184 			break;
185 		}
186 
187 		KASSERT(nnode->tn_size < MAXPATHLEN);
188 		nnode->tn_size++; /* include the NUL terminator */
189 
190 		nnode->tn_spec.tn_lnk.tn_link =
191 		    tmpfs_strname_alloc(tmp, nnode->tn_size);
192 		if (nnode->tn_spec.tn_lnk.tn_link == NULL) {
193 			tmpfs_node_put(tmp, nnode);
194 			return ENOSPC;
195 		}
196 		memcpy(nnode->tn_spec.tn_lnk.tn_link, target, nnode->tn_size);
197 		break;
198 	case VREG:
199 		/* Regular file.  Create an underlying UVM object. */
200 		nnode->tn_spec.tn_reg.tn_aobj =
201 		    uao_create(INT32_MAX - PAGE_SIZE, 0);
202 		nnode->tn_spec.tn_reg.tn_aobj_pages = 0;
203 		break;
204 	default:
205 		KASSERT(false);
206 	}
207 
208 	mutex_init(&nnode->tn_vlock, MUTEX_DEFAULT, IPL_NONE);
209 
210 	mutex_enter(&tmp->tm_lock);
211 	LIST_INSERT_HEAD(&tmp->tm_nodes, nnode, tn_entries);
212 	mutex_exit(&tmp->tm_lock);
213 
214 	*node = nnode;
215 	return 0;
216 }
217 
218 /*
219  * tmpfs_free_node: remove the inode from a list in the mount point and
220  * destroy the inode structures.
221  */
222 void
223 tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
224 {
225 	size_t objsz;
226 
227 	mutex_enter(&tmp->tm_lock);
228 	LIST_REMOVE(node, tn_entries);
229 	mutex_exit(&tmp->tm_lock);
230 
231 	switch (node->tn_type) {
232 	case VLNK:
233 		if (node->tn_size > 0) {
234 			tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
235 			    node->tn_size);
236 		}
237 		break;
238 	case VREG:
239 		/*
240 		 * Calculate the size of inode data, decrease the used-memory
241 		 * counter, and destroy the unerlying UVM object (if any).
242 		 */
243 		objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
244 		if (objsz != 0) {
245 			tmpfs_mem_decr(tmp, objsz);
246 		}
247 		if (node->tn_spec.tn_reg.tn_aobj != NULL) {
248 			uao_detach(node->tn_spec.tn_reg.tn_aobj);
249 		}
250 		break;
251 	case VDIR:
252 		KASSERT(node->tn_size == 0);
253 		KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL);
254 		KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
255 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
256 		    node == tmp->tm_root);
257 		break;
258 	default:
259 		break;
260 	}
261 	KASSERT(node->tn_vnode == NULL);
262 	KASSERT(node->tn_links == 0);
263 
264 	mutex_destroy(&node->tn_vlock);
265 	tmpfs_node_put(tmp, node);
266 }
267 
268 /*
269  * tmpfs_vnode_get: allocate or reclaim a vnode for a specified inode.
270  *
271  * => Must be called with tmpfs_node_t::tn_vlock held.
272  * => Returns vnode (*vpp) locked.
273  */
274 int
275 tmpfs_vnode_get(struct mount *mp, tmpfs_node_t *node, vnode_t **vpp)
276 {
277 	vnode_t *vp;
278 	kmutex_t *slock;
279 	int error;
280 again:
281 	/* If there is already a vnode, try to reclaim it. */
282 	if ((vp = node->tn_vnode) != NULL) {
283 		atomic_or_32(&node->tn_gen, TMPFS_RECLAIMING_BIT);
284 		mutex_enter(vp->v_interlock);
285 		mutex_exit(&node->tn_vlock);
286 		error = vget(vp, LK_EXCLUSIVE);
287 		if (error == ENOENT) {
288 			mutex_enter(&node->tn_vlock);
289 			goto again;
290 		}
291 		atomic_and_32(&node->tn_gen, ~TMPFS_RECLAIMING_BIT);
292 		*vpp = vp;
293 		return error;
294 	}
295 	if (TMPFS_NODE_RECLAIMING(node)) {
296 		atomic_and_32(&node->tn_gen, ~TMPFS_RECLAIMING_BIT);
297 	}
298 
299 	/*
300 	 * Get a new vnode and associate it with our inode.  Share the
301 	 * lock with underlying UVM object, if there is one (VREG case).
302 	 */
303 	if (node->tn_type == VREG) {
304 		struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
305 		slock = uobj->vmobjlock;
306 	} else {
307 		slock = NULL;
308 	}
309 	error = getnewvnode(VT_TMPFS, mp, tmpfs_vnodeop_p, slock, &vp);
310 	if (error) {
311 		mutex_exit(&node->tn_vlock);
312 		return error;
313 	}
314 
315 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
316 	vp->v_type = node->tn_type;
317 
318 	/* Type-specific initialization. */
319 	switch (node->tn_type) {
320 	case VBLK:
321 	case VCHR:
322 		vp->v_op = tmpfs_specop_p;
323 		spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
324 		break;
325 	case VDIR:
326 		vp->v_vflag |= node->tn_spec.tn_dir.tn_parent == node ?
327 		    VV_ROOT : 0;
328 		break;
329 	case VFIFO:
330 		vp->v_op = tmpfs_fifoop_p;
331 		break;
332 	case VLNK:
333 	case VREG:
334 	case VSOCK:
335 		break;
336 	default:
337 		KASSERT(false);
338 	}
339 
340 	uvm_vnp_setsize(vp, node->tn_size);
341 	vp->v_data = node;
342 	node->tn_vnode = vp;
343 	mutex_exit(&node->tn_vlock);
344 
345 	KASSERT(VOP_ISLOCKED(vp));
346 	*vpp = vp;
347 	return 0;
348 }
349 
350 /*
351  * tmpfs_construct_node: allocate a new file of specified type and adds it
352  * into the parent directory.
353  *
354  * => Credentials of the caller are used.
355  */
356 int
357 tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
358     struct componentname *cnp, char *target)
359 {
360 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
361 	tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
362 	tmpfs_dirent_t *de, *wde;
363 	int error;
364 
365 	KASSERT(VOP_ISLOCKED(dvp));
366 	*vpp = NULL;
367 
368 	/*
369 	 * If directory was removed, prevent from node creation.  The vnode
370 	 * might still be referenced, but it is about to be reclaimed.
371 	 */
372 	if (dnode->tn_links == 0) {
373 		error = ENOENT;
374 		goto out;
375 	}
376 
377 	/* Check for the maximum number of links limit. */
378 	if (vap->va_type == VDIR) {
379 		/* Check for maximum links limit. */
380 		if (dnode->tn_links == LINK_MAX) {
381 			error = EMLINK;
382 			goto out;
383 		}
384 		KASSERT(dnode->tn_links < LINK_MAX);
385 	}
386 
387 	/* Allocate a node that represents the new file. */
388 	error = tmpfs_alloc_node(tmp, vap->va_type, kauth_cred_geteuid(cnp->cn_cred),
389 	    dnode->tn_gid, vap->va_mode, target, vap->va_rdev, &node);
390 	if (error)
391 		goto out;
392 
393 	/* Allocate a directory entry that points to the new file. */
394 	error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
395 	if (error) {
396 		tmpfs_free_node(tmp, node);
397 		goto out;
398 	}
399 
400 	/* Get a vnode for the new file. */
401 	mutex_enter(&node->tn_vlock);
402 	error = tmpfs_vnode_get(dvp->v_mount, node, vpp);
403 	if (error) {
404 		tmpfs_free_dirent(tmp, de);
405 		tmpfs_free_node(tmp, node);
406 		goto out;
407 	}
408 
409 	/* Remove whiteout before adding the new entry. */
410 	if (cnp->cn_flags & ISWHITEOUT) {
411 		wde = tmpfs_dir_lookup(dnode, cnp);
412 		KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
413 		tmpfs_dir_detach(dnode, wde);
414 		tmpfs_free_dirent(tmp, wde);
415 	}
416 
417 	/* Associate inode and attach the entry into the directory. */
418 	tmpfs_dir_attach(dnode, de, node);
419 
420 	/* Make node opaque if requested. */
421 	if (cnp->cn_flags & ISWHITEOUT)
422 		node->tn_flags |= UF_OPAQUE;
423 
424 	/* Update the parent's timestamps. */
425 	tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
426 out:
427 	if (error == 0)
428 		VOP_UNLOCK(*vpp);
429 
430 	return error;
431 }
432 
433 /*
434  * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
435  * The directory entry contains a path name component.
436  */
437 int
438 tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
439     tmpfs_dirent_t **de)
440 {
441 	tmpfs_dirent_t *nde;
442 
443 	nde = tmpfs_dirent_get(tmp);
444 	if (nde == NULL)
445 		return ENOSPC;
446 
447 	nde->td_name = tmpfs_strname_alloc(tmp, len);
448 	if (nde->td_name == NULL) {
449 		tmpfs_dirent_put(tmp, nde);
450 		return ENOSPC;
451 	}
452 	nde->td_namelen = len;
453 	memcpy(nde->td_name, name, len);
454 	nde->td_seq = TMPFS_DIRSEQ_NONE;
455 
456 	*de = nde;
457 	return 0;
458 }
459 
460 /*
461  * tmpfs_free_dirent: free a directory entry.
462  */
463 void
464 tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
465 {
466 	KASSERT(de->td_node == NULL);
467 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
468 	tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
469 	tmpfs_dirent_put(tmp, de);
470 }
471 
472 /*
473  * tmpfs_dir_attach: associate directory entry with a specified inode,
474  * and attach the entry into the directory, specified by vnode.
475  *
476  * => Increases link count on the associated node.
477  * => Increases link count on directory node if our node is VDIR.
478  * => It is caller's responsibility to check for the LINK_MAX limit.
479  * => Triggers kqueue events here.
480  */
481 void
482 tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
483 {
484 	vnode_t *dvp = dnode->tn_vnode;
485 	int events = NOTE_WRITE;
486 
487 	KASSERT(dvp != NULL);
488 	KASSERT(VOP_ISLOCKED(dvp));
489 
490 	/* Get a new sequence number. */
491 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
492 	de->td_seq = tmpfs_dir_getseq(dnode, de);
493 
494 	/* Associate directory entry and the inode. */
495 	de->td_node = node;
496 	if (node != TMPFS_NODE_WHITEOUT) {
497 		KASSERT(node->tn_links < LINK_MAX);
498 		node->tn_links++;
499 
500 		/* Save the hint (might overwrite). */
501 		node->tn_dirent_hint = de;
502 	} else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
503 		/* Flag that there are whiteout entries. */
504 		atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
505 	}
506 
507 	/* Insert the entry to the directory (parent of inode). */
508 	TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
509 	dnode->tn_size += sizeof(tmpfs_dirent_t);
510 	uvm_vnp_setsize(dvp, dnode->tn_size);
511 
512 	if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
513 		/* Set parent. */
514 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
515 		node->tn_spec.tn_dir.tn_parent = dnode;
516 
517 		/* Increase the link count of parent. */
518 		KASSERT(dnode->tn_links < LINK_MAX);
519 		dnode->tn_links++;
520 		events |= NOTE_LINK;
521 
522 		TMPFS_VALIDATE_DIR(node);
523 	}
524 	VN_KNOTE(dvp, events);
525 }
526 
527 /*
528  * tmpfs_dir_detach: disassociate directory entry and its inode,
529  * and detach the entry from the directory, specified by vnode.
530  *
531  * => Decreases link count on the associated node.
532  * => Decreases the link count on directory node, if our node is VDIR.
533  * => Triggers kqueue events here.
534  *
535  * => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
536  */
537 void
538 tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
539 {
540 	tmpfs_node_t *node = de->td_node;
541 	vnode_t *vp, *dvp = dnode->tn_vnode;
542 	int events = NOTE_WRITE;
543 
544 	KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
545 
546 	if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
547 		/* Deassociate the inode and entry. */
548 		node->tn_dirent_hint = NULL;
549 
550 		KASSERT(node->tn_links > 0);
551 		node->tn_links--;
552 
553 		if ((vp = node->tn_vnode) != NULL) {
554 			KASSERT(VOP_ISLOCKED(vp));
555 			VN_KNOTE(vp, node->tn_links ? NOTE_LINK : NOTE_DELETE);
556 		}
557 
558 		/* If directory - decrease the link count of parent. */
559 		if (node->tn_type == VDIR) {
560 			KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
561 			node->tn_spec.tn_dir.tn_parent = NULL;
562 
563 			KASSERT(dnode->tn_links > 0);
564 			dnode->tn_links--;
565 			events |= NOTE_LINK;
566 		}
567 	}
568 	de->td_node = NULL;
569 
570 	/* Remove the entry from the directory. */
571 	if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
572 		dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
573 	}
574 	TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
575 	dnode->tn_size -= sizeof(tmpfs_dirent_t);
576 	tmpfs_dir_putseq(dnode, de);
577 
578 	if (dvp) {
579 		uvm_vnp_setsize(dvp, dnode->tn_size);
580 		VN_KNOTE(dvp, events);
581 	}
582 }
583 
584 /*
585  * tmpfs_dir_lookup: find a directory entry in the specified inode.
586  *
587  * Note that the . and .. components are not allowed as they do not
588  * physically exist within directories.
589  */
590 tmpfs_dirent_t *
591 tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
592 {
593 	const char *name = cnp->cn_nameptr;
594 	const uint16_t nlen = cnp->cn_namelen;
595 	tmpfs_dirent_t *de;
596 
597 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
598 	KASSERT(nlen != 1 || !(name[0] == '.'));
599 	KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
600 	TMPFS_VALIDATE_DIR(node);
601 
602 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
603 		if (de->td_namelen != nlen)
604 			continue;
605 		if (memcmp(de->td_name, name, nlen) != 0)
606 			continue;
607 		break;
608 	}
609 	return de;
610 }
611 
612 /*
613  * tmpfs_dir_cached: get a cached directory entry if it is valid.  Used to
614  * avoid unnecessary tmpfs_dir_lookup().
615  *
616  * => The vnode must be locked.
617  */
618 tmpfs_dirent_t *
619 tmpfs_dir_cached(tmpfs_node_t *node)
620 {
621 	tmpfs_dirent_t *de = node->tn_dirent_hint;
622 
623 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
624 
625 	if (de == NULL) {
626 		return NULL;
627 	}
628 	KASSERT(de->td_node == node);
629 
630 	/*
631 	 * Directories always have a valid hint.  For files, check if there
632 	 * are any hard links.  If there are - hint might be invalid.
633 	 */
634 	return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
635 }
636 
637 /*
638  * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
639  *
640  * => Shall not be larger than 2^31 for linux32 compatibility.
641  */
642 uint32_t
643 tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
644 {
645 	uint32_t seq = de->td_seq;
646 	vmem_t *seq_arena;
647 	vmem_addr_t off;
648 	int error __diagused;
649 
650 	TMPFS_VALIDATE_DIR(dnode);
651 
652 	if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
653 		/* Already set. */
654 		KASSERT(seq >= TMPFS_DIRSEQ_START);
655 		return seq;
656 	}
657 
658 	/*
659 	 * The "." and ".." and the end-of-directory have reserved numbers.
660 	 * The other sequence numbers are allocated as following:
661 	 *
662 	 * - The first half of the 2^31 is assigned incrementally.
663 	 *
664 	 * - If that range is exceeded, then the second half of 2^31
665 	 * is used, but managed by vmem(9).
666 	 */
667 
668 	seq = dnode->tn_spec.tn_dir.tn_next_seq;
669 	KASSERT(seq >= TMPFS_DIRSEQ_START);
670 
671 	if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
672 		/* First half: just increment and return. */
673 		dnode->tn_spec.tn_dir.tn_next_seq++;
674 		return seq;
675 	}
676 
677 	/*
678 	 * First half exceeded, use the second half.  May need to create
679 	 * vmem(9) arena for the directory first.
680 	 */
681 	if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
682 		seq_arena = vmem_create("tmpfscoo", 0,
683 		    TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
684 		    VM_SLEEP, IPL_NONE);
685 		dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
686 		KASSERT(seq_arena != NULL);
687 	}
688 	error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
689 	KASSERT(error == 0);
690 
691 	KASSERT(off < TMPFS_DIRSEQ_END);
692 	seq = off | TMPFS_DIRSEQ_END;
693 	return seq;
694 }
695 
696 static void
697 tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
698 {
699 	vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
700 	uint32_t seq = de->td_seq;
701 
702 	TMPFS_VALIDATE_DIR(dnode);
703 
704 	if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
705 		/* First half (or no sequence number set yet). */
706 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
707 	} else {
708 		/* Second half. */
709 		KASSERT(seq_arena != NULL);
710 		KASSERT(seq >= TMPFS_DIRSEQ_END);
711 		seq &= ~TMPFS_DIRSEQ_END;
712 		vmem_free(seq_arena, seq, 1);
713 	}
714 	de->td_seq = TMPFS_DIRSEQ_NONE;
715 
716 	/* Empty?  We can reset. */
717 	if (seq_arena && dnode->tn_size == 0) {
718 		dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
719 		dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
720 		vmem_destroy(seq_arena);
721 	}
722 }
723 
724 /*
725  * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
726  */
727 tmpfs_dirent_t *
728 tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
729 {
730 	tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
731 
732 	TMPFS_VALIDATE_DIR(node);
733 
734 	/*
735 	 * First, check the cache.  If does not match - perform a lookup.
736 	 */
737 	if (de && de->td_seq == seq) {
738 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
739 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
740 		return de;
741 	}
742 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
743 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
744 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
745 		if (de->td_seq == seq)
746 			return de;
747 	}
748 	return NULL;
749 }
750 
751 /*
752  * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
753  * dot meta entries, that is, "." or "..".  Copy it to the UIO space.
754  */
755 static int
756 tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
757 {
758 	tmpfs_dirent_t *de;
759 	off_t next = 0;
760 	int error;
761 
762 	switch (uio->uio_offset) {
763 	case TMPFS_DIRSEQ_DOT:
764 		dp->d_fileno = node->tn_id;
765 		strlcpy(dp->d_name, ".", sizeof(dp->d_name));
766 		next = TMPFS_DIRSEQ_DOTDOT;
767 		break;
768 	case TMPFS_DIRSEQ_DOTDOT:
769 		dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
770 		strlcpy(dp->d_name, "..", sizeof(dp->d_name));
771 		de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
772 		next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
773 		break;
774 	default:
775 		KASSERT(false);
776 	}
777 	dp->d_type = DT_DIR;
778 	dp->d_namlen = strlen(dp->d_name);
779 	dp->d_reclen = _DIRENT_SIZE(dp);
780 
781 	if (dp->d_reclen > uio->uio_resid) {
782 		return EJUSTRETURN;
783 	}
784 	if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
785 		return error;
786 	}
787 
788 	uio->uio_offset = next;
789 	return error;
790 }
791 
792 /*
793  * tmpfs_dir_getdents: helper function for tmpfs_readdir.
794  *
795  * => Returns as much directory entries as can fit in the uio space.
796  * => The read starts at uio->uio_offset.
797  */
798 int
799 tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
800 {
801 	tmpfs_dirent_t *de;
802 	struct dirent dent;
803 	int error = 0;
804 
805 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
806 	TMPFS_VALIDATE_DIR(node);
807 
808 	/*
809 	 * First check for the "." and ".." cases.
810 	 * Note: tmpfs_dir_getdotents() will "seek" for us.
811 	 */
812 	memset(&dent, 0, sizeof(dent));
813 
814 	if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
815 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
816 			goto done;
817 		}
818 		(*cntp)++;
819 	}
820 	if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
821 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
822 			goto done;
823 		}
824 		(*cntp)++;
825 	}
826 
827 	/* Done if we reached the end. */
828 	if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
829 		goto done;
830 	}
831 
832 	/* Locate the directory entry given by the given sequence number. */
833 	de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
834 	if (de == NULL) {
835 		error = EINVAL;
836 		goto done;
837 	}
838 
839 	/*
840 	 * Read as many entries as possible; i.e., until we reach the end
841 	 * of the directory or we exhaust UIO space.
842 	 */
843 	do {
844 		if (de->td_node == TMPFS_NODE_WHITEOUT) {
845 			dent.d_fileno = 1;
846 			dent.d_type = DT_WHT;
847 		} else {
848 			dent.d_fileno = de->td_node->tn_id;
849 			dent.d_type = vtype2dt(de->td_node->tn_type);
850 		}
851 		dent.d_namlen = de->td_namelen;
852 		KASSERT(de->td_namelen < sizeof(dent.d_name));
853 		memcpy(dent.d_name, de->td_name, de->td_namelen);
854 		dent.d_name[de->td_namelen] = '\0';
855 		dent.d_reclen = _DIRENT_SIZE(&dent);
856 
857 		if (dent.d_reclen > uio->uio_resid) {
858 			/* Exhausted UIO space. */
859 			error = EJUSTRETURN;
860 			break;
861 		}
862 
863 		/* Copy out the directory entry and continue. */
864 		error = uiomove(&dent, dent.d_reclen, uio);
865 		if (error) {
866 			break;
867 		}
868 		(*cntp)++;
869 		de = TAILQ_NEXT(de, td_entries);
870 
871 	} while (uio->uio_resid > 0 && de);
872 
873 	/* Cache the last entry or clear and mark EOF. */
874 	uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
875 	node->tn_spec.tn_dir.tn_readdir_lastp = de;
876 done:
877 	tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);
878 
879 	if (error == EJUSTRETURN) {
880 		/* Exhausted UIO space - just return. */
881 		error = 0;
882 	}
883 	KASSERT(error >= 0);
884 	return error;
885 }
886 
887 /*
888  * tmpfs_reg_resize: resize the underlying UVM object associated with the
889  * specified regular file.
890  */
891 int
892 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
893 {
894 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
895 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
896 	struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
897 	size_t newpages, oldpages;
898 	off_t oldsize;
899 
900 	KASSERT(vp->v_type == VREG);
901 	KASSERT(newsize >= 0);
902 
903 	oldsize = node->tn_size;
904 	oldpages = round_page(oldsize) >> PAGE_SHIFT;
905 	newpages = round_page(newsize) >> PAGE_SHIFT;
906 	KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
907 
908 	if (newpages > oldpages) {
909 		/* Increase the used-memory counter if getting extra pages. */
910 		if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
911 			return ENOSPC;
912 		}
913 	} else if (newsize < oldsize) {
914 		size_t zerolen;
915 
916 		zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
917 		ubc_zerorange(uobj, newsize, zerolen, UBC_UNMAP_FLAG(vp));
918 	}
919 
920 	node->tn_spec.tn_reg.tn_aobj_pages = newpages;
921 	node->tn_size = newsize;
922 	uvm_vnp_setsize(vp, newsize);
923 
924 	/*
925 	 * Free "backing store".
926 	 */
927 	if (newpages < oldpages) {
928 		KASSERT(uobj->vmobjlock == vp->v_interlock);
929 
930 		mutex_enter(uobj->vmobjlock);
931 		uao_dropswap_range(uobj, newpages, oldpages);
932 		mutex_exit(uobj->vmobjlock);
933 
934 		/* Decrease the used-memory counter. */
935 		tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
936 	}
937 	if (newsize > oldsize) {
938 		VN_KNOTE(vp, NOTE_EXTEND);
939 	}
940 	return 0;
941 }
942 
943 /*
944  * tmpfs_chflags: change flags of the given vnode.
945  */
946 int
947 tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
948 {
949 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
950 	kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
951 	int error;
952 	bool changing_sysflags = false;
953 
954 	KASSERT(VOP_ISLOCKED(vp));
955 
956 	/* Disallow this operation if the file system is mounted read-only. */
957 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
958 		return EROFS;
959 
960 	/*
961 	 * If the new flags have non-user flags that are different than
962 	 * those on the node, we need special permission to change them.
963 	 */
964 	if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
965 		action |= KAUTH_VNODE_WRITE_SYSFLAGS;
966 		changing_sysflags = true;
967 	}
968 
969 	/*
970 	 * Indicate that this node's flags have system attributes in them if
971 	 * that's the case.
972 	 */
973 	if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
974 		action |= KAUTH_VNODE_HAS_SYSFLAGS;
975 	}
976 
977 	error = kauth_authorize_vnode(cred, action, vp, NULL,
978 	    genfs_can_chflags(cred, vp->v_type, node->tn_uid,
979 	    changing_sysflags));
980 	if (error)
981 		return error;
982 
983 	/*
984 	 * Set the flags. If we're not setting non-user flags, be careful not
985 	 * to overwrite them.
986 	 *
987 	 * XXX: Can't we always assign here? if the system flags are different,
988 	 *      the code above should catch attempts to change them without
989 	 *      proper permissions, and if we're here it means it's okay to
990 	 *      change them...
991 	 */
992 	if (!changing_sysflags) {
993 		/* Clear all user-settable flags and re-set them. */
994 		node->tn_flags &= SF_SETTABLE;
995 		node->tn_flags |= (flags & UF_SETTABLE);
996 	} else {
997 		node->tn_flags = flags;
998 	}
999 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1000 	VN_KNOTE(vp, NOTE_ATTRIB);
1001 	return 0;
1002 }
1003 
1004 /*
1005  * tmpfs_chmod: change access mode on the given vnode.
1006  */
1007 int
1008 tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
1009 {
1010 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1011 	int error;
1012 
1013 	KASSERT(VOP_ISLOCKED(vp));
1014 
1015 	/* Disallow this operation if the file system is mounted read-only. */
1016 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1017 		return EROFS;
1018 
1019 	/* Immutable or append-only files cannot be modified, either. */
1020 	if (node->tn_flags & (IMMUTABLE | APPEND))
1021 		return EPERM;
1022 
1023 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
1024 	    NULL, genfs_can_chmod(vp->v_type, cred, node->tn_uid, node->tn_gid, mode));
1025 	if (error) {
1026 		return error;
1027 	}
1028 	node->tn_mode = (mode & ALLPERMS);
1029 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1030 	VN_KNOTE(vp, NOTE_ATTRIB);
1031 	return 0;
1032 }
1033 
1034 /*
1035  * tmpfs_chown: change ownership of the given vnode.
1036  *
1037  * => At least one of uid or gid must be different than VNOVAL.
1038  * => Attribute is unchanged for VNOVAL case.
1039  */
1040 int
1041 tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
1042 {
1043 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1044 	int error;
1045 
1046 	KASSERT(VOP_ISLOCKED(vp));
1047 
1048 	/* Assign default values if they are unknown. */
1049 	KASSERT(uid != VNOVAL || gid != VNOVAL);
1050 	if (uid == VNOVAL) {
1051 		uid = node->tn_uid;
1052 	}
1053 	if (gid == VNOVAL) {
1054 		gid = node->tn_gid;
1055 	}
1056 
1057 	/* Disallow this operation if the file system is mounted read-only. */
1058 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1059 		return EROFS;
1060 
1061 	/* Immutable or append-only files cannot be modified, either. */
1062 	if (node->tn_flags & (IMMUTABLE | APPEND))
1063 		return EPERM;
1064 
1065 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
1066 	    NULL, genfs_can_chown(cred, node->tn_uid, node->tn_gid, uid,
1067 	    gid));
1068 	if (error) {
1069 		return error;
1070 	}
1071 	node->tn_uid = uid;
1072 	node->tn_gid = gid;
1073 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1074 	VN_KNOTE(vp, NOTE_ATTRIB);
1075 	return 0;
1076 }
1077 
1078 /*
1079  * tmpfs_chsize: change size of the given vnode.
1080  */
1081 int
1082 tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
1083 {
1084 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1085 	const off_t length = size;
1086 	int error;
1087 
1088 	KASSERT(VOP_ISLOCKED(vp));
1089 
1090 	/* Decide whether this is a valid operation based on the file type. */
1091 	switch (vp->v_type) {
1092 	case VDIR:
1093 		return EISDIR;
1094 	case VREG:
1095 		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1096 			return EROFS;
1097 		}
1098 		break;
1099 	case VBLK:
1100 	case VCHR:
1101 	case VFIFO:
1102 		/*
1103 		 * Allow modifications of special files even if in the file
1104 		 * system is mounted read-only (we are not modifying the
1105 		 * files themselves, but the objects they represent).
1106 		 */
1107 		return 0;
1108 	default:
1109 		return EOPNOTSUPP;
1110 	}
1111 
1112 	/* Immutable or append-only files cannot be modified, either. */
1113 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1114 		return EPERM;
1115 	}
1116 
1117 	if (length < 0) {
1118 		return EINVAL;
1119 	}
1120 	if (node->tn_size == length) {
1121 		return 0;
1122 	}
1123 
1124 	/* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
1125 	if ((error = tmpfs_reg_resize(vp, length)) != 0) {
1126 		return error;
1127 	}
1128 	tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
1129 	return 0;
1130 }
1131 
1132 /*
1133  * tmpfs_chtimes: change access and modification times for vnode.
1134  */
1135 int
1136 tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
1137     const struct timespec *mtime, const struct timespec *btime,
1138     int vaflags, kauth_cred_t cred, lwp_t *l)
1139 {
1140 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1141 	int error;
1142 
1143 	KASSERT(VOP_ISLOCKED(vp));
1144 
1145 	/* Disallow this operation if the file system is mounted read-only. */
1146 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1147 		return EROFS;
1148 
1149 	/* Immutable or append-only files cannot be modified, either. */
1150 	if (node->tn_flags & (IMMUTABLE | APPEND))
1151 		return EPERM;
1152 
1153 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
1154 	    genfs_can_chtimes(vp, vaflags, node->tn_uid, cred));
1155 	if (error)
1156 		return error;
1157 
1158 	if (atime->tv_sec != VNOVAL) {
1159 		node->tn_atime = *atime;
1160 	}
1161 	if (mtime->tv_sec != VNOVAL) {
1162 		node->tn_mtime = *mtime;
1163 	}
1164 	if (btime->tv_sec != VNOVAL) {
1165 		node->tn_birthtime = *btime;
1166 	}
1167 	VN_KNOTE(vp, NOTE_ATTRIB);
1168 	return 0;
1169 }
1170 
1171 /*
1172  * tmpfs_update: update the timestamps as indicated by the flags.
1173  */
1174 void
1175 tmpfs_update(vnode_t *vp, unsigned tflags)
1176 {
1177 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1178 	struct timespec nowtm;
1179 
1180 	if (tflags == 0) {
1181 		return;
1182 	}
1183 	vfs_timestamp(&nowtm);
1184 
1185 	if (tflags & TMPFS_UPDATE_ATIME) {
1186 		node->tn_atime = nowtm;
1187 	}
1188 	if (tflags & TMPFS_UPDATE_MTIME) {
1189 		node->tn_mtime = nowtm;
1190 	}
1191 	if (tflags & TMPFS_UPDATE_CTIME) {
1192 		node->tn_ctime = nowtm;
1193 	}
1194 }
1195