xref: /netbsd-src/sys/fs/tmpfs/tmpfs_subr.c (revision e457e4abc4546f54ab1ba084c265db2b0925ee65)
1 /*	$NetBSD: tmpfs_subr.c,v 1.108 2020/04/04 20:49:30 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2005-2013 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
9  * 2005 program, and by Mindaugas Rasiukevicius.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Efficient memory file system: interfaces for inode and directory entry
35  * construction, destruction and manipulation.
36  *
37  * Reference counting
38  *
39  *	The link count of inode (tmpfs_node_t::tn_links) is used as a
40  *	reference counter.  However, it has slightly different semantics.
41  *
42  *	For directories - link count represents directory entries, which
43  *	refer to the directories.  In other words, it represents the count
44  *	of sub-directories.  It also takes into account the virtual '.'
45  *	entry (which has no real entry in the list).  For files - link count
46  *	represents the hard links.  Since only empty directories can be
47  *	removed - link count aligns the reference counting requirements
48  *	enough.  Note: to check whether directory is not empty, the inode
49  *	size (tmpfs_node_t::tn_size) can be used.
50  *
51  *	The inode itself, as an object, gathers its first reference when
52  *	directory entry is attached via tmpfs_dir_attach(9).  For instance,
53  *	after regular tmpfs_create(), a file would have a link count of 1,
54  *	while directory after tmpfs_mkdir() would have 2 (due to '.').
55  *
56  * Reclamation
57  *
58  *	It should be noted that tmpfs inodes rely on a combination of vnode
59  *	reference counting and link counting.  That is, an inode can only be
60  *	destroyed if its associated vnode is inactive.  The destruction is
61  *	done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
62  *	that tmpfs_node_t::tn_links being 0 is a destruction criterion.
63  *
64  *	If an inode has references within the file system (tn_links > 0) and
65  *	its inactive vnode gets reclaimed/recycled - then the association is
66  *	broken in tmpfs_reclaim().  In such case, an inode will always pass
67  *	tmpfs_lookup() and thus vcache_get() to associate a new vnode.
68  *
69  * Lock order
70  *
71  *	vnode_t::v_vlock ->
72  *		vnode_t::v_interlock
73  */
74 
75 #include <sys/cdefs.h>
76 __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.108 2020/04/04 20:49:30 ad Exp $");
77 
78 #include <sys/param.h>
79 #include <sys/cprng.h>
80 #include <sys/dirent.h>
81 #include <sys/event.h>
82 #include <sys/kmem.h>
83 #include <sys/mount.h>
84 #include <sys/namei.h>
85 #include <sys/time.h>
86 #include <sys/stat.h>
87 #include <sys/systm.h>
88 #include <sys/vnode.h>
89 #include <sys/kauth.h>
90 #include <sys/atomic.h>
91 
92 #include <uvm/uvm.h>
93 
94 #include <miscfs/specfs/specdev.h>
95 #include <miscfs/genfs/genfs.h>
96 #include <fs/tmpfs/tmpfs.h>
97 #include <fs/tmpfs/tmpfs_fifoops.h>
98 #include <fs/tmpfs/tmpfs_specops.h>
99 #include <fs/tmpfs/tmpfs_vnops.h>
100 
101 static void	tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
102 
103 /*
104  * Initialize vnode with tmpfs node.
105  */
106 static void
107 tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node)
108 {
109 	krwlock_t *slock;
110 
111 	KASSERT(node->tn_vnode == NULL);
112 
113 	/* Share the interlock with the node. */
114 	if (node->tn_type == VREG) {
115 		slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock;
116 		rw_obj_hold(slock);
117 		uvm_obj_setlock(&vp->v_uobj, slock);
118 	}
119 
120 	vp->v_tag = VT_TMPFS;
121 	vp->v_type = node->tn_type;
122 
123 	/* Type-specific initialization. */
124 	switch (vp->v_type) {
125 	case VBLK:
126 	case VCHR:
127 		vp->v_op = tmpfs_specop_p;
128 		spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
129 		break;
130 	case VFIFO:
131 		vp->v_op = tmpfs_fifoop_p;
132 		break;
133 	case VDIR:
134 		if (node->tn_spec.tn_dir.tn_parent == node)
135 			vp->v_vflag |= VV_ROOT;
136 		/* FALLTHROUGH */
137 	case VLNK:
138 	case VREG:
139 	case VSOCK:
140 		vp->v_op = tmpfs_vnodeop_p;
141 		break;
142 	default:
143 		panic("bad node type %d", vp->v_type);
144 		break;
145 	}
146 
147 	vp->v_data = node;
148 	node->tn_vnode = vp;
149 	uvm_vnp_setsize(vp, node->tn_size);
150 	KASSERT(node->tn_mode != VNOVAL);
151 	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
152 }
153 
154 /*
155  * tmpfs_loadvnode: initialise a vnode for a specified inode.
156  */
157 int
158 tmpfs_loadvnode(struct mount *mp, struct vnode *vp,
159     const void *key, size_t key_len, const void **new_key)
160 {
161 	tmpfs_node_t *node;
162 
163 	KASSERT(key_len == sizeof(node));
164 	memcpy(&node, key, key_len);
165 
166 	if (node->tn_links == 0)
167 		return ENOENT;
168 
169 	tmpfs_init_vnode(vp, node);
170 
171 	*new_key = &vp->v_data;
172 
173 	return 0;
174 }
175 
176 /*
177  * tmpfs_newvnode: allocate a new inode of a specified type and
178  * attach the vonode.
179  */
180 int
181 tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
182     struct vattr *vap, kauth_cred_t cred, void *extra,
183     size_t *key_len, const void **new_key)
184 {
185 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
186 	tmpfs_node_t *node, *dnode;
187 
188 	if (dvp != NULL) {
189 		KASSERT(VOP_ISLOCKED(dvp));
190 		dnode = VP_TO_TMPFS_DIR(dvp);
191 		if (dnode->tn_links == 0)
192 			return ENOENT;
193 		if (vap->va_type == VDIR) {
194 			/* Check for maximum links limit. */
195 			if (dnode->tn_links == LINK_MAX)
196 				return EMLINK;
197 			KASSERT(dnode->tn_links < LINK_MAX);
198 		}
199 	} else
200 		dnode = NULL;
201 
202 	node = tmpfs_node_get(tmp);
203 	if (node == NULL)
204 		return ENOSPC;
205 
206 	/* Initially, no references and no associations. */
207 	node->tn_links = 0;
208 	node->tn_vnode = NULL;
209 	node->tn_holdcount = 0;
210 	node->tn_dirent_hint = NULL;
211 
212 	/*
213 	 * XXX Where the pool is backed by a map larger than (4GB *
214 	 * sizeof(*node)), this may produce duplicate inode numbers
215 	 * for applications that do not understand 64-bit ino_t.
216 	 */
217 	node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node));
218 	/*
219 	 * Make sure the generation number is not zero.
220 	 * tmpfs_inactive() uses generation zero to mark dead nodes.
221 	 */
222 	do {
223 		node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
224 	} while (node->tn_gen == 0);
225 
226 	/* Generic initialization. */
227 	KASSERT((int)vap->va_type != VNOVAL);
228 	node->tn_type = vap->va_type;
229 	node->tn_size = 0;
230 	node->tn_flags = 0;
231 	node->tn_lockf = NULL;
232 
233 	vfs_timestamp(&node->tn_atime);
234 	node->tn_birthtime = node->tn_atime;
235 	node->tn_ctime = node->tn_atime;
236 	node->tn_mtime = node->tn_atime;
237 
238 	if (dvp == NULL) {
239 		KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL);
240 		node->tn_uid = vap->va_uid;
241 		node->tn_gid = vap->va_gid;
242 		vp->v_vflag |= VV_ROOT;
243 	} else {
244 		KASSERT(dnode != NULL);
245 		node->tn_uid = kauth_cred_geteuid(cred);
246 		node->tn_gid = dnode->tn_gid;
247 	}
248 	KASSERT(vap->va_mode != VNOVAL);
249 	node->tn_mode = vap->va_mode;
250 
251 	/* Type-specific initialization. */
252 	switch (node->tn_type) {
253 	case VBLK:
254 	case VCHR:
255 		/* Character/block special device. */
256 		KASSERT(vap->va_rdev != VNOVAL);
257 		node->tn_spec.tn_dev.tn_rdev = vap->va_rdev;
258 		break;
259 	case VDIR:
260 		/* Directory. */
261 		TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir);
262 		node->tn_spec.tn_dir.tn_parent = NULL;
263 		node->tn_spec.tn_dir.tn_seq_arena = NULL;
264 		node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
265 		node->tn_spec.tn_dir.tn_readdir_lastp = NULL;
266 
267 		/* Extra link count for the virtual '.' entry. */
268 		node->tn_links++;
269 		break;
270 	case VFIFO:
271 	case VSOCK:
272 		break;
273 	case VLNK:
274 		node->tn_size = 0;
275 		node->tn_spec.tn_lnk.tn_link = NULL;
276 		break;
277 	case VREG:
278 		/* Regular file.  Create an underlying UVM object. */
279 		node->tn_spec.tn_reg.tn_aobj =
280 		    uao_create(INT64_MAX - PAGE_SIZE, 0);
281 		node->tn_spec.tn_reg.tn_aobj_pages = 0;
282 		break;
283 	default:
284 		panic("bad node type %d", vp->v_type);
285 		break;
286 	}
287 
288 	tmpfs_init_vnode(vp, node);
289 
290 	mutex_enter(&tmp->tm_lock);
291 	LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries);
292 	mutex_exit(&tmp->tm_lock);
293 
294 	*key_len = sizeof(vp->v_data);
295 	*new_key = &vp->v_data;
296 
297 	return 0;
298 }
299 
300 /*
301  * tmpfs_free_node: remove the inode from a list in the mount point and
302  * destroy the inode structures.
303  */
304 void
305 tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
306 {
307 	size_t objsz;
308 	uint32_t hold;
309 
310 	mutex_enter(&tmp->tm_lock);
311 	hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED);
312 	/* Defer destruction to last thread holding this node. */
313 	if (hold != TMPFS_NODE_RECLAIMED) {
314 		mutex_exit(&tmp->tm_lock);
315 		return;
316 	}
317 	LIST_REMOVE(node, tn_entries);
318 	mutex_exit(&tmp->tm_lock);
319 
320 	switch (node->tn_type) {
321 	case VLNK:
322 		if (node->tn_size > 0) {
323 			tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
324 			    node->tn_size);
325 		}
326 		break;
327 	case VREG:
328 		/*
329 		 * Calculate the size of inode data, decrease the used-memory
330 		 * counter, and destroy the unerlying UVM object (if any).
331 		 */
332 		objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
333 		if (objsz != 0) {
334 			tmpfs_mem_decr(tmp, objsz);
335 		}
336 		if (node->tn_spec.tn_reg.tn_aobj != NULL) {
337 			uao_detach(node->tn_spec.tn_reg.tn_aobj);
338 		}
339 		break;
340 	case VDIR:
341 		KASSERT(node->tn_size == 0);
342 		KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL);
343 		KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
344 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
345 		    node == tmp->tm_root);
346 		break;
347 	default:
348 		break;
349 	}
350 	KASSERT(node->tn_vnode == NULL);
351 	KASSERT(node->tn_links == 0);
352 
353 	tmpfs_node_put(tmp, node);
354 }
355 
356 /*
357  * tmpfs_construct_node: allocate a new file of specified type and adds it
358  * into the parent directory.
359  *
360  * => Credentials of the caller are used.
361  */
362 int
363 tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
364     struct componentname *cnp, char *target)
365 {
366 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
367 	tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
368 	tmpfs_dirent_t *de, *wde;
369 	char *slink = NULL;
370 	int ssize = 0;
371 	int error;
372 
373 	/* Allocate symlink target. */
374 	if (target != NULL) {
375 		KASSERT(vap->va_type == VLNK);
376 		ssize = strlen(target);
377 		KASSERT(ssize < MAXPATHLEN);
378 		if (ssize > 0) {
379 			slink = tmpfs_strname_alloc(tmp, ssize);
380 			if (slink == NULL)
381 				return ENOSPC;
382 			memcpy(slink, target, ssize);
383 		}
384 	}
385 
386 	/* Allocate a directory entry that points to the new file. */
387 	error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
388 	if (error) {
389 		if (slink != NULL)
390 			tmpfs_strname_free(tmp, slink, ssize);
391 		return error;
392 	}
393 
394 	/* Allocate a vnode that represents the new file. */
395 	error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp);
396 	if (error) {
397 		if (slink != NULL)
398 			tmpfs_strname_free(tmp, slink, ssize);
399 		tmpfs_free_dirent(tmp, de);
400 		return error;
401 	}
402 	error = vn_lock(*vpp, LK_EXCLUSIVE);
403 	if (error) {
404 		vrele(*vpp);
405 		*vpp = NULL;
406 		if (slink != NULL)
407 			tmpfs_strname_free(tmp, slink, ssize);
408 		tmpfs_free_dirent(tmp, de);
409 		return error;
410 	}
411 
412 	node = VP_TO_TMPFS_NODE(*vpp);
413 
414 	if (slink != NULL) {
415 		node->tn_spec.tn_lnk.tn_link = slink;
416 		node->tn_size = ssize;
417 	}
418 
419 	/* Remove whiteout before adding the new entry. */
420 	if (cnp->cn_flags & ISWHITEOUT) {
421 		wde = tmpfs_dir_lookup(dnode, cnp);
422 		KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
423 		tmpfs_dir_detach(dnode, wde);
424 		tmpfs_free_dirent(tmp, wde);
425 	}
426 
427 	/* Associate inode and attach the entry into the directory. */
428 	tmpfs_dir_attach(dnode, de, node);
429 
430 	/* Make node opaque if requested. */
431 	if (cnp->cn_flags & ISWHITEOUT)
432 		node->tn_flags |= UF_OPAQUE;
433 
434 	/* Update the parent's timestamps. */
435 	tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
436 
437 	VOP_UNLOCK(*vpp);
438 
439 	cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
440 	return 0;
441 }
442 
443 /*
444  * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
445  * The directory entry contains a path name component.
446  */
447 int
448 tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
449     tmpfs_dirent_t **de)
450 {
451 	tmpfs_dirent_t *nde;
452 
453 	nde = tmpfs_dirent_get(tmp);
454 	if (nde == NULL)
455 		return ENOSPC;
456 
457 	nde->td_name = tmpfs_strname_alloc(tmp, len);
458 	if (nde->td_name == NULL) {
459 		tmpfs_dirent_put(tmp, nde);
460 		return ENOSPC;
461 	}
462 	nde->td_namelen = len;
463 	memcpy(nde->td_name, name, len);
464 	nde->td_seq = TMPFS_DIRSEQ_NONE;
465 	nde->td_node = NULL; /* for asserts */
466 
467 	*de = nde;
468 	return 0;
469 }
470 
471 /*
472  * tmpfs_free_dirent: free a directory entry.
473  */
474 void
475 tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
476 {
477 	KASSERT(de->td_node == NULL);
478 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
479 	tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
480 	tmpfs_dirent_put(tmp, de);
481 }
482 
483 /*
484  * tmpfs_dir_attach: associate directory entry with a specified inode,
485  * and attach the entry into the directory, specified by vnode.
486  *
487  * => Increases link count on the associated node.
488  * => Increases link count on directory node if our node is VDIR.
489  * => It is caller's responsibility to check for the LINK_MAX limit.
490  * => Triggers kqueue events here.
491  */
492 void
493 tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
494 {
495 	vnode_t *dvp = dnode->tn_vnode;
496 	int events = NOTE_WRITE;
497 
498 	KASSERT(dvp != NULL);
499 	KASSERT(VOP_ISLOCKED(dvp));
500 
501 	/* Get a new sequence number. */
502 	KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
503 	de->td_seq = tmpfs_dir_getseq(dnode, de);
504 
505 	/* Associate directory entry and the inode. */
506 	de->td_node = node;
507 	if (node != TMPFS_NODE_WHITEOUT) {
508 		KASSERT(node->tn_links < LINK_MAX);
509 		node->tn_links++;
510 
511 		/* Save the hint (might overwrite). */
512 		node->tn_dirent_hint = de;
513 	} else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
514 		/* Flag that there are whiteout entries. */
515 		atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
516 	}
517 
518 	/* Insert the entry to the directory (parent of inode). */
519 	TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
520 	dnode->tn_size += sizeof(tmpfs_dirent_t);
521 	uvm_vnp_setsize(dvp, dnode->tn_size);
522 
523 	if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
524 		/* Set parent. */
525 		KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
526 		node->tn_spec.tn_dir.tn_parent = dnode;
527 
528 		/* Increase the link count of parent. */
529 		KASSERT(dnode->tn_links < LINK_MAX);
530 		dnode->tn_links++;
531 		events |= NOTE_LINK;
532 
533 		TMPFS_VALIDATE_DIR(node);
534 	}
535 	VN_KNOTE(dvp, events);
536 }
537 
538 /*
539  * tmpfs_dir_detach: disassociate directory entry and its inode,
540  * and detach the entry from the directory, specified by vnode.
541  *
542  * => Decreases link count on the associated node.
543  * => Decreases the link count on directory node, if our node is VDIR.
544  * => Triggers kqueue events here.
545  *
546  * => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
547  */
548 void
549 tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
550 {
551 	tmpfs_node_t *node = de->td_node;
552 	vnode_t *vp, *dvp = dnode->tn_vnode;
553 	int events = NOTE_WRITE;
554 
555 	KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
556 
557 	if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
558 		/* Deassociate the inode and entry. */
559 		node->tn_dirent_hint = NULL;
560 
561 		KASSERT(node->tn_links > 0);
562 		node->tn_links--;
563 
564 		if ((vp = node->tn_vnode) != NULL) {
565 			KASSERT(VOP_ISLOCKED(vp));
566 			VN_KNOTE(vp, node->tn_links ? NOTE_LINK : NOTE_DELETE);
567 		}
568 
569 		/* If directory - decrease the link count of parent. */
570 		if (node->tn_type == VDIR) {
571 			KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
572 			node->tn_spec.tn_dir.tn_parent = NULL;
573 
574 			KASSERT(dnode->tn_links > 0);
575 			dnode->tn_links--;
576 			events |= NOTE_LINK;
577 		}
578 	}
579 	de->td_node = NULL;
580 
581 	/* Remove the entry from the directory. */
582 	if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
583 		dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
584 	}
585 	TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
586 	dnode->tn_size -= sizeof(tmpfs_dirent_t);
587 	tmpfs_dir_putseq(dnode, de);
588 
589 	if (dvp) {
590 		uvm_vnp_setsize(dvp, dnode->tn_size);
591 		VN_KNOTE(dvp, events);
592 	}
593 }
594 
595 /*
596  * tmpfs_dir_lookup: find a directory entry in the specified inode.
597  *
598  * Note that the . and .. components are not allowed as they do not
599  * physically exist within directories.
600  */
601 tmpfs_dirent_t *
602 tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
603 {
604 	const char *name = cnp->cn_nameptr;
605 	const uint16_t nlen = cnp->cn_namelen;
606 	tmpfs_dirent_t *de;
607 
608 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
609 	KASSERT(nlen != 1 || !(name[0] == '.'));
610 	KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
611 	TMPFS_VALIDATE_DIR(node);
612 
613 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
614 		if (de->td_namelen != nlen)
615 			continue;
616 		if (memcmp(de->td_name, name, nlen) != 0)
617 			continue;
618 		break;
619 	}
620 	return de;
621 }
622 
623 /*
624  * tmpfs_dir_cached: get a cached directory entry if it is valid.  Used to
625  * avoid unnecessary tmpfs_dir_lookup().
626  *
627  * => The vnode must be locked.
628  */
629 tmpfs_dirent_t *
630 tmpfs_dir_cached(tmpfs_node_t *node)
631 {
632 	tmpfs_dirent_t *de = node->tn_dirent_hint;
633 
634 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
635 
636 	if (de == NULL) {
637 		return NULL;
638 	}
639 	KASSERT(de->td_node == node);
640 
641 	/*
642 	 * Directories always have a valid hint.  For files, check if there
643 	 * are any hard links.  If there are - hint might be invalid.
644 	 */
645 	return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
646 }
647 
648 /*
649  * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
650  *
651  * => Shall not be larger than 2^31 for linux32 compatibility.
652  */
653 uint32_t
654 tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
655 {
656 	uint32_t seq = de->td_seq;
657 	vmem_t *seq_arena;
658 	vmem_addr_t off;
659 	int error __diagused;
660 
661 	TMPFS_VALIDATE_DIR(dnode);
662 
663 	if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
664 		/* Already set. */
665 		KASSERT(seq >= TMPFS_DIRSEQ_START);
666 		return seq;
667 	}
668 
669 	/*
670 	 * The "." and ".." and the end-of-directory have reserved numbers.
671 	 * The other sequence numbers are allocated as following:
672 	 *
673 	 * - The first half of the 2^31 is assigned incrementally.
674 	 *
675 	 * - If that range is exceeded, then the second half of 2^31
676 	 * is used, but managed by vmem(9).
677 	 */
678 
679 	seq = dnode->tn_spec.tn_dir.tn_next_seq;
680 	KASSERT(seq >= TMPFS_DIRSEQ_START);
681 
682 	if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
683 		/* First half: just increment and return. */
684 		dnode->tn_spec.tn_dir.tn_next_seq++;
685 		return seq;
686 	}
687 
688 	/*
689 	 * First half exceeded, use the second half.  May need to create
690 	 * vmem(9) arena for the directory first.
691 	 */
692 	if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
693 		seq_arena = vmem_create("tmpfscoo", 0,
694 		    TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
695 		    VM_SLEEP, IPL_NONE);
696 		dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
697 		KASSERT(seq_arena != NULL);
698 	}
699 	error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
700 	KASSERT(error == 0);
701 
702 	KASSERT(off < TMPFS_DIRSEQ_END);
703 	seq = off | TMPFS_DIRSEQ_END;
704 	return seq;
705 }
706 
707 static void
708 tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
709 {
710 	vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
711 	uint32_t seq = de->td_seq;
712 
713 	TMPFS_VALIDATE_DIR(dnode);
714 
715 	if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
716 		/* First half (or no sequence number set yet). */
717 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
718 	} else {
719 		/* Second half. */
720 		KASSERT(seq_arena != NULL);
721 		KASSERT(seq >= TMPFS_DIRSEQ_END);
722 		seq &= ~TMPFS_DIRSEQ_END;
723 		vmem_free(seq_arena, seq, 1);
724 	}
725 	de->td_seq = TMPFS_DIRSEQ_NONE;
726 
727 	/* Empty?  We can reset. */
728 	if (seq_arena && dnode->tn_size == 0) {
729 		dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
730 		dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
731 		vmem_destroy(seq_arena);
732 	}
733 }
734 
735 /*
736  * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
737  */
738 tmpfs_dirent_t *
739 tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
740 {
741 	tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
742 
743 	TMPFS_VALIDATE_DIR(node);
744 
745 	/*
746 	 * First, check the cache.  If does not match - perform a lookup.
747 	 */
748 	if (de && de->td_seq == seq) {
749 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
750 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
751 		return de;
752 	}
753 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
754 		KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
755 		KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
756 		if (de->td_seq == seq)
757 			return de;
758 	}
759 	return NULL;
760 }
761 
762 /*
763  * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
764  * dot meta entries, that is, "." or "..".  Copy it to the UIO space.
765  */
766 static int
767 tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
768 {
769 	tmpfs_dirent_t *de;
770 	off_t next = 0;
771 	int error;
772 
773 	switch (uio->uio_offset) {
774 	case TMPFS_DIRSEQ_DOT:
775 		dp->d_fileno = node->tn_id;
776 		strlcpy(dp->d_name, ".", sizeof(dp->d_name));
777 		next = TMPFS_DIRSEQ_DOTDOT;
778 		break;
779 	case TMPFS_DIRSEQ_DOTDOT:
780 		dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
781 		strlcpy(dp->d_name, "..", sizeof(dp->d_name));
782 		de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
783 		next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
784 		break;
785 	default:
786 		KASSERT(false);
787 	}
788 	dp->d_type = DT_DIR;
789 	dp->d_namlen = strlen(dp->d_name);
790 	dp->d_reclen = _DIRENT_SIZE(dp);
791 
792 	if (dp->d_reclen > uio->uio_resid) {
793 		return EJUSTRETURN;
794 	}
795 	if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
796 		return error;
797 	}
798 
799 	uio->uio_offset = next;
800 	return error;
801 }
802 
803 /*
804  * tmpfs_dir_getdents: helper function for tmpfs_readdir.
805  *
806  * => Returns as much directory entries as can fit in the uio space.
807  * => The read starts at uio->uio_offset.
808  */
809 int
810 tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
811 {
812 	tmpfs_dirent_t *de;
813 	struct dirent dent;
814 	int error = 0;
815 
816 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
817 	TMPFS_VALIDATE_DIR(node);
818 
819 	/*
820 	 * First check for the "." and ".." cases.
821 	 * Note: tmpfs_dir_getdotents() will "seek" for us.
822 	 */
823 	memset(&dent, 0, sizeof(dent));
824 
825 	if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
826 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
827 			goto done;
828 		}
829 		(*cntp)++;
830 	}
831 	if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
832 		if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
833 			goto done;
834 		}
835 		(*cntp)++;
836 	}
837 
838 	/* Done if we reached the end. */
839 	if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
840 		goto done;
841 	}
842 
843 	/* Locate the directory entry given by the given sequence number. */
844 	de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
845 	if (de == NULL) {
846 		error = EINVAL;
847 		goto done;
848 	}
849 
850 	/*
851 	 * Read as many entries as possible; i.e., until we reach the end
852 	 * of the directory or we exhaust UIO space.
853 	 */
854 	do {
855 		if (de->td_node == TMPFS_NODE_WHITEOUT) {
856 			dent.d_fileno = 1;
857 			dent.d_type = DT_WHT;
858 		} else {
859 			dent.d_fileno = de->td_node->tn_id;
860 			dent.d_type = vtype2dt(de->td_node->tn_type);
861 		}
862 		dent.d_namlen = de->td_namelen;
863 		KASSERT(de->td_namelen < sizeof(dent.d_name));
864 		memcpy(dent.d_name, de->td_name, de->td_namelen);
865 		dent.d_name[de->td_namelen] = '\0';
866 		dent.d_reclen = _DIRENT_SIZE(&dent);
867 
868 		if (dent.d_reclen > uio->uio_resid) {
869 			/* Exhausted UIO space. */
870 			error = EJUSTRETURN;
871 			break;
872 		}
873 
874 		/* Copy out the directory entry and continue. */
875 		error = uiomove(&dent, dent.d_reclen, uio);
876 		if (error) {
877 			break;
878 		}
879 		(*cntp)++;
880 		de = TAILQ_NEXT(de, td_entries);
881 
882 	} while (uio->uio_resid > 0 && de);
883 
884 	/* Cache the last entry or clear and mark EOF. */
885 	uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
886 	node->tn_spec.tn_dir.tn_readdir_lastp = de;
887 done:
888 	tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);
889 
890 	if (error == EJUSTRETURN) {
891 		/* Exhausted UIO space - just return. */
892 		error = 0;
893 	}
894 	KASSERT(error >= 0);
895 	return error;
896 }
897 
898 /*
899  * tmpfs_reg_resize: resize the underlying UVM object associated with the
900  * specified regular file.
901  */
902 int
903 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
904 {
905 	tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
906 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
907 	struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
908 	size_t newpages, oldpages;
909 	off_t oldsize;
910 
911 	KASSERT(vp->v_type == VREG);
912 	KASSERT(newsize >= 0);
913 
914 	oldsize = node->tn_size;
915 	oldpages = round_page(oldsize) >> PAGE_SHIFT;
916 	newpages = round_page(newsize) >> PAGE_SHIFT;
917 	KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
918 
919 	if (newsize == oldsize) {
920 		return 0;
921 	}
922 
923 	if (newpages > oldpages) {
924 		/* Increase the used-memory counter if getting extra pages. */
925 		if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
926 			return ENOSPC;
927 		}
928 	} else if (newsize < oldsize) {
929 		size_t zerolen;
930 
931 		zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
932 		ubc_zerorange(uobj, newsize, zerolen, UBC_UNMAP_FLAG(vp));
933 	}
934 
935 	node->tn_spec.tn_reg.tn_aobj_pages = newpages;
936 	node->tn_size = newsize;
937 	uvm_vnp_setsize(vp, newsize);
938 
939 	/*
940 	 * Free "backing store".
941 	 */
942 	if (newpages < oldpages) {
943 		rw_enter(uobj->vmobjlock, RW_WRITER);
944 		uao_dropswap_range(uobj, newpages, oldpages);
945 		rw_exit(uobj->vmobjlock);
946 
947 		/* Decrease the used-memory counter. */
948 		tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
949 	}
950 	if (newsize > oldsize) {
951 		VN_KNOTE(vp, NOTE_EXTEND);
952 	}
953 	return 0;
954 }
955 
956 /*
957  * tmpfs_chflags: change flags of the given vnode.
958  */
959 int
960 tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
961 {
962 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
963 	kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
964 	int error;
965 	bool changing_sysflags = false;
966 
967 	KASSERT(VOP_ISLOCKED(vp));
968 
969 	/* Disallow this operation if the file system is mounted read-only. */
970 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
971 		return EROFS;
972 
973 	/*
974 	 * If the new flags have non-user flags that are different than
975 	 * those on the node, we need special permission to change them.
976 	 */
977 	if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
978 		action |= KAUTH_VNODE_WRITE_SYSFLAGS;
979 		changing_sysflags = true;
980 	}
981 
982 	/*
983 	 * Indicate that this node's flags have system attributes in them if
984 	 * that's the case.
985 	 */
986 	if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
987 		action |= KAUTH_VNODE_HAS_SYSFLAGS;
988 	}
989 
990 	error = kauth_authorize_vnode(cred, action, vp, NULL,
991 	    genfs_can_chflags(cred, vp->v_type, node->tn_uid,
992 	    changing_sysflags));
993 	if (error)
994 		return error;
995 
996 	/*
997 	 * Set the flags. If we're not setting non-user flags, be careful not
998 	 * to overwrite them.
999 	 *
1000 	 * XXX: Can't we always assign here? if the system flags are different,
1001 	 *      the code above should catch attempts to change them without
1002 	 *      proper permissions, and if we're here it means it's okay to
1003 	 *      change them...
1004 	 */
1005 	if (!changing_sysflags) {
1006 		/* Clear all user-settable flags and re-set them. */
1007 		node->tn_flags &= SF_SETTABLE;
1008 		node->tn_flags |= (flags & UF_SETTABLE);
1009 	} else {
1010 		node->tn_flags = flags;
1011 	}
1012 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1013 	VN_KNOTE(vp, NOTE_ATTRIB);
1014 	return 0;
1015 }
1016 
1017 /*
1018  * tmpfs_chmod: change access mode on the given vnode.
1019  */
1020 int
1021 tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
1022 {
1023 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1024 	int error;
1025 
1026 	KASSERT(VOP_ISLOCKED(vp));
1027 
1028 	/* Disallow this operation if the file system is mounted read-only. */
1029 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1030 		return EROFS;
1031 
1032 	/* Immutable or append-only files cannot be modified, either. */
1033 	if (node->tn_flags & (IMMUTABLE | APPEND))
1034 		return EPERM;
1035 
1036 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
1037 	    NULL, genfs_can_chmod(vp->v_type, cred, node->tn_uid, node->tn_gid, mode));
1038 	if (error) {
1039 		return error;
1040 	}
1041 	node->tn_mode = (mode & ALLPERMS);
1042 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1043 	VN_KNOTE(vp, NOTE_ATTRIB);
1044 	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
1045 	return 0;
1046 }
1047 
1048 /*
1049  * tmpfs_chown: change ownership of the given vnode.
1050  *
1051  * => At least one of uid or gid must be different than VNOVAL.
1052  * => Attribute is unchanged for VNOVAL case.
1053  */
1054 int
1055 tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
1056 {
1057 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1058 	int error;
1059 
1060 	KASSERT(VOP_ISLOCKED(vp));
1061 
1062 	/* Assign default values if they are unknown. */
1063 	KASSERT(uid != VNOVAL || gid != VNOVAL);
1064 	if (uid == VNOVAL) {
1065 		uid = node->tn_uid;
1066 	}
1067 	if (gid == VNOVAL) {
1068 		gid = node->tn_gid;
1069 	}
1070 
1071 	/* Disallow this operation if the file system is mounted read-only. */
1072 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1073 		return EROFS;
1074 
1075 	/* Immutable or append-only files cannot be modified, either. */
1076 	if (node->tn_flags & (IMMUTABLE | APPEND))
1077 		return EPERM;
1078 
1079 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
1080 	    NULL, genfs_can_chown(cred, node->tn_uid, node->tn_gid, uid,
1081 	    gid));
1082 	if (error) {
1083 		return error;
1084 	}
1085 	node->tn_uid = uid;
1086 	node->tn_gid = gid;
1087 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1088 	VN_KNOTE(vp, NOTE_ATTRIB);
1089 	cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
1090 	return 0;
1091 }
1092 
1093 /*
1094  * tmpfs_chsize: change size of the given vnode.
1095  */
1096 int
1097 tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
1098 {
1099 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1100 	const off_t length = size;
1101 	int error;
1102 
1103 	KASSERT(VOP_ISLOCKED(vp));
1104 
1105 	/* Decide whether this is a valid operation based on the file type. */
1106 	switch (vp->v_type) {
1107 	case VDIR:
1108 		return EISDIR;
1109 	case VREG:
1110 		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1111 			return EROFS;
1112 		}
1113 		break;
1114 	case VBLK:
1115 	case VCHR:
1116 	case VFIFO:
1117 		/*
1118 		 * Allow modifications of special files even if in the file
1119 		 * system is mounted read-only (we are not modifying the
1120 		 * files themselves, but the objects they represent).
1121 		 */
1122 		return 0;
1123 	default:
1124 		return EOPNOTSUPP;
1125 	}
1126 
1127 	/* Immutable or append-only files cannot be modified, either. */
1128 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1129 		return EPERM;
1130 	}
1131 
1132 	if (length < 0) {
1133 		return EINVAL;
1134 	}
1135 
1136 	/* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
1137 	if (node->tn_size != length &&
1138 	    (error = tmpfs_reg_resize(vp, length)) != 0) {
1139 		return error;
1140 	}
1141 	tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
1142 	return 0;
1143 }
1144 
1145 /*
1146  * tmpfs_chtimes: change access and modification times for vnode.
1147  */
1148 int
1149 tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
1150     const struct timespec *mtime, const struct timespec *btime,
1151     int vaflags, kauth_cred_t cred, lwp_t *l)
1152 {
1153 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1154 	int error;
1155 
1156 	KASSERT(VOP_ISLOCKED(vp));
1157 
1158 	/* Disallow this operation if the file system is mounted read-only. */
1159 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1160 		return EROFS;
1161 
1162 	/* Immutable or append-only files cannot be modified, either. */
1163 	if (node->tn_flags & (IMMUTABLE | APPEND))
1164 		return EPERM;
1165 
1166 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
1167 	    genfs_can_chtimes(vp, vaflags, node->tn_uid, cred));
1168 	if (error)
1169 		return error;
1170 
1171 	if (atime->tv_sec != VNOVAL) {
1172 		node->tn_atime = *atime;
1173 	}
1174 	if (mtime->tv_sec != VNOVAL) {
1175 		node->tn_mtime = *mtime;
1176 	}
1177 	if (btime->tv_sec != VNOVAL) {
1178 		node->tn_birthtime = *btime;
1179 	}
1180 	VN_KNOTE(vp, NOTE_ATTRIB);
1181 	return 0;
1182 }
1183 
1184 /*
1185  * tmpfs_update: update the timestamps as indicated by the flags.
1186  */
1187 void
1188 tmpfs_update(vnode_t *vp, unsigned tflags)
1189 {
1190 	tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1191 	struct timespec nowtm;
1192 
1193 	if (tflags == 0) {
1194 		return;
1195 	}
1196 	vfs_timestamp(&nowtm);
1197 
1198 	if (tflags & TMPFS_UPDATE_ATIME) {
1199 		node->tn_atime = nowtm;
1200 	}
1201 	if (tflags & TMPFS_UPDATE_MTIME) {
1202 		node->tn_mtime = nowtm;
1203 	}
1204 	if (tflags & TMPFS_UPDATE_CTIME) {
1205 		node->tn_ctime = nowtm;
1206 	}
1207 }
1208