1 /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2005 The NetBSD Foundation, Inc. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code 11 * 2005 program. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Efficient memory file system supporting functions. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/dirent.h> 42 #include <sys/fnv_hash.h> 43 #include <sys/lock.h> 44 #include <sys/limits.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/random.h> 50 #include <sys/refcount.h> 51 #include <sys/rwlock.h> 52 #include <sys/smr.h> 53 #include <sys/stat.h> 54 #include <sys/sysctl.h> 55 #include <sys/user.h> 56 #include <sys/vnode.h> 57 #include <sys/vmmeter.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #include <vm/vm_page.h> 63 #include <vm/vm_pageout.h> 64 #include <vm/vm_pager.h> 65 #include <vm/vm_extern.h> 66 #include <vm/swap_pager.h> 67 #include <vm/uma.h> 68 69 #include <fs/tmpfs/tmpfs.h> 70 #include <fs/tmpfs/tmpfs_fifoops.h> 71 #include <fs/tmpfs/tmpfs_vnops.h> 72 73 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 74 "tmpfs file system"); 75 76 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; 77 static long tmpfs_pages_avail_init; 78 static int tmpfs_mem_percent = TMPFS_MEM_PERCENT; 79 static void tmpfs_set_reserve_from_percent(void); 80 81 MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); 82 static uma_zone_t tmpfs_node_pool; 83 VFS_SMR_DECLARE; 84 85 int tmpfs_pager_type = -1; 86 87 static vm_object_t 88 tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 89 vm_ooffset_t offset, struct ucred *cred) 90 { 91 vm_object_t object; 92 93 MPASS(handle == NULL); 94 MPASS(offset == 0); 95 object = vm_object_allocate_dyn(tmpfs_pager_type, size, 96 OBJ_COLORED | OBJ_SWAP); 97 if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { 98 vm_object_deallocate(object); 99 object = NULL; 100 } 101 return (object); 102 } 103 104 /* 105 * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. 106 * 107 * This allows for periodic mtime updates while only scanning vnodes which are 108 * plausibly dirty, see tmpfs_update_mtime_lazy. 109 */ 110 static void 111 tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, 112 vm_offset_t new) 113 { 114 struct vnode *vp; 115 116 VM_OBJECT_ASSERT_WLOCKED(object); 117 118 vp = VM_TO_TMPFS_VP(object); 119 120 /* 121 * Forced unmount? 122 */ 123 if (vp == NULL) { 124 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 125 ("object %p with OBJ_TMPFS_VREF but without vnode", 126 object)); 127 VM_OBJECT_WUNLOCK(object); 128 return; 129 } 130 131 if (old == 0) { 132 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 133 ("object without writable mappings has a reference")); 134 VNPASS(vp->v_usecount > 0, vp); 135 } else { 136 VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, 137 ("object with writable mappings does not " 138 "have a reference")); 139 } 140 141 if (old == new) { 142 VM_OBJECT_WUNLOCK(object); 143 return; 144 } 145 146 if (new == 0) { 147 vm_object_clear_flag(object, OBJ_TMPFS_VREF); 148 VM_OBJECT_WUNLOCK(object); 149 vrele(vp); 150 } else { 151 if ((object->flags & OBJ_TMPFS_VREF) == 0) { 152 vref(vp); 153 vlazy(vp); 154 vm_object_set_flag(object, OBJ_TMPFS_VREF); 155 } 156 VM_OBJECT_WUNLOCK(object); 157 } 158 } 159 160 static void 161 tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, 162 vm_offset_t end) 163 { 164 vm_offset_t new, old; 165 166 VM_OBJECT_WLOCK(object); 167 KASSERT((object->flags & OBJ_ANON) == 0, 168 ("%s: object %p with OBJ_ANON", __func__, object)); 169 old = object->un_pager.swp.writemappings; 170 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 171 new = object->un_pager.swp.writemappings; 172 tmpfs_pager_writecount_recalc(object, old, new); 173 VM_OBJECT_ASSERT_UNLOCKED(object); 174 } 175 176 static void 177 tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, 178 vm_offset_t end) 179 { 180 vm_offset_t new, old; 181 182 VM_OBJECT_WLOCK(object); 183 KASSERT((object->flags & OBJ_ANON) == 0, 184 ("%s: object %p with OBJ_ANON", __func__, object)); 185 old = object->un_pager.swp.writemappings; 186 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 187 new = object->un_pager.swp.writemappings; 188 tmpfs_pager_writecount_recalc(object, old, new); 189 VM_OBJECT_ASSERT_UNLOCKED(object); 190 } 191 192 static void 193 tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) 194 { 195 struct vnode *vp; 196 197 /* 198 * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type 199 * type. In this case there is no v_writecount to adjust. 200 */ 201 if (vp_heldp != NULL) 202 VM_OBJECT_RLOCK(object); 203 else 204 VM_OBJECT_ASSERT_LOCKED(object); 205 if ((object->flags & OBJ_TMPFS) != 0) { 206 vp = VM_TO_TMPFS_VP(object); 207 if (vp != NULL) { 208 *vpp = vp; 209 if (vp_heldp != NULL) { 210 vhold(vp); 211 *vp_heldp = true; 212 } 213 } 214 } 215 if (vp_heldp != NULL) 216 VM_OBJECT_RUNLOCK(object); 217 } 218 219 static void 220 tmpfs_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) 221 { 222 struct tmpfs_node *node; 223 struct tmpfs_mount *tm; 224 vm_size_t c; 225 226 swap_pager_freespace(obj, start, size, &c); 227 if ((obj->flags & OBJ_TMPFS) == 0 || c == 0) 228 return; 229 230 node = obj->un_pager.swp.swp_priv; 231 MPASS(node->tn_type == VREG); 232 tm = node->tn_reg.tn_tmp; 233 234 KASSERT(tm->tm_pages_used >= c, 235 ("tmpfs tm %p pages %jd free %jd", tm, 236 (uintmax_t)tm->tm_pages_used, (uintmax_t)c)); 237 atomic_add_long(&tm->tm_pages_used, -c); 238 KASSERT(node->tn_reg.tn_pages >= c, 239 ("tmpfs node %p pages %jd free %jd", node, 240 (uintmax_t)node->tn_reg.tn_pages, (uintmax_t)c)); 241 node->tn_reg.tn_pages -= c; 242 } 243 244 static void 245 tmpfs_page_inserted(vm_object_t obj, vm_page_t m) 246 { 247 struct tmpfs_node *node; 248 struct tmpfs_mount *tm; 249 250 if ((obj->flags & OBJ_TMPFS) == 0) 251 return; 252 253 node = obj->un_pager.swp.swp_priv; 254 MPASS(node->tn_type == VREG); 255 tm = node->tn_reg.tn_tmp; 256 257 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 258 atomic_add_long(&tm->tm_pages_used, 1); 259 node->tn_reg.tn_pages += 1; 260 } 261 } 262 263 static void 264 tmpfs_page_removed(vm_object_t obj, vm_page_t m) 265 { 266 struct tmpfs_node *node; 267 struct tmpfs_mount *tm; 268 269 if ((obj->flags & OBJ_TMPFS) == 0) 270 return; 271 272 node = obj->un_pager.swp.swp_priv; 273 MPASS(node->tn_type == VREG); 274 tm = node->tn_reg.tn_tmp; 275 276 if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { 277 KASSERT(tm->tm_pages_used >= 1, 278 ("tmpfs tm %p pages %jd free 1", tm, 279 (uintmax_t)tm->tm_pages_used)); 280 atomic_add_long(&tm->tm_pages_used, -1); 281 KASSERT(node->tn_reg.tn_pages >= 1, 282 ("tmpfs node %p pages %jd free 1", node, 283 (uintmax_t)node->tn_reg.tn_pages)); 284 node->tn_reg.tn_pages -= 1; 285 } 286 } 287 288 static boolean_t 289 tmpfs_can_alloc_page(vm_object_t obj, vm_pindex_t pindex) 290 { 291 struct tmpfs_mount *tm; 292 293 tm = VM_TO_TMPFS_MP(obj); 294 if (tm == NULL || vm_pager_has_page(obj, pindex, NULL, NULL) || 295 tm->tm_pages_max == 0) 296 return (true); 297 if (tm->tm_pages_max == ULONG_MAX) 298 return (tmpfs_mem_avail() >= 1); 299 return (tm->tm_pages_max > atomic_load_long(&tm->tm_pages_used)); 300 } 301 302 struct pagerops tmpfs_pager_ops = { 303 .pgo_kvme_type = KVME_TYPE_VNODE, 304 .pgo_alloc = tmpfs_pager_alloc, 305 .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, 306 .pgo_update_writecount = tmpfs_pager_update_writecount, 307 .pgo_release_writecount = tmpfs_pager_release_writecount, 308 .pgo_mightbedirty = vm_object_mightbedirty_, 309 .pgo_getvp = tmpfs_pager_getvp, 310 .pgo_freespace = tmpfs_pager_freespace, 311 .pgo_page_inserted = tmpfs_page_inserted, 312 .pgo_page_removed = tmpfs_page_removed, 313 .pgo_can_alloc_page = tmpfs_can_alloc_page, 314 }; 315 316 static int 317 tmpfs_node_ctor(void *mem, int size, void *arg, int flags) 318 { 319 struct tmpfs_node *node; 320 321 node = mem; 322 node->tn_gen++; 323 node->tn_size = 0; 324 node->tn_status = 0; 325 node->tn_accessed = false; 326 node->tn_flags = 0; 327 node->tn_links = 0; 328 node->tn_vnode = NULL; 329 node->tn_vpstate = 0; 330 return (0); 331 } 332 333 static void 334 tmpfs_node_dtor(void *mem, int size, void *arg) 335 { 336 struct tmpfs_node *node; 337 338 node = mem; 339 node->tn_type = VNON; 340 } 341 342 static int 343 tmpfs_node_init(void *mem, int size, int flags) 344 { 345 struct tmpfs_node *node; 346 347 node = mem; 348 node->tn_id = 0; 349 mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); 350 node->tn_gen = arc4random(); 351 return (0); 352 } 353 354 static void 355 tmpfs_node_fini(void *mem, int size) 356 { 357 struct tmpfs_node *node; 358 359 node = mem; 360 mtx_destroy(&node->tn_interlock); 361 } 362 363 int 364 tmpfs_subr_init(void) 365 { 366 tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, 367 OBJT_SWAP); 368 if (tmpfs_pager_type == -1) 369 return (EINVAL); 370 tmpfs_node_pool = uma_zcreate("TMPFS node", 371 sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, 372 tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); 373 VFS_SMR_ZONE_SET(tmpfs_node_pool); 374 375 tmpfs_pages_avail_init = tmpfs_mem_avail(); 376 tmpfs_set_reserve_from_percent(); 377 return (0); 378 } 379 380 void 381 tmpfs_subr_uninit(void) 382 { 383 if (tmpfs_pager_type != -1) 384 vm_pager_free_dyn_type(tmpfs_pager_type); 385 tmpfs_pager_type = -1; 386 uma_zdestroy(tmpfs_node_pool); 387 } 388 389 static int 390 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) 391 { 392 int error; 393 long pages, bytes; 394 395 pages = *(long *)arg1; 396 bytes = pages * PAGE_SIZE; 397 398 error = sysctl_handle_long(oidp, &bytes, 0, req); 399 if (error || !req->newptr) 400 return (error); 401 402 pages = bytes / PAGE_SIZE; 403 if (pages < TMPFS_PAGES_MINRESERVED) 404 return (EINVAL); 405 406 *(long *)arg1 = pages; 407 return (0); 408 } 409 410 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, 411 CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_pages_reserved, 0, 412 sysctl_mem_reserved, "L", 413 "Amount of available memory and swap below which tmpfs growth stops"); 414 415 static int 416 sysctl_mem_percent(SYSCTL_HANDLER_ARGS) 417 { 418 int error, percent; 419 420 percent = *(int *)arg1; 421 error = sysctl_handle_int(oidp, &percent, 0, req); 422 if (error || !req->newptr) 423 return (error); 424 425 if ((unsigned) percent > 100) 426 return (EINVAL); 427 428 *(long *)arg1 = percent; 429 tmpfs_set_reserve_from_percent(); 430 return (0); 431 } 432 433 static void 434 tmpfs_set_reserve_from_percent(void) 435 { 436 size_t reserved; 437 438 reserved = tmpfs_pages_avail_init * (100 - tmpfs_mem_percent) / 100; 439 tmpfs_pages_reserved = max(reserved, TMPFS_PAGES_MINRESERVED); 440 } 441 442 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_percent, 443 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &tmpfs_mem_percent, 0, 444 sysctl_mem_percent, "I", 445 "Percent of available memory that can be used if no size limit"); 446 447 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, 448 struct tmpfs_dirent *b); 449 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 450 451 size_t 452 tmpfs_mem_avail(void) 453 { 454 size_t avail; 455 long reserved; 456 457 avail = swap_pager_avail + vm_free_count(); 458 reserved = atomic_load_long(&tmpfs_pages_reserved); 459 if (__predict_false(avail < reserved)) 460 return (0); 461 return (avail - reserved); 462 } 463 464 size_t 465 tmpfs_pages_used(struct tmpfs_mount *tmp) 466 { 467 const size_t node_size = sizeof(struct tmpfs_node) + 468 sizeof(struct tmpfs_dirent); 469 size_t meta_pages; 470 471 meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, 472 PAGE_SIZE); 473 return (meta_pages + tmp->tm_pages_used); 474 } 475 476 bool 477 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) 478 { 479 if (tmpfs_mem_avail() < req_pages) 480 return (false); 481 482 if (tmp->tm_pages_max != ULONG_MAX && 483 tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) 484 return (false); 485 486 return (true); 487 } 488 489 static int 490 tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, 491 int end, boolean_t ignerr) 492 { 493 vm_page_t m; 494 int rv, error; 495 496 VM_OBJECT_ASSERT_WLOCKED(object); 497 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 498 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 499 end)); 500 error = 0; 501 502 retry: 503 m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); 504 if (m != NULL) { 505 MPASS(vm_page_all_valid(m)); 506 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 507 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | 508 VM_ALLOC_WAITFAIL); 509 if (m == NULL) 510 goto retry; 511 vm_object_pip_add(object, 1); 512 VM_OBJECT_WUNLOCK(object); 513 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 514 VM_OBJECT_WLOCK(object); 515 vm_object_pip_wakeup(object); 516 if (rv == VM_PAGER_OK) { 517 /* 518 * Since the page was not resident, and therefore not 519 * recently accessed, immediately enqueue it for 520 * asynchronous laundering. The current operation is 521 * not regarded as an access. 522 */ 523 vm_page_launder(m); 524 } else { 525 vm_page_free(m); 526 m = NULL; 527 if (!ignerr) 528 error = EIO; 529 } 530 } 531 if (m != NULL) { 532 pmap_zero_page_area(m, base, end - base); 533 vm_page_set_dirty(m); 534 vm_page_xunbusy(m); 535 } 536 537 return (error); 538 } 539 540 void 541 tmpfs_ref_node(struct tmpfs_node *node) 542 { 543 #ifdef INVARIANTS 544 u_int old; 545 546 old = 547 #endif 548 refcount_acquire(&node->tn_refcount); 549 #ifdef INVARIANTS 550 KASSERT(old > 0, ("node %p zero refcount", node)); 551 #endif 552 } 553 554 /* 555 * Allocates a new node of type 'type' inside the 'tmp' mount point, with 556 * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', 557 * using the credentials of the process 'p'. 558 * 559 * If the node type is set to 'VDIR', then the parent parameter must point 560 * to the parent directory of the node being created. It may only be NULL 561 * while allocating the root node. 562 * 563 * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter 564 * specifies the device the node represents. 565 * 566 * If the node type is set to 'VLNK', then the parameter target specifies 567 * the file name of the target file for the symbolic link that is being 568 * created. 569 * 570 * Note that new nodes are retrieved from the available list if it has 571 * items or, if it is empty, from the node pool as long as there is enough 572 * space to create them. 573 * 574 * Returns zero on success or an appropriate error code on failure. 575 */ 576 int 577 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, __enum_uint8(vtype) type, 578 uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, 579 const char *target, dev_t rdev, struct tmpfs_node **node) 580 { 581 struct tmpfs_node *nnode; 582 char *symlink; 583 char symlink_smr; 584 585 /* If the root directory of the 'tmp' file system is not yet 586 * allocated, this must be the request to do it. */ 587 MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); 588 589 MPASS((type == VLNK) ^ (target == NULL)); 590 MPASS((type == VBLK || type == VCHR) ^ (rdev == VNOVAL)); 591 592 if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) 593 return (ENOSPC); 594 if (!tmpfs_pages_check_avail(tmp, 1)) 595 return (ENOSPC); 596 597 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 598 /* 599 * When a new tmpfs node is created for fully 600 * constructed mount point, there must be a parent 601 * node, which vnode is locked exclusively. As 602 * consequence, if the unmount is executing in 603 * parallel, vflush() cannot reclaim the parent vnode. 604 * Due to this, the check for MNTK_UNMOUNT flag is not 605 * racy: if we did not see MNTK_UNMOUNT flag, then tmp 606 * cannot be destroyed until node construction is 607 * finished and the parent vnode unlocked. 608 * 609 * Tmpfs does not need to instantiate new nodes during 610 * unmount. 611 */ 612 return (EBUSY); 613 } 614 if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) 615 return (EROFS); 616 617 nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); 618 619 /* Generic initialization. */ 620 nnode->tn_type = type; 621 vfs_timestamp(&nnode->tn_atime); 622 nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = 623 nnode->tn_atime; 624 nnode->tn_uid = uid; 625 nnode->tn_gid = gid; 626 nnode->tn_mode = mode; 627 nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); 628 nnode->tn_refcount = 1; 629 LIST_INIT(&nnode->tn_extattrs); 630 631 /* Type-specific initialization. */ 632 switch (nnode->tn_type) { 633 case VBLK: 634 case VCHR: 635 nnode->tn_rdev = rdev; 636 break; 637 638 case VDIR: 639 RB_INIT(&nnode->tn_dir.tn_dirhead); 640 LIST_INIT(&nnode->tn_dir.tn_dupindex); 641 MPASS(parent != nnode); 642 MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); 643 nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; 644 nnode->tn_dir.tn_readdir_lastn = 0; 645 nnode->tn_dir.tn_readdir_lastp = NULL; 646 nnode->tn_links++; 647 TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); 648 nnode->tn_dir.tn_parent->tn_links++; 649 TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); 650 break; 651 652 case VFIFO: 653 /* FALLTHROUGH */ 654 case VSOCK: 655 break; 656 657 case VLNK: 658 MPASS(strlen(target) < MAXPATHLEN); 659 nnode->tn_size = strlen(target); 660 661 symlink = NULL; 662 if (!tmp->tm_nonc) { 663 symlink = cache_symlink_alloc(nnode->tn_size + 1, 664 M_WAITOK); 665 symlink_smr = true; 666 } 667 if (symlink == NULL) { 668 symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, 669 M_WAITOK); 670 symlink_smr = false; 671 } 672 memcpy(symlink, target, nnode->tn_size + 1); 673 674 /* 675 * Allow safe symlink resolving for lockless lookup. 676 * tmpfs_fplookup_symlink references this comment. 677 * 678 * 1. nnode is not yet visible to the world 679 * 2. both tn_link_target and tn_link_smr get populated 680 * 3. release fence publishes their content 681 * 4. tn_link_target content is immutable until node 682 * destruction, where the pointer gets set to NULL 683 * 5. tn_link_smr is never changed once set 684 * 685 * As a result it is sufficient to issue load consume 686 * on the node pointer to also get the above content 687 * in a stable manner. Worst case tn_link_smr flag 688 * may be set to true despite being stale, while the 689 * target buffer is already cleared out. 690 */ 691 atomic_store_ptr(&nnode->tn_link_target, symlink); 692 atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); 693 atomic_thread_fence_rel(); 694 break; 695 696 case VREG: 697 nnode->tn_reg.tn_aobj = 698 vm_pager_allocate(tmpfs_pager_type, NULL, 0, 699 VM_PROT_DEFAULT, 0, 700 NULL /* XXXKIB - tmpfs needs swap reservation */); 701 nnode->tn_reg.tn_aobj->un_pager.swp.swp_priv = nnode; 702 vm_object_set_flag(nnode->tn_reg.tn_aobj, OBJ_TMPFS); 703 nnode->tn_reg.tn_tmp = tmp; 704 nnode->tn_reg.tn_pages = 0; 705 break; 706 707 default: 708 panic("tmpfs_alloc_node: type %p %d", nnode, 709 (int)nnode->tn_type); 710 } 711 712 TMPFS_LOCK(tmp); 713 LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); 714 nnode->tn_attached = true; 715 tmp->tm_nodes_inuse++; 716 tmp->tm_refcount++; 717 TMPFS_UNLOCK(tmp); 718 719 *node = nnode; 720 return (0); 721 } 722 723 /* 724 * Destroys the node pointed to by node from the file system 'tmp'. 725 * If the node references a directory, no entries are allowed. 726 */ 727 void 728 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) 729 { 730 if (refcount_release_if_not_last(&node->tn_refcount)) 731 return; 732 733 TMPFS_LOCK(tmp); 734 TMPFS_NODE_LOCK(node); 735 if (!tmpfs_free_node_locked(tmp, node, false)) { 736 TMPFS_NODE_UNLOCK(node); 737 TMPFS_UNLOCK(tmp); 738 } 739 } 740 741 bool 742 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, 743 bool detach) 744 { 745 struct tmpfs_extattr *ea; 746 vm_object_t uobj; 747 char *symlink; 748 bool last; 749 750 TMPFS_MP_ASSERT_LOCKED(tmp); 751 TMPFS_NODE_ASSERT_LOCKED(node); 752 753 last = refcount_release(&node->tn_refcount); 754 if (node->tn_attached && (detach || last)) { 755 MPASS(tmp->tm_nodes_inuse > 0); 756 tmp->tm_nodes_inuse--; 757 LIST_REMOVE(node, tn_entries); 758 node->tn_attached = false; 759 } 760 if (!last) 761 return (false); 762 763 TMPFS_NODE_UNLOCK(node); 764 765 #ifdef INVARIANTS 766 MPASS(node->tn_vnode == NULL); 767 MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); 768 769 /* 770 * Make sure this is a node type we can deal with. Everything 771 * is explicitly enumerated without the 'default' clause so 772 * the compiler can throw an error in case a new type is 773 * added. 774 */ 775 switch (node->tn_type) { 776 case VBLK: 777 case VCHR: 778 case VDIR: 779 case VFIFO: 780 case VSOCK: 781 case VLNK: 782 case VREG: 783 break; 784 case VNON: 785 case VBAD: 786 case VMARKER: 787 panic("%s: bad type %d for node %p", __func__, 788 (int)node->tn_type, node); 789 } 790 #endif 791 792 while ((ea = LIST_FIRST(&node->tn_extattrs)) != NULL) { 793 LIST_REMOVE(ea, ea_extattrs); 794 tmpfs_extattr_free(ea); 795 } 796 797 switch (node->tn_type) { 798 case VREG: 799 uobj = node->tn_reg.tn_aobj; 800 node->tn_reg.tn_aobj = NULL; 801 if (uobj != NULL) { 802 VM_OBJECT_WLOCK(uobj); 803 KASSERT((uobj->flags & OBJ_TMPFS) != 0, 804 ("tmpfs node %p uobj %p not tmpfs", node, uobj)); 805 vm_object_clear_flag(uobj, OBJ_TMPFS); 806 KASSERT(tmp->tm_pages_used >= node->tn_reg.tn_pages, 807 ("tmpfs tmp %p node %p pages %jd free %jd", tmp, 808 node, (uintmax_t)tmp->tm_pages_used, 809 (uintmax_t)node->tn_reg.tn_pages)); 810 atomic_add_long(&tmp->tm_pages_used, 811 -node->tn_reg.tn_pages); 812 VM_OBJECT_WUNLOCK(uobj); 813 } 814 tmpfs_free_tmp(tmp); 815 816 /* 817 * vm_object_deallocate() must not be called while 818 * owning tm_allnode_lock, because deallocate might 819 * sleep. Call it after tmpfs_free_tmp() does the 820 * unlock. 821 */ 822 if (uobj != NULL) 823 vm_object_deallocate(uobj); 824 825 break; 826 case VLNK: 827 tmpfs_free_tmp(tmp); 828 829 symlink = node->tn_link_target; 830 atomic_store_ptr(&node->tn_link_target, NULL); 831 if (atomic_load_char(&node->tn_link_smr)) { 832 cache_symlink_free(symlink, node->tn_size + 1); 833 } else { 834 free(symlink, M_TMPFSNAME); 835 } 836 break; 837 default: 838 tmpfs_free_tmp(tmp); 839 break; 840 } 841 842 uma_zfree_smr(tmpfs_node_pool, node); 843 return (true); 844 } 845 846 static __inline uint32_t 847 tmpfs_dirent_hash(const char *name, u_int len) 848 { 849 uint32_t hash; 850 851 hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; 852 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP 853 hash &= 0xf; 854 #endif 855 if (hash < TMPFS_DIRCOOKIE_MIN) 856 hash += TMPFS_DIRCOOKIE_MIN; 857 858 return (hash); 859 } 860 861 static __inline off_t 862 tmpfs_dirent_cookie(struct tmpfs_dirent *de) 863 { 864 if (de == NULL) 865 return (TMPFS_DIRCOOKIE_EOF); 866 867 MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); 868 869 return (de->td_cookie); 870 } 871 872 static __inline boolean_t 873 tmpfs_dirent_dup(struct tmpfs_dirent *de) 874 { 875 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); 876 } 877 878 static __inline boolean_t 879 tmpfs_dirent_duphead(struct tmpfs_dirent *de) 880 { 881 return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); 882 } 883 884 void 885 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) 886 { 887 de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); 888 memcpy(de->ud.td_name, name, namelen); 889 de->td_namelen = namelen; 890 } 891 892 /* 893 * Allocates a new directory entry for the node node with a name of name. 894 * The new directory entry is returned in *de. 895 * 896 * The link count of node is increased by one to reflect the new object 897 * referencing it. 898 * 899 * Returns zero on success or an appropriate error code on failure. 900 */ 901 int 902 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, 903 const char *name, u_int len, struct tmpfs_dirent **de) 904 { 905 struct tmpfs_dirent *nde; 906 907 nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); 908 nde->td_node = node; 909 if (name != NULL) { 910 nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); 911 tmpfs_dirent_init(nde, name, len); 912 } else 913 nde->td_namelen = 0; 914 if (node != NULL) 915 node->tn_links++; 916 917 *de = nde; 918 919 return (0); 920 } 921 922 /* 923 * Frees a directory entry. It is the caller's responsibility to destroy 924 * the node referenced by it if needed. 925 * 926 * The link count of node is decreased by one to reflect the removal of an 927 * object that referenced it. This only happens if 'node_exists' is true; 928 * otherwise the function will not access the node referred to by the 929 * directory entry, as it may already have been released from the outside. 930 */ 931 void 932 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) 933 { 934 struct tmpfs_node *node; 935 936 node = de->td_node; 937 if (node != NULL) { 938 MPASS(node->tn_links > 0); 939 node->tn_links--; 940 } 941 if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) 942 free(de->ud.td_name, M_TMPFSNAME); 943 free(de, M_TMPFSDIR); 944 } 945 946 void 947 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) 948 { 949 bool want_vrele; 950 951 ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); 952 if (vp->v_type != VREG || obj == NULL) 953 return; 954 955 VM_OBJECT_WLOCK(obj); 956 VI_LOCK(vp); 957 /* 958 * May be going through forced unmount. 959 */ 960 want_vrele = false; 961 if ((obj->flags & OBJ_TMPFS_VREF) != 0) { 962 vm_object_clear_flag(obj, OBJ_TMPFS_VREF); 963 want_vrele = true; 964 } 965 966 if (vp->v_writecount < 0) 967 vp->v_writecount = 0; 968 VI_UNLOCK(vp); 969 VM_OBJECT_WUNLOCK(obj); 970 if (want_vrele) { 971 vrele(vp); 972 } 973 } 974 975 /* 976 * Allocates a new vnode for the node node or returns a new reference to 977 * an existing one if the node had already a vnode referencing it. The 978 * resulting locked vnode is returned in *vpp. 979 * 980 * Returns zero on success or an appropriate error code on failure. 981 */ 982 int 983 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, 984 struct vnode **vpp) 985 { 986 struct vnode *vp; 987 enum vgetstate vs; 988 struct tmpfs_mount *tm; 989 vm_object_t object; 990 int error; 991 992 error = 0; 993 tm = VFS_TO_TMPFS(mp); 994 TMPFS_NODE_LOCK(node); 995 tmpfs_ref_node(node); 996 loop: 997 TMPFS_NODE_ASSERT_LOCKED(node); 998 if ((vp = node->tn_vnode) != NULL) { 999 MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); 1000 if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || 1001 (VN_IS_DOOMED(vp) && 1002 (lkflag & LK_NOWAIT) != 0)) { 1003 TMPFS_NODE_UNLOCK(node); 1004 error = ENOENT; 1005 vp = NULL; 1006 goto out; 1007 } 1008 if (VN_IS_DOOMED(vp)) { 1009 node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; 1010 while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { 1011 msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 1012 0, "tmpfsE", 0); 1013 } 1014 goto loop; 1015 } 1016 vs = vget_prep(vp); 1017 TMPFS_NODE_UNLOCK(node); 1018 error = vget_finish(vp, lkflag, vs); 1019 if (error == ENOENT) { 1020 TMPFS_NODE_LOCK(node); 1021 goto loop; 1022 } 1023 if (error != 0) { 1024 vp = NULL; 1025 goto out; 1026 } 1027 1028 /* 1029 * Make sure the vnode is still there after 1030 * getting the interlock to avoid racing a free. 1031 */ 1032 if (node->tn_vnode != vp) { 1033 vput(vp); 1034 TMPFS_NODE_LOCK(node); 1035 goto loop; 1036 } 1037 1038 goto out; 1039 } 1040 1041 if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || 1042 (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { 1043 TMPFS_NODE_UNLOCK(node); 1044 error = ENOENT; 1045 vp = NULL; 1046 goto out; 1047 } 1048 1049 /* 1050 * otherwise lock the vp list while we call getnewvnode 1051 * since that can block. 1052 */ 1053 if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { 1054 node->tn_vpstate |= TMPFS_VNODE_WANT; 1055 error = msleep((caddr_t) &node->tn_vpstate, 1056 TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); 1057 if (error != 0) 1058 goto out; 1059 goto loop; 1060 } else 1061 node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; 1062 1063 TMPFS_NODE_UNLOCK(node); 1064 1065 /* Get a new vnode and associate it with our node. */ 1066 error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? 1067 &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); 1068 if (error != 0) 1069 goto unlock; 1070 MPASS(vp != NULL); 1071 1072 /* lkflag is ignored, the lock is exclusive */ 1073 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1074 1075 vp->v_data = node; 1076 vp->v_type = node->tn_type; 1077 1078 /* Type-specific initialization. */ 1079 switch (node->tn_type) { 1080 case VBLK: 1081 /* FALLTHROUGH */ 1082 case VCHR: 1083 /* FALLTHROUGH */ 1084 case VLNK: 1085 /* FALLTHROUGH */ 1086 case VSOCK: 1087 break; 1088 case VFIFO: 1089 vp->v_op = &tmpfs_fifoop_entries; 1090 break; 1091 case VREG: 1092 object = node->tn_reg.tn_aobj; 1093 VM_OBJECT_WLOCK(object); 1094 KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, 1095 ("%s: object %p with OBJ_TMPFS_VREF but without vnode", 1096 __func__, object)); 1097 VI_LOCK(vp); 1098 KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); 1099 vp->v_object = object; 1100 vn_irflag_set_locked(vp, (tm->tm_pgread ? VIRF_PGREAD : 0) | 1101 VIRF_TEXT_REF); 1102 VI_UNLOCK(vp); 1103 VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, 1104 ("leaked OBJ_TMPFS_VREF")); 1105 if (object->un_pager.swp.writemappings > 0) { 1106 vrefact(vp); 1107 vlazy(vp); 1108 vm_object_set_flag(object, OBJ_TMPFS_VREF); 1109 } 1110 VM_OBJECT_WUNLOCK(object); 1111 break; 1112 case VDIR: 1113 MPASS(node->tn_dir.tn_parent != NULL); 1114 if (node->tn_dir.tn_parent == node) 1115 vp->v_vflag |= VV_ROOT; 1116 break; 1117 1118 default: 1119 panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); 1120 } 1121 if (vp->v_type != VFIFO) 1122 VN_LOCK_ASHARE(vp); 1123 1124 error = insmntque1(vp, mp); 1125 if (error != 0) { 1126 /* Need to clear v_object for insmntque failure. */ 1127 tmpfs_destroy_vobject(vp, vp->v_object); 1128 vp->v_object = NULL; 1129 vp->v_data = NULL; 1130 vp->v_op = &dead_vnodeops; 1131 vgone(vp); 1132 vput(vp); 1133 vp = NULL; 1134 } else { 1135 vn_set_state(vp, VSTATE_CONSTRUCTED); 1136 } 1137 1138 unlock: 1139 TMPFS_NODE_LOCK(node); 1140 1141 MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); 1142 node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; 1143 node->tn_vnode = vp; 1144 1145 if (node->tn_vpstate & TMPFS_VNODE_WANT) { 1146 node->tn_vpstate &= ~TMPFS_VNODE_WANT; 1147 TMPFS_NODE_UNLOCK(node); 1148 wakeup((caddr_t) &node->tn_vpstate); 1149 } else 1150 TMPFS_NODE_UNLOCK(node); 1151 1152 out: 1153 if (error == 0) { 1154 *vpp = vp; 1155 1156 #ifdef INVARIANTS 1157 MPASS(*vpp != NULL); 1158 ASSERT_VOP_LOCKED(*vpp, __func__); 1159 TMPFS_NODE_LOCK(node); 1160 MPASS(*vpp == node->tn_vnode); 1161 TMPFS_NODE_UNLOCK(node); 1162 #endif 1163 } 1164 tmpfs_free_node(tm, node); 1165 1166 return (error); 1167 } 1168 1169 /* 1170 * Destroys the association between the vnode vp and the node it 1171 * references. 1172 */ 1173 void 1174 tmpfs_free_vp(struct vnode *vp) 1175 { 1176 struct tmpfs_node *node; 1177 1178 node = VP_TO_TMPFS_NODE(vp); 1179 1180 TMPFS_NODE_ASSERT_LOCKED(node); 1181 node->tn_vnode = NULL; 1182 if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) 1183 wakeup(&node->tn_vnode); 1184 node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; 1185 vp->v_data = NULL; 1186 } 1187 1188 /* 1189 * Allocates a new file of type 'type' and adds it to the parent directory 1190 * 'dvp'; this addition is done using the component name given in 'cnp'. 1191 * The ownership of the new file is automatically assigned based on the 1192 * credentials of the caller (through 'cnp'), the group is set based on 1193 * the parent directory and the mode is determined from the 'vap' argument. 1194 * If successful, *vpp holds a vnode to the newly created file and zero 1195 * is returned. Otherwise *vpp is NULL and the function returns an 1196 * appropriate error code. 1197 */ 1198 int 1199 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, 1200 struct componentname *cnp, const char *target) 1201 { 1202 int error; 1203 struct tmpfs_dirent *de; 1204 struct tmpfs_mount *tmp; 1205 struct tmpfs_node *dnode; 1206 struct tmpfs_node *node; 1207 struct tmpfs_node *parent; 1208 1209 ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); 1210 1211 tmp = VFS_TO_TMPFS(dvp->v_mount); 1212 dnode = VP_TO_TMPFS_DIR(dvp); 1213 *vpp = NULL; 1214 1215 /* If the entry we are creating is a directory, we cannot overflow 1216 * the number of links of its parent, because it will get a new 1217 * link. */ 1218 if (vap->va_type == VDIR) { 1219 /* Ensure that we do not overflow the maximum number of links 1220 * imposed by the system. */ 1221 MPASS(dnode->tn_links <= TMPFS_LINK_MAX); 1222 if (dnode->tn_links == TMPFS_LINK_MAX) { 1223 return (EMLINK); 1224 } 1225 1226 parent = dnode; 1227 MPASS(parent != NULL); 1228 } else 1229 parent = NULL; 1230 1231 /* Allocate a node that represents the new file. */ 1232 error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, 1233 cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, 1234 target, vap->va_rdev, &node); 1235 if (error != 0) 1236 return (error); 1237 1238 /* Allocate a directory entry that points to the new file. */ 1239 error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, 1240 &de); 1241 if (error != 0) { 1242 tmpfs_free_node(tmp, node); 1243 return (error); 1244 } 1245 1246 /* Allocate a vnode for the new file. */ 1247 error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); 1248 if (error != 0) { 1249 tmpfs_free_dirent(tmp, de); 1250 tmpfs_free_node(tmp, node); 1251 return (error); 1252 } 1253 1254 /* Now that all required items are allocated, we can proceed to 1255 * insert the new node into the directory, an operation that 1256 * cannot fail. */ 1257 if (cnp->cn_flags & ISWHITEOUT) 1258 tmpfs_dir_whiteout_remove(dvp, cnp); 1259 tmpfs_dir_attach(dvp, de); 1260 return (0); 1261 } 1262 1263 struct tmpfs_dirent * 1264 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1265 { 1266 struct tmpfs_dirent *de; 1267 1268 de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); 1269 dc->tdc_tree = de; 1270 if (de != NULL && tmpfs_dirent_duphead(de)) 1271 de = LIST_FIRST(&de->ud.td_duphead); 1272 dc->tdc_current = de; 1273 1274 return (dc->tdc_current); 1275 } 1276 1277 struct tmpfs_dirent * 1278 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) 1279 { 1280 struct tmpfs_dirent *de; 1281 1282 MPASS(dc->tdc_tree != NULL); 1283 if (tmpfs_dirent_dup(dc->tdc_current)) { 1284 dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); 1285 if (dc->tdc_current != NULL) 1286 return (dc->tdc_current); 1287 } 1288 dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, 1289 &dnode->tn_dir.tn_dirhead, dc->tdc_tree); 1290 if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { 1291 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1292 MPASS(dc->tdc_current != NULL); 1293 } 1294 1295 return (dc->tdc_current); 1296 } 1297 1298 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ 1299 static struct tmpfs_dirent * 1300 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) 1301 { 1302 struct tmpfs_dirent *de, dekey; 1303 1304 dekey.td_hash = hash; 1305 de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); 1306 return (de); 1307 } 1308 1309 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ 1310 static struct tmpfs_dirent * 1311 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, 1312 struct tmpfs_dir_cursor *dc) 1313 { 1314 struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; 1315 struct tmpfs_dirent *de, dekey; 1316 1317 MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); 1318 1319 if (cookie == node->tn_dir.tn_readdir_lastn && 1320 (de = node->tn_dir.tn_readdir_lastp) != NULL) { 1321 /* Protect against possible race, tn_readdir_last[pn] 1322 * may be updated with only shared vnode lock held. */ 1323 if (cookie == tmpfs_dirent_cookie(de)) 1324 goto out; 1325 } 1326 1327 if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { 1328 LIST_FOREACH(de, &node->tn_dir.tn_dupindex, 1329 uh.td_dup.index_entries) { 1330 MPASS(tmpfs_dirent_dup(de)); 1331 if (de->td_cookie == cookie) 1332 goto out; 1333 /* dupindex list is sorted. */ 1334 if (de->td_cookie < cookie) { 1335 de = NULL; 1336 goto out; 1337 } 1338 } 1339 MPASS(de == NULL); 1340 goto out; 1341 } 1342 1343 if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { 1344 de = NULL; 1345 } else { 1346 dekey.td_hash = cookie; 1347 /* Recover if direntry for cookie was removed */ 1348 de = RB_NFIND(tmpfs_dir, dirhead, &dekey); 1349 } 1350 dc->tdc_tree = de; 1351 dc->tdc_current = de; 1352 if (de != NULL && tmpfs_dirent_duphead(de)) { 1353 dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); 1354 MPASS(dc->tdc_current != NULL); 1355 } 1356 return (dc->tdc_current); 1357 1358 out: 1359 dc->tdc_tree = de; 1360 dc->tdc_current = de; 1361 if (de != NULL && tmpfs_dirent_dup(de)) 1362 dc->tdc_tree = tmpfs_dir_xlookup_hash(node, 1363 de->td_hash); 1364 return (dc->tdc_current); 1365 } 1366 1367 /* 1368 * Looks for a directory entry in the directory represented by node. 1369 * 'cnp' describes the name of the entry to look for. Note that the . 1370 * and .. components are not allowed as they do not physically exist 1371 * within directories. 1372 * 1373 * Returns a pointer to the entry when found, otherwise NULL. 1374 */ 1375 struct tmpfs_dirent * 1376 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, 1377 struct componentname *cnp) 1378 { 1379 struct tmpfs_dir_duphead *duphead; 1380 struct tmpfs_dirent *de; 1381 uint32_t hash; 1382 1383 MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); 1384 MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && 1385 cnp->cn_nameptr[1] == '.'))); 1386 TMPFS_VALIDATE_DIR(node); 1387 1388 hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); 1389 de = tmpfs_dir_xlookup_hash(node, hash); 1390 if (de != NULL && tmpfs_dirent_duphead(de)) { 1391 duphead = &de->ud.td_duphead; 1392 LIST_FOREACH(de, duphead, uh.td_dup.entries) { 1393 if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1394 cnp->cn_namelen)) 1395 break; 1396 } 1397 } else if (de != NULL) { 1398 if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, 1399 cnp->cn_namelen)) 1400 de = NULL; 1401 } 1402 if (de != NULL && f != NULL && de->td_node != f) 1403 de = NULL; 1404 1405 return (de); 1406 } 1407 1408 /* 1409 * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex 1410 * list, allocate new cookie value. 1411 */ 1412 static void 1413 tmpfs_dir_attach_dup(struct tmpfs_node *dnode, 1414 struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) 1415 { 1416 struct tmpfs_dir_duphead *dupindex; 1417 struct tmpfs_dirent *de, *pde; 1418 1419 dupindex = &dnode->tn_dir.tn_dupindex; 1420 de = LIST_FIRST(dupindex); 1421 if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { 1422 if (de == NULL) 1423 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1424 else 1425 nde->td_cookie = de->td_cookie + 1; 1426 MPASS(tmpfs_dirent_dup(nde)); 1427 LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); 1428 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1429 return; 1430 } 1431 1432 /* 1433 * Cookie numbers are near exhaustion. Scan dupindex list for unused 1434 * numbers. dupindex list is sorted in descending order. Keep it so 1435 * after inserting nde. 1436 */ 1437 while (1) { 1438 pde = de; 1439 de = LIST_NEXT(de, uh.td_dup.index_entries); 1440 if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { 1441 /* 1442 * Last element of the index doesn't have minimal cookie 1443 * value, use it. 1444 */ 1445 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; 1446 LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); 1447 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1448 return; 1449 } else if (de == NULL) { 1450 /* 1451 * We are so lucky have 2^30 hash duplicates in single 1452 * directory :) Return largest possible cookie value. 1453 * It should be fine except possible issues with 1454 * VOP_READDIR restart. 1455 */ 1456 nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; 1457 LIST_INSERT_HEAD(dupindex, nde, 1458 uh.td_dup.index_entries); 1459 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1460 return; 1461 } 1462 if (de->td_cookie + 1 == pde->td_cookie || 1463 de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) 1464 continue; /* No hole or invalid cookie. */ 1465 nde->td_cookie = de->td_cookie + 1; 1466 MPASS(tmpfs_dirent_dup(nde)); 1467 MPASS(pde->td_cookie > nde->td_cookie); 1468 MPASS(nde->td_cookie > de->td_cookie); 1469 LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); 1470 LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); 1471 return; 1472 } 1473 } 1474 1475 /* 1476 * Attaches the directory entry de to the directory represented by vp. 1477 * Note that this does not change the link count of the node pointed by 1478 * the directory entry, as this is done by tmpfs_alloc_dirent. 1479 */ 1480 void 1481 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) 1482 { 1483 struct tmpfs_node *dnode; 1484 struct tmpfs_dirent *xde, *nde; 1485 1486 ASSERT_VOP_ELOCKED(vp, __func__); 1487 MPASS(de->td_namelen > 0); 1488 MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); 1489 MPASS(de->td_cookie == de->td_hash); 1490 1491 dnode = VP_TO_TMPFS_DIR(vp); 1492 dnode->tn_dir.tn_readdir_lastn = 0; 1493 dnode->tn_dir.tn_readdir_lastp = NULL; 1494 1495 MPASS(!tmpfs_dirent_dup(de)); 1496 xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1497 if (xde != NULL && tmpfs_dirent_duphead(xde)) 1498 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1499 else if (xde != NULL) { 1500 /* 1501 * Allocate new duphead. Swap xde with duphead to avoid 1502 * adding/removing elements with the same hash. 1503 */ 1504 MPASS(!tmpfs_dirent_dup(xde)); 1505 tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, 1506 &nde); 1507 /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ 1508 memcpy(nde, xde, sizeof(*xde)); 1509 xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; 1510 LIST_INIT(&xde->ud.td_duphead); 1511 xde->td_namelen = 0; 1512 xde->td_node = NULL; 1513 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); 1514 tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); 1515 } 1516 dnode->tn_size += sizeof(struct tmpfs_dirent); 1517 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1518 dnode->tn_accessed = true; 1519 tmpfs_update(vp); 1520 } 1521 1522 /* 1523 * Detaches the directory entry de from the directory represented by vp. 1524 * Note that this does not change the link count of the node pointed by 1525 * the directory entry, as this is done by tmpfs_free_dirent. 1526 */ 1527 void 1528 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) 1529 { 1530 struct tmpfs_mount *tmp; 1531 struct tmpfs_dir *head; 1532 struct tmpfs_node *dnode; 1533 struct tmpfs_dirent *xde; 1534 1535 ASSERT_VOP_ELOCKED(vp, __func__); 1536 1537 dnode = VP_TO_TMPFS_DIR(vp); 1538 head = &dnode->tn_dir.tn_dirhead; 1539 dnode->tn_dir.tn_readdir_lastn = 0; 1540 dnode->tn_dir.tn_readdir_lastp = NULL; 1541 1542 if (tmpfs_dirent_dup(de)) { 1543 /* Remove duphead if de was last entry. */ 1544 if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { 1545 xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); 1546 MPASS(tmpfs_dirent_duphead(xde)); 1547 } else 1548 xde = NULL; 1549 LIST_REMOVE(de, uh.td_dup.entries); 1550 LIST_REMOVE(de, uh.td_dup.index_entries); 1551 if (xde != NULL) { 1552 if (LIST_EMPTY(&xde->ud.td_duphead)) { 1553 RB_REMOVE(tmpfs_dir, head, xde); 1554 tmp = VFS_TO_TMPFS(vp->v_mount); 1555 MPASS(xde->td_node == NULL); 1556 tmpfs_free_dirent(tmp, xde); 1557 } 1558 } 1559 de->td_cookie = de->td_hash; 1560 } else 1561 RB_REMOVE(tmpfs_dir, head, de); 1562 1563 dnode->tn_size -= sizeof(struct tmpfs_dirent); 1564 dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 1565 dnode->tn_accessed = true; 1566 tmpfs_update(vp); 1567 } 1568 1569 void 1570 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) 1571 { 1572 struct tmpfs_dirent *de, *dde, *nde; 1573 1574 RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { 1575 RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); 1576 /* Node may already be destroyed. */ 1577 de->td_node = NULL; 1578 if (tmpfs_dirent_duphead(de)) { 1579 while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { 1580 LIST_REMOVE(dde, uh.td_dup.entries); 1581 dde->td_node = NULL; 1582 tmpfs_free_dirent(tmp, dde); 1583 } 1584 } 1585 tmpfs_free_dirent(tmp, de); 1586 } 1587 } 1588 1589 /* 1590 * Helper function for tmpfs_readdir. Creates a '.' entry for the given 1591 * directory and returns it in the uio space. The function returns 0 1592 * on success, -1 if there was not enough space in the uio structure to 1593 * hold the directory entry or an appropriate error code if another 1594 * error happens. 1595 */ 1596 static int 1597 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1598 struct uio *uio) 1599 { 1600 int error; 1601 struct dirent dent; 1602 1603 TMPFS_VALIDATE_DIR(node); 1604 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); 1605 1606 dent.d_fileno = node->tn_id; 1607 dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; 1608 dent.d_type = DT_DIR; 1609 dent.d_namlen = 1; 1610 dent.d_name[0] = '.'; 1611 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1612 dirent_terminate(&dent); 1613 1614 if (dent.d_reclen > uio->uio_resid) 1615 error = EJUSTRETURN; 1616 else 1617 error = uiomove(&dent, dent.d_reclen, uio); 1618 1619 tmpfs_set_accessed(tm, node); 1620 1621 return (error); 1622 } 1623 1624 /* 1625 * Helper function for tmpfs_readdir. Creates a '..' entry for the given 1626 * directory and returns it in the uio space. The function returns 0 1627 * on success, -1 if there was not enough space in the uio structure to 1628 * hold the directory entry or an appropriate error code if another 1629 * error happens. 1630 */ 1631 static int 1632 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, 1633 struct uio *uio, off_t next) 1634 { 1635 struct tmpfs_node *parent; 1636 struct dirent dent; 1637 int error; 1638 1639 TMPFS_VALIDATE_DIR(node); 1640 MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); 1641 1642 /* 1643 * Return ENOENT if the current node is already removed. 1644 */ 1645 TMPFS_ASSERT_LOCKED(node); 1646 parent = node->tn_dir.tn_parent; 1647 if (parent == NULL) 1648 return (ENOENT); 1649 1650 dent.d_fileno = parent->tn_id; 1651 dent.d_off = next; 1652 dent.d_type = DT_DIR; 1653 dent.d_namlen = 2; 1654 dent.d_name[0] = '.'; 1655 dent.d_name[1] = '.'; 1656 dent.d_reclen = GENERIC_DIRSIZ(&dent); 1657 dirent_terminate(&dent); 1658 1659 if (dent.d_reclen > uio->uio_resid) 1660 error = EJUSTRETURN; 1661 else 1662 error = uiomove(&dent, dent.d_reclen, uio); 1663 1664 tmpfs_set_accessed(tm, node); 1665 1666 return (error); 1667 } 1668 1669 /* 1670 * Helper function for tmpfs_readdir. Returns as much directory entries 1671 * as can fit in the uio space. The read starts at uio->uio_offset. 1672 * The function returns 0 on success, -1 if there was not enough space 1673 * in the uio structure to hold the directory entry or an appropriate 1674 * error code if another error happens. 1675 */ 1676 int 1677 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, 1678 struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) 1679 { 1680 struct tmpfs_dir_cursor dc; 1681 struct tmpfs_dirent *de, *nde; 1682 off_t off; 1683 int error; 1684 1685 TMPFS_VALIDATE_DIR(node); 1686 1687 off = 0; 1688 1689 /* 1690 * Lookup the node from the current offset. The starting offset of 1691 * 0 will lookup both '.' and '..', and then the first real entry, 1692 * or EOF if there are none. Then find all entries for the dir that 1693 * fit into the buffer. Once no more entries are found (de == NULL), 1694 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next 1695 * call to return 0. 1696 */ 1697 switch (uio->uio_offset) { 1698 case TMPFS_DIRCOOKIE_DOT: 1699 error = tmpfs_dir_getdotdent(tm, node, uio); 1700 if (error != 0) 1701 return (error); 1702 uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; 1703 if (cookies != NULL) 1704 cookies[(*ncookies)++] = off; 1705 /* FALLTHROUGH */ 1706 case TMPFS_DIRCOOKIE_DOTDOT: 1707 de = tmpfs_dir_first(node, &dc); 1708 off = tmpfs_dirent_cookie(de); 1709 error = tmpfs_dir_getdotdotdent(tm, node, uio, off); 1710 if (error != 0) 1711 return (error); 1712 uio->uio_offset = off; 1713 if (cookies != NULL) 1714 cookies[(*ncookies)++] = off; 1715 /* EOF. */ 1716 if (de == NULL) 1717 return (0); 1718 break; 1719 case TMPFS_DIRCOOKIE_EOF: 1720 return (0); 1721 default: 1722 de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); 1723 if (de == NULL) 1724 return (EINVAL); 1725 if (cookies != NULL) 1726 off = tmpfs_dirent_cookie(de); 1727 } 1728 1729 /* 1730 * Read as much entries as possible; i.e., until we reach the end of the 1731 * directory or we exhaust uio space. 1732 */ 1733 do { 1734 struct dirent d; 1735 1736 /* 1737 * Create a dirent structure representing the current tmpfs_node 1738 * and fill it. 1739 */ 1740 if (de->td_node == NULL) { 1741 d.d_fileno = 1; 1742 d.d_type = DT_WHT; 1743 } else { 1744 d.d_fileno = de->td_node->tn_id; 1745 switch (de->td_node->tn_type) { 1746 case VBLK: 1747 d.d_type = DT_BLK; 1748 break; 1749 1750 case VCHR: 1751 d.d_type = DT_CHR; 1752 break; 1753 1754 case VDIR: 1755 d.d_type = DT_DIR; 1756 break; 1757 1758 case VFIFO: 1759 d.d_type = DT_FIFO; 1760 break; 1761 1762 case VLNK: 1763 d.d_type = DT_LNK; 1764 break; 1765 1766 case VREG: 1767 d.d_type = DT_REG; 1768 break; 1769 1770 case VSOCK: 1771 d.d_type = DT_SOCK; 1772 break; 1773 1774 default: 1775 panic("tmpfs_dir_getdents: type %p %d", 1776 de->td_node, (int)de->td_node->tn_type); 1777 } 1778 } 1779 d.d_namlen = de->td_namelen; 1780 MPASS(de->td_namelen < sizeof(d.d_name)); 1781 (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); 1782 d.d_reclen = GENERIC_DIRSIZ(&d); 1783 1784 /* 1785 * Stop reading if the directory entry we are treating is bigger 1786 * than the amount of data that can be returned. 1787 */ 1788 if (d.d_reclen > uio->uio_resid) { 1789 error = EJUSTRETURN; 1790 break; 1791 } 1792 1793 nde = tmpfs_dir_next(node, &dc); 1794 d.d_off = tmpfs_dirent_cookie(nde); 1795 dirent_terminate(&d); 1796 1797 /* 1798 * Copy the new dirent structure into the output buffer and 1799 * advance pointers. 1800 */ 1801 error = uiomove(&d, d.d_reclen, uio); 1802 if (error == 0) { 1803 de = nde; 1804 if (cookies != NULL) { 1805 off = tmpfs_dirent_cookie(de); 1806 MPASS(*ncookies < maxcookies); 1807 cookies[(*ncookies)++] = off; 1808 } 1809 } 1810 } while (error == 0 && uio->uio_resid > 0 && de != NULL); 1811 1812 /* Skip setting off when using cookies as it is already done above. */ 1813 if (cookies == NULL) 1814 off = tmpfs_dirent_cookie(de); 1815 1816 /* Update the offset and cache. */ 1817 uio->uio_offset = off; 1818 node->tn_dir.tn_readdir_lastn = off; 1819 node->tn_dir.tn_readdir_lastp = de; 1820 1821 tmpfs_set_accessed(tm, node); 1822 return (error); 1823 } 1824 1825 int 1826 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) 1827 { 1828 struct tmpfs_dirent *de; 1829 int error; 1830 1831 error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, 1832 cnp->cn_nameptr, cnp->cn_namelen, &de); 1833 if (error != 0) 1834 return (error); 1835 tmpfs_dir_attach(dvp, de); 1836 return (0); 1837 } 1838 1839 void 1840 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) 1841 { 1842 struct tmpfs_dirent *de; 1843 1844 de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); 1845 MPASS(de != NULL && de->td_node == NULL); 1846 tmpfs_dir_detach(dvp, de); 1847 tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); 1848 } 1849 1850 /* 1851 * Resizes the aobj associated with the regular file pointed to by 'vp' to the 1852 * size 'newsize'. 'vp' must point to a vnode that represents a regular file. 1853 * 'newsize' must be positive. 1854 * 1855 * Returns zero on success or an appropriate error code on failure. 1856 */ 1857 int 1858 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) 1859 { 1860 struct tmpfs_node *node; 1861 vm_object_t uobj; 1862 vm_pindex_t idx, newpages, oldpages; 1863 off_t oldsize; 1864 int base, error; 1865 1866 MPASS(vp->v_type == VREG); 1867 MPASS(newsize >= 0); 1868 1869 node = VP_TO_TMPFS_NODE(vp); 1870 uobj = node->tn_reg.tn_aobj; 1871 1872 /* 1873 * Convert the old and new sizes to the number of pages needed to 1874 * store them. It may happen that we do not need to do anything 1875 * because the last allocated page can accommodate the change on 1876 * its own. 1877 */ 1878 oldsize = node->tn_size; 1879 oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); 1880 MPASS(oldpages == uobj->size); 1881 newpages = OFF_TO_IDX(newsize + PAGE_MASK); 1882 1883 if (__predict_true(newpages == oldpages && newsize >= oldsize)) { 1884 node->tn_size = newsize; 1885 return (0); 1886 } 1887 1888 VM_OBJECT_WLOCK(uobj); 1889 if (newsize < oldsize) { 1890 /* 1891 * Zero the truncated part of the last page. 1892 */ 1893 base = newsize & PAGE_MASK; 1894 if (base != 0) { 1895 idx = OFF_TO_IDX(newsize); 1896 error = tmpfs_partial_page_invalidate(uobj, idx, base, 1897 PAGE_SIZE, ignerr); 1898 if (error != 0) { 1899 VM_OBJECT_WUNLOCK(uobj); 1900 return (error); 1901 } 1902 } 1903 1904 /* 1905 * Release any swap space and free any whole pages. 1906 */ 1907 if (newpages < oldpages) 1908 vm_object_page_remove(uobj, newpages, 0, 0); 1909 } 1910 uobj->size = newpages; 1911 VM_OBJECT_WUNLOCK(uobj); 1912 1913 node->tn_size = newsize; 1914 return (0); 1915 } 1916 1917 /* 1918 * Punch hole in the aobj associated with the regular file pointed to by 'vp'. 1919 * Requests completely beyond the end-of-file are converted to no-op. 1920 * 1921 * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on 1922 * failure. 1923 */ 1924 int 1925 tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) 1926 { 1927 struct tmpfs_node *node; 1928 vm_object_t object; 1929 vm_pindex_t pistart, pi, piend; 1930 int startofs, endofs, end; 1931 off_t off, len; 1932 int error; 1933 1934 KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", 1935 __func__)); 1936 node = VP_TO_TMPFS_NODE(vp); 1937 KASSERT(node->tn_type == VREG, ("%s: node is not regular file", 1938 __func__)); 1939 object = node->tn_reg.tn_aobj; 1940 off = *offset; 1941 len = omin(node->tn_size - off, *length); 1942 startofs = off & PAGE_MASK; 1943 endofs = (off + len) & PAGE_MASK; 1944 pistart = OFF_TO_IDX(off); 1945 piend = OFF_TO_IDX(off + len); 1946 pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); 1947 error = 0; 1948 1949 /* Handle the case when offset is on or beyond file size. */ 1950 if (len <= 0) { 1951 *length = 0; 1952 return (0); 1953 } 1954 1955 VM_OBJECT_WLOCK(object); 1956 1957 /* 1958 * If there is a partial page at the beginning of the hole-punching 1959 * request, fill the partial page with zeroes. 1960 */ 1961 if (startofs != 0) { 1962 end = pistart != piend ? PAGE_SIZE : endofs; 1963 error = tmpfs_partial_page_invalidate(object, pistart, startofs, 1964 end, FALSE); 1965 if (error != 0) 1966 goto out; 1967 off += end - startofs; 1968 len -= end - startofs; 1969 } 1970 1971 /* 1972 * Toss away the full pages in the affected area. 1973 */ 1974 if (pi < piend) { 1975 vm_object_page_remove(object, pi, piend, 0); 1976 off += IDX_TO_OFF(piend - pi); 1977 len -= IDX_TO_OFF(piend - pi); 1978 } 1979 1980 /* 1981 * If there is a partial page at the end of the hole-punching request, 1982 * fill the partial page with zeroes. 1983 */ 1984 if (endofs != 0 && pistart != piend) { 1985 error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, 1986 FALSE); 1987 if (error != 0) 1988 goto out; 1989 off += endofs; 1990 len -= endofs; 1991 } 1992 1993 out: 1994 VM_OBJECT_WUNLOCK(object); 1995 *offset = off; 1996 *length = len; 1997 return (error); 1998 } 1999 2000 void 2001 tmpfs_check_mtime(struct vnode *vp) 2002 { 2003 struct tmpfs_node *node; 2004 struct vm_object *obj; 2005 2006 ASSERT_VOP_ELOCKED(vp, "check_mtime"); 2007 if (vp->v_type != VREG) 2008 return; 2009 obj = vp->v_object; 2010 KASSERT(obj->type == tmpfs_pager_type && 2011 (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == 2012 (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); 2013 /* unlocked read */ 2014 if (obj->generation != obj->cleangeneration) { 2015 VM_OBJECT_WLOCK(obj); 2016 if (obj->generation != obj->cleangeneration) { 2017 obj->cleangeneration = obj->generation; 2018 node = VP_TO_TMPFS_NODE(vp); 2019 node->tn_status |= TMPFS_NODE_MODIFIED | 2020 TMPFS_NODE_CHANGED; 2021 } 2022 VM_OBJECT_WUNLOCK(obj); 2023 } 2024 } 2025 2026 /* 2027 * Change flags of the given vnode. 2028 * Caller should execute tmpfs_update on vp after a successful execution. 2029 * The vnode must be locked on entry and remain locked on exit. 2030 */ 2031 int 2032 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, 2033 struct thread *td) 2034 { 2035 int error; 2036 struct tmpfs_node *node; 2037 2038 ASSERT_VOP_ELOCKED(vp, "chflags"); 2039 2040 node = VP_TO_TMPFS_NODE(vp); 2041 2042 if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | 2043 UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | 2044 UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | 2045 UF_SPARSE | UF_SYSTEM)) != 0) 2046 return (EOPNOTSUPP); 2047 2048 /* Disallow this operation if the file system is mounted read-only. */ 2049 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2050 return (EROFS); 2051 2052 /* 2053 * Callers may only modify the file flags on objects they 2054 * have VADMIN rights for. 2055 */ 2056 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2057 return (error); 2058 /* 2059 * Unprivileged processes are not permitted to unset system 2060 * flags, or modify flags if any system flags are set. 2061 */ 2062 if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { 2063 if (node->tn_flags & 2064 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { 2065 error = securelevel_gt(cred, 0); 2066 if (error) 2067 return (error); 2068 } 2069 } else { 2070 if (node->tn_flags & 2071 (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || 2072 ((flags ^ node->tn_flags) & SF_SETTABLE)) 2073 return (EPERM); 2074 } 2075 node->tn_flags = flags; 2076 node->tn_status |= TMPFS_NODE_CHANGED; 2077 2078 ASSERT_VOP_ELOCKED(vp, "chflags2"); 2079 2080 return (0); 2081 } 2082 2083 /* 2084 * Change access mode on the given vnode. 2085 * Caller should execute tmpfs_update on vp after a successful execution. 2086 * The vnode must be locked on entry and remain locked on exit. 2087 */ 2088 int 2089 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, 2090 struct thread *td) 2091 { 2092 int error; 2093 struct tmpfs_node *node; 2094 mode_t newmode; 2095 2096 ASSERT_VOP_ELOCKED(vp, "chmod"); 2097 ASSERT_VOP_IN_SEQC(vp); 2098 2099 node = VP_TO_TMPFS_NODE(vp); 2100 2101 /* Disallow this operation if the file system is mounted read-only. */ 2102 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2103 return (EROFS); 2104 2105 /* Immutable or append-only files cannot be modified, either. */ 2106 if (node->tn_flags & (IMMUTABLE | APPEND)) 2107 return (EPERM); 2108 2109 /* 2110 * To modify the permissions on a file, must possess VADMIN 2111 * for that file. 2112 */ 2113 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2114 return (error); 2115 2116 /* 2117 * Privileged processes may set the sticky bit on non-directories, 2118 * as well as set the setgid bit on a file with a group that the 2119 * process is not a member of. 2120 */ 2121 if (vp->v_type != VDIR && (mode & S_ISTXT)) { 2122 if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) 2123 return (EFTYPE); 2124 } 2125 if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { 2126 error = priv_check_cred(cred, PRIV_VFS_SETGID); 2127 if (error) 2128 return (error); 2129 } 2130 2131 newmode = node->tn_mode & ~ALLPERMS; 2132 newmode |= mode & ALLPERMS; 2133 atomic_store_short(&node->tn_mode, newmode); 2134 2135 node->tn_status |= TMPFS_NODE_CHANGED; 2136 2137 ASSERT_VOP_ELOCKED(vp, "chmod2"); 2138 2139 return (0); 2140 } 2141 2142 /* 2143 * Change ownership of the given vnode. At least one of uid or gid must 2144 * be different than VNOVAL. If one is set to that value, the attribute 2145 * is unchanged. 2146 * Caller should execute tmpfs_update on vp after a successful execution. 2147 * The vnode must be locked on entry and remain locked on exit. 2148 */ 2149 int 2150 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, 2151 struct thread *td) 2152 { 2153 int error; 2154 struct tmpfs_node *node; 2155 uid_t ouid; 2156 gid_t ogid; 2157 mode_t newmode; 2158 2159 ASSERT_VOP_ELOCKED(vp, "chown"); 2160 ASSERT_VOP_IN_SEQC(vp); 2161 2162 node = VP_TO_TMPFS_NODE(vp); 2163 2164 /* Assign default values if they are unknown. */ 2165 MPASS(uid != VNOVAL || gid != VNOVAL); 2166 if (uid == VNOVAL) 2167 uid = node->tn_uid; 2168 if (gid == VNOVAL) 2169 gid = node->tn_gid; 2170 MPASS(uid != VNOVAL && gid != VNOVAL); 2171 2172 /* Disallow this operation if the file system is mounted read-only. */ 2173 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2174 return (EROFS); 2175 2176 /* Immutable or append-only files cannot be modified, either. */ 2177 if (node->tn_flags & (IMMUTABLE | APPEND)) 2178 return (EPERM); 2179 2180 /* 2181 * To modify the ownership of a file, must possess VADMIN for that 2182 * file. 2183 */ 2184 if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) 2185 return (error); 2186 2187 /* 2188 * To change the owner of a file, or change the group of a file to a 2189 * group of which we are not a member, the caller must have 2190 * privilege. 2191 */ 2192 if ((uid != node->tn_uid || 2193 (gid != node->tn_gid && !groupmember(gid, cred))) && 2194 (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) 2195 return (error); 2196 2197 ogid = node->tn_gid; 2198 ouid = node->tn_uid; 2199 2200 node->tn_uid = uid; 2201 node->tn_gid = gid; 2202 2203 node->tn_status |= TMPFS_NODE_CHANGED; 2204 2205 if ((node->tn_mode & (S_ISUID | S_ISGID)) != 0 && 2206 (ouid != uid || ogid != gid)) { 2207 if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { 2208 newmode = node->tn_mode & ~(S_ISUID | S_ISGID); 2209 atomic_store_short(&node->tn_mode, newmode); 2210 } 2211 } 2212 2213 ASSERT_VOP_ELOCKED(vp, "chown2"); 2214 2215 return (0); 2216 } 2217 2218 /* 2219 * Change size of the given vnode. 2220 * Caller should execute tmpfs_update on vp after a successful execution. 2221 * The vnode must be locked on entry and remain locked on exit. 2222 */ 2223 int 2224 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, 2225 struct thread *td) 2226 { 2227 int error; 2228 struct tmpfs_node *node; 2229 2230 ASSERT_VOP_ELOCKED(vp, "chsize"); 2231 2232 node = VP_TO_TMPFS_NODE(vp); 2233 2234 /* Decide whether this is a valid operation based on the file type. */ 2235 error = 0; 2236 switch (vp->v_type) { 2237 case VDIR: 2238 return (EISDIR); 2239 2240 case VREG: 2241 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2242 return (EROFS); 2243 break; 2244 2245 case VBLK: 2246 /* FALLTHROUGH */ 2247 case VCHR: 2248 /* FALLTHROUGH */ 2249 case VFIFO: 2250 /* 2251 * Allow modifications of special files even if in the file 2252 * system is mounted read-only (we are not modifying the 2253 * files themselves, but the objects they represent). 2254 */ 2255 return (0); 2256 2257 default: 2258 /* Anything else is unsupported. */ 2259 return (EOPNOTSUPP); 2260 } 2261 2262 /* Immutable or append-only files cannot be modified, either. */ 2263 if (node->tn_flags & (IMMUTABLE | APPEND)) 2264 return (EPERM); 2265 2266 error = vn_rlimit_trunc(size, td); 2267 if (error != 0) 2268 return (error); 2269 2270 error = tmpfs_truncate(vp, size); 2271 /* 2272 * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents 2273 * for us, as will update tn_status; no need to do that here. 2274 */ 2275 2276 ASSERT_VOP_ELOCKED(vp, "chsize2"); 2277 2278 return (error); 2279 } 2280 2281 /* 2282 * Change access and modification times of the given vnode. 2283 * Caller should execute tmpfs_update on vp after a successful execution. 2284 * The vnode must be locked on entry and remain locked on exit. 2285 */ 2286 int 2287 tmpfs_chtimes(struct vnode *vp, struct vattr *vap, 2288 struct ucred *cred, struct thread *td) 2289 { 2290 int error; 2291 struct tmpfs_node *node; 2292 2293 ASSERT_VOP_ELOCKED(vp, "chtimes"); 2294 2295 node = VP_TO_TMPFS_NODE(vp); 2296 2297 /* Disallow this operation if the file system is mounted read-only. */ 2298 if (vp->v_mount->mnt_flag & MNT_RDONLY) 2299 return (EROFS); 2300 2301 /* Immutable or append-only files cannot be modified, either. */ 2302 if (node->tn_flags & (IMMUTABLE | APPEND)) 2303 return (EPERM); 2304 2305 error = vn_utimes_perm(vp, vap, cred, td); 2306 if (error != 0) 2307 return (error); 2308 2309 if (vap->va_atime.tv_sec != VNOVAL) 2310 node->tn_accessed = true; 2311 if (vap->va_mtime.tv_sec != VNOVAL) 2312 node->tn_status |= TMPFS_NODE_MODIFIED; 2313 if (vap->va_birthtime.tv_sec != VNOVAL) 2314 node->tn_status |= TMPFS_NODE_MODIFIED; 2315 tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); 2316 if (vap->va_birthtime.tv_sec != VNOVAL) 2317 node->tn_birthtime = vap->va_birthtime; 2318 ASSERT_VOP_ELOCKED(vp, "chtimes2"); 2319 2320 return (0); 2321 } 2322 2323 void 2324 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) 2325 { 2326 2327 if ((node->tn_status & status) == status || tm->tm_ronly) 2328 return; 2329 TMPFS_NODE_LOCK(node); 2330 node->tn_status |= status; 2331 TMPFS_NODE_UNLOCK(node); 2332 } 2333 2334 void 2335 tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) 2336 { 2337 if (node->tn_accessed || tm->tm_ronly) 2338 return; 2339 atomic_store_8(&node->tn_accessed, true); 2340 } 2341 2342 /* Sync timestamps */ 2343 void 2344 tmpfs_itimes(struct vnode *vp, const struct timespec *acc, 2345 const struct timespec *mod) 2346 { 2347 struct tmpfs_node *node; 2348 struct timespec now; 2349 2350 ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); 2351 node = VP_TO_TMPFS_NODE(vp); 2352 2353 if (!node->tn_accessed && 2354 (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) 2355 return; 2356 2357 vfs_timestamp(&now); 2358 TMPFS_NODE_LOCK(node); 2359 if (node->tn_accessed) { 2360 if (acc == NULL) 2361 acc = &now; 2362 node->tn_atime = *acc; 2363 } 2364 if (node->tn_status & TMPFS_NODE_MODIFIED) { 2365 if (mod == NULL) 2366 mod = &now; 2367 node->tn_mtime = *mod; 2368 } 2369 if (node->tn_status & TMPFS_NODE_CHANGED) 2370 node->tn_ctime = now; 2371 node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); 2372 node->tn_accessed = false; 2373 TMPFS_NODE_UNLOCK(node); 2374 2375 /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ 2376 random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); 2377 } 2378 2379 int 2380 tmpfs_truncate(struct vnode *vp, off_t length) 2381 { 2382 struct tmpfs_node *node; 2383 int error; 2384 2385 if (length < 0) 2386 return (EINVAL); 2387 if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) 2388 return (EFBIG); 2389 2390 node = VP_TO_TMPFS_NODE(vp); 2391 error = node->tn_size == length ? 0 : tmpfs_reg_resize(vp, length, 2392 FALSE); 2393 if (error == 0) 2394 node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; 2395 tmpfs_update(vp); 2396 2397 return (error); 2398 } 2399 2400 static __inline int 2401 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) 2402 { 2403 if (a->td_hash > b->td_hash) 2404 return (1); 2405 else if (a->td_hash < b->td_hash) 2406 return (-1); 2407 return (0); 2408 } 2409 2410 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); 2411