1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2016 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/file.h> 37 #include <sys/kmem.h> 38 #include <sys/uio.h> 39 #include <sys/cmn_err.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/unistd.h> 43 #include <sys/sunddi.h> 44 #include <sys/random.h> 45 #include <sys/policy.h> 46 #include <sys/condvar.h> 47 #include <sys/callb.h> 48 #include <sys/smp.h> 49 #include <sys/zfs_dir.h> 50 #include <sys/zfs_acl.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/zap.h> 53 #include <sys/dmu.h> 54 #include <sys/atomic.h> 55 #include <sys/zfs_ctldir.h> 56 #include <sys/zfs_fuid.h> 57 #include <sys/sa.h> 58 #include <sys/zfs_sa.h> 59 #include <sys/dmu_objset.h> 60 #include <sys/dsl_dir.h> 61 62 #include <sys/ccompat.h> 63 64 /* 65 * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups 66 * of names after deciding which is the appropriate lookup interface. 67 */ 68 static int 69 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, 70 matchtype_t mt, uint64_t *zoid) 71 { 72 int error; 73 74 if (zfsvfs->z_norm) { 75 76 /* 77 * In the non-mixed case we only expect there would ever 78 * be one match, but we need to use the normalizing lookup. 79 */ 80 error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, 81 zoid, mt, NULL, 0, NULL); 82 } else { 83 error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); 84 } 85 *zoid = ZFS_DIRENT_OBJ(*zoid); 86 87 return (error); 88 } 89 90 /* 91 * Look up a directory entry under a locked vnode. 92 * dvp being locked gives us a guarantee that there are no concurrent 93 * modification of the directory and, thus, if a node can be found in 94 * the directory, then it must not be unlinked. 95 * 96 * Input arguments: 97 * dzp - znode for directory 98 * name - name of entry to lock 99 * flag - ZNEW: if the entry already exists, fail with EEXIST. 100 * ZEXISTS: if the entry does not exist, fail with ENOENT. 101 * ZXATTR: we want dzp's xattr directory 102 * 103 * Output arguments: 104 * zpp - pointer to the znode for the entry (NULL if there isn't one) 105 * 106 * Return value: 0 on success or errno on failure. 107 * 108 * NOTE: Always checks for, and rejects, '.' and '..'. 109 */ 110 int 111 zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) 112 { 113 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 114 znode_t *zp; 115 matchtype_t mt = 0; 116 uint64_t zoid; 117 int error = 0; 118 119 if (zfsvfs->z_replay == B_FALSE) 120 ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); 121 122 *zpp = NULL; 123 124 /* 125 * Verify that we are not trying to lock '.', '..', or '.zfs' 126 */ 127 if (name[0] == '.' && 128 (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || 129 (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) 130 return (SET_ERROR(EEXIST)); 131 132 /* 133 * Case sensitivity and normalization preferences are set when 134 * the file system is created. These are stored in the 135 * zfsvfs->z_case and zfsvfs->z_norm fields. These choices 136 * affect how we perform zap lookups. 137 * 138 * When matching we may need to normalize & change case according to 139 * FS settings. 140 * 141 * Note that a normalized match is necessary for a case insensitive 142 * filesystem when the lookup request is not exact because normalization 143 * can fold case independent of normalizing code point sequences. 144 * 145 * See the table above zfs_dropname(). 146 */ 147 if (zfsvfs->z_norm != 0) { 148 mt = MT_NORMALIZE; 149 150 /* 151 * Determine if the match needs to honor the case specified in 152 * lookup, and if so keep track of that so that during 153 * normalization we don't fold case. 154 */ 155 if (zfsvfs->z_case == ZFS_CASE_MIXED) { 156 mt |= MT_MATCH_CASE; 157 } 158 } 159 160 /* 161 * Only look in or update the DNLC if we are looking for the 162 * name on a file system that does not require normalization 163 * or case folding. We can also look there if we happen to be 164 * on a non-normalizing, mixed sensitivity file system IF we 165 * are looking for the exact name. 166 * 167 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE 168 * because in that case MT_EXACT and MT_FIRST should produce exactly 169 * the same result. 170 */ 171 172 if (dzp->z_unlinked && !(flag & ZXATTR)) 173 return (ENOENT); 174 if (flag & ZXATTR) { 175 error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, 176 sizeof (zoid)); 177 if (error == 0) 178 error = (zoid == 0 ? ENOENT : 0); 179 } else { 180 error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); 181 } 182 if (error) { 183 if (error != ENOENT || (flag & ZEXISTS)) { 184 return (error); 185 } 186 } else { 187 if (flag & ZNEW) { 188 return (SET_ERROR(EEXIST)); 189 } 190 error = zfs_zget(zfsvfs, zoid, &zp); 191 if (error) 192 return (error); 193 ASSERT(!zp->z_unlinked); 194 *zpp = zp; 195 } 196 197 return (0); 198 } 199 200 static int 201 zfs_dd_lookup(znode_t *dzp, znode_t **zpp) 202 { 203 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 204 znode_t *zp; 205 uint64_t parent; 206 int error; 207 208 #ifdef ZFS_DEBUG 209 if (zfsvfs->z_replay == B_FALSE) 210 ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); 211 #endif 212 if (dzp->z_unlinked) 213 return (ENOENT); 214 215 if ((error = sa_lookup(dzp->z_sa_hdl, 216 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 217 return (error); 218 219 error = zfs_zget(zfsvfs, parent, &zp); 220 if (error == 0) 221 *zpp = zp; 222 return (error); 223 } 224 225 int 226 zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) 227 { 228 zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; 229 znode_t *zp = NULL; 230 int error = 0; 231 232 #ifdef ZFS_DEBUG 233 if (zfsvfs->z_replay == B_FALSE) 234 ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); 235 #endif 236 if (dzp->z_unlinked) 237 return (SET_ERROR(ENOENT)); 238 239 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 240 *zpp = dzp; 241 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 242 error = zfs_dd_lookup(dzp, &zp); 243 if (error == 0) 244 *zpp = zp; 245 } else { 246 error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); 247 if (error == 0) { 248 dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ 249 *zpp = zp; 250 } 251 } 252 return (error); 253 } 254 255 /* 256 * unlinked Set (formerly known as the "delete queue") Error Handling 257 * 258 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we 259 * don't specify the name of the entry that we will be manipulating. We 260 * also fib and say that we won't be adding any new entries to the 261 * unlinked set, even though we might (this is to lower the minimum file 262 * size that can be deleted in a full filesystem). So on the small 263 * chance that the nlink list is using a fat zap (ie. has more than 264 * 2000 entries), we *may* not pre-read a block that's needed. 265 * Therefore it is remotely possible for some of the assertions 266 * regarding the unlinked set below to fail due to i/o error. On a 267 * nondebug system, this will result in the space being leaked. 268 */ 269 void 270 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) 271 { 272 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 273 274 ASSERT(zp->z_unlinked); 275 ASSERT3U(zp->z_links, ==, 0); 276 277 VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); 278 279 dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); 280 } 281 282 /* 283 * Clean up any znodes that had no links when we either crashed or 284 * (force) umounted the file system. 285 */ 286 void 287 zfs_unlinked_drain(zfsvfs_t *zfsvfs) 288 { 289 zap_cursor_t zc; 290 zap_attribute_t *zap; 291 dmu_object_info_t doi; 292 znode_t *zp; 293 dmu_tx_t *tx; 294 int error; 295 296 /* 297 * Iterate over the contents of the unlinked set. 298 */ 299 zap = zap_attribute_alloc(); 300 for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); 301 zap_cursor_retrieve(&zc, zap) == 0; 302 zap_cursor_advance(&zc)) { 303 304 /* 305 * See what kind of object we have in list 306 */ 307 308 error = dmu_object_info(zfsvfs->z_os, 309 zap->za_first_integer, &doi); 310 if (error != 0) 311 continue; 312 313 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || 314 (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); 315 /* 316 * We need to re-mark these list entries for deletion, 317 * so we pull them back into core and set zp->z_unlinked. 318 */ 319 error = zfs_zget(zfsvfs, zap->za_first_integer, &zp); 320 321 /* 322 * We may pick up znodes that are already marked for deletion. 323 * This could happen during the purge of an extended attribute 324 * directory. All we need to do is skip over them, since they 325 * are already in the system marked z_unlinked. 326 */ 327 if (error != 0) 328 continue; 329 330 vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); 331 332 /* 333 * Due to changes in zfs_rmnode we need to make sure the 334 * link count is set to zero here. 335 */ 336 if (zp->z_links != 0) { 337 tx = dmu_tx_create(zfsvfs->z_os); 338 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 339 error = dmu_tx_assign(tx, TXG_WAIT); 340 if (error != 0) { 341 dmu_tx_abort(tx); 342 vput(ZTOV(zp)); 343 continue; 344 } 345 zp->z_links = 0; 346 VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 347 &zp->z_links, sizeof (zp->z_links), tx)); 348 dmu_tx_commit(tx); 349 } 350 351 zp->z_unlinked = B_TRUE; 352 vput(ZTOV(zp)); 353 } 354 zap_cursor_fini(&zc); 355 zap_attribute_free(zap); 356 } 357 358 /* 359 * Delete the entire contents of a directory. Return a count 360 * of the number of entries that could not be deleted. If we encounter 361 * an error, return a count of at least one so that the directory stays 362 * in the unlinked set. 363 * 364 * NOTE: this function assumes that the directory is inactive, 365 * so there is no need to lock its entries before deletion. 366 * Also, it assumes the directory contents is *only* regular 367 * files. 368 */ 369 static int 370 zfs_purgedir(znode_t *dzp) 371 { 372 zap_cursor_t zc; 373 zap_attribute_t *zap; 374 znode_t *xzp; 375 dmu_tx_t *tx; 376 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 377 int skipped = 0; 378 int error; 379 380 zap = zap_attribute_alloc(); 381 for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); 382 (error = zap_cursor_retrieve(&zc, zap)) == 0; 383 zap_cursor_advance(&zc)) { 384 error = zfs_zget(zfsvfs, 385 ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp); 386 if (error) { 387 skipped += 1; 388 continue; 389 } 390 391 vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); 392 ASSERT((ZTOV(xzp)->v_type == VREG) || 393 (ZTOV(xzp)->v_type == VLNK)); 394 395 tx = dmu_tx_create(zfsvfs->z_os); 396 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 397 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name); 398 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 399 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 400 /* Is this really needed ? */ 401 zfs_sa_upgrade_txholds(tx, xzp); 402 dmu_tx_mark_netfree(tx); 403 error = dmu_tx_assign(tx, TXG_WAIT); 404 if (error) { 405 dmu_tx_abort(tx); 406 vput(ZTOV(xzp)); 407 skipped += 1; 408 continue; 409 } 410 411 error = zfs_link_destroy(dzp, zap->za_name, xzp, tx, 0, NULL); 412 if (error) 413 skipped += 1; 414 dmu_tx_commit(tx); 415 416 vput(ZTOV(xzp)); 417 } 418 zap_cursor_fini(&zc); 419 zap_attribute_free(zap); 420 if (error != ENOENT) 421 skipped += 1; 422 return (skipped); 423 } 424 425 extern taskq_t *zfsvfs_taskq; 426 427 void 428 zfs_rmnode(znode_t *zp) 429 { 430 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 431 objset_t *os = zfsvfs->z_os; 432 dmu_tx_t *tx; 433 uint64_t z_id = zp->z_id; 434 uint64_t acl_obj; 435 uint64_t xattr_obj; 436 uint64_t count; 437 int error; 438 439 ASSERT3U(zp->z_links, ==, 0); 440 if (zfsvfs->z_replay == B_FALSE) 441 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); 442 443 /* 444 * If this is an attribute directory, purge its contents. 445 */ 446 if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && 447 (zp->z_pflags & ZFS_XATTR)) { 448 if (zfs_purgedir(zp) != 0) { 449 /* 450 * Not enough space to delete some xattrs. 451 * Leave it in the unlinked set. 452 */ 453 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 454 zfs_znode_dmu_fini(zp); 455 zfs_znode_free(zp); 456 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 457 return; 458 } 459 } else { 460 /* 461 * Free up all the data in the file. We don't do this for 462 * XATTR directories because we need truncate and remove to be 463 * in the same tx, like in zfs_znode_delete(). Otherwise, if 464 * we crash here we'll end up with an inconsistent truncated 465 * zap object in the delete queue. Note a truncated file is 466 * harmless since it only contains user data. 467 */ 468 error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); 469 if (error) { 470 /* 471 * Not enough space or we were interrupted by unmount. 472 * Leave the file in the unlinked set. 473 */ 474 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 475 zfs_znode_dmu_fini(zp); 476 zfs_znode_free(zp); 477 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 478 return; 479 } 480 } 481 482 /* 483 * If the file has extended attributes, we're going to unlink 484 * the xattr dir. 485 */ 486 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 487 &xattr_obj, sizeof (xattr_obj)); 488 if (error) 489 xattr_obj = 0; 490 491 acl_obj = zfs_external_acl(zp); 492 493 /* 494 * Set up the final transaction. 495 */ 496 tx = dmu_tx_create(os); 497 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 498 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 499 if (xattr_obj) 500 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); 501 if (acl_obj) 502 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 503 504 zfs_sa_upgrade_txholds(tx, zp); 505 error = dmu_tx_assign(tx, TXG_WAIT); 506 if (error) { 507 /* 508 * Not enough space to delete the file. Leave it in the 509 * unlinked set, leaking it until the fs is remounted (at 510 * which point we'll call zfs_unlinked_drain() to process it). 511 */ 512 dmu_tx_abort(tx); 513 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 514 zfs_znode_dmu_fini(zp); 515 zfs_znode_free(zp); 516 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 517 return; 518 } 519 520 /* 521 * FreeBSD's implementation of zfs_zget requires a vnode to back it. 522 * This means that we could end up calling into getnewvnode while 523 * calling zfs_rmnode as a result of a prior call to getnewvnode 524 * trying to clear vnodes out of the cache. If this repeats we can 525 * recurse enough that we overflow our stack. To avoid this, we 526 * avoid calling zfs_zget on the xattr znode and instead simply add 527 * it to the unlinked set and schedule a call to zfs_unlinked_drain. 528 */ 529 if (xattr_obj) { 530 /* Add extended attribute directory to the unlinked set. */ 531 VERIFY3U(0, ==, 532 zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); 533 } 534 535 mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 536 537 /* Remove this znode from the unlinked set */ 538 VERIFY3U(0, ==, 539 zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); 540 541 if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { 542 cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); 543 } 544 545 mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 546 547 dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); 548 549 zfs_znode_delete(zp, tx); 550 zfs_znode_free(zp); 551 552 dmu_tx_commit(tx); 553 554 if (xattr_obj) { 555 /* 556 * We're using the FreeBSD taskqueue API here instead of 557 * the Solaris taskq API since the FreeBSD API allows for a 558 * task to be enqueued multiple times but executed once. 559 */ 560 taskqueue_enqueue(zfsvfs_taskq->tq_queue, 561 &zfsvfs->z_unlinked_drain_task); 562 } 563 } 564 565 static uint64_t 566 zfs_dirent(znode_t *zp, uint64_t mode) 567 { 568 uint64_t de = zp->z_id; 569 570 if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) 571 de |= IFTODT(mode) << 60; 572 return (de); 573 } 574 575 /* 576 * Link zp into dzp. Can only fail if zp has been unlinked. 577 */ 578 int 579 zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, 580 int flag) 581 { 582 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 583 vnode_t *vp = ZTOV(zp); 584 uint64_t value; 585 int zp_is_dir = (vp->v_type == VDIR); 586 sa_bulk_attr_t bulk[5]; 587 uint64_t mtime[2], ctime[2]; 588 int count = 0; 589 int error; 590 591 if (zfsvfs->z_replay == B_FALSE) { 592 ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); 593 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); 594 } 595 if (zp_is_dir) { 596 if (dzp->z_links >= ZFS_LINK_MAX) 597 return (SET_ERROR(EMLINK)); 598 } 599 if (!(flag & ZRENAMING)) { 600 if (zp->z_unlinked) { /* no new links to unlinked zp */ 601 ASSERT(!(flag & (ZNEW | ZEXISTS))); 602 return (SET_ERROR(ENOENT)); 603 } 604 if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { 605 return (SET_ERROR(EMLINK)); 606 } 607 zp->z_links++; 608 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 609 &zp->z_links, sizeof (zp->z_links)); 610 611 } else { 612 ASSERT(!zp->z_unlinked); 613 } 614 value = zfs_dirent(zp, zp->z_mode); 615 error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 616 8, 1, &value, tx); 617 618 /* 619 * zap_add could fail to add the entry if it exceeds the capacity of the 620 * leaf-block and zap_leaf_split() failed to help. 621 * The caller of this routine is responsible for failing the transaction 622 * which will rollback the SA updates done above. 623 */ 624 if (error != 0) { 625 if (!(flag & ZRENAMING) && !(flag & ZNEW)) 626 zp->z_links--; 627 return (error); 628 } 629 630 /* 631 * If we added a longname activate the SPA_FEATURE_LONGNAME. 632 */ 633 if (strlen(name) >= ZAP_MAXNAMELEN) { 634 dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); 635 ds->ds_feature_activation[SPA_FEATURE_LONGNAME] = 636 (void *)B_TRUE; 637 } 638 639 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 640 &dzp->z_id, sizeof (dzp->z_id)); 641 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 642 &zp->z_pflags, sizeof (zp->z_pflags)); 643 644 if (!(flag & ZNEW)) { 645 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 646 ctime, sizeof (ctime)); 647 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 648 ctime); 649 } 650 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 651 ASSERT0(error); 652 653 dzp->z_size++; 654 dzp->z_links += zp_is_dir; 655 count = 0; 656 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 657 &dzp->z_size, sizeof (dzp->z_size)); 658 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 659 &dzp->z_links, sizeof (dzp->z_links)); 660 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 661 mtime, sizeof (mtime)); 662 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 663 ctime, sizeof (ctime)); 664 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 665 &dzp->z_pflags, sizeof (dzp->z_pflags)); 666 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 667 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 668 ASSERT0(error); 669 return (0); 670 } 671 672 /* 673 * The match type in the code for this function should conform to: 674 * 675 * ------------------------------------------------------------------------ 676 * fs type | z_norm | lookup type | match type 677 * ---------|-------------|-------------|---------------------------------- 678 * CS !norm | 0 | 0 | 0 (exact) 679 * CS norm | formX | 0 | MT_NORMALIZE 680 * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE 681 * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 682 * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE 683 * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 684 * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 685 * CM !norm | upper | ZCILOOK | MT_NORMALIZE 686 * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 687 * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE 688 * 689 * Abbreviations: 690 * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed 691 * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) 692 * formX = unicode normalization form set on fs creation 693 */ 694 static int 695 zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, 696 int flag) 697 { 698 int error; 699 700 if (zp->z_zfsvfs->z_norm) { 701 matchtype_t mt = MT_NORMALIZE; 702 703 if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { 704 mt |= MT_MATCH_CASE; 705 } 706 707 error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, 708 name, mt, tx); 709 } else { 710 error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); 711 } 712 713 return (error); 714 } 715 716 /* 717 * Unlink zp from dzp, and mark zp for deletion if this was the last link. 718 * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). 719 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. 720 * If it's non-NULL, we use it to indicate whether the znode needs deletion, 721 * and it's the caller's job to do it. 722 */ 723 int 724 zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, 725 int flag, boolean_t *unlinkedp) 726 { 727 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 728 vnode_t *vp = ZTOV(zp); 729 int zp_is_dir = (vp->v_type == VDIR); 730 boolean_t unlinked = B_FALSE; 731 sa_bulk_attr_t bulk[5]; 732 uint64_t mtime[2], ctime[2]; 733 int count = 0; 734 int error; 735 736 if (zfsvfs->z_replay == B_FALSE) { 737 ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); 738 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); 739 } 740 if (!(flag & ZRENAMING)) { 741 742 if (zp_is_dir && !zfs_dirempty(zp)) 743 return (SET_ERROR(ENOTEMPTY)); 744 745 /* 746 * If we get here, we are going to try to remove the object. 747 * First try removing the name from the directory; if that 748 * fails, return the error. 749 */ 750 error = zfs_dropname(dzp, name, zp, tx, flag); 751 if (error != 0) { 752 return (error); 753 } 754 755 if (zp->z_links <= zp_is_dir) { 756 zfs_panic_recover("zfs: link count on vnode %p is %u, " 757 "should be at least %u", zp->z_vnode, 758 (int)zp->z_links, 759 zp_is_dir + 1); 760 zp->z_links = zp_is_dir + 1; 761 } 762 if (--zp->z_links == zp_is_dir) { 763 zp->z_unlinked = B_TRUE; 764 zp->z_links = 0; 765 unlinked = B_TRUE; 766 } else { 767 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 768 NULL, &ctime, sizeof (ctime)); 769 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 770 NULL, &zp->z_pflags, sizeof (zp->z_pflags)); 771 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 772 ctime); 773 } 774 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 775 NULL, &zp->z_links, sizeof (zp->z_links)); 776 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 777 count = 0; 778 ASSERT0(error); 779 } else { 780 ASSERT(!zp->z_unlinked); 781 error = zfs_dropname(dzp, name, zp, tx, flag); 782 if (error != 0) 783 return (error); 784 } 785 786 dzp->z_size--; /* one dirent removed */ 787 dzp->z_links -= zp_is_dir; /* ".." link from zp */ 788 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 789 NULL, &dzp->z_links, sizeof (dzp->z_links)); 790 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 791 NULL, &dzp->z_size, sizeof (dzp->z_size)); 792 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 793 NULL, ctime, sizeof (ctime)); 794 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 795 NULL, mtime, sizeof (mtime)); 796 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 797 NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); 798 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 799 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 800 ASSERT0(error); 801 802 if (unlinkedp != NULL) 803 *unlinkedp = unlinked; 804 else if (unlinked) 805 zfs_unlinked_add(zp, tx); 806 807 return (0); 808 } 809 810 /* 811 * Indicate whether the directory is empty. 812 */ 813 boolean_t 814 zfs_dirempty(znode_t *dzp) 815 { 816 return (dzp->z_size == 2); 817 } 818 819 int 820 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) 821 { 822 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 823 znode_t *xzp; 824 dmu_tx_t *tx; 825 int error; 826 zfs_acl_ids_t acl_ids; 827 boolean_t fuid_dirtied; 828 uint64_t parent __maybe_unused; 829 830 *xvpp = NULL; 831 832 if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, 833 &acl_ids, NULL)) != 0) 834 return (error); 835 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { 836 zfs_acl_ids_free(&acl_ids); 837 return (SET_ERROR(EDQUOT)); 838 } 839 840 getnewvnode_reserve(); 841 842 tx = dmu_tx_create(zfsvfs->z_os); 843 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 844 ZFS_SA_BASE_ATTR_SIZE); 845 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 846 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 847 fuid_dirtied = zfsvfs->z_fuid_dirty; 848 if (fuid_dirtied) 849 zfs_fuid_txhold(zfsvfs, tx); 850 error = dmu_tx_assign(tx, TXG_WAIT); 851 if (error) { 852 zfs_acl_ids_free(&acl_ids); 853 dmu_tx_abort(tx); 854 getnewvnode_drop_reserve(); 855 return (error); 856 } 857 zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); 858 859 if (fuid_dirtied) 860 zfs_fuid_sync(zfsvfs, tx); 861 862 ASSERT0(sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, 863 sizeof (parent))); 864 ASSERT3U(parent, ==, zp->z_id); 865 866 VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, 867 sizeof (xzp->z_id), tx)); 868 869 zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, 870 acl_ids.z_fuidp, vap); 871 872 zfs_acl_ids_free(&acl_ids); 873 dmu_tx_commit(tx); 874 875 getnewvnode_drop_reserve(); 876 877 *xvpp = xzp; 878 879 return (0); 880 } 881 882 /* 883 * Return a znode for the extended attribute directory for zp. 884 * ** If the directory does not already exist, it is created ** 885 * 886 * IN: zp - znode to obtain attribute directory from 887 * cr - credentials of caller 888 * flags - flags from the VOP_LOOKUP call 889 * 890 * OUT: xzpp - pointer to extended attribute znode 891 * 892 * RETURN: 0 on success 893 * error number on failure 894 */ 895 int 896 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) 897 { 898 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 899 znode_t *xzp; 900 vattr_t va; 901 int error; 902 top: 903 error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); 904 if (error) 905 return (error); 906 907 if (xzp != NULL) { 908 *xzpp = xzp; 909 return (0); 910 } 911 912 913 if (!(flags & CREATE_XATTR_DIR)) 914 return (SET_ERROR(ENOATTR)); 915 916 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 917 return (SET_ERROR(EROFS)); 918 } 919 920 /* 921 * The ability to 'create' files in an attribute 922 * directory comes from the write_xattr permission on the base file. 923 * 924 * The ability to 'search' an attribute directory requires 925 * read_xattr permission on the base file. 926 * 927 * Once in a directory the ability to read/write attributes 928 * is controlled by the permissions on the attribute file. 929 */ 930 va.va_mask = AT_MODE | AT_UID | AT_GID; 931 va.va_type = VDIR; 932 va.va_mode = S_IFDIR | S_ISVTX | 0777; 933 zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); 934 935 error = zfs_make_xattrdir(zp, &va, xzpp, cr); 936 937 if (error == ERESTART) { 938 /* NB: we already did dmu_tx_wait() if necessary */ 939 goto top; 940 } 941 if (error == 0) 942 VOP_UNLOCK(ZTOV(*xzpp)); 943 944 return (error); 945 } 946 947 /* 948 * Decide whether it is okay to remove within a sticky directory. 949 * 950 * In sticky directories, write access is not sufficient; 951 * you can remove entries from a directory only if: 952 * 953 * you own the directory, 954 * you own the entry, 955 * the entry is a plain file and you have write access, 956 * or you are privileged (checked in secpolicy...). 957 * 958 * The function returns 0 if remove access is granted. 959 */ 960 int 961 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) 962 { 963 uid_t uid; 964 uid_t downer; 965 uid_t fowner; 966 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 967 968 if (zdp->z_zfsvfs->z_replay) 969 return (0); 970 971 if ((zdp->z_mode & S_ISVTX) == 0) 972 return (0); 973 974 downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); 975 fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); 976 977 if ((uid = crgetuid(cr)) == downer || uid == fowner || 978 (ZTOV(zp)->v_type == VREG && 979 zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL) == 0)) 980 return (0); 981 else 982 return (secpolicy_vnode_remove(ZTOV(zp), cr)); 983 } 984