1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28 /* Portions Copyright 2007 Jeremy Teo */
29 /* Portions Copyright 2010 Robert Milkowski */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/vfs.h>
38 #include <sys/vm.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/atomic.h>
46 #include <sys/namei.h>
47 #include <sys/mman.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/unistd.h>
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/dirent.h>
62 #include <sys/policy.h>
63 #include <sys/sunddi.h>
64 #include <sys/filio.h>
65 #include <sys/sid.h>
66 #include <sys/zfs_ctldir.h>
67 #include <sys/zfs_fuid.h>
68 #include <sys/zfs_sa.h>
69 #include <sys/dnlc.h>
70 #include <sys/zfs_rlock.h>
71 #include <sys/buf.h>
72 #include <sys/sched.h>
73 #include <sys/acl.h>
74 #include <sys/extdirent.h>
75
76 #ifdef __FreeBSD__
77 #include <sys/kidmap.h>
78 #include <sys/bio.h>
79 #include <vm/vm_param.h>
80 #endif
81
82 #ifdef __NetBSD__
83 #include <dev/mm.h>
84 #include <miscfs/fifofs/fifo.h>
85 #include <miscfs/genfs/genfs.h>
86 #include <miscfs/genfs/genfs_node.h>
87 #include <uvm/uvm_extern.h>
88 #include <sys/fstrans.h>
89 #include <sys/malloc.h>
90
91 uint_t zfs_putpage_key;
92 #endif
93
94 /*
95 * Programming rules.
96 *
97 * Each vnode op performs some logical unit of work. To do this, the ZPL must
98 * properly lock its in-core state, create a DMU transaction, do the work,
99 * record this work in the intent log (ZIL), commit the DMU transaction,
100 * and wait for the intent log to commit if it is a synchronous operation.
101 * Moreover, the vnode ops must work in both normal and log replay context.
102 * The ordering of events is important to avoid deadlocks and references
103 * to freed memory. The example below illustrates the following Big Rules:
104 *
105 * (1) A check must be made in each zfs thread for a mounted file system.
106 * This is done avoiding races using ZFS_ENTER(zfsvfs).
107 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
108 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
109 * can return EIO from the calling function.
110 *
111 * (2) VN_RELE() should always be the last thing except for zil_commit()
112 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
113 * First, if it's the last reference, the vnode/znode
114 * can be freed, so the zp may point to freed memory. Second, the last
115 * reference will call zfs_zinactive(), which may induce a lot of work --
116 * pushing cached pages (which acquires range locks) and syncing out
117 * cached atime changes. Third, zfs_zinactive() may require a new tx,
118 * which could deadlock the system if you were already holding one.
119 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
120 *
121 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
122 * as they can span dmu_tx_assign() calls.
123 *
124 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
125 * dmu_tx_assign(). This is critical because we don't want to block
126 * while holding locks.
127 *
128 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
129 * reduces lock contention and CPU usage when we must wait (note that if
130 * throughput is constrained by the storage, nearly every transaction
131 * must wait).
132 *
133 * Note, in particular, that if a lock is sometimes acquired before
134 * the tx assigns, and sometimes after (e.g. z_lock), then failing
135 * to use a non-blocking assign can deadlock the system. The scenario:
136 *
137 * Thread A has grabbed a lock before calling dmu_tx_assign().
138 * Thread B is in an already-assigned tx, and blocks for this lock.
139 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
140 * forever, because the previous txg can't quiesce until B's tx commits.
141 *
142 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
143 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
144 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
145 * to indicate that this operation has already called dmu_tx_wait().
146 * This will ensure that we don't retry forever, waiting a short bit
147 * each time.
148 *
149 * (5) If the operation succeeded, generate the intent log entry for it
150 * before dropping locks. This ensures that the ordering of events
151 * in the intent log matches the order in which they actually occurred.
152 * During ZIL replay the zfs_log_* functions will update the sequence
153 * number to indicate the zil transaction has replayed.
154 *
155 * (6) At the end of each vnode op, the DMU tx must always commit,
156 * regardless of whether there were any errors.
157 *
158 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
159 * to ensure that synchronous semantics are provided when necessary.
160 *
161 * In general, this is how things should be ordered in each vnode op:
162 *
163 * ZFS_ENTER(zfsvfs); // exit if unmounted
164 * top:
165 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
166 * rw_enter(...); // grab any other locks you need
167 * tx = dmu_tx_create(...); // get DMU tx
168 * dmu_tx_hold_*(); // hold each object you might modify
169 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
170 * if (error) {
171 * rw_exit(...); // drop locks
172 * zfs_dirent_unlock(dl); // unlock directory entry
173 * VN_RELE(...); // release held vnodes
174 * if (error == ERESTART) {
175 * waited = B_TRUE;
176 * dmu_tx_wait(tx);
177 * dmu_tx_abort(tx);
178 * goto top;
179 * }
180 * dmu_tx_abort(tx); // abort DMU tx
181 * ZFS_EXIT(zfsvfs); // finished in zfs
182 * return (error); // really out of space
183 * }
184 * error = do_real_work(); // do whatever this VOP does
185 * if (error == 0)
186 * zfs_log_*(...); // on success, make ZIL entry
187 * dmu_tx_commit(tx); // commit DMU tx -- error or not
188 * rw_exit(...); // drop locks
189 * zfs_dirent_unlock(dl); // unlock directory entry
190 * VN_RELE(...); // release held vnodes
191 * zil_commit(zilog, foid); // synchronous when necessary
192 * ZFS_EXIT(zfsvfs); // finished in zfs
193 * return (error); // done, report error
194 */
195
196 /* ARGSUSED */
197 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)198 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
199 {
200 znode_t *zp = VTOZ(*vpp);
201 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
202
203 ZFS_ENTER(zfsvfs);
204 ZFS_VERIFY_ZP(zp);
205
206 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
207 ((flag & FAPPEND) == 0)) {
208 ZFS_EXIT(zfsvfs);
209 return (SET_ERROR(EPERM));
210 }
211
212 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
213 ZTOV(zp)->v_type == VREG &&
214 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
215 if (fs_vscan(*vpp, cr, 0) != 0) {
216 ZFS_EXIT(zfsvfs);
217 return (SET_ERROR(EACCES));
218 }
219 }
220
221 /* Keep a count of the synchronous opens in the znode */
222 if (flag & (FSYNC | FDSYNC))
223 atomic_inc_32(&zp->z_sync_cnt);
224
225 ZFS_EXIT(zfsvfs);
226 return (0);
227 }
228
229 /* ARGSUSED */
230 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)231 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
232 caller_context_t *ct)
233 {
234 znode_t *zp = VTOZ(vp);
235 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
236
237 /*
238 * Clean up any locks held by this process on the vp.
239 */
240 cleanlocks(vp, ddi_get_pid(), 0);
241 cleanshares(vp, ddi_get_pid());
242
243 ZFS_ENTER(zfsvfs);
244 ZFS_VERIFY_ZP(zp);
245
246 /* Decrement the synchronous opens in the znode */
247 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
248 atomic_dec_32(&zp->z_sync_cnt);
249
250 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
251 ZTOV(zp)->v_type == VREG &&
252 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
253 VERIFY(fs_vscan(vp, cr, 1) == 0);
254
255 ZFS_EXIT(zfsvfs);
256 return (0);
257 }
258
259 /*
260 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
261 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
262 */
263 static int
zfs_holey(vnode_t * vp,u_long cmd,offset_t * off)264 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
265 {
266 znode_t *zp = VTOZ(vp);
267 uint64_t noff = (uint64_t)*off; /* new offset */
268 uint64_t file_sz;
269 int error;
270 boolean_t hole;
271
272 file_sz = zp->z_size;
273 if (noff >= file_sz) {
274 return (SET_ERROR(ENXIO));
275 }
276
277 if (cmd == _FIO_SEEK_HOLE)
278 hole = B_TRUE;
279 else
280 hole = B_FALSE;
281
282 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
283
284 if (error == ESRCH)
285 return (SET_ERROR(ENXIO));
286
287 /*
288 * We could find a hole that begins after the logical end-of-file,
289 * because dmu_offset_next() only works on whole blocks. If the
290 * EOF falls mid-block, then indicate that the "virtual hole"
291 * at the end of the file begins at the logical EOF, rather than
292 * at the end of the last block.
293 */
294 if (noff > file_sz) {
295 ASSERT(hole);
296 noff = file_sz;
297 }
298
299 if (noff < *off)
300 return (error);
301 *off = noff;
302 return (error);
303 }
304
305 /* ARGSUSED */
306 static int
zfs_ioctl(vnode_t * vp,u_long com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)307 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
308 int *rvalp, caller_context_t *ct)
309 {
310 offset_t off;
311 offset_t ndata;
312 dmu_object_info_t doi;
313 int error;
314 zfsvfs_t *zfsvfs;
315 znode_t *zp;
316
317 switch (com) {
318 case _FIOFFS:
319 {
320 return (0);
321
322 /*
323 * The following two ioctls are used by bfu. Faking out,
324 * necessary to avoid bfu errors.
325 */
326 }
327 case _FIOGDIO:
328 case _FIOSDIO:
329 {
330 return (0);
331 }
332
333 case _FIO_SEEK_DATA:
334 case _FIO_SEEK_HOLE:
335 {
336 #ifdef illumos
337 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
338 return (SET_ERROR(EFAULT));
339 #else
340 off = *(offset_t *)data;
341 #endif
342 zp = VTOZ(vp);
343 zfsvfs = zp->z_zfsvfs;
344 ZFS_ENTER(zfsvfs);
345 ZFS_VERIFY_ZP(zp);
346
347 /* offset parameter is in/out */
348 error = zfs_holey(vp, com, &off);
349 ZFS_EXIT(zfsvfs);
350 if (error)
351 return (error);
352 #ifdef illumos
353 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
354 return (SET_ERROR(EFAULT));
355 #else
356 *(offset_t *)data = off;
357 #endif
358 return (0);
359 }
360 #ifdef illumos
361 case _FIO_COUNT_FILLED:
362 {
363 /*
364 * _FIO_COUNT_FILLED adds a new ioctl command which
365 * exposes the number of filled blocks in a
366 * ZFS object.
367 */
368 zp = VTOZ(vp);
369 zfsvfs = zp->z_zfsvfs;
370 ZFS_ENTER(zfsvfs);
371 ZFS_VERIFY_ZP(zp);
372
373 /*
374 * Wait for all dirty blocks for this object
375 * to get synced out to disk, and the DMU info
376 * updated.
377 */
378 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
379 if (error) {
380 ZFS_EXIT(zfsvfs);
381 return (error);
382 }
383
384 /*
385 * Retrieve fill count from DMU object.
386 */
387 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
388 if (error) {
389 ZFS_EXIT(zfsvfs);
390 return (error);
391 }
392
393 ndata = doi.doi_fill_count;
394
395 ZFS_EXIT(zfsvfs);
396 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
397 return (SET_ERROR(EFAULT));
398 return (0);
399 }
400 #endif
401 }
402 return (SET_ERROR(ENOTTY));
403 }
404
405 #ifdef __FreeBSD__
406 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)407 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
408 {
409 vm_object_t obj;
410 vm_page_t pp;
411 int64_t end;
412
413 /*
414 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
415 * aligned boundaries, if the range is not aligned. As a result a
416 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
417 * It may happen that all DEV_BSIZE subranges are marked clean and thus
418 * the whole page would be considred clean despite have some dirty data.
419 * For this reason we should shrink the range to DEV_BSIZE aligned
420 * boundaries before calling vm_page_clear_dirty.
421 */
422 end = rounddown2(off + nbytes, DEV_BSIZE);
423 off = roundup2(off, DEV_BSIZE);
424 nbytes = end - off;
425
426 obj = vp->v_object;
427 zfs_vmobject_assert_wlocked(obj);
428
429 for (;;) {
430 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
431 pp->valid) {
432 if (vm_page_xbusied(pp)) {
433 /*
434 * Reference the page before unlocking and
435 * sleeping so that the page daemon is less
436 * likely to reclaim it.
437 */
438 vm_page_reference(pp);
439 vm_page_lock(pp);
440 zfs_vmobject_wunlock(obj);
441 vm_page_busy_sleep(pp, "zfsmwb", true);
442 zfs_vmobject_wlock(obj);
443 continue;
444 }
445 vm_page_sbusy(pp);
446 } else if (pp != NULL) {
447 ASSERT(!pp->valid);
448 pp = NULL;
449 }
450
451 if (pp != NULL) {
452 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
453 vm_object_pip_add(obj, 1);
454 pmap_remove_write(pp);
455 if (nbytes != 0)
456 vm_page_clear_dirty(pp, off, nbytes);
457 }
458 break;
459 }
460 return (pp);
461 }
462
463 static void
page_unbusy(vm_page_t pp)464 page_unbusy(vm_page_t pp)
465 {
466
467 vm_page_sunbusy(pp);
468 vm_object_pip_subtract(pp->object, 1);
469 }
470
471 static vm_page_t
page_hold(vnode_t * vp,int64_t start)472 page_hold(vnode_t *vp, int64_t start)
473 {
474 vm_object_t obj;
475 vm_page_t pp;
476
477 obj = vp->v_object;
478 zfs_vmobject_assert_wlocked(obj);
479
480 for (;;) {
481 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
482 pp->valid) {
483 if (vm_page_xbusied(pp)) {
484 /*
485 * Reference the page before unlocking and
486 * sleeping so that the page daemon is less
487 * likely to reclaim it.
488 */
489 vm_page_reference(pp);
490 vm_page_lock(pp);
491 zfs_vmobject_wunlock(obj);
492 vm_page_busy_sleep(pp, "zfsmwb", true);
493 zfs_vmobject_wlock(obj);
494 continue;
495 }
496
497 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
498 vm_page_lock(pp);
499 vm_page_hold(pp);
500 vm_page_unlock(pp);
501
502 } else
503 pp = NULL;
504 break;
505 }
506 return (pp);
507 }
508
509 static void
page_unhold(vm_page_t pp)510 page_unhold(vm_page_t pp)
511 {
512
513 vm_page_lock(pp);
514 vm_page_unhold(pp);
515 vm_page_unlock(pp);
516 }
517
518 /*
519 * When a file is memory mapped, we must keep the IO data synchronized
520 * between the DMU cache and the memory mapped pages. What this means:
521 *
522 * On Write: If we find a memory mapped page, we write to *both*
523 * the page and the dmu buffer.
524 */
525 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)526 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
527 int segflg, dmu_tx_t *tx)
528 {
529 vm_object_t obj;
530 struct sf_buf *sf;
531 caddr_t va;
532 int off;
533
534 ASSERT(segflg != UIO_NOCOPY);
535 ASSERT(vp->v_mount != NULL);
536 obj = vp->v_object;
537 ASSERT(obj != NULL);
538
539 off = start & PAGEOFFSET;
540 zfs_vmobject_wlock(obj);
541 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
542 vm_page_t pp;
543 int nbytes = imin(PAGESIZE - off, len);
544
545 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
546 zfs_vmobject_wunlock(obj);
547
548 va = zfs_map_page(pp, &sf);
549 (void) dmu_read(os, oid, start+off, nbytes,
550 va+off, DMU_READ_PREFETCH);;
551 zfs_unmap_page(sf);
552
553 zfs_vmobject_wlock(obj);
554 page_unbusy(pp);
555 }
556 len -= nbytes;
557 off = 0;
558 }
559 vm_object_pip_wakeupn(obj, 0);
560 zfs_vmobject_wunlock(obj);
561 }
562
563 /*
564 * Read with UIO_NOCOPY flag means that sendfile(2) requests
565 * ZFS to populate a range of page cache pages with data.
566 *
567 * NOTE: this function could be optimized to pre-allocate
568 * all pages in advance, drain exclusive busy on all of them,
569 * map them into contiguous KVA region and populate them
570 * in one single dmu_read() call.
571 */
572 static int
mappedread_sf(vnode_t * vp,int nbytes,uio_t * uio)573 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
574 {
575 znode_t *zp = VTOZ(vp);
576 objset_t *os = zp->z_zfsvfs->z_os;
577 struct sf_buf *sf;
578 vm_object_t obj;
579 vm_page_t pp;
580 int64_t start;
581 caddr_t va;
582 int len = nbytes;
583 int off;
584 int error = 0;
585
586 ASSERT(uio->uio_segflg == UIO_NOCOPY);
587 ASSERT(vp->v_mount != NULL);
588 obj = vp->v_object;
589 ASSERT(obj != NULL);
590 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
591
592 zfs_vmobject_wlock(obj);
593 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
594 int bytes = MIN(PAGESIZE, len);
595
596 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
597 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
598 if (pp->valid == 0) {
599 zfs_vmobject_wunlock(obj);
600 va = zfs_map_page(pp, &sf);
601 error = dmu_read(os, zp->z_id, start, bytes, va,
602 DMU_READ_PREFETCH);
603 if (bytes != PAGESIZE && error == 0)
604 bzero(va + bytes, PAGESIZE - bytes);
605 zfs_unmap_page(sf);
606 zfs_vmobject_wlock(obj);
607 vm_page_sunbusy(pp);
608 vm_page_lock(pp);
609 if (error) {
610 if (pp->wire_count == 0 && pp->valid == 0 &&
611 !vm_page_busied(pp))
612 vm_page_free(pp);
613 } else {
614 pp->valid = VM_PAGE_BITS_ALL;
615 vm_page_activate(pp);
616 }
617 vm_page_unlock(pp);
618 } else {
619 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
620 vm_page_sunbusy(pp);
621 }
622 if (error)
623 break;
624 uio->uio_resid -= bytes;
625 uio->uio_offset += bytes;
626 len -= bytes;
627 }
628 zfs_vmobject_wunlock(obj);
629 return (error);
630 }
631
632 /*
633 * When a file is memory mapped, we must keep the IO data synchronized
634 * between the DMU cache and the memory mapped pages. What this means:
635 *
636 * On Read: We "read" preferentially from memory mapped pages,
637 * else we default from the dmu buffer.
638 *
639 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
640 * the file is memory mapped.
641 */
642 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)643 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
644 {
645 znode_t *zp = VTOZ(vp);
646 vm_object_t obj;
647 int64_t start;
648 caddr_t va;
649 int len = nbytes;
650 int off;
651 int error = 0;
652
653 ASSERT(vp->v_mount != NULL);
654 obj = vp->v_object;
655 ASSERT(obj != NULL);
656
657 start = uio->uio_loffset;
658 off = start & PAGEOFFSET;
659 zfs_vmobject_wlock(obj);
660 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
661 vm_page_t pp;
662 uint64_t bytes = MIN(PAGESIZE - off, len);
663
664 if (pp = page_hold(vp, start)) {
665 struct sf_buf *sf;
666 caddr_t va;
667
668 zfs_vmobject_wunlock(obj);
669 va = zfs_map_page(pp, &sf);
670 #ifdef illumos
671 error = uiomove(va + off, bytes, UIO_READ, uio);
672 #else
673 error = vn_io_fault_uiomove(va + off, bytes, uio);
674 #endif
675 zfs_unmap_page(sf);
676 zfs_vmobject_wlock(obj);
677 page_unhold(pp);
678 } else {
679 zfs_vmobject_wunlock(obj);
680 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
681 uio, bytes);
682 zfs_vmobject_wlock(obj);
683 }
684 len -= bytes;
685 off = 0;
686 if (error)
687 break;
688 }
689 zfs_vmobject_wunlock(obj);
690 return (error);
691 }
692 #endif /* __FreeBSD__ */
693
694 #ifdef __NetBSD__
695
696 caddr_t
zfs_map_page(page_t * pp,enum seg_rw rw)697 zfs_map_page(page_t *pp, enum seg_rw rw)
698 {
699 vaddr_t va;
700 int flags;
701
702 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
703 if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
704 return (caddr_t)va;
705 #endif
706
707 flags = UVMPAGER_MAPIN_WAITOK |
708 (rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ);
709 va = uvm_pagermapin(&pp, 1, flags);
710 return (caddr_t)va;
711 }
712
713 void
zfs_unmap_page(page_t * pp,caddr_t addr)714 zfs_unmap_page(page_t *pp, caddr_t addr)
715 {
716
717 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
718 vaddr_t va;
719
720 if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
721 return;
722 #endif
723 uvm_pagermapout((vaddr_t)addr, 1);
724 }
725
726 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)727 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
728 {
729 znode_t *zp = VTOZ(vp);
730 struct uvm_object *uobj = &vp->v_uobj;
731 krwlock_t *rw = uobj->vmobjlock;
732 int64_t start;
733 caddr_t va;
734 size_t len = nbytes;
735 int off;
736 int error = 0;
737 int npages, found;
738
739 start = uio->uio_loffset;
740 off = start & PAGEOFFSET;
741
742 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
743 page_t *pp;
744 uint64_t bytes = MIN(PAGESIZE - off, len);
745
746 pp = NULL;
747 npages = 1;
748 rw_enter(rw, RW_WRITER);
749 found = uvn_findpages(uobj, start, &npages, &pp, NULL,
750 UFP_NOALLOC);
751 rw_exit(rw);
752
753 /* XXXNETBSD shouldn't access userspace with the page busy */
754 if (found) {
755 va = zfs_map_page(pp, S_READ);
756 error = uiomove(va + off, bytes, UIO_READ, uio);
757 zfs_unmap_page(pp, va);
758 rw_enter(rw, RW_WRITER);
759 uvm_page_unbusy(&pp, 1);
760 rw_exit(rw);
761 } else {
762 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
763 uio, bytes);
764 }
765
766 len -= bytes;
767 off = 0;
768 if (error)
769 break;
770 }
771 return (error);
772 }
773
774 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)775 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
776 int segflg, dmu_tx_t *tx)
777 {
778 struct uvm_object *uobj = &vp->v_uobj;
779 krwlock_t *rw = uobj->vmobjlock;
780 caddr_t va;
781 int off, status;
782
783 ASSERT(vp->v_mount != NULL);
784
785 rw_enter(rw, RW_WRITER);
786
787 off = start & PAGEOFFSET;
788 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
789 page_t *pp;
790 int nbytes = MIN(PAGESIZE - off, len);
791 int npages, found;
792
793 pp = NULL;
794 npages = 1;
795 found = uvn_findpages(uobj, start, &npages, &pp, NULL,
796 UFP_NOALLOC);
797 if (found) {
798 /*
799 * We're about to zap the page's contents and don't
800 * care about any existing modifications. We must
801 * keep track of any new modifications past this
802 * point. Clear the modified bit in the pmap, and
803 * if the page is marked dirty revert to tracking
804 * the modified bit.
805 */
806 switch (uvm_pagegetdirty(pp)) {
807 case UVM_PAGE_STATUS_DIRTY:
808 /* Does pmap_clear_modify(). */
809 uvm_pagemarkdirty(pp, UVM_PAGE_STATUS_UNKNOWN);
810 break;
811 case UVM_PAGE_STATUS_UNKNOWN:
812 pmap_clear_modify(pp);
813 break;
814 case UVM_PAGE_STATUS_CLEAN:
815 /* Nothing to do. */
816 break;
817 }
818 rw_exit(rw);
819
820 va = zfs_map_page(pp, S_WRITE);
821 (void) dmu_read(os, oid, start + off, nbytes,
822 va + off, DMU_READ_PREFETCH);
823 zfs_unmap_page(pp, va);
824
825 rw_enter(rw, RW_WRITER);
826 uvm_page_unbusy(&pp, 1);
827 }
828 len -= nbytes;
829 off = 0;
830 }
831 rw_exit(rw);
832 }
833 #endif /* __NetBSD__ */
834
835 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
836
837 /*
838 * Read bytes from specified file into supplied buffer.
839 *
840 * IN: vp - vnode of file to be read from.
841 * uio - structure supplying read location, range info,
842 * and return buffer.
843 * ioflag - SYNC flags; used to provide FRSYNC semantics.
844 * cr - credentials of caller.
845 * ct - caller context
846 *
847 * OUT: uio - updated offset and range, buffer filled.
848 *
849 * RETURN: 0 on success, error code on failure.
850 *
851 * Side Effects:
852 * vp - atime updated if byte count > 0
853 */
854 /* ARGSUSED */
855 static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)856 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
857 {
858 znode_t *zp = VTOZ(vp);
859 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
860 ssize_t n, nbytes;
861 int error = 0;
862 rl_t *rl;
863 xuio_t *xuio = NULL;
864
865 ZFS_ENTER(zfsvfs);
866 ZFS_VERIFY_ZP(zp);
867
868 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
869 ZFS_EXIT(zfsvfs);
870 return (SET_ERROR(EACCES));
871 }
872
873 /*
874 * Validate file offset
875 */
876 if (uio->uio_loffset < (offset_t)0) {
877 ZFS_EXIT(zfsvfs);
878 return (SET_ERROR(EINVAL));
879 }
880
881 /*
882 * Fasttrack empty reads
883 */
884 if (uio->uio_resid == 0) {
885 ZFS_EXIT(zfsvfs);
886 return (0);
887 }
888
889 /*
890 * Check for mandatory locks
891 */
892 if (MANDMODE(zp->z_mode)) {
893 if (error = chklock(vp, FREAD,
894 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
895 ZFS_EXIT(zfsvfs);
896 return (error);
897 }
898 }
899
900 /*
901 * If we're in FRSYNC mode, sync out this znode before reading it.
902 */
903 if (zfsvfs->z_log &&
904 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
905 zil_commit(zfsvfs->z_log, zp->z_id);
906
907 /*
908 * Lock the range against changes.
909 */
910 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
911
912 /*
913 * If we are reading past end-of-file we can skip
914 * to the end; but we might still need to set atime.
915 */
916 if (uio->uio_loffset >= zp->z_size) {
917 error = 0;
918 goto out;
919 }
920
921 ASSERT(uio->uio_loffset < zp->z_size);
922 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
923
924 #ifdef illumos
925 if ((uio->uio_extflg == UIO_XUIO) &&
926 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
927 int nblk;
928 int blksz = zp->z_blksz;
929 uint64_t offset = uio->uio_loffset;
930
931 xuio = (xuio_t *)uio;
932 if ((ISP2(blksz))) {
933 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
934 blksz)) / blksz;
935 } else {
936 ASSERT(offset + n <= blksz);
937 nblk = 1;
938 }
939 (void) dmu_xuio_init(xuio, nblk);
940
941 if (vn_has_cached_data(vp)) {
942 /*
943 * For simplicity, we always allocate a full buffer
944 * even if we only expect to read a portion of a block.
945 */
946 while (--nblk >= 0) {
947 (void) dmu_xuio_add(xuio,
948 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
949 blksz), 0, blksz);
950 }
951 }
952 }
953 #endif /* illumos */
954
955 while (n > 0) {
956 nbytes = MIN(n, zfs_read_chunk_size -
957 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
958
959 #ifdef __FreeBSD__
960 if (uio->uio_segflg == UIO_NOCOPY)
961 error = mappedread_sf(vp, nbytes, uio);
962 else
963 #endif /* __FreeBSD__ */
964 if (vn_has_cached_data(vp)) {
965 error = mappedread(vp, nbytes, uio);
966 } else {
967 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
968 uio, nbytes);
969 }
970 if (error) {
971 /* convert checksum errors into IO errors */
972 if (error == ECKSUM)
973 error = SET_ERROR(EIO);
974 break;
975 }
976
977 n -= nbytes;
978 }
979 out:
980 zfs_range_unlock(rl);
981
982 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
983 ZFS_EXIT(zfsvfs);
984 return (error);
985 }
986
987 /*
988 * Write the bytes to a file.
989 *
990 * IN: vp - vnode of file to be written to.
991 * uio - structure supplying write location, range info,
992 * and data buffer.
993 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
994 * set if in append mode.
995 * cr - credentials of caller.
996 * ct - caller context (NFS/CIFS fem monitor only)
997 *
998 * OUT: uio - updated offset and range.
999 *
1000 * RETURN: 0 on success, error code on failure.
1001 *
1002 * Timestamps:
1003 * vp - ctime|mtime updated if byte count > 0
1004 */
1005
1006 /* ARGSUSED */
1007 static int
zfs_write(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)1008 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
1009 {
1010 znode_t *zp = VTOZ(vp);
1011 rlim64_t limit = MAXOFFSET_T;
1012 ssize_t start_resid = uio->uio_resid;
1013 ssize_t tx_bytes;
1014 uint64_t end_size;
1015 dmu_tx_t *tx;
1016 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1017 zilog_t *zilog;
1018 offset_t woff;
1019 ssize_t n, nbytes;
1020 rl_t *rl;
1021 int max_blksz = zfsvfs->z_max_blksz;
1022 int error = 0;
1023 arc_buf_t *abuf;
1024 iovec_t *aiov = NULL;
1025 xuio_t *xuio = NULL;
1026 int i_iov = 0;
1027 int iovcnt = uio->uio_iovcnt;
1028 iovec_t *iovp = uio->uio_iov;
1029 int write_eof;
1030 int count = 0;
1031 sa_bulk_attr_t bulk[4];
1032 uint64_t mtime[2], ctime[2];
1033 int segflg;
1034
1035 #ifdef __NetBSD__
1036 segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ?
1037 UIO_SYSSPACE : UIO_USERSPACE;
1038 #else
1039 segflg = uio->uio_segflg;
1040 #endif
1041
1042 /*
1043 * Fasttrack empty write
1044 */
1045 n = start_resid;
1046 if (n == 0)
1047 return (0);
1048
1049 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
1050 limit = MAXOFFSET_T;
1051
1052 ZFS_ENTER(zfsvfs);
1053 ZFS_VERIFY_ZP(zp);
1054
1055 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1056 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1057 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1058 &zp->z_size, 8);
1059 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1060 &zp->z_pflags, 8);
1061
1062 /*
1063 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
1064 * callers might not be able to detect properly that we are read-only,
1065 * so check it explicitly here.
1066 */
1067 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
1068 ZFS_EXIT(zfsvfs);
1069 return (SET_ERROR(EROFS));
1070 }
1071
1072 /*
1073 * If immutable or not appending then return EPERM
1074 */
1075 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
1076 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
1077 (uio->uio_loffset < zp->z_size))) {
1078 ZFS_EXIT(zfsvfs);
1079 return (SET_ERROR(EPERM));
1080 }
1081
1082 zilog = zfsvfs->z_log;
1083
1084 /*
1085 * Validate file offset
1086 */
1087 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
1088 if (woff < 0) {
1089 ZFS_EXIT(zfsvfs);
1090 return (SET_ERROR(EINVAL));
1091 }
1092
1093 /*
1094 * Check for mandatory locks before calling zfs_range_lock()
1095 * in order to prevent a deadlock with locks set via fcntl().
1096 */
1097 if (MANDMODE((mode_t)zp->z_mode) &&
1098 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
1099 ZFS_EXIT(zfsvfs);
1100 return (error);
1101 }
1102
1103 #ifdef illumos
1104 /*
1105 * Pre-fault the pages to ensure slow (eg NFS) pages
1106 * don't hold up txg.
1107 * Skip this if uio contains loaned arc_buf.
1108 */
1109 if ((uio->uio_extflg == UIO_XUIO) &&
1110 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
1111 xuio = (xuio_t *)uio;
1112 else
1113 uio_prefaultpages(MIN(n, max_blksz), uio);
1114 #endif
1115
1116 /*
1117 * If in append mode, set the io offset pointer to eof.
1118 */
1119 if (ioflag & FAPPEND) {
1120 /*
1121 * Obtain an appending range lock to guarantee file append
1122 * semantics. We reset the write offset once we have the lock.
1123 */
1124 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
1125 woff = rl->r_off;
1126 if (rl->r_len == UINT64_MAX) {
1127 /*
1128 * We overlocked the file because this write will cause
1129 * the file block size to increase.
1130 * Note that zp_size cannot change with this lock held.
1131 */
1132 woff = zp->z_size;
1133 }
1134 uio->uio_loffset = woff;
1135 } else {
1136 /*
1137 * Note that if the file block size will change as a result of
1138 * this write, then this range lock will lock the entire file
1139 * so that we can re-write the block safely.
1140 */
1141 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
1142 }
1143
1144 #ifdef illumos
1145 if (woff >= limit) {
1146 zfs_range_unlock(rl);
1147 ZFS_EXIT(zfsvfs);
1148 return (SET_ERROR(EFBIG));
1149 }
1150
1151 #endif
1152 #ifdef __FreeBSD__
1153 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
1154 zfs_range_unlock(rl);
1155 ZFS_EXIT(zfsvfs);
1156 return (SET_ERROR(EFBIG));
1157 }
1158 #endif
1159 #ifdef __NetBSD__
1160 /* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */
1161 #endif
1162
1163 if ((woff + n) > limit || woff > (limit - n))
1164 n = limit - woff;
1165
1166 /* Will this write extend the file length? */
1167 write_eof = (woff + n > zp->z_size);
1168
1169 end_size = MAX(zp->z_size, woff + n);
1170
1171 /*
1172 * Write the file in reasonable size chunks. Each chunk is written
1173 * in a separate transaction; this keeps the intent log records small
1174 * and allows us to do more fine-grained space accounting.
1175 */
1176 while (n > 0) {
1177 abuf = NULL;
1178 woff = uio->uio_loffset;
1179 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1180 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1181 if (abuf != NULL)
1182 dmu_return_arcbuf(abuf);
1183 error = SET_ERROR(EDQUOT);
1184 break;
1185 }
1186
1187 if (xuio && abuf == NULL) {
1188 ASSERT(i_iov < iovcnt);
1189 aiov = &iovp[i_iov];
1190 abuf = dmu_xuio_arcbuf(xuio, i_iov);
1191 dmu_xuio_clear(xuio, i_iov);
1192 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1193 iovec_t *, aiov, arc_buf_t *, abuf);
1194 ASSERT((aiov->iov_base == abuf->b_data) ||
1195 ((char *)aiov->iov_base - (char *)abuf->b_data +
1196 aiov->iov_len == arc_buf_size(abuf)));
1197 i_iov++;
1198 } else if (abuf == NULL && n >= max_blksz &&
1199 woff >= zp->z_size &&
1200 P2PHASE(woff, max_blksz) == 0 &&
1201 zp->z_blksz == max_blksz) {
1202 /*
1203 * This write covers a full block. "Borrow" a buffer
1204 * from the dmu so that we can fill it before we enter
1205 * a transaction. This avoids the possibility of
1206 * holding up the transaction if the data copy hangs
1207 * up on a pagefault (e.g., from an NFS server mapping).
1208 */
1209 #if defined(illumos) || defined(__NetBSD__)
1210 size_t cbytes;
1211 #endif
1212
1213 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1214 max_blksz);
1215 ASSERT(abuf != NULL);
1216 ASSERT(arc_buf_size(abuf) == max_blksz);
1217 #if defined(illumos) || defined(__NetBSD__)
1218 if (error = uiocopy(abuf->b_data, max_blksz,
1219 UIO_WRITE, uio, &cbytes)) {
1220 dmu_return_arcbuf(abuf);
1221 break;
1222 }
1223 ASSERT(cbytes == max_blksz);
1224 #endif
1225 #ifdef __FreeBSD__
1226 ssize_t resid = uio->uio_resid;
1227
1228 error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1229 if (error != 0) {
1230 uio->uio_offset -= resid - uio->uio_resid;
1231 uio->uio_resid = resid;
1232 dmu_return_arcbuf(abuf);
1233 break;
1234 }
1235 #endif
1236 }
1237
1238 /*
1239 * Start a transaction.
1240 */
1241 tx = dmu_tx_create(zfsvfs->z_os);
1242 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1243 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1244 zfs_sa_upgrade_txholds(tx, zp);
1245 error = dmu_tx_assign(tx, TXG_WAIT);
1246 if (error) {
1247 dmu_tx_abort(tx);
1248 if (abuf != NULL)
1249 dmu_return_arcbuf(abuf);
1250 break;
1251 }
1252
1253 /*
1254 * If zfs_range_lock() over-locked we grow the blocksize
1255 * and then reduce the lock range. This will only happen
1256 * on the first iteration since zfs_range_reduce() will
1257 * shrink down r_len to the appropriate size.
1258 */
1259 if (rl->r_len == UINT64_MAX) {
1260 uint64_t new_blksz;
1261
1262 if (zp->z_blksz > max_blksz) {
1263 /*
1264 * File's blocksize is already larger than the
1265 * "recordsize" property. Only let it grow to
1266 * the next power of 2.
1267 */
1268 ASSERT(!ISP2(zp->z_blksz));
1269 new_blksz = MIN(end_size,
1270 1 << highbit64(zp->z_blksz));
1271 } else {
1272 new_blksz = MIN(end_size, max_blksz);
1273 }
1274 zfs_grow_blocksize(zp, new_blksz, tx);
1275 zfs_range_reduce(rl, woff, n);
1276 }
1277
1278 /*
1279 * XXX - should we really limit each write to z_max_blksz?
1280 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1281 */
1282 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1283
1284 if (woff + nbytes > zp->z_size)
1285 vnode_pager_setsize(vp, woff + nbytes);
1286
1287 if (abuf == NULL) {
1288 tx_bytes = uio->uio_resid;
1289 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1290 uio, nbytes, tx);
1291 tx_bytes -= uio->uio_resid;
1292 } else {
1293 tx_bytes = nbytes;
1294 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1295 /*
1296 * If this is not a full block write, but we are
1297 * extending the file past EOF and this data starts
1298 * block-aligned, use assign_arcbuf(). Otherwise,
1299 * write via dmu_write().
1300 */
1301 if (tx_bytes < max_blksz && (!write_eof ||
1302 aiov->iov_base != abuf->b_data)) {
1303 ASSERT(xuio);
1304 dmu_write(zfsvfs->z_os, zp->z_id, woff,
1305 aiov->iov_len, aiov->iov_base, tx);
1306 dmu_return_arcbuf(abuf);
1307 xuio_stat_wbuf_copied();
1308 } else {
1309 ASSERT(xuio || tx_bytes == max_blksz);
1310 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1311 woff, abuf, tx);
1312 }
1313 #if defined(illumos) || defined(__NetBSD__)
1314 ASSERT(tx_bytes <= uio->uio_resid);
1315 uioskip(uio, tx_bytes);
1316 #endif
1317 }
1318 if (tx_bytes && vn_has_cached_data(vp)) {
1319 update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1320 zp->z_id, segflg, tx);
1321 }
1322
1323 /*
1324 * If we made no progress, we're done. If we made even
1325 * partial progress, update the znode and ZIL accordingly.
1326 */
1327 if (tx_bytes == 0) {
1328 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1329 (void *)&zp->z_size, sizeof (uint64_t), tx);
1330 dmu_tx_commit(tx);
1331 ASSERT(error != 0);
1332 break;
1333 }
1334
1335 /*
1336 * Clear Set-UID/Set-GID bits on successful write if not
1337 * privileged and at least one of the excute bits is set.
1338 *
1339 * It would be nice to to this after all writes have
1340 * been done, but that would still expose the ISUID/ISGID
1341 * to another app after the partial write is committed.
1342 *
1343 * Note: we don't call zfs_fuid_map_id() here because
1344 * user 0 is not an ephemeral uid.
1345 */
1346 mutex_enter(&zp->z_acl_lock);
1347 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1348 (S_IXUSR >> 6))) != 0 &&
1349 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1350 secpolicy_vnode_setid_retain(vp, cr,
1351 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1352 uint64_t newmode;
1353 zp->z_mode &= ~(S_ISUID | S_ISGID);
1354 newmode = zp->z_mode;
1355 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1356 (void *)&newmode, sizeof (uint64_t), tx);
1357 #ifdef __NetBSD__
1358 cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid,
1359 true);
1360 #endif
1361 }
1362 mutex_exit(&zp->z_acl_lock);
1363
1364 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1365 B_TRUE);
1366
1367 /*
1368 * Update the file size (zp_size) if it has changed;
1369 * account for possible concurrent updates.
1370 */
1371 while ((end_size = zp->z_size) < uio->uio_loffset) {
1372 (void) atomic_cas_64(&zp->z_size, end_size,
1373 uio->uio_loffset);
1374 #ifdef illumos
1375 ASSERT(error == 0);
1376 #else
1377 ASSERT(error == 0 || error == EFAULT);
1378 #endif
1379 }
1380 /*
1381 * If we are replaying and eof is non zero then force
1382 * the file size to the specified eof. Note, there's no
1383 * concurrency during replay.
1384 */
1385 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1386 zp->z_size = zfsvfs->z_replay_eof;
1387
1388 if (error == 0)
1389 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1390 else
1391 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1392
1393 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1394 dmu_tx_commit(tx);
1395
1396 if (error != 0)
1397 break;
1398 ASSERT(tx_bytes == nbytes);
1399 n -= nbytes;
1400
1401 #ifdef illumos
1402 if (!xuio && n > 0)
1403 uio_prefaultpages(MIN(n, max_blksz), uio);
1404 #endif
1405 }
1406
1407 zfs_range_unlock(rl);
1408
1409 /*
1410 * If we're in replay mode, or we made no progress, return error.
1411 * Otherwise, it's at least a partial write, so it's successful.
1412 */
1413 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1414 ZFS_EXIT(zfsvfs);
1415 return (error);
1416 }
1417
1418 #ifdef __FreeBSD__
1419 /*
1420 * EFAULT means that at least one page of the source buffer was not
1421 * available. VFS will re-try remaining I/O upon this error.
1422 */
1423 if (error == EFAULT) {
1424 ZFS_EXIT(zfsvfs);
1425 return (error);
1426 }
1427 #endif
1428
1429 if (ioflag & (FSYNC | FDSYNC) ||
1430 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1431 zil_commit(zilog, zp->z_id);
1432
1433 ZFS_EXIT(zfsvfs);
1434 return (0);
1435 }
1436
1437 void
zfs_get_done(zgd_t * zgd,int error)1438 zfs_get_done(zgd_t *zgd, int error)
1439 {
1440 znode_t *zp = zgd->zgd_private;
1441 objset_t *os = zp->z_zfsvfs->z_os;
1442
1443 if (zgd->zgd_db)
1444 dmu_buf_rele(zgd->zgd_db, zgd);
1445
1446 zfs_range_unlock(zgd->zgd_rl);
1447
1448 /*
1449 * Release the vnode asynchronously as we currently have the
1450 * txg stopped from syncing.
1451 */
1452 VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1453
1454 if (error == 0 && zgd->zgd_bp)
1455 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1456
1457 kmem_free(zgd, sizeof (zgd_t));
1458 }
1459
1460 #ifdef DEBUG
1461 static int zil_fault_io = 0;
1462 #endif
1463
1464 /*
1465 * Get data to generate a TX_WRITE intent log record.
1466 */
1467 int
zfs_get_data(void * arg,lr_write_t * lr,char * buf,zio_t * zio)1468 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1469 {
1470 zfsvfs_t *zfsvfs = arg;
1471 objset_t *os = zfsvfs->z_os;
1472 znode_t *zp;
1473 uint64_t object = lr->lr_foid;
1474 uint64_t offset = lr->lr_offset;
1475 uint64_t size = lr->lr_length;
1476 blkptr_t *bp = &lr->lr_blkptr;
1477 dmu_buf_t *db;
1478 zgd_t *zgd;
1479 int error = 0;
1480
1481 ASSERT(zio != NULL);
1482 ASSERT(size != 0);
1483
1484 /*
1485 * Nothing to do if the file has been removed
1486 */
1487 if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0)
1488 return (SET_ERROR(ENOENT));
1489 if (zp->z_unlinked) {
1490 /*
1491 * Release the vnode asynchronously as we currently have the
1492 * txg stopped from syncing.
1493 */
1494 VN_RELE_CLEANER(ZTOV(zp),
1495 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1496 return (SET_ERROR(ENOENT));
1497 }
1498
1499 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1500 zgd->zgd_zilog = zfsvfs->z_log;
1501 zgd->zgd_private = zp;
1502
1503 /*
1504 * Write records come in two flavors: immediate and indirect.
1505 * For small writes it's cheaper to store the data with the
1506 * log record (immediate); for large writes it's cheaper to
1507 * sync the data and get a pointer to it (indirect) so that
1508 * we don't have to write the data twice.
1509 */
1510 if (buf != NULL) { /* immediate write */
1511 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1512 /* test for truncation needs to be done while range locked */
1513 if (offset >= zp->z_size) {
1514 error = SET_ERROR(ENOENT);
1515 } else {
1516 error = dmu_read(os, object, offset, size, buf,
1517 DMU_READ_NO_PREFETCH);
1518 }
1519 ASSERT(error == 0 || error == ENOENT);
1520 } else { /* indirect write */
1521 /*
1522 * Have to lock the whole block to ensure when it's
1523 * written out and it's checksum is being calculated
1524 * that no one can change the data. We need to re-check
1525 * blocksize after we get the lock in case it's changed!
1526 */
1527 for (;;) {
1528 uint64_t blkoff;
1529 size = zp->z_blksz;
1530 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1531 offset -= blkoff;
1532 zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1533 RL_READER);
1534 if (zp->z_blksz == size)
1535 break;
1536 offset += blkoff;
1537 zfs_range_unlock(zgd->zgd_rl);
1538 }
1539 /* test for truncation needs to be done while range locked */
1540 if (lr->lr_offset >= zp->z_size)
1541 error = SET_ERROR(ENOENT);
1542 #ifdef DEBUG
1543 if (zil_fault_io) {
1544 error = SET_ERROR(EIO);
1545 zil_fault_io = 0;
1546 }
1547 #endif
1548 if (error == 0)
1549 error = dmu_buf_hold(os, object, offset, zgd, &db,
1550 DMU_READ_NO_PREFETCH);
1551
1552 if (error == 0) {
1553 blkptr_t *obp = dmu_buf_get_blkptr(db);
1554 if (obp) {
1555 ASSERT(BP_IS_HOLE(bp));
1556 *bp = *obp;
1557 }
1558
1559 zgd->zgd_db = db;
1560 zgd->zgd_bp = bp;
1561
1562 ASSERT(db->db_offset == offset);
1563 ASSERT(db->db_size == size);
1564
1565 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1566 zfs_get_done, zgd);
1567 ASSERT(error || lr->lr_length <= zp->z_blksz);
1568
1569 /*
1570 * On success, we need to wait for the write I/O
1571 * initiated by dmu_sync() to complete before we can
1572 * release this dbuf. We will finish everything up
1573 * in the zfs_get_done() callback.
1574 */
1575 if (error == 0)
1576 return (0);
1577
1578 if (error == EALREADY) {
1579 lr->lr_common.lrc_txtype = TX_WRITE2;
1580 error = 0;
1581 }
1582 }
1583 }
1584
1585 zfs_get_done(zgd, error);
1586
1587 return (error);
1588 }
1589
1590 /*ARGSUSED*/
1591 static int
zfs_access(vnode_t * vp,int mode,int flag,cred_t * cr,caller_context_t * ct)1592 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1593 caller_context_t *ct)
1594 {
1595 znode_t *zp = VTOZ(vp);
1596 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1597 int error;
1598
1599 ZFS_ENTER(zfsvfs);
1600 ZFS_VERIFY_ZP(zp);
1601
1602 if (flag & V_ACE_MASK)
1603 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1604 else
1605 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1606
1607 ZFS_EXIT(zfsvfs);
1608 return (error);
1609 }
1610
1611 #ifdef __FreeBSD__
1612 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)1613 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1614 {
1615 int error;
1616
1617 *vpp = arg;
1618 error = vn_lock(*vpp, lkflags);
1619 if (error != 0)
1620 vrele(*vpp);
1621 return (error);
1622 }
1623
1624 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)1625 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1626 {
1627 znode_t *zdp = VTOZ(dvp);
1628 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1629 int error;
1630 int ltype;
1631
1632 ASSERT_VOP_LOCKED(dvp, __func__);
1633 #ifdef DIAGNOSTIC
1634 if ((zdp->z_pflags & ZFS_XATTR) == 0)
1635 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1636 #endif
1637
1638 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1639 ASSERT3P(dvp, ==, vp);
1640 vref(dvp);
1641 ltype = lkflags & LK_TYPE_MASK;
1642 if (ltype != VOP_ISLOCKED(dvp)) {
1643 if (ltype == LK_EXCLUSIVE)
1644 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1645 else /* if (ltype == LK_SHARED) */
1646 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1647
1648 /*
1649 * Relock for the "." case could leave us with
1650 * reclaimed vnode.
1651 */
1652 if (dvp->v_iflag & VI_DOOMED) {
1653 vrele(dvp);
1654 return (SET_ERROR(ENOENT));
1655 }
1656 }
1657 return (0);
1658 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1659 /*
1660 * Note that in this case, dvp is the child vnode, and we
1661 * are looking up the parent vnode - exactly reverse from
1662 * normal operation. Unlocking dvp requires some rather
1663 * tricky unlock/relock dance to prevent mp from being freed;
1664 * use vn_vget_ino_gen() which takes care of all that.
1665 *
1666 * XXX Note that there is a time window when both vnodes are
1667 * unlocked. It is possible, although highly unlikely, that
1668 * during that window the parent-child relationship between
1669 * the vnodes may change, for example, get reversed.
1670 * In that case we would have a wrong lock order for the vnodes.
1671 * All other filesystems seem to ignore this problem, so we
1672 * do the same here.
1673 * A potential solution could be implemented as follows:
1674 * - using LK_NOWAIT when locking the second vnode and retrying
1675 * if necessary
1676 * - checking that the parent-child relationship still holds
1677 * after locking both vnodes and retrying if it doesn't
1678 */
1679 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1680 return (error);
1681 } else {
1682 error = vn_lock(vp, lkflags);
1683 if (error != 0)
1684 vrele(vp);
1685 return (error);
1686 }
1687 }
1688
1689 /*
1690 * Lookup an entry in a directory, or an extended attribute directory.
1691 * If it exists, return a held vnode reference for it.
1692 *
1693 * IN: dvp - vnode of directory to search.
1694 * nm - name of entry to lookup.
1695 * pnp - full pathname to lookup [UNUSED].
1696 * flags - LOOKUP_XATTR set if looking for an attribute.
1697 * rdir - root directory vnode [UNUSED].
1698 * cr - credentials of caller.
1699 * ct - caller context
1700 *
1701 * OUT: vpp - vnode of located entry, NULL if not found.
1702 *
1703 * RETURN: 0 on success, error code on failure.
1704 *
1705 * Timestamps:
1706 * NA
1707 */
1708 /* ARGSUSED */
1709 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,kthread_t * td,int flags)1710 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1711 int nameiop, cred_t *cr, kthread_t *td, int flags)
1712 {
1713 znode_t *zdp = VTOZ(dvp);
1714 znode_t *zp;
1715 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1716 int error = 0;
1717
1718 /* fast path (should be redundant with vfs namecache) */
1719 if (!(flags & LOOKUP_XATTR)) {
1720 if (dvp->v_type != VDIR) {
1721 return (SET_ERROR(ENOTDIR));
1722 } else if (zdp->z_sa_hdl == NULL) {
1723 return (SET_ERROR(EIO));
1724 }
1725 }
1726
1727 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1728
1729 ZFS_ENTER(zfsvfs);
1730 ZFS_VERIFY_ZP(zdp);
1731
1732 *vpp = NULL;
1733
1734 if (flags & LOOKUP_XATTR) {
1735 #ifdef TODO
1736 /*
1737 * If the xattr property is off, refuse the lookup request.
1738 */
1739 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1740 ZFS_EXIT(zfsvfs);
1741 return (SET_ERROR(EINVAL));
1742 }
1743 #endif
1744
1745 /*
1746 * We don't allow recursive attributes..
1747 * Maybe someday we will.
1748 */
1749 if (zdp->z_pflags & ZFS_XATTR) {
1750 ZFS_EXIT(zfsvfs);
1751 return (SET_ERROR(EINVAL));
1752 }
1753
1754 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1755 ZFS_EXIT(zfsvfs);
1756 return (error);
1757 }
1758
1759 /*
1760 * Do we have permission to get into attribute directory?
1761 */
1762 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1763 B_FALSE, cr)) {
1764 vrele(*vpp);
1765 *vpp = NULL;
1766 }
1767
1768 ZFS_EXIT(zfsvfs);
1769 return (error);
1770 }
1771
1772 /*
1773 * Check accessibility of directory.
1774 */
1775 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1776 ZFS_EXIT(zfsvfs);
1777 return (error);
1778 }
1779
1780 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1781 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1782 ZFS_EXIT(zfsvfs);
1783 return (SET_ERROR(EILSEQ));
1784 }
1785
1786
1787 /*
1788 * First handle the special cases.
1789 */
1790 if ((cnp->cn_flags & ISDOTDOT) != 0) {
1791 /*
1792 * If we are a snapshot mounted under .zfs, return
1793 * the vp for the snapshot directory.
1794 */
1795 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1796 struct componentname cn;
1797 vnode_t *zfsctl_vp;
1798 int ltype;
1799
1800 ZFS_EXIT(zfsvfs);
1801 ltype = VOP_ISLOCKED(dvp);
1802 VOP_UNLOCK(dvp, 0);
1803 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1804 &zfsctl_vp);
1805 if (error == 0) {
1806 cn.cn_nameptr = "snapshot";
1807 cn.cn_namelen = strlen(cn.cn_nameptr);
1808 cn.cn_nameiop = cnp->cn_nameiop;
1809 cn.cn_flags = cnp->cn_flags;
1810 cn.cn_lkflags = cnp->cn_lkflags;
1811 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1812 vput(zfsctl_vp);
1813 }
1814 vn_lock(dvp, ltype | LK_RETRY);
1815 return (error);
1816 }
1817 }
1818 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1819 ZFS_EXIT(zfsvfs);
1820 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1821 return (SET_ERROR(ENOTSUP));
1822 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1823 return (error);
1824 }
1825
1826 /*
1827 * The loop is retry the lookup if the parent-child relationship
1828 * changes during the dot-dot locking complexities.
1829 */
1830 for (;;) {
1831 uint64_t parent;
1832
1833 error = zfs_dirlook(zdp, nm, &zp);
1834 if (error == 0)
1835 *vpp = ZTOV(zp);
1836
1837 ZFS_EXIT(zfsvfs);
1838 if (error != 0)
1839 break;
1840
1841 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1842 if (error != 0) {
1843 /*
1844 * If we've got a locking error, then the vnode
1845 * got reclaimed because of a force unmount.
1846 * We never enter doomed vnodes into the name cache.
1847 */
1848 *vpp = NULL;
1849 return (error);
1850 }
1851
1852 if ((cnp->cn_flags & ISDOTDOT) == 0)
1853 break;
1854
1855 ZFS_ENTER(zfsvfs);
1856 if (zdp->z_sa_hdl == NULL) {
1857 error = SET_ERROR(EIO);
1858 } else {
1859 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1860 &parent, sizeof (parent));
1861 }
1862 if (error != 0) {
1863 ZFS_EXIT(zfsvfs);
1864 vput(ZTOV(zp));
1865 break;
1866 }
1867 if (zp->z_id == parent) {
1868 ZFS_EXIT(zfsvfs);
1869 break;
1870 }
1871 vput(ZTOV(zp));
1872 }
1873
1874 out:
1875 if (error != 0)
1876 *vpp = NULL;
1877
1878 /* Translate errors and add SAVENAME when needed. */
1879 if (cnp->cn_flags & ISLASTCN) {
1880 switch (nameiop) {
1881 case CREATE:
1882 case RENAME:
1883 if (error == ENOENT) {
1884 error = EJUSTRETURN;
1885 cnp->cn_flags |= SAVENAME;
1886 break;
1887 }
1888 /* FALLTHROUGH */
1889 case DELETE:
1890 if (error == 0)
1891 cnp->cn_flags |= SAVENAME;
1892 break;
1893 }
1894 }
1895
1896 /* Insert name into cache (as non-existent) if appropriate. */
1897 if (zfsvfs->z_use_namecache &&
1898 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1899 cache_enter(dvp, NULL, cnp);
1900
1901 /* Insert name into cache if appropriate. */
1902 if (zfsvfs->z_use_namecache &&
1903 error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1904 if (!(cnp->cn_flags & ISLASTCN) ||
1905 (nameiop != DELETE && nameiop != RENAME)) {
1906 cache_enter(dvp, *vpp, cnp);
1907 }
1908 }
1909
1910 return (error);
1911 }
1912 #endif /* __FreeBSD__ */
1913
1914 #ifdef __NetBSD__
1915 /*
1916 * If vnode is for a device return a specfs vnode instead.
1917 */
1918 static int
specvp_check(vnode_t ** vpp,cred_t * cr)1919 specvp_check(vnode_t **vpp, cred_t *cr)
1920 {
1921 int error = 0;
1922
1923 if (IS_DEVVP(*vpp)) {
1924 struct vnode *svp;
1925
1926 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1927 VN_RELE(*vpp);
1928 if (svp == NULL)
1929 error = ENOSYS;
1930 *vpp = svp;
1931 }
1932 return (error);
1933 }
1934
1935 /*
1936 * Lookup an entry in a directory, or an extended attribute directory.
1937 * If it exists, return a held vnode reference for it.
1938 *
1939 * IN: dvp - vnode of directory to search.
1940 * nm - name of entry to lookup.
1941 * pnp - full pathname to lookup [UNUSED].
1942 * flags - LOOKUP_XATTR set if looking for an attribute.
1943 * rdir - root directory vnode [UNUSED].
1944 * cr - credentials of caller.
1945 * ct - caller context
1946 * direntflags - directory lookup flags
1947 * realpnp - returned pathname.
1948 *
1949 * OUT: vpp - vnode of located entry, NULL if not found.
1950 *
1951 * RETURN: 0 if success
1952 * error code if failure
1953 *
1954 * Timestamps:
1955 * NA
1956 */
1957 /* ARGSUSED */
1958 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,int flags,struct componentname * cnp,int nameiop,cred_t * cr)1959 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, int flags,
1960 struct componentname *cnp, int nameiop, cred_t *cr)
1961 {
1962 znode_t *zdp = VTOZ(dvp);
1963 znode_t *zp;
1964 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1965 int error = 0;
1966
1967 /* fast path */
1968 if (!(flags & LOOKUP_XATTR)) {
1969 if (dvp->v_type != VDIR) {
1970 return (ENOTDIR);
1971 } else if (zdp->z_sa_hdl == NULL) {
1972 return (SET_ERROR(EIO));
1973 }
1974
1975 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1976 error = zfs_fastaccesschk_execute(zdp, cr);
1977 if (!error) {
1978 *vpp = dvp;
1979 VN_HOLD(*vpp);
1980 return (0);
1981 }
1982 return (error);
1983 } else {
1984 vnode_t *tvp = dnlc_lookup(dvp, nm);
1985
1986 if (tvp) {
1987 error = zfs_fastaccesschk_execute(zdp, cr);
1988 if (error) {
1989 VN_RELE(tvp);
1990 return (error);
1991 }
1992 if (tvp == DNLC_NO_VNODE) {
1993 VN_RELE(tvp);
1994 return (ENOENT);
1995 } else {
1996 *vpp = tvp;
1997 return (specvp_check(vpp, cr));
1998 }
1999 }
2000 }
2001 }
2002
2003 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
2004
2005 ZFS_ENTER(zfsvfs);
2006 ZFS_VERIFY_ZP(zdp);
2007
2008 *vpp = NULL;
2009
2010 if (flags & LOOKUP_XATTR) {
2011 #ifdef TODO
2012 /*
2013 * If the xattr property is off, refuse the lookup request.
2014 */
2015 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
2016 ZFS_EXIT(zfsvfs);
2017 return (EINVAL);
2018 }
2019 #endif
2020
2021 /*
2022 * We don't allow recursive attributes..
2023 * Maybe someday we will.
2024 */
2025 if (zdp->z_pflags & ZFS_XATTR) {
2026 ZFS_EXIT(zfsvfs);
2027 return (EINVAL);
2028 }
2029
2030 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
2031 ZFS_EXIT(zfsvfs);
2032 return (error);
2033 }
2034
2035 /*
2036 * Do we have permission to get into attribute directory?
2037 */
2038 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
2039 B_FALSE, cr)) {
2040 VN_RELE(*vpp);
2041 *vpp = NULL;
2042 }
2043
2044 ZFS_EXIT(zfsvfs);
2045 return (error);
2046 }
2047
2048 if (dvp->v_type != VDIR) {
2049 ZFS_EXIT(zfsvfs);
2050 return (ENOTDIR);
2051 }
2052
2053 /*
2054 * Check accessibility of directory.
2055 */
2056 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
2057 ZFS_EXIT(zfsvfs);
2058 return (error);
2059 }
2060
2061 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
2062 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2063 ZFS_EXIT(zfsvfs);
2064 return (EILSEQ);
2065 }
2066
2067 /*
2068 * First handle the special cases.
2069 */
2070 if ((cnp->cn_flags & ISDOTDOT) != 0) {
2071 /*
2072 * If we are a snapshot mounted under .zfs, return
2073 * the vp for the snapshot directory.
2074 */
2075 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
2076 ZFS_EXIT(zfsvfs);
2077 error = zfsctl_snapshot(zfsvfs->z_parent, vpp);
2078
2079 return (error);
2080 }
2081 }
2082 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
2083 ZFS_EXIT(zfsvfs);
2084 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
2085 return (SET_ERROR(ENOTSUP));
2086 error = zfsctl_root(zfsvfs, vpp);
2087 return (error);
2088 }
2089
2090 error = zfs_dirlook(zdp, nm, &zp);
2091 if (error == 0) {
2092 *vpp = ZTOV(zp);
2093 error = specvp_check(vpp, cr);
2094 }
2095
2096 ZFS_EXIT(zfsvfs);
2097 return (error);
2098 }
2099 #endif
2100
2101 /*
2102 * Attempt to create a new entry in a directory. If the entry
2103 * already exists, truncate the file if permissible, else return
2104 * an error. Return the vp of the created or trunc'd file.
2105 *
2106 * IN: dvp - vnode of directory to put new file entry in.
2107 * name - name of new file entry.
2108 * vap - attributes of new file.
2109 * excl - flag indicating exclusive or non-exclusive mode.
2110 * mode - mode to open file with.
2111 * cr - credentials of caller.
2112 * flag - large file flag [UNUSED].
2113 * ct - caller context
2114 * vsecp - ACL to be set
2115 *
2116 * OUT: vpp - vnode of created or trunc'd entry.
2117 *
2118 * RETURN: 0 on success, error code on failure.
2119 *
2120 * Timestamps:
2121 * dvp - ctime|mtime updated if new entry created
2122 * vp - ctime|mtime always, atime if new
2123 */
2124
2125 /* ARGSUSED */
2126 static int
zfs_create(vnode_t * dvp,char * name,vattr_t * vap,int excl,int mode,vnode_t ** vpp,cred_t * cr,kthread_t * td)2127 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
2128 vnode_t **vpp, cred_t *cr, kthread_t *td)
2129 {
2130 znode_t *zp, *dzp = VTOZ(dvp);
2131 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2132 zilog_t *zilog;
2133 objset_t *os;
2134 dmu_tx_t *tx;
2135 int error;
2136 ksid_t *ksid;
2137 uid_t uid;
2138 gid_t gid = crgetgid(cr);
2139 zfs_acl_ids_t acl_ids;
2140 boolean_t fuid_dirtied;
2141 void *vsecp = NULL;
2142 int flag = 0;
2143 uint64_t txtype;
2144
2145 /*
2146 * If we have an ephemeral id, ACL, or XVATTR then
2147 * make sure file system is at proper version
2148 */
2149
2150 ksid = crgetsid(cr, KSID_OWNER);
2151 if (ksid)
2152 uid = ksid_getid(ksid);
2153 else
2154 uid = crgetuid(cr);
2155
2156 if (zfsvfs->z_use_fuids == B_FALSE &&
2157 (vsecp || (vap->va_mask & AT_XVATTR) ||
2158 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2159 return (SET_ERROR(EINVAL));
2160
2161 ZFS_ENTER(zfsvfs);
2162 ZFS_VERIFY_ZP(dzp);
2163 os = zfsvfs->z_os;
2164 zilog = zfsvfs->z_log;
2165
2166 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
2167 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2168 ZFS_EXIT(zfsvfs);
2169 return (SET_ERROR(EILSEQ));
2170 }
2171
2172 if (vap->va_mask & AT_XVATTR) {
2173 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2174 crgetuid(cr), cr, vap->va_type)) != 0) {
2175 ZFS_EXIT(zfsvfs);
2176 return (error);
2177 }
2178 }
2179
2180 *vpp = NULL;
2181
2182 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
2183 vap->va_mode &= ~S_ISVTX;
2184
2185 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
2186 if (error) {
2187 ZFS_EXIT(zfsvfs);
2188 return (error);
2189 }
2190 ASSERT3P(zp, ==, NULL);
2191
2192 /*
2193 * Create a new file object and update the directory
2194 * to reference it.
2195 */
2196 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
2197 goto out;
2198 }
2199
2200 /*
2201 * We only support the creation of regular files in
2202 * extended attribute directories.
2203 */
2204
2205 if ((dzp->z_pflags & ZFS_XATTR) &&
2206 (vap->va_type != VREG)) {
2207 error = SET_ERROR(EINVAL);
2208 goto out;
2209 }
2210
2211 if ((error = zfs_acl_ids_create(dzp, 0, vap,
2212 cr, vsecp, &acl_ids)) != 0)
2213 goto out;
2214
2215 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2216 zfs_acl_ids_free(&acl_ids);
2217 error = SET_ERROR(EDQUOT);
2218 goto out;
2219 }
2220
2221 getnewvnode_reserve(1);
2222
2223 tx = dmu_tx_create(os);
2224
2225 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2226 ZFS_SA_BASE_ATTR_SIZE);
2227
2228 fuid_dirtied = zfsvfs->z_fuid_dirty;
2229 if (fuid_dirtied)
2230 zfs_fuid_txhold(zfsvfs, tx);
2231 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2232 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
2233 if (!zfsvfs->z_use_sa &&
2234 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2235 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2236 0, acl_ids.z_aclp->z_acl_bytes);
2237 }
2238 error = dmu_tx_assign(tx, TXG_WAIT);
2239 if (error) {
2240 zfs_acl_ids_free(&acl_ids);
2241 dmu_tx_abort(tx);
2242 getnewvnode_drop_reserve();
2243 ZFS_EXIT(zfsvfs);
2244 return (error);
2245 }
2246 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2247
2248 if (fuid_dirtied)
2249 zfs_fuid_sync(zfsvfs, tx);
2250
2251 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
2252 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
2253 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
2254 vsecp, acl_ids.z_fuidp, vap);
2255 zfs_acl_ids_free(&acl_ids);
2256 dmu_tx_commit(tx);
2257
2258 getnewvnode_drop_reserve();
2259
2260 out:
2261 if (error == 0) {
2262 *vpp = ZTOV(zp);
2263 }
2264
2265 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2266 zil_commit(zilog, 0);
2267
2268 ZFS_EXIT(zfsvfs);
2269 return (error);
2270 }
2271
2272 /*
2273 * Remove an entry from a directory.
2274 *
2275 * IN: dvp - vnode of directory to remove entry from.
2276 * name - name of entry to remove.
2277 * cr - credentials of caller.
2278 * ct - caller context
2279 * flags - case flags
2280 *
2281 * RETURN: 0 on success, error code on failure.
2282 *
2283 * Timestamps:
2284 * dvp - ctime|mtime
2285 * vp - ctime (if nlink > 0)
2286 */
2287
2288 /*ARGSUSED*/
2289 static int
zfs_remove(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2290 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2291 {
2292 znode_t *dzp = VTOZ(dvp);
2293 znode_t *zp = VTOZ(vp);
2294 znode_t *xzp;
2295 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2296 zilog_t *zilog;
2297 uint64_t acl_obj, xattr_obj;
2298 uint64_t obj = 0;
2299 dmu_tx_t *tx;
2300 boolean_t unlinked, toobig = FALSE;
2301 uint64_t txtype;
2302 int error;
2303
2304 ZFS_ENTER(zfsvfs);
2305 ZFS_VERIFY_ZP(dzp);
2306 ZFS_VERIFY_ZP(zp);
2307 zilog = zfsvfs->z_log;
2308 zp = VTOZ(vp);
2309
2310 xattr_obj = 0;
2311 xzp = NULL;
2312
2313 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2314 goto out;
2315 }
2316
2317 /*
2318 * Need to use rmdir for removing directories.
2319 */
2320 if (vp->v_type == VDIR) {
2321 error = SET_ERROR(EPERM);
2322 goto out;
2323 }
2324
2325 vnevent_remove(vp, dvp, name, ct);
2326
2327 obj = zp->z_id;
2328
2329 /* are there any extended attributes? */
2330 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2331 &xattr_obj, sizeof (xattr_obj));
2332 if (error == 0 && xattr_obj) {
2333 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
2334 ASSERT0(error);
2335 }
2336
2337 /*
2338 * We may delete the znode now, or we may put it in the unlinked set;
2339 * it depends on whether we're the last link, and on whether there are
2340 * other holds on the vnode. So we dmu_tx_hold() the right things to
2341 * allow for either case.
2342 */
2343 tx = dmu_tx_create(zfsvfs->z_os);
2344 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2345 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2346 zfs_sa_upgrade_txholds(tx, zp);
2347 zfs_sa_upgrade_txholds(tx, dzp);
2348
2349 if (xzp) {
2350 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2351 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2352 }
2353
2354 /* charge as an update -- would be nice not to charge at all */
2355 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2356
2357 /*
2358 * Mark this transaction as typically resulting in a net free of space
2359 */
2360 dmu_tx_mark_netfree(tx);
2361
2362 error = dmu_tx_assign(tx, TXG_WAIT);
2363 if (error) {
2364 dmu_tx_abort(tx);
2365 ZFS_EXIT(zfsvfs);
2366 return (error);
2367 }
2368
2369 /*
2370 * Remove the directory entry.
2371 */
2372 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2373
2374 if (error) {
2375 dmu_tx_commit(tx);
2376 goto out;
2377 }
2378
2379 if (unlinked) {
2380 zfs_unlinked_add(zp, tx);
2381 vp->v_vflag |= VV_NOSYNC;
2382 }
2383
2384 txtype = TX_REMOVE;
2385 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2386
2387 dmu_tx_commit(tx);
2388 out:
2389
2390 if (xzp)
2391 vrele(ZTOV(xzp));
2392
2393 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2394 zil_commit(zilog, 0);
2395
2396 ZFS_EXIT(zfsvfs);
2397 return (error);
2398 }
2399
2400 /*
2401 * Create a new directory and insert it into dvp using the name
2402 * provided. Return a pointer to the inserted directory.
2403 *
2404 * IN: dvp - vnode of directory to add subdir to.
2405 * dirname - name of new directory.
2406 * vap - attributes of new directory.
2407 * cr - credentials of caller.
2408 * ct - caller context
2409 * flags - case flags
2410 * vsecp - ACL to be set
2411 *
2412 * OUT: vpp - vnode of created directory.
2413 *
2414 * RETURN: 0 on success, error code on failure.
2415 *
2416 * Timestamps:
2417 * dvp - ctime|mtime updated
2418 * vp - ctime|mtime|atime updated
2419 */
2420 /*ARGSUSED*/
2421 static int
zfs_mkdir(vnode_t * dvp,char * dirname,vattr_t * vap,vnode_t ** vpp,cred_t * cr)2422 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2423 {
2424 znode_t *zp, *dzp = VTOZ(dvp);
2425 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2426 zilog_t *zilog;
2427 uint64_t txtype;
2428 dmu_tx_t *tx;
2429 int error;
2430 ksid_t *ksid;
2431 uid_t uid;
2432 gid_t gid = crgetgid(cr);
2433 zfs_acl_ids_t acl_ids;
2434 boolean_t fuid_dirtied;
2435
2436 ASSERT(vap->va_type == VDIR);
2437
2438 /*
2439 * If we have an ephemeral id, ACL, or XVATTR then
2440 * make sure file system is at proper version
2441 */
2442
2443 ksid = crgetsid(cr, KSID_OWNER);
2444 if (ksid)
2445 uid = ksid_getid(ksid);
2446 else
2447 uid = crgetuid(cr);
2448 if (zfsvfs->z_use_fuids == B_FALSE &&
2449 ((vap->va_mask & AT_XVATTR) ||
2450 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2451 return (SET_ERROR(EINVAL));
2452
2453 ZFS_ENTER(zfsvfs);
2454 ZFS_VERIFY_ZP(dzp);
2455 zilog = zfsvfs->z_log;
2456
2457 if (dzp->z_pflags & ZFS_XATTR) {
2458 ZFS_EXIT(zfsvfs);
2459 return (SET_ERROR(EINVAL));
2460 }
2461
2462 if (zfsvfs->z_utf8 && u8_validate(dirname,
2463 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2464 ZFS_EXIT(zfsvfs);
2465 return (SET_ERROR(EILSEQ));
2466 }
2467
2468 if (vap->va_mask & AT_XVATTR) {
2469 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2470 crgetuid(cr), cr, vap->va_type)) != 0) {
2471 ZFS_EXIT(zfsvfs);
2472 return (error);
2473 }
2474 }
2475
2476 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2477 NULL, &acl_ids)) != 0) {
2478 ZFS_EXIT(zfsvfs);
2479 return (error);
2480 }
2481
2482 /*
2483 * First make sure the new directory doesn't exist.
2484 *
2485 * Existence is checked first to make sure we don't return
2486 * EACCES instead of EEXIST which can cause some applications
2487 * to fail.
2488 */
2489 *vpp = NULL;
2490
2491 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2492 zfs_acl_ids_free(&acl_ids);
2493 ZFS_EXIT(zfsvfs);
2494 return (error);
2495 }
2496 ASSERT3P(zp, ==, NULL);
2497
2498 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2499 zfs_acl_ids_free(&acl_ids);
2500 ZFS_EXIT(zfsvfs);
2501 return (error);
2502 }
2503
2504 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2505 zfs_acl_ids_free(&acl_ids);
2506 ZFS_EXIT(zfsvfs);
2507 return (SET_ERROR(EDQUOT));
2508 }
2509
2510 /*
2511 * Add a new entry to the directory.
2512 */
2513 getnewvnode_reserve(1);
2514 tx = dmu_tx_create(zfsvfs->z_os);
2515 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2516 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2517 fuid_dirtied = zfsvfs->z_fuid_dirty;
2518 if (fuid_dirtied)
2519 zfs_fuid_txhold(zfsvfs, tx);
2520 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2521 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2522 acl_ids.z_aclp->z_acl_bytes);
2523 }
2524
2525 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2526 ZFS_SA_BASE_ATTR_SIZE);
2527
2528 error = dmu_tx_assign(tx, TXG_WAIT);
2529 if (error) {
2530 zfs_acl_ids_free(&acl_ids);
2531 dmu_tx_abort(tx);
2532 getnewvnode_drop_reserve();
2533 ZFS_EXIT(zfsvfs);
2534 return (error);
2535 }
2536
2537 /*
2538 * Create new node.
2539 */
2540 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2541
2542 if (fuid_dirtied)
2543 zfs_fuid_sync(zfsvfs, tx);
2544
2545 /*
2546 * Now put new name in parent dir.
2547 */
2548 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2549
2550 *vpp = ZTOV(zp);
2551
2552 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2553 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2554 acl_ids.z_fuidp, vap);
2555
2556 zfs_acl_ids_free(&acl_ids);
2557
2558 dmu_tx_commit(tx);
2559
2560 getnewvnode_drop_reserve();
2561
2562 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2563 zil_commit(zilog, 0);
2564
2565 ZFS_EXIT(zfsvfs);
2566 return (0);
2567 }
2568
2569 /*
2570 * Remove a directory subdir entry. If the current working
2571 * directory is the same as the subdir to be removed, the
2572 * remove will fail.
2573 *
2574 * IN: dvp - vnode of directory to remove from.
2575 * name - name of directory to be removed.
2576 * cwd - vnode of current working directory.
2577 * cr - credentials of caller.
2578 * ct - caller context
2579 * flags - case flags
2580 *
2581 * RETURN: 0 on success, error code on failure.
2582 *
2583 * Timestamps:
2584 * dvp - ctime|mtime updated
2585 */
2586 /*ARGSUSED*/
2587 static int
zfs_rmdir(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2588 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2589 {
2590 znode_t *dzp = VTOZ(dvp);
2591 znode_t *zp = VTOZ(vp);
2592 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2593 zilog_t *zilog;
2594 dmu_tx_t *tx;
2595 int error;
2596
2597 ZFS_ENTER(zfsvfs);
2598 ZFS_VERIFY_ZP(dzp);
2599 ZFS_VERIFY_ZP(zp);
2600 zilog = zfsvfs->z_log;
2601
2602
2603 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2604 goto out;
2605 }
2606
2607 if (vp->v_type != VDIR) {
2608 error = SET_ERROR(ENOTDIR);
2609 goto out;
2610 }
2611
2612 vnevent_rmdir(vp, dvp, name, ct);
2613
2614 tx = dmu_tx_create(zfsvfs->z_os);
2615 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2616 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2617 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2618 zfs_sa_upgrade_txholds(tx, zp);
2619 zfs_sa_upgrade_txholds(tx, dzp);
2620 dmu_tx_mark_netfree(tx);
2621 error = dmu_tx_assign(tx, TXG_WAIT);
2622 if (error) {
2623 dmu_tx_abort(tx);
2624 ZFS_EXIT(zfsvfs);
2625 return (error);
2626 }
2627
2628 cache_purge(dvp);
2629
2630 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2631
2632 if (error == 0) {
2633 uint64_t txtype = TX_RMDIR;
2634 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2635 }
2636
2637 dmu_tx_commit(tx);
2638
2639 cache_purge(vp);
2640 out:
2641 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2642 zil_commit(zilog, 0);
2643
2644 ZFS_EXIT(zfsvfs);
2645 return (error);
2646 }
2647
2648 /*
2649 * Read as many directory entries as will fit into the provided
2650 * buffer from the given directory cursor position (specified in
2651 * the uio structure).
2652 *
2653 * IN: vp - vnode of directory to read.
2654 * uio - structure supplying read location, range info,
2655 * and return buffer.
2656 * cr - credentials of caller.
2657 * ct - caller context
2658 * flags - case flags
2659 *
2660 * OUT: uio - updated offset and range, buffer filled.
2661 * eofp - set to true if end-of-file detected.
2662 *
2663 * RETURN: 0 on success, error code on failure.
2664 *
2665 * Timestamps:
2666 * vp - atime updated
2667 *
2668 * Note that the low 4 bits of the cookie returned by zap is always zero.
2669 * This allows us to use the low range for "special" directory entries:
2670 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2671 * we use the offset 2 for the '.zfs' directory.
2672 */
2673 /* ARGSUSED */
2674 static int
zfs_readdir(vnode_t * vp,uio_t * uio,cred_t * cr,int * eofp,int * ncookies,off_t ** cookies)2675 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies)
2676 {
2677 znode_t *zp = VTOZ(vp);
2678 iovec_t *iovp;
2679 edirent_t *eodp;
2680 dirent64_t *odp;
2681 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2682 objset_t *os;
2683 caddr_t outbuf;
2684 size_t bufsize;
2685 zap_cursor_t zc;
2686 zap_attribute_t zap;
2687 uint_t bytes_wanted;
2688 uint64_t offset; /* must be unsigned; checks for < 1 */
2689 uint64_t parent;
2690 int local_eof;
2691 int outcount;
2692 int error;
2693 uint8_t prefetch;
2694 boolean_t check_sysattrs;
2695 uint8_t type;
2696 int ncooks = 0;
2697 off_t *cooks = NULL;
2698 int flags = 0;
2699 #ifdef __FreeBSD__
2700 boolean_t user = uio->uio_segflg != UIO_SYSSPACE;
2701 #endif
2702 #ifdef __NetBSD__
2703 boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace);
2704 #endif
2705
2706 ZFS_ENTER(zfsvfs);
2707 ZFS_VERIFY_ZP(zp);
2708
2709 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2710 &parent, sizeof (parent))) != 0) {
2711 ZFS_EXIT(zfsvfs);
2712 return (error);
2713 }
2714
2715 /*
2716 * If we are not given an eof variable,
2717 * use a local one.
2718 */
2719 if (eofp == NULL)
2720 eofp = &local_eof;
2721
2722 /*
2723 * Check for valid iov_len.
2724 */
2725 if (uio->uio_iov->iov_len <= 0) {
2726 ZFS_EXIT(zfsvfs);
2727 return (SET_ERROR(EINVAL));
2728 }
2729
2730 /*
2731 * Quit if directory has been removed (posix)
2732 */
2733 if ((*eofp = zp->z_unlinked) != 0) {
2734 ZFS_EXIT(zfsvfs);
2735 return (0);
2736 }
2737
2738 error = 0;
2739 os = zfsvfs->z_os;
2740 offset = uio->uio_loffset;
2741 prefetch = zp->z_zn_prefetch;
2742
2743 /*
2744 * Initialize the iterator cursor.
2745 */
2746 if (offset <= 3) {
2747 /*
2748 * Start iteration from the beginning of the directory.
2749 */
2750 zap_cursor_init(&zc, os, zp->z_id);
2751 } else {
2752 /*
2753 * The offset is a serialized cursor.
2754 */
2755 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2756 }
2757
2758 /*
2759 * Get space to change directory entries into fs independent format.
2760 */
2761 iovp = uio->uio_iov;
2762 bytes_wanted = iovp->iov_len;
2763 if (user || uio->uio_iovcnt != 1) {
2764 bufsize = bytes_wanted;
2765 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2766 odp = (struct dirent64 *)outbuf;
2767 } else {
2768 bufsize = bytes_wanted;
2769 outbuf = NULL;
2770 odp = (struct dirent64 *)iovp->iov_base;
2771 }
2772 eodp = (struct edirent *)odp;
2773
2774 if (ncookies != NULL) {
2775 /*
2776 * Minimum entry size is dirent size and 1 byte for a file name.
2777 */
2778 #ifdef __FreeBSD__
2779 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2780 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2781 #endif
2782 #ifdef __NetBSD__
2783 ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2784 cooks = malloc(ncooks * sizeof(off_t), M_TEMP, M_WAITOK);
2785 #endif
2786 *cookies = cooks;
2787 *ncookies = ncooks;
2788 }
2789
2790 /*
2791 * If this VFS supports the system attribute view interface; and
2792 * we're looking at an extended attribute directory; and we care
2793 * about normalization conflicts on this vfs; then we must check
2794 * for normalization conflicts with the sysattr name space.
2795 */
2796 #ifdef TODO
2797 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2798 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2799 (flags & V_RDDIR_ENTFLAGS);
2800 #else
2801 check_sysattrs = 0;
2802 #endif
2803
2804 /*
2805 * Transform to file-system independent format
2806 */
2807 outcount = 0;
2808 while (outcount < bytes_wanted) {
2809 ino64_t objnum;
2810 ushort_t reclen;
2811 off64_t *next = NULL;
2812
2813 /*
2814 * Special case `.', `..', and `.zfs'.
2815 */
2816 if (offset == 0) {
2817 (void) strcpy(zap.za_name, ".");
2818 zap.za_normalization_conflict = 0;
2819 objnum = zp->z_id;
2820 type = DT_DIR;
2821 } else if (offset == 1) {
2822 (void) strcpy(zap.za_name, "..");
2823 zap.za_normalization_conflict = 0;
2824 objnum = parent;
2825 type = DT_DIR;
2826 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2827 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2828 zap.za_normalization_conflict = 0;
2829 objnum = ZFSCTL_INO_ROOT;
2830 type = DT_DIR;
2831 } else {
2832 /*
2833 * Grab next entry.
2834 */
2835 if (error = zap_cursor_retrieve(&zc, &zap)) {
2836 if ((*eofp = (error == ENOENT)) != 0)
2837 break;
2838 else
2839 goto update;
2840 }
2841
2842 if (zap.za_integer_length != 8 ||
2843 zap.za_num_integers != 1) {
2844 cmn_err(CE_WARN, "zap_readdir: bad directory "
2845 "entry, obj = %lld, offset = %lld\n",
2846 (u_longlong_t)zp->z_id,
2847 (u_longlong_t)offset);
2848 error = SET_ERROR(ENXIO);
2849 goto update;
2850 }
2851
2852 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2853 /*
2854 * MacOS X can extract the object type here such as:
2855 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2856 */
2857 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2858
2859 if (check_sysattrs && !zap.za_normalization_conflict) {
2860 #ifdef TODO
2861 zap.za_normalization_conflict =
2862 xattr_sysattr_casechk(zap.za_name);
2863 #else
2864 panic("%s:%u: TODO", __func__, __LINE__);
2865 #endif
2866 }
2867 }
2868
2869 if (flags & V_RDDIR_ACCFILTER) {
2870 /*
2871 * If we have no access at all, don't include
2872 * this entry in the returned information
2873 */
2874 znode_t *ezp;
2875 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2876 goto skip_entry;
2877 if (!zfs_has_access(ezp, cr)) {
2878 vrele(ZTOV(ezp));
2879 goto skip_entry;
2880 }
2881 vrele(ZTOV(ezp));
2882 }
2883
2884 if (flags & V_RDDIR_ENTFLAGS)
2885 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2886 else
2887 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2888
2889 /*
2890 * Will this entry fit in the buffer?
2891 */
2892 if (outcount + reclen > bufsize) {
2893 /*
2894 * Did we manage to fit anything in the buffer?
2895 */
2896 if (!outcount) {
2897 error = SET_ERROR(EINVAL);
2898 goto update;
2899 }
2900 break;
2901 }
2902 if (flags & V_RDDIR_ENTFLAGS) {
2903 /*
2904 * Add extended flag entry:
2905 */
2906 eodp->ed_ino = objnum;
2907 eodp->ed_reclen = reclen;
2908 /* NOTE: ed_off is the offset for the *next* entry */
2909 next = &(eodp->ed_off);
2910 eodp->ed_eflags = zap.za_normalization_conflict ?
2911 ED_CASE_CONFLICT : 0;
2912 (void) strncpy(eodp->ed_name, zap.za_name,
2913 EDIRENT_NAMELEN(reclen));
2914 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2915 } else {
2916 /*
2917 * Add normal entry:
2918 */
2919 odp->d_ino = objnum;
2920 odp->d_reclen = reclen;
2921 odp->d_namlen = strlen(zap.za_name);
2922 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2923 odp->d_type = type;
2924 odp = (dirent64_t *)((intptr_t)odp + reclen);
2925 }
2926 outcount += reclen;
2927
2928 ASSERT(outcount <= bufsize);
2929
2930 /* Prefetch znode */
2931 if (prefetch)
2932 dmu_prefetch(os, objnum, 0, 0, 0,
2933 ZIO_PRIORITY_SYNC_READ);
2934
2935 skip_entry:
2936 /*
2937 * Move to the next entry, fill in the previous offset.
2938 */
2939 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2940 zap_cursor_advance(&zc);
2941 offset = zap_cursor_serialize(&zc);
2942 } else {
2943 offset += 1;
2944 }
2945
2946 if (cooks != NULL) {
2947 *cooks++ = offset;
2948 ncooks--;
2949 #ifdef __FreeBSD__
2950 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2951 #endif
2952 #ifdef __NetBSD__
2953 KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks);
2954 #endif
2955 }
2956 }
2957 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2958
2959 /* Subtract unused cookies */
2960 if (ncookies != NULL)
2961 *ncookies -= ncooks;
2962
2963 if (!user && uio->uio_iovcnt == 1) {
2964 iovp->iov_base += outcount;
2965 iovp->iov_len -= outcount;
2966 uio->uio_resid -= outcount;
2967 } else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) {
2968 /*
2969 * Reset the pointer.
2970 */
2971 offset = uio->uio_loffset;
2972 }
2973
2974 update:
2975 zap_cursor_fini(&zc);
2976 if (user || uio->uio_iovcnt != 1)
2977 kmem_free(outbuf, bufsize);
2978
2979 if (error == ENOENT)
2980 error = 0;
2981
2982 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2983
2984 uio->uio_loffset = offset;
2985 ZFS_EXIT(zfsvfs);
2986 if (error != 0 && cookies != NULL) {
2987 #ifdef __FreeBSD__
2988 free(*cookies, M_TEMP);
2989 #endif
2990 #ifdef __NetBSD__
2991 kmem_free(*cookies, ncooks * sizeof(off_t));
2992 #endif
2993 *cookies = NULL;
2994 *ncookies = 0;
2995 }
2996 return (error);
2997 }
2998
2999 ulong_t zfs_fsync_sync_cnt = 4;
3000
3001 static int
zfs_fsync(vnode_t * vp,int syncflag,cred_t * cr,caller_context_t * ct)3002 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
3003 {
3004 znode_t *zp = VTOZ(vp);
3005 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3006
3007 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
3008
3009 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3010 ZFS_ENTER(zfsvfs);
3011 ZFS_VERIFY_ZP(zp);
3012
3013 #ifdef __NetBSD__
3014 if (!zp->z_unlinked)
3015 #endif
3016 zil_commit(zfsvfs->z_log, zp->z_id);
3017 ZFS_EXIT(zfsvfs);
3018 }
3019 return (0);
3020 }
3021
3022
3023 /*
3024 * Get the requested file attributes and place them in the provided
3025 * vattr structure.
3026 *
3027 * IN: vp - vnode of file.
3028 * vap - va_mask identifies requested attributes.
3029 * If AT_XVATTR set, then optional attrs are requested
3030 * flags - ATTR_NOACLCHECK (CIFS server context)
3031 * cr - credentials of caller.
3032 * ct - caller context
3033 *
3034 * OUT: vap - attribute values.
3035 *
3036 * RETURN: 0 (always succeeds).
3037 */
3038 /* ARGSUSED */
3039 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)3040 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3041 caller_context_t *ct)
3042 {
3043 znode_t *zp = VTOZ(vp);
3044 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3045 int error = 0;
3046 uint32_t blksize;
3047 u_longlong_t nblocks;
3048 uint64_t links;
3049 uint64_t mtime[2], ctime[2], crtime[2], rdev;
3050 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
3051 xoptattr_t *xoap = NULL;
3052 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3053 sa_bulk_attr_t bulk[4];
3054 int count = 0;
3055
3056 ZFS_ENTER(zfsvfs);
3057 ZFS_VERIFY_ZP(zp);
3058
3059 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
3060
3061 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3062 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3063 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
3064 if (vp->v_type == VBLK || vp->v_type == VCHR)
3065 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
3066 &rdev, 8);
3067
3068 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
3069 ZFS_EXIT(zfsvfs);
3070 return (error);
3071 }
3072
3073 /*
3074 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
3075 * Also, if we are the owner don't bother, since owner should
3076 * always be allowed to read basic attributes of file.
3077 */
3078 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
3079 (vap->va_uid != crgetuid(cr))) {
3080 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
3081 skipaclchk, cr)) {
3082 ZFS_EXIT(zfsvfs);
3083 return (error);
3084 }
3085 }
3086
3087 /*
3088 * Return all attributes. It's cheaper to provide the answer
3089 * than to determine whether we were asked the question.
3090 */
3091
3092 vap->va_type = IFTOVT(zp->z_mode);
3093 vap->va_mode = zp->z_mode & ~S_IFMT;
3094 #ifdef illumos
3095 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
3096 #endif
3097 #ifdef __FreeBSD__
3098 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
3099 vap->va_nodeid = zp->z_id;
3100 #endif
3101 #ifdef __NetBSD__
3102 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid;
3103 vap->va_nodeid = zp->z_id;
3104 /*
3105 * If we are a snapshot mounted under .zfs, return
3106 * the object id of the snapshot to make getcwd happy.
3107 */
3108 if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
3109 vnode_t *cvp = vp->v_mount->mnt_vnodecovered;
3110
3111 if (cvp && zfsctl_is_node(cvp))
3112 vap->va_nodeid = dmu_objset_id(zfsvfs->z_os);
3113 }
3114 #endif
3115 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
3116 links = zp->z_links + 1;
3117 else
3118 links = zp->z_links;
3119 /* XXX NetBSD: use LINK_MAX when that value matches 32-bit nlink_t */
3120 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
3121 vap->va_size = zp->z_size;
3122 #ifdef illumos
3123 vap->va_rdev = vp->v_rdev;
3124 #else
3125 if (vp->v_type == VBLK || vp->v_type == VCHR)
3126 vap->va_rdev = zfs_cmpldev(rdev);
3127 #endif
3128 vap->va_seq = zp->z_seq;
3129 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
3130 vap->va_filerev = zp->z_seq;
3131
3132 /*
3133 * Add in any requested optional attributes and the create time.
3134 * Also set the corresponding bits in the returned attribute bitmap.
3135 */
3136 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
3137 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
3138 xoap->xoa_archive =
3139 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
3140 XVA_SET_RTN(xvap, XAT_ARCHIVE);
3141 }
3142
3143 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
3144 xoap->xoa_readonly =
3145 ((zp->z_pflags & ZFS_READONLY) != 0);
3146 XVA_SET_RTN(xvap, XAT_READONLY);
3147 }
3148
3149 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
3150 xoap->xoa_system =
3151 ((zp->z_pflags & ZFS_SYSTEM) != 0);
3152 XVA_SET_RTN(xvap, XAT_SYSTEM);
3153 }
3154
3155 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
3156 xoap->xoa_hidden =
3157 ((zp->z_pflags & ZFS_HIDDEN) != 0);
3158 XVA_SET_RTN(xvap, XAT_HIDDEN);
3159 }
3160
3161 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3162 xoap->xoa_nounlink =
3163 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
3164 XVA_SET_RTN(xvap, XAT_NOUNLINK);
3165 }
3166
3167 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3168 xoap->xoa_immutable =
3169 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
3170 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
3171 }
3172
3173 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3174 xoap->xoa_appendonly =
3175 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
3176 XVA_SET_RTN(xvap, XAT_APPENDONLY);
3177 }
3178
3179 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3180 xoap->xoa_nodump =
3181 ((zp->z_pflags & ZFS_NODUMP) != 0);
3182 XVA_SET_RTN(xvap, XAT_NODUMP);
3183 }
3184
3185 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
3186 xoap->xoa_opaque =
3187 ((zp->z_pflags & ZFS_OPAQUE) != 0);
3188 XVA_SET_RTN(xvap, XAT_OPAQUE);
3189 }
3190
3191 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3192 xoap->xoa_av_quarantined =
3193 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
3194 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
3195 }
3196
3197 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3198 xoap->xoa_av_modified =
3199 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
3200 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
3201 }
3202
3203 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
3204 vp->v_type == VREG) {
3205 zfs_sa_get_scanstamp(zp, xvap);
3206 }
3207
3208 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3209 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
3210 XVA_SET_RTN(xvap, XAT_REPARSE);
3211 }
3212 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
3213 xoap->xoa_generation = zp->z_gen;
3214 XVA_SET_RTN(xvap, XAT_GEN);
3215 }
3216
3217 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
3218 xoap->xoa_offline =
3219 ((zp->z_pflags & ZFS_OFFLINE) != 0);
3220 XVA_SET_RTN(xvap, XAT_OFFLINE);
3221 }
3222
3223 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
3224 xoap->xoa_sparse =
3225 ((zp->z_pflags & ZFS_SPARSE) != 0);
3226 XVA_SET_RTN(xvap, XAT_SPARSE);
3227 }
3228 }
3229
3230 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
3231 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
3232 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
3233 ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
3234
3235
3236 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3237 vap->va_blksize = blksize;
3238 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
3239
3240 if (zp->z_blksz == 0) {
3241 /*
3242 * Block size hasn't been set; suggest maximal I/O transfers.
3243 */
3244 vap->va_blksize = zfsvfs->z_max_blksz;
3245 }
3246
3247 ZFS_EXIT(zfsvfs);
3248 return (0);
3249 }
3250
3251 /*
3252 * Set the file attributes to the values contained in the
3253 * vattr structure.
3254 *
3255 * IN: vp - vnode of file to be modified.
3256 * vap - new attribute values.
3257 * If AT_XVATTR set, then optional attrs are being set
3258 * flags - ATTR_UTIME set if non-default time values provided.
3259 * - ATTR_NOACLCHECK (CIFS context only).
3260 * cr - credentials of caller.
3261 * ct - caller context
3262 *
3263 * RETURN: 0 on success, error code on failure.
3264 *
3265 * Timestamps:
3266 * vp - ctime updated, mtime updated if size changed.
3267 */
3268 /* ARGSUSED */
3269 static int
zfs_setattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)3270 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3271 caller_context_t *ct)
3272 {
3273 znode_t *zp = VTOZ(vp);
3274 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3275 zilog_t *zilog;
3276 dmu_tx_t *tx;
3277 vattr_t oldva;
3278 xvattr_t tmpxvattr;
3279 uint_t mask = vap->va_mask;
3280 uint_t saved_mask = 0;
3281 uint64_t saved_mode;
3282 int trim_mask = 0;
3283 uint64_t new_mode;
3284 uint64_t new_uid, new_gid;
3285 uint64_t xattr_obj;
3286 uint64_t mtime[2], ctime[2];
3287 znode_t *attrzp;
3288 int need_policy = FALSE;
3289 int err, err2;
3290 zfs_fuid_info_t *fuidp = NULL;
3291 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
3292 xoptattr_t *xoap;
3293 zfs_acl_t *aclp;
3294 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3295 boolean_t fuid_dirtied = B_FALSE;
3296 sa_bulk_attr_t bulk[7], xattr_bulk[7];
3297 int count = 0, xattr_count = 0;
3298
3299 if (mask == 0)
3300 return (0);
3301
3302 if (mask & AT_NOSET)
3303 return (SET_ERROR(EINVAL));
3304
3305 ZFS_ENTER(zfsvfs);
3306 ZFS_VERIFY_ZP(zp);
3307
3308 zilog = zfsvfs->z_log;
3309
3310 /*
3311 * Make sure that if we have ephemeral uid/gid or xvattr specified
3312 * that file system is at proper version level
3313 */
3314
3315 if (zfsvfs->z_use_fuids == B_FALSE &&
3316 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3317 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3318 (mask & AT_XVATTR))) {
3319 ZFS_EXIT(zfsvfs);
3320 return (SET_ERROR(EINVAL));
3321 }
3322
3323 if (mask & AT_SIZE && vp->v_type == VDIR) {
3324 ZFS_EXIT(zfsvfs);
3325 return (SET_ERROR(EISDIR));
3326 }
3327
3328 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3329 ZFS_EXIT(zfsvfs);
3330 return (SET_ERROR(EINVAL));
3331 }
3332
3333 /*
3334 * If this is an xvattr_t, then get a pointer to the structure of
3335 * optional attributes. If this is NULL, then we have a vattr_t.
3336 */
3337 xoap = xva_getxoptattr(xvap);
3338
3339 xva_init(&tmpxvattr);
3340
3341 /*
3342 * Immutable files can only alter immutable bit and atime
3343 */
3344 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3345 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3346 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3347 ZFS_EXIT(zfsvfs);
3348 return (SET_ERROR(EPERM));
3349 }
3350
3351 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3352 ZFS_EXIT(zfsvfs);
3353 return (SET_ERROR(EPERM));
3354 }
3355
3356 /*
3357 * Verify timestamps doesn't overflow 32 bits.
3358 * ZFS can handle large timestamps, but 32bit syscalls can't
3359 * handle times greater than 2039. This check should be removed
3360 * once large timestamps are fully supported.
3361 */
3362 if (mask & (AT_ATIME | AT_MTIME)) {
3363 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3364 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3365 ZFS_EXIT(zfsvfs);
3366 return (SET_ERROR(EOVERFLOW));
3367 }
3368 }
3369 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
3370 TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
3371 ZFS_EXIT(zfsvfs);
3372 return (SET_ERROR(EOVERFLOW));
3373 }
3374
3375 attrzp = NULL;
3376 aclp = NULL;
3377
3378 /* Can this be moved to before the top label? */
3379 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3380 ZFS_EXIT(zfsvfs);
3381 return (SET_ERROR(EROFS));
3382 }
3383
3384 /*
3385 * First validate permissions
3386 */
3387
3388 if (mask & AT_SIZE) {
3389 /*
3390 * XXX - Note, we are not providing any open
3391 * mode flags here (like FNDELAY), so we may
3392 * block if there are locks present... this
3393 * should be addressed in openat().
3394 */
3395 /* XXX - would it be OK to generate a log record here? */
3396 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3397 if (err) {
3398 ZFS_EXIT(zfsvfs);
3399 return (err);
3400 }
3401 }
3402
3403 if (mask & (AT_ATIME|AT_MTIME) ||
3404 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3405 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3406 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3407 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3408 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3409 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3410 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3411 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3412 skipaclchk, cr);
3413 }
3414
3415 if (mask & (AT_UID|AT_GID)) {
3416 int idmask = (mask & (AT_UID|AT_GID));
3417 int take_owner;
3418 int take_group;
3419
3420 /*
3421 * NOTE: even if a new mode is being set,
3422 * we may clear S_ISUID/S_ISGID bits.
3423 */
3424
3425 if (!(mask & AT_MODE))
3426 vap->va_mode = zp->z_mode;
3427
3428 /*
3429 * Take ownership or chgrp to group we are a member of
3430 */
3431
3432 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3433 take_group = (mask & AT_GID) &&
3434 zfs_groupmember(zfsvfs, vap->va_gid, cr);
3435
3436 /*
3437 * If both AT_UID and AT_GID are set then take_owner and
3438 * take_group must both be set in order to allow taking
3439 * ownership.
3440 *
3441 * Otherwise, send the check through secpolicy_vnode_setattr()
3442 *
3443 */
3444
3445 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3446 ((idmask == AT_UID) && take_owner) ||
3447 ((idmask == AT_GID) && take_group)) {
3448 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3449 skipaclchk, cr) == 0) {
3450 /*
3451 * Remove setuid/setgid for non-privileged users
3452 */
3453 secpolicy_setid_clear(vap, vp, cr);
3454 trim_mask = (mask & (AT_UID|AT_GID));
3455 } else {
3456 need_policy = TRUE;
3457 }
3458 } else {
3459 need_policy = TRUE;
3460 }
3461 }
3462
3463 oldva.va_mode = zp->z_mode;
3464 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3465 if (mask & AT_XVATTR) {
3466 /*
3467 * Update xvattr mask to include only those attributes
3468 * that are actually changing.
3469 *
3470 * the bits will be restored prior to actually setting
3471 * the attributes so the caller thinks they were set.
3472 */
3473 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3474 if (xoap->xoa_appendonly !=
3475 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3476 need_policy = TRUE;
3477 } else {
3478 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3479 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3480 }
3481 }
3482
3483 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3484 if (xoap->xoa_nounlink !=
3485 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3486 need_policy = TRUE;
3487 } else {
3488 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3489 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3490 }
3491 }
3492
3493 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3494 if (xoap->xoa_immutable !=
3495 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3496 need_policy = TRUE;
3497 } else {
3498 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3499 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3500 }
3501 }
3502
3503 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3504 if (xoap->xoa_nodump !=
3505 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3506 #if 0
3507 /*
3508 * XXXSB - zfs_netbsd_setattr()
3509 * has already checked if this
3510 * request is authorised, and our
3511 * secpolicy_xvattr() doesn't check
3512 * kauth chflags. Fix this when we
3513 * migrate to openzfs.
3514 */
3515 need_policy = TRUE;
3516 #endif
3517 } else {
3518 XVA_CLR_REQ(xvap, XAT_NODUMP);
3519 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3520 }
3521 }
3522
3523 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3524 if (xoap->xoa_av_modified !=
3525 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3526 need_policy = TRUE;
3527 } else {
3528 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3529 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3530 }
3531 }
3532
3533 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3534 if ((vp->v_type != VREG &&
3535 xoap->xoa_av_quarantined) ||
3536 xoap->xoa_av_quarantined !=
3537 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3538 need_policy = TRUE;
3539 } else {
3540 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3541 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3542 }
3543 }
3544
3545 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3546 ZFS_EXIT(zfsvfs);
3547 return (SET_ERROR(EPERM));
3548 }
3549
3550 if (need_policy == FALSE &&
3551 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3552 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3553 need_policy = TRUE;
3554 }
3555 }
3556
3557 if (mask & AT_MODE) {
3558 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3559 err = secpolicy_setid_setsticky_clear(vp, vap,
3560 &oldva, cr);
3561 if (err) {
3562 ZFS_EXIT(zfsvfs);
3563 return (err);
3564 }
3565 trim_mask |= AT_MODE;
3566 } else {
3567 need_policy = TRUE;
3568 }
3569 }
3570
3571 if (need_policy) {
3572 /*
3573 * If trim_mask is set then take ownership
3574 * has been granted or write_acl is present and user
3575 * has the ability to modify mode. In that case remove
3576 * UID|GID and or MODE from mask so that
3577 * secpolicy_vnode_setattr() doesn't revoke it.
3578 */
3579
3580 if (trim_mask) {
3581 saved_mask = vap->va_mask;
3582 vap->va_mask &= ~trim_mask;
3583 if (trim_mask & AT_MODE) {
3584 /*
3585 * Save the mode, as secpolicy_vnode_setattr()
3586 * will overwrite it with ova.va_mode.
3587 */
3588 saved_mode = vap->va_mode;
3589 }
3590 }
3591 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3592 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3593 if (err) {
3594 ZFS_EXIT(zfsvfs);
3595 return (err);
3596 }
3597
3598 if (trim_mask) {
3599 vap->va_mask |= saved_mask;
3600 if (trim_mask & AT_MODE) {
3601 /*
3602 * Recover the mode after
3603 * secpolicy_vnode_setattr().
3604 */
3605 vap->va_mode = saved_mode;
3606 }
3607 }
3608 }
3609
3610 /*
3611 * secpolicy_vnode_setattr, or take ownership may have
3612 * changed va_mask
3613 */
3614 mask = vap->va_mask;
3615
3616 if ((mask & (AT_UID | AT_GID))) {
3617 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3618 &xattr_obj, sizeof (xattr_obj));
3619
3620 if (err == 0 && xattr_obj) {
3621 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3622 if (err == 0) {
3623 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3624 if (err != 0)
3625 vrele(ZTOV(attrzp));
3626 }
3627 if (err)
3628 goto out2;
3629 }
3630 if (mask & AT_UID) {
3631 new_uid = zfs_fuid_create(zfsvfs,
3632 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3633 if (new_uid != zp->z_uid &&
3634 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3635 if (attrzp)
3636 vput(ZTOV(attrzp));
3637 err = SET_ERROR(EDQUOT);
3638 goto out2;
3639 }
3640 }
3641
3642 if (mask & AT_GID) {
3643 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3644 cr, ZFS_GROUP, &fuidp);
3645 if (new_gid != zp->z_gid &&
3646 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3647 if (attrzp)
3648 vput(ZTOV(attrzp));
3649 err = SET_ERROR(EDQUOT);
3650 goto out2;
3651 }
3652 }
3653 }
3654 tx = dmu_tx_create(zfsvfs->z_os);
3655
3656 if (mask & AT_MODE) {
3657 uint64_t pmode = zp->z_mode;
3658 uint64_t acl_obj;
3659 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3660
3661 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3662 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3663 err = SET_ERROR(EPERM);
3664 goto out;
3665 }
3666
3667 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3668 goto out;
3669
3670 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3671 /*
3672 * Are we upgrading ACL from old V0 format
3673 * to V1 format?
3674 */
3675 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3676 zfs_znode_acl_version(zp) ==
3677 ZFS_ACL_VERSION_INITIAL) {
3678 dmu_tx_hold_free(tx, acl_obj, 0,
3679 DMU_OBJECT_END);
3680 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3681 0, aclp->z_acl_bytes);
3682 } else {
3683 dmu_tx_hold_write(tx, acl_obj, 0,
3684 aclp->z_acl_bytes);
3685 }
3686 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3687 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3688 0, aclp->z_acl_bytes);
3689 }
3690 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3691 } else {
3692 if ((mask & AT_XVATTR) &&
3693 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3694 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3695 else
3696 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3697 }
3698
3699 if (attrzp) {
3700 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3701 }
3702
3703 fuid_dirtied = zfsvfs->z_fuid_dirty;
3704 if (fuid_dirtied)
3705 zfs_fuid_txhold(zfsvfs, tx);
3706
3707 zfs_sa_upgrade_txholds(tx, zp);
3708
3709 err = dmu_tx_assign(tx, TXG_WAIT);
3710 if (err)
3711 goto out;
3712
3713 count = 0;
3714 /*
3715 * Set each attribute requested.
3716 * We group settings according to the locks they need to acquire.
3717 *
3718 * Note: you cannot set ctime directly, although it will be
3719 * updated as a side-effect of calling this function.
3720 */
3721
3722 if (mask & (AT_UID|AT_GID|AT_MODE))
3723 mutex_enter(&zp->z_acl_lock);
3724
3725 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3726 &zp->z_pflags, sizeof (zp->z_pflags));
3727
3728 if (attrzp) {
3729 if (mask & (AT_UID|AT_GID|AT_MODE))
3730 mutex_enter(&attrzp->z_acl_lock);
3731 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3732 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3733 sizeof (attrzp->z_pflags));
3734 }
3735
3736 if (mask & (AT_UID|AT_GID)) {
3737
3738 if (mask & AT_UID) {
3739 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3740 &new_uid, sizeof (new_uid));
3741 zp->z_uid = new_uid;
3742 if (attrzp) {
3743 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3744 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3745 sizeof (new_uid));
3746 attrzp->z_uid = new_uid;
3747 }
3748 }
3749
3750 if (mask & AT_GID) {
3751 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3752 NULL, &new_gid, sizeof (new_gid));
3753 zp->z_gid = new_gid;
3754 if (attrzp) {
3755 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3756 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3757 sizeof (new_gid));
3758 attrzp->z_gid = new_gid;
3759 }
3760 }
3761 if (!(mask & AT_MODE)) {
3762 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3763 NULL, &new_mode, sizeof (new_mode));
3764 new_mode = zp->z_mode;
3765 }
3766 err = zfs_acl_chown_setattr(zp);
3767 ASSERT(err == 0);
3768 if (attrzp) {
3769 err = zfs_acl_chown_setattr(attrzp);
3770 ASSERT(err == 0);
3771 }
3772 }
3773
3774 if (mask & AT_MODE) {
3775 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3776 &new_mode, sizeof (new_mode));
3777 zp->z_mode = new_mode;
3778 ASSERT3U((uintptr_t)aclp, !=, 0);
3779 err = zfs_aclset_common(zp, aclp, cr, tx);
3780 ASSERT0(err);
3781 if (zp->z_acl_cached)
3782 zfs_acl_free(zp->z_acl_cached);
3783 zp->z_acl_cached = aclp;
3784 aclp = NULL;
3785 }
3786
3787
3788 if (mask & AT_ATIME) {
3789 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3790 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3791 &zp->z_atime, sizeof (zp->z_atime));
3792 }
3793
3794 if (mask & AT_MTIME) {
3795 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3796 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3797 mtime, sizeof (mtime));
3798 }
3799
3800 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3801 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3802 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3803 NULL, mtime, sizeof (mtime));
3804 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3805 &ctime, sizeof (ctime));
3806 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3807 B_TRUE);
3808 } else if (mask != 0) {
3809 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3810 &ctime, sizeof (ctime));
3811 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3812 B_TRUE);
3813 if (attrzp) {
3814 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3815 SA_ZPL_CTIME(zfsvfs), NULL,
3816 &ctime, sizeof (ctime));
3817 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3818 mtime, ctime, B_TRUE);
3819 }
3820 }
3821 /*
3822 * Do this after setting timestamps to prevent timestamp
3823 * update from toggling bit
3824 */
3825
3826 if (xoap && (mask & AT_XVATTR)) {
3827
3828 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3829 xoap->xoa_createtime = vap->va_birthtime;
3830 /*
3831 * restore trimmed off masks
3832 * so that return masks can be set for caller.
3833 */
3834
3835 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3836 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3837 }
3838 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3839 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3840 }
3841 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3842 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3843 }
3844 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3845 XVA_SET_REQ(xvap, XAT_NODUMP);
3846 }
3847 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3848 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3849 }
3850 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3851 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3852 }
3853
3854 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3855 ASSERT(vp->v_type == VREG);
3856
3857 zfs_xvattr_set(zp, xvap, tx);
3858 }
3859
3860 if (fuid_dirtied)
3861 zfs_fuid_sync(zfsvfs, tx);
3862
3863 if (mask != 0)
3864 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3865
3866 if (mask & (AT_UID|AT_GID|AT_MODE))
3867 mutex_exit(&zp->z_acl_lock);
3868
3869 if (attrzp) {
3870 if (mask & (AT_UID|AT_GID|AT_MODE))
3871 mutex_exit(&attrzp->z_acl_lock);
3872 }
3873 out:
3874 if (err == 0 && attrzp) {
3875 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3876 xattr_count, tx);
3877 ASSERT(err2 == 0);
3878 }
3879
3880 if (attrzp)
3881 vput(ZTOV(attrzp));
3882
3883 if (aclp)
3884 zfs_acl_free(aclp);
3885
3886 if (fuidp) {
3887 zfs_fuid_info_free(fuidp);
3888 fuidp = NULL;
3889 }
3890
3891 if (err) {
3892 dmu_tx_abort(tx);
3893 } else {
3894 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3895 dmu_tx_commit(tx);
3896 }
3897
3898 out2:
3899 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3900 zil_commit(zilog, 0);
3901
3902 ZFS_EXIT(zfsvfs);
3903 return (err);
3904 }
3905
3906 /*
3907 * We acquire all but fdvp locks using non-blocking acquisitions. If we
3908 * fail to acquire any lock in the path we will drop all held locks,
3909 * acquire the new lock in a blocking fashion, and then release it and
3910 * restart the rename. This acquire/release step ensures that we do not
3911 * spin on a lock waiting for release. On error release all vnode locks
3912 * and decrement references the way tmpfs_rename() would do.
3913 */
3914 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3915 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3916 struct vnode *tdvp, struct vnode **tvpp,
3917 const struct componentname *scnp, const struct componentname *tcnp)
3918 {
3919 zfsvfs_t *zfsvfs;
3920 struct vnode *nvp, *svp, *tvp;
3921 znode_t *sdzp, *tdzp, *szp, *tzp;
3922 #ifdef __FreeBSD__
3923 const char *snm = scnp->cn_nameptr;
3924 const char *tnm = tcnp->cn_nameptr;
3925 #endif
3926 #ifdef __NetBSD__
3927 char *snm, *tnm;
3928 #endif
3929 int error;
3930
3931 #ifdef __FreeBSD__
3932 VOP_UNLOCK(tdvp, 0);
3933 if (*tvpp != NULL && *tvpp != tdvp)
3934 VOP_UNLOCK(*tvpp, 0);
3935 #endif
3936
3937 relock:
3938 error = vn_lock(sdvp, LK_EXCLUSIVE);
3939 if (error)
3940 goto out;
3941 sdzp = VTOZ(sdvp);
3942
3943 #ifdef __NetBSD__
3944 if (tdvp == sdvp) {
3945 } else {
3946 #endif
3947 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3948 if (error != 0) {
3949 VOP_UNLOCK(sdvp, 0);
3950 if (error != EBUSY)
3951 goto out;
3952 error = vn_lock(tdvp, LK_EXCLUSIVE);
3953 if (error)
3954 goto out;
3955 VOP_UNLOCK(tdvp, 0);
3956 goto relock;
3957 }
3958 #ifdef __NetBSD__
3959 } /* end if (tdvp == sdvp) */
3960 #endif
3961
3962 tdzp = VTOZ(tdvp);
3963
3964 /*
3965 * Before using sdzp and tdzp we must ensure that they are live.
3966 * As a porting legacy from illumos we have two things to worry
3967 * about. One is typical for FreeBSD and it is that the vnode is
3968 * not reclaimed (doomed). The other is that the znode is live.
3969 * The current code can invalidate the znode without acquiring the
3970 * corresponding vnode lock if the object represented by the znode
3971 * and vnode is no longer valid after a rollback or receive operation.
3972 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3973 * that protects the znodes from the invalidation.
3974 */
3975 zfsvfs = sdzp->z_zfsvfs;
3976 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3977 ZFS_ENTER(zfsvfs);
3978
3979 /*
3980 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3981 * bypassing the cleanup code in the case of an error.
3982 */
3983 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3984 ZFS_EXIT(zfsvfs);
3985 VOP_UNLOCK(sdvp, 0);
3986 #ifdef __NetBSD__
3987 if (tdvp != sdvp)
3988 #endif
3989 VOP_UNLOCK(tdvp, 0);
3990 error = SET_ERROR(EIO);
3991 goto out;
3992 }
3993
3994 /*
3995 * Re-resolve svp to be certain it still exists and fetch the
3996 * correct vnode.
3997 */
3998 #ifdef __NetBSD__
3999 /* ZFS wants a null-terminated name. */
4000 snm = PNBUF_GET();
4001 strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4002 #endif
4003 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
4004 #ifdef __NetBSD__
4005 PNBUF_PUT(snm);
4006 #endif
4007 if (error != 0) {
4008 /* Source entry invalid or not there. */
4009 ZFS_EXIT(zfsvfs);
4010 VOP_UNLOCK(sdvp, 0);
4011 #ifdef __NetBSD__
4012 if (tdvp != sdvp)
4013 #endif
4014 VOP_UNLOCK(tdvp, 0);
4015 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
4016 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
4017 error = SET_ERROR(EINVAL);
4018 goto out;
4019 }
4020 svp = ZTOV(szp);
4021
4022 /*
4023 * Re-resolve tvp, if it disappeared we just carry on.
4024 */
4025 #ifdef __NetBSD__
4026 /* ZFS wants a null-terminated name. */
4027 tnm = PNBUF_GET();
4028 strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4029 #endif
4030 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
4031 #ifdef __NetBSD__
4032 PNBUF_PUT(tnm);
4033 #endif
4034 if (error != 0) {
4035 ZFS_EXIT(zfsvfs);
4036 VOP_UNLOCK(sdvp, 0);
4037 #ifdef __NetBSD__
4038 if (tdvp != sdvp)
4039 #endif
4040 VOP_UNLOCK(tdvp, 0);
4041 vrele(svp);
4042 if ((tcnp->cn_flags & ISDOTDOT) != 0)
4043 error = SET_ERROR(EINVAL);
4044 goto out;
4045 }
4046 if (tzp != NULL)
4047 tvp = ZTOV(tzp);
4048 else
4049 tvp = NULL;
4050
4051 /*
4052 * At present the vnode locks must be acquired before z_teardown_lock,
4053 * although it would be more logical to use the opposite order.
4054 */
4055 ZFS_EXIT(zfsvfs);
4056
4057 /*
4058 * Now try acquire locks on svp and tvp.
4059 */
4060 nvp = svp;
4061 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4062 if (error != 0) {
4063 VOP_UNLOCK(sdvp, 0);
4064 #ifdef __NetBSD__
4065 if (tdvp != sdvp)
4066 #endif
4067 VOP_UNLOCK(tdvp, 0);
4068 if (tvp != NULL)
4069 vrele(tvp);
4070 if (error != EBUSY) {
4071 vrele(nvp);
4072 goto out;
4073 }
4074 error = vn_lock(nvp, LK_EXCLUSIVE);
4075 if (error != 0) {
4076 vrele(nvp);
4077 goto out;
4078 }
4079 VOP_UNLOCK(nvp, 0);
4080 /*
4081 * Concurrent rename race.
4082 * XXX ?
4083 */
4084 if (nvp == tdvp) {
4085 vrele(nvp);
4086 error = SET_ERROR(EINVAL);
4087 goto out;
4088 }
4089 #ifdef __NetBSD__
4090 if (*svpp != NULL)
4091 #endif
4092 vrele(*svpp);
4093 *svpp = nvp;
4094 goto relock;
4095 }
4096 #ifdef __NetBSD__
4097 if (*svpp != NULL)
4098 #endif
4099 vrele(*svpp);
4100 *svpp = nvp;
4101
4102 if (*tvpp != NULL)
4103 vrele(*tvpp);
4104 *tvpp = NULL;
4105 if (tvp != NULL) {
4106 nvp = tvp;
4107
4108 #ifdef __NetBSD__
4109 if (tvp == svp || tvp == sdvp) {
4110 } else {
4111 #endif
4112 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4113 if (error != 0) {
4114 VOP_UNLOCK(sdvp, 0);
4115 #ifdef __NetBSD__
4116 if (tdvp != sdvp)
4117 #endif
4118 VOP_UNLOCK(tdvp, 0);
4119 #ifdef __NetBSD__
4120 if (*svpp != tdvp)
4121 #endif
4122 VOP_UNLOCK(*svpp, 0);
4123 if (error != EBUSY) {
4124 vrele(nvp);
4125 goto out;
4126 }
4127 error = vn_lock(nvp, LK_EXCLUSIVE);
4128 if (error != 0) {
4129 vrele(nvp);
4130 goto out;
4131 }
4132 vput(nvp);
4133 goto relock;
4134 }
4135 #ifdef __NetBSD__
4136 } /* end if (tvp == svp || tvp == sdvp) */
4137 #endif
4138
4139 *tvpp = nvp;
4140 }
4141
4142 KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE);
4143 KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE);
4144 KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4145 KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE);
4146
4147 return (0);
4148
4149 out:
4150 return (error);
4151 }
4152
4153 /*
4154 * Note that we must use VRELE_ASYNC in this function as it walks
4155 * up the directory tree and vrele may need to acquire an exclusive
4156 * lock if a last reference to a vnode is dropped.
4157 */
4158 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)4159 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
4160 {
4161 zfsvfs_t *zfsvfs;
4162 znode_t *zp, *zp1;
4163 uint64_t parent;
4164 int error;
4165
4166 zfsvfs = tdzp->z_zfsvfs;
4167 if (tdzp == szp)
4168 return (SET_ERROR(EINVAL));
4169 if (tdzp == sdzp)
4170 return (0);
4171 if (tdzp->z_id == zfsvfs->z_root)
4172 return (0);
4173 zp = tdzp;
4174 for (;;) {
4175 ASSERT(!zp->z_unlinked);
4176 if ((error = sa_lookup(zp->z_sa_hdl,
4177 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
4178 break;
4179
4180 if (parent == szp->z_id) {
4181 error = SET_ERROR(EINVAL);
4182 break;
4183 }
4184 if (parent == zfsvfs->z_root)
4185 break;
4186 if (parent == sdzp->z_id)
4187 break;
4188
4189 error = zfs_zget(zfsvfs, parent, &zp1);
4190 if (error != 0)
4191 break;
4192
4193 if (zp != tdzp)
4194 VN_RELE_ASYNC(ZTOV(zp),
4195 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4196 zp = zp1;
4197 }
4198
4199 if (error == ENOTDIR)
4200 panic("checkpath: .. not a directory\n");
4201 if (zp != tdzp)
4202 VN_RELE_ASYNC(ZTOV(zp),
4203 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4204 return (error);
4205 }
4206
4207 /*
4208 * Move an entry from the provided source directory to the target
4209 * directory. Change the entry name as indicated.
4210 *
4211 * IN: sdvp - Source directory containing the "old entry".
4212 * snm - Old entry name.
4213 * tdvp - Target directory to contain the "new entry".
4214 * tnm - New entry name.
4215 * cr - credentials of caller.
4216 * ct - caller context
4217 * flags - case flags
4218 *
4219 * RETURN: 0 on success, error code on failure.
4220 *
4221 * Timestamps:
4222 * sdvp,tdvp - ctime|mtime updated
4223 */
4224 /*ARGSUSED*/
4225 static int
zfs_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)4226 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
4227 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
4228 cred_t *cr)
4229 {
4230 zfsvfs_t *zfsvfs;
4231 znode_t *sdzp, *tdzp, *szp, *tzp;
4232 zilog_t *zilog = NULL;
4233 dmu_tx_t *tx;
4234 #ifdef __FreeBSD__
4235 char *snm = __UNCONST(scnp->cn_nameptr);
4236 char *tnm = __UNCONST(tcnp->cn_nameptr);
4237 #endif
4238 #ifdef __NetBSD__
4239 char *snm, *tnm;
4240 #endif
4241 int error = 0;
4242
4243 /* Reject renames across filesystems. */
4244 if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) ||
4245 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
4246 error = SET_ERROR(EXDEV);
4247 goto out;
4248 }
4249
4250 if (zfsctl_is_node(tdvp)) {
4251 error = SET_ERROR(EXDEV);
4252 goto out;
4253 }
4254
4255 /*
4256 * Lock all four vnodes to ensure safety and semantics of renaming.
4257 */
4258 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
4259 if (error != 0) {
4260 /* no vnodes are locked in the case of error here */
4261 return (error);
4262 }
4263
4264 tdzp = VTOZ(tdvp);
4265 sdzp = VTOZ(sdvp);
4266 zfsvfs = tdzp->z_zfsvfs;
4267 zilog = zfsvfs->z_log;
4268 #ifdef __NetBSD__
4269 /* ZFS wants a null-terminated name. */
4270 snm = PNBUF_GET();
4271 strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4272 tnm = PNBUF_GET();
4273 strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4274 #endif
4275
4276 /*
4277 * After we re-enter ZFS_ENTER() we will have to revalidate all
4278 * znodes involved.
4279 */
4280 ZFS_ENTER(zfsvfs);
4281
4282 if (zfsvfs->z_utf8 && u8_validate(tnm,
4283 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4284 error = SET_ERROR(EILSEQ);
4285 goto unlockout;
4286 }
4287
4288 #ifndef __NetBSD__
4289 /* If source and target are the same file, there is nothing to do. */
4290 if ((*svpp) == (*tvpp)) {
4291 error = 0;
4292 goto unlockout;
4293 }
4294 #endif
4295
4296 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
4297 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
4298 (*tvpp)->v_mountedhere != NULL)) {
4299 error = SET_ERROR(EXDEV);
4300 goto unlockout;
4301 }
4302
4303 /*
4304 * We can not use ZFS_VERIFY_ZP() here because it could directly return
4305 * bypassing the cleanup code in the case of an error.
4306 */
4307 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
4308 error = SET_ERROR(EIO);
4309 goto unlockout;
4310 }
4311
4312 szp = VTOZ(*svpp);
4313 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
4314 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
4315 error = SET_ERROR(EIO);
4316 goto unlockout;
4317 }
4318
4319 /*
4320 * This is to prevent the creation of links into attribute space
4321 * by renaming a linked file into/outof an attribute directory.
4322 * See the comment in zfs_link() for why this is considered bad.
4323 */
4324 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
4325 error = SET_ERROR(EINVAL);
4326 goto unlockout;
4327 }
4328
4329 /*
4330 * Must have write access at the source to remove the old entry
4331 * and write access at the target to create the new entry.
4332 * Note that if target and source are the same, this can be
4333 * done in a single check.
4334 */
4335 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
4336 goto unlockout;
4337
4338 if ((*svpp)->v_type == VDIR) {
4339 /*
4340 * Avoid ".", "..", and aliases of "." for obvious reasons.
4341 */
4342 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
4343 sdzp == szp ||
4344 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
4345 error = SET_ERROR(EINVAL);
4346 goto unlockout;
4347 }
4348
4349 /*
4350 * Check to make sure rename is valid.
4351 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
4352 */
4353 if (error = zfs_rename_check(szp, sdzp, tdzp))
4354 goto unlockout;
4355 }
4356
4357 /*
4358 * Does target exist?
4359 */
4360 if (tzp) {
4361 /*
4362 * Source and target must be the same type.
4363 */
4364 if ((*svpp)->v_type == VDIR) {
4365 if ((*tvpp)->v_type != VDIR) {
4366 error = SET_ERROR(ENOTDIR);
4367 goto unlockout;
4368 } else {
4369 cache_purge(tdvp);
4370 if (sdvp != tdvp)
4371 cache_purge(sdvp);
4372 }
4373 } else {
4374 if ((*tvpp)->v_type == VDIR) {
4375 error = SET_ERROR(EISDIR);
4376 goto unlockout;
4377 }
4378 }
4379
4380 /*
4381 * POSIX dictates that when the source and target
4382 * entries refer to the same file object, rename
4383 * must do nothing and exit without error.
4384 */
4385 #ifndef __NetBSD__
4386 /*
4387 * But on NetBSD we have a different system call to do
4388 * this, posix_rename, which sorta kinda handles this
4389 * case (modulo races), and our tests expect BSD
4390 * semantics for rename, so we'll do that until we can
4391 * push the choice between BSD and POSIX semantics into
4392 * the VOP_RENAME protocol as a flag.
4393 */
4394 if (szp->z_id == tzp->z_id) {
4395 error = 0;
4396 goto unlockout;
4397 }
4398 #endif
4399 }
4400
4401 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
4402 if (tzp)
4403 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
4404
4405 /*
4406 * notify the target directory if it is not the same
4407 * as source directory.
4408 */
4409 if (tdvp != sdvp) {
4410 vnevent_rename_dest_dir(tdvp, ct);
4411 }
4412
4413 tx = dmu_tx_create(zfsvfs->z_os);
4414 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4415 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4416 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4417 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4418 if (sdzp != tdzp) {
4419 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4420 zfs_sa_upgrade_txholds(tx, tdzp);
4421 }
4422 if (tzp) {
4423 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4424 zfs_sa_upgrade_txholds(tx, tzp);
4425 }
4426
4427 zfs_sa_upgrade_txholds(tx, szp);
4428 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4429 error = dmu_tx_assign(tx, TXG_WAIT);
4430 if (error) {
4431 dmu_tx_abort(tx);
4432 goto unlockout;
4433 }
4434
4435
4436 if (tzp && (tzp->z_id != szp->z_id))
4437 /* Attempt to remove the existing target */
4438 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
4439
4440 if (error == 0) {
4441 if (!tzp || (tzp->z_id != szp->z_id))
4442 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
4443 if (error == 0) {
4444 szp->z_pflags |= ZFS_AV_MODIFIED;
4445
4446 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4447 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4448 ASSERT0(error);
4449
4450 error = zfs_link_destroy(sdzp, snm, szp, tx,
4451 /* Kludge for BSD rename semantics. */
4452 tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL);
4453 if (error == 0) {
4454 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
4455 snm, tdzp, tnm, szp);
4456
4457 /*
4458 * Update path information for the target vnode
4459 */
4460 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
4461 } else {
4462 /*
4463 * At this point, we have successfully created
4464 * the target name, but have failed to remove
4465 * the source name. Since the create was done
4466 * with the ZRENAMING flag, there are
4467 * complications; for one, the link count is
4468 * wrong. The easiest way to deal with this
4469 * is to remove the newly created target, and
4470 * return the original error. This must
4471 * succeed; fortunately, it is very unlikely to
4472 * fail, since we just created it.
4473 */
4474 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
4475 ZRENAMING, NULL), ==, 0);
4476 }
4477 }
4478 if (error == 0) {
4479 cache_purge(*svpp);
4480 if (*tvpp != NULL)
4481 cache_purge(*tvpp);
4482 cache_purge_negative(tdvp);
4483 #ifdef __NetBSD__
4484 if (*svpp == *tvpp) {
4485 VN_KNOTE(sdvp, NOTE_WRITE);
4486 VN_KNOTE(*svpp, (szp->z_links == 0 ?
4487 NOTE_DELETE : NOTE_LINK));
4488 } else {
4489 genfs_rename_knote(sdvp, *svpp, tdvp, *tvpp,
4490 tzp != NULL ? tzp->z_links : 0);
4491 }
4492 #endif
4493 }
4494 }
4495
4496 dmu_tx_commit(tx);
4497
4498 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4499 zil_commit(zilog, 0);
4500
4501 unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
4502 ZFS_EXIT(zfsvfs);
4503
4504 VOP_UNLOCK(*svpp, 0);
4505 VOP_UNLOCK(sdvp, 0);
4506 #ifdef __NetBSD__
4507 PNBUF_PUT(snm);
4508 PNBUF_PUT(tnm);
4509 #endif
4510
4511 if (*tvpp != sdvp && *tvpp != *svpp)
4512 if (*tvpp != NULL)
4513 VOP_UNLOCK(*tvpp, 0);
4514 if (tdvp != sdvp && tdvp != *svpp)
4515 if (tdvp != *tvpp)
4516 VOP_UNLOCK(tdvp, 0);
4517
4518 out:
4519 return (error);
4520 }
4521
4522 /*
4523 * Insert the indicated symbolic reference entry into the directory.
4524 *
4525 * IN: dvp - Directory to contain new symbolic link.
4526 * link - Name for new symlink entry.
4527 * vap - Attributes of new entry.
4528 * cr - credentials of caller.
4529 * ct - caller context
4530 * flags - case flags
4531 *
4532 * RETURN: 0 on success, error code on failure.
4533 *
4534 * Timestamps:
4535 * dvp - ctime|mtime updated
4536 */
4537 /*ARGSUSED*/
4538 static int
zfs_symlink(vnode_t * dvp,vnode_t ** vpp,char * name,vattr_t * vap,char * link,cred_t * cr,kthread_t * td)4539 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4540 cred_t *cr, kthread_t *td)
4541 {
4542 znode_t *zp, *dzp = VTOZ(dvp);
4543 dmu_tx_t *tx;
4544 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
4545 zilog_t *zilog;
4546 uint64_t len = strlen(link);
4547 int error;
4548 zfs_acl_ids_t acl_ids;
4549 boolean_t fuid_dirtied;
4550 uint64_t txtype = TX_SYMLINK;
4551 int flags = 0;
4552
4553 ASSERT(vap->va_type == VLNK);
4554
4555 ZFS_ENTER(zfsvfs);
4556 ZFS_VERIFY_ZP(dzp);
4557 zilog = zfsvfs->z_log;
4558
4559 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4560 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4561 ZFS_EXIT(zfsvfs);
4562 return (SET_ERROR(EILSEQ));
4563 }
4564
4565 if (len > MAXPATHLEN) {
4566 ZFS_EXIT(zfsvfs);
4567 return (SET_ERROR(ENAMETOOLONG));
4568 }
4569
4570 if ((error = zfs_acl_ids_create(dzp, 0,
4571 vap, cr, NULL, &acl_ids)) != 0) {
4572 ZFS_EXIT(zfsvfs);
4573 return (error);
4574 }
4575
4576 /*
4577 * Attempt to lock directory; fail if entry already exists.
4578 */
4579 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4580 if (error) {
4581 zfs_acl_ids_free(&acl_ids);
4582 ZFS_EXIT(zfsvfs);
4583 return (error);
4584 }
4585
4586 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4587 zfs_acl_ids_free(&acl_ids);
4588 ZFS_EXIT(zfsvfs);
4589 return (error);
4590 }
4591
4592 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4593 zfs_acl_ids_free(&acl_ids);
4594 ZFS_EXIT(zfsvfs);
4595 return (SET_ERROR(EDQUOT));
4596 }
4597
4598 getnewvnode_reserve(1);
4599 tx = dmu_tx_create(zfsvfs->z_os);
4600 fuid_dirtied = zfsvfs->z_fuid_dirty;
4601 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4602 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4603 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4604 ZFS_SA_BASE_ATTR_SIZE + len);
4605 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4606 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4607 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4608 acl_ids.z_aclp->z_acl_bytes);
4609 }
4610 if (fuid_dirtied)
4611 zfs_fuid_txhold(zfsvfs, tx);
4612 error = dmu_tx_assign(tx, TXG_WAIT);
4613 if (error) {
4614 zfs_acl_ids_free(&acl_ids);
4615 dmu_tx_abort(tx);
4616 getnewvnode_drop_reserve();
4617 ZFS_EXIT(zfsvfs);
4618 return (error);
4619 }
4620
4621 /*
4622 * Create a new object for the symlink.
4623 * for version 4 ZPL datsets the symlink will be an SA attribute
4624 */
4625 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4626
4627 if (fuid_dirtied)
4628 zfs_fuid_sync(zfsvfs, tx);
4629
4630 if (zp->z_is_sa)
4631 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4632 link, len, tx);
4633 else
4634 zfs_sa_symlink(zp, link, len, tx);
4635
4636 zp->z_size = len;
4637 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4638 &zp->z_size, sizeof (zp->z_size), tx);
4639 /*
4640 * Insert the new object into the directory.
4641 */
4642 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4643
4644 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4645 *vpp = ZTOV(zp);
4646
4647 zfs_acl_ids_free(&acl_ids);
4648
4649 dmu_tx_commit(tx);
4650
4651 getnewvnode_drop_reserve();
4652
4653 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4654 zil_commit(zilog, 0);
4655
4656 ZFS_EXIT(zfsvfs);
4657 return (error);
4658 }
4659
4660 /*
4661 * Return, in the buffer contained in the provided uio structure,
4662 * the symbolic path referred to by vp.
4663 *
4664 * IN: vp - vnode of symbolic link.
4665 * uio - structure to contain the link path.
4666 * cr - credentials of caller.
4667 * ct - caller context
4668 *
4669 * OUT: uio - structure containing the link path.
4670 *
4671 * RETURN: 0 on success, error code on failure.
4672 *
4673 * Timestamps:
4674 * vp - atime updated
4675 */
4676 /* ARGSUSED */
4677 static int
zfs_readlink(vnode_t * vp,uio_t * uio,cred_t * cr,caller_context_t * ct)4678 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4679 {
4680 znode_t *zp = VTOZ(vp);
4681 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4682 int error;
4683
4684 ZFS_ENTER(zfsvfs);
4685 ZFS_VERIFY_ZP(zp);
4686
4687 if (zp->z_is_sa)
4688 error = sa_lookup_uio(zp->z_sa_hdl,
4689 SA_ZPL_SYMLINK(zfsvfs), uio);
4690 else
4691 error = zfs_sa_readlink(zp, uio);
4692
4693 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4694
4695 ZFS_EXIT(zfsvfs);
4696 return (error);
4697 }
4698
4699 /*
4700 * Insert a new entry into directory tdvp referencing svp.
4701 *
4702 * IN: tdvp - Directory to contain new entry.
4703 * svp - vnode of new entry.
4704 * name - name of new entry.
4705 * cr - credentials of caller.
4706 * ct - caller context
4707 *
4708 * RETURN: 0 on success, error code on failure.
4709 *
4710 * Timestamps:
4711 * tdvp - ctime|mtime updated
4712 * svp - ctime updated
4713 */
4714 /* ARGSUSED */
4715 static int
zfs_link(vnode_t * tdvp,vnode_t * svp,char * name,cred_t * cr,caller_context_t * ct,int flags)4716 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4717 caller_context_t *ct, int flags)
4718 {
4719 znode_t *dzp = VTOZ(tdvp);
4720 znode_t *tzp, *szp;
4721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
4722 zilog_t *zilog;
4723 dmu_tx_t *tx;
4724 int error;
4725 uint64_t parent;
4726 uid_t owner;
4727
4728 ASSERT(tdvp->v_type == VDIR);
4729
4730 ZFS_ENTER(zfsvfs);
4731 ZFS_VERIFY_ZP(dzp);
4732 zilog = zfsvfs->z_log;
4733
4734 /*
4735 * POSIX dictates that we return EPERM here.
4736 * Better choices include ENOTSUP or EISDIR.
4737 */
4738 if (svp->v_type == VDIR) {
4739 ZFS_EXIT(zfsvfs);
4740 return (SET_ERROR(EPERM));
4741 }
4742
4743 szp = VTOZ(svp);
4744 ZFS_VERIFY_ZP(szp);
4745
4746 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4747 ZFS_EXIT(zfsvfs);
4748 return (SET_ERROR(EPERM));
4749 }
4750
4751 /* Prevent links to .zfs/shares files */
4752
4753 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4754 &parent, sizeof (uint64_t))) != 0) {
4755 ZFS_EXIT(zfsvfs);
4756 return (error);
4757 }
4758 if (parent == zfsvfs->z_shares_dir) {
4759 ZFS_EXIT(zfsvfs);
4760 return (SET_ERROR(EPERM));
4761 }
4762
4763 if (zfsvfs->z_utf8 && u8_validate(name,
4764 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4765 ZFS_EXIT(zfsvfs);
4766 return (SET_ERROR(EILSEQ));
4767 }
4768
4769 /*
4770 * We do not support links between attributes and non-attributes
4771 * because of the potential security risk of creating links
4772 * into "normal" file space in order to circumvent restrictions
4773 * imposed in attribute space.
4774 */
4775 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4776 ZFS_EXIT(zfsvfs);
4777 return (SET_ERROR(EINVAL));
4778 }
4779
4780
4781 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4782 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4783 ZFS_EXIT(zfsvfs);
4784 return (SET_ERROR(EPERM));
4785 }
4786
4787 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4788 ZFS_EXIT(zfsvfs);
4789 return (error);
4790 }
4791
4792 /*
4793 * Attempt to lock directory; fail if entry already exists.
4794 */
4795 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4796 if (error) {
4797 ZFS_EXIT(zfsvfs);
4798 return (error);
4799 }
4800
4801 tx = dmu_tx_create(zfsvfs->z_os);
4802 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4803 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4804 zfs_sa_upgrade_txholds(tx, szp);
4805 zfs_sa_upgrade_txholds(tx, dzp);
4806 error = dmu_tx_assign(tx, TXG_WAIT);
4807 if (error) {
4808 dmu_tx_abort(tx);
4809 ZFS_EXIT(zfsvfs);
4810 return (error);
4811 }
4812
4813 error = zfs_link_create(dzp, name, szp, tx, 0);
4814
4815 if (error == 0) {
4816 uint64_t txtype = TX_LINK;
4817 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4818 }
4819
4820 dmu_tx_commit(tx);
4821
4822 if (error == 0) {
4823 vnevent_link(svp, ct);
4824 }
4825
4826 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4827 zil_commit(zilog, 0);
4828
4829 ZFS_EXIT(zfsvfs);
4830 return (error);
4831 }
4832
4833
4834 /*ARGSUSED*/
4835 void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4836 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4837 {
4838 znode_t *zp = VTOZ(vp);
4839 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4840 int error;
4841
4842 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4843 if (zp->z_sa_hdl == NULL) {
4844 /*
4845 * The fs has been unmounted, or we did a
4846 * suspend/resume and this file no longer exists.
4847 */
4848 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4849 vrecycle(vp);
4850 return;
4851 }
4852
4853 if (zp->z_unlinked) {
4854 /*
4855 * Fast path to recycle a vnode of a removed file.
4856 */
4857 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4858 vrecycle(vp);
4859 return;
4860 }
4861
4862 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4863 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4864
4865 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4866 zfs_sa_upgrade_txholds(tx, zp);
4867 error = dmu_tx_assign(tx, TXG_WAIT);
4868 if (error) {
4869 dmu_tx_abort(tx);
4870 } else {
4871 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4872 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4873 zp->z_atime_dirty = 0;
4874 dmu_tx_commit(tx);
4875 }
4876 }
4877 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4878 }
4879
4880
4881 #ifdef __FreeBSD__
4882 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4883 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4884 #endif
4885
4886 /*ARGSUSED*/
4887 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4888 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4889 {
4890 znode_t *zp = VTOZ(vp);
4891 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4892 uint32_t gen;
4893 uint64_t gen64;
4894 uint64_t object = zp->z_id;
4895 zfid_short_t *zfid;
4896 int size, i, error;
4897
4898 ZFS_ENTER(zfsvfs);
4899 ZFS_VERIFY_ZP(zp);
4900
4901 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4902 &gen64, sizeof (uint64_t))) != 0) {
4903 ZFS_EXIT(zfsvfs);
4904 return (error);
4905 }
4906
4907 gen = (uint32_t)gen64;
4908
4909 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4910
4911 #ifdef illumos
4912 if (fidp->fid_len < size) {
4913 fidp->fid_len = size;
4914 ZFS_EXIT(zfsvfs);
4915 return (SET_ERROR(ENOSPC));
4916 }
4917 #else
4918 fidp->fid_len = size;
4919 #endif
4920
4921 zfid = (zfid_short_t *)fidp;
4922
4923 zfid->zf_len = size;
4924
4925 for (i = 0; i < sizeof (zfid->zf_object); i++)
4926 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4927
4928 /* Must have a non-zero generation number to distinguish from .zfs */
4929 if (gen == 0)
4930 gen = 1;
4931 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4932 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4933
4934 if (size == LONG_FID_LEN) {
4935 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4936 zfid_long_t *zlfid;
4937
4938 zlfid = (zfid_long_t *)fidp;
4939
4940 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4941 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4942
4943 /* XXX - this should be the generation number for the objset */
4944 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4945 zlfid->zf_setgen[i] = 0;
4946 }
4947
4948 ZFS_EXIT(zfsvfs);
4949 return (0);
4950 }
4951
4952 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4953 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4954 caller_context_t *ct)
4955 {
4956 znode_t *zp, *xzp;
4957 zfsvfs_t *zfsvfs;
4958 int error;
4959
4960 switch (cmd) {
4961 case _PC_LINK_MAX:
4962 *valp = INT_MAX;
4963 return (0);
4964
4965 case _PC_FILESIZEBITS:
4966 *valp = 64;
4967 return (0);
4968 #ifdef illumos
4969 case _PC_XATTR_EXISTS:
4970 zp = VTOZ(vp);
4971 zfsvfs = zp->z_zfsvfs;
4972 ZFS_ENTER(zfsvfs);
4973 ZFS_VERIFY_ZP(zp);
4974 *valp = 0;
4975 error = zfs_dirent_lookup(zp, "", &xzp,
4976 ZXATTR | ZEXISTS | ZSHARED);
4977 if (error == 0) {
4978 if (!zfs_dirempty(xzp))
4979 *valp = 1;
4980 vrele(ZTOV(xzp));
4981 } else if (error == ENOENT) {
4982 /*
4983 * If there aren't extended attributes, it's the
4984 * same as having zero of them.
4985 */
4986 error = 0;
4987 }
4988 ZFS_EXIT(zfsvfs);
4989 return (error);
4990
4991 case _PC_SATTR_ENABLED:
4992 case _PC_SATTR_EXISTS:
4993 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4994 (vp->v_type == VREG || vp->v_type == VDIR);
4995 return (0);
4996
4997 case _PC_ACCESS_FILTERING:
4998 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4999 vp->v_type == VDIR;
5000 return (0);
5001
5002 case _PC_ACL_ENABLED:
5003 *valp = _ACL_ACE_ENABLED;
5004 return (0);
5005 #endif /* illumos */
5006 case _PC_MIN_HOLE_SIZE:
5007 *valp = (int)SPA_MINBLOCKSIZE;
5008 return (0);
5009 #ifdef illumos
5010 case _PC_TIMESTAMP_RESOLUTION:
5011 /* nanosecond timestamp resolution */
5012 *valp = 1L;
5013 return (0);
5014 #endif
5015 case _PC_ACL_EXTENDED:
5016 *valp = 0;
5017 return (0);
5018
5019 #ifndef __NetBSD__
5020 case _PC_ACL_NFS4:
5021 *valp = 1;
5022 return (0);
5023
5024 case _PC_ACL_PATH_MAX:
5025 *valp = ACL_MAX_ENTRIES;
5026 return (0);
5027 #endif
5028
5029 default:
5030 return (EOPNOTSUPP);
5031 }
5032 }
5033
5034 /*ARGSUSED*/
5035 static int
zfs_getsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)5036 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5037 caller_context_t *ct)
5038 {
5039 znode_t *zp = VTOZ(vp);
5040 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5041 int error;
5042 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5043
5044 ZFS_ENTER(zfsvfs);
5045 ZFS_VERIFY_ZP(zp);
5046 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5047 ZFS_EXIT(zfsvfs);
5048
5049 return (error);
5050 }
5051
5052 /*ARGSUSED*/
5053 int
zfs_setsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)5054 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5055 caller_context_t *ct)
5056 {
5057 znode_t *zp = VTOZ(vp);
5058 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5059 int error;
5060 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5061 zilog_t *zilog = zfsvfs->z_log;
5062
5063 ZFS_ENTER(zfsvfs);
5064 ZFS_VERIFY_ZP(zp);
5065
5066 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5067
5068 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5069 zil_commit(zilog, 0);
5070
5071 ZFS_EXIT(zfsvfs);
5072 return (error);
5073 }
5074
5075 static int
ioflags(int ioflags)5076 ioflags(int ioflags)
5077 {
5078 int flags = 0;
5079
5080 if (ioflags & IO_APPEND)
5081 flags |= FAPPEND;
5082 if (ioflags & IO_NDELAY)
5083 flags |= FNONBLOCK;
5084 if (ioflags & IO_SYNC)
5085 flags |= (FSYNC | FDSYNC | FRSYNC);
5086
5087 return (flags);
5088 }
5089
5090 #ifdef __NetBSD__
5091
5092 static int
zfs_netbsd_open(void * v)5093 zfs_netbsd_open(void *v)
5094 {
5095 struct vop_open_args *ap = v;
5096
5097 return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL));
5098 }
5099
5100 static int
zfs_netbsd_close(void * v)5101 zfs_netbsd_close(void *v)
5102 {
5103 struct vop_close_args *ap = v;
5104
5105 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5106 }
5107
5108 static int
zfs_netbsd_ioctl(void * v)5109 zfs_netbsd_ioctl(void *v)
5110 {
5111 struct vop_ioctl_args *ap = v;
5112
5113 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5114 ap->a_fflag, ap->a_cred, NULL, NULL));
5115 }
5116
5117
5118 static int
zfs_netbsd_read(void * v)5119 zfs_netbsd_read(void *v)
5120 {
5121 struct vop_read_args *ap = v;
5122 vnode_t *vp = ap->a_vp;
5123 znode_t *zp = VTOZ(vp);
5124
5125 switch (vp->v_type) {
5126 case VBLK:
5127 case VCHR:
5128 ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5129 return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap));
5130 case VFIFO:
5131 ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5132 return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap));
5133 }
5134
5135 return (zfs_read(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
5136 }
5137
5138 static int
zfs_netbsd_write(void * v)5139 zfs_netbsd_write(void *v)
5140 {
5141 struct vop_write_args *ap = v;
5142 vnode_t *vp = ap->a_vp;
5143 znode_t *zp = VTOZ(vp);
5144 struct uio *uio = ap->a_uio;
5145 off_t osize = zp->z_size;
5146 int error, resid;
5147
5148 switch (vp->v_type) {
5149 case VBLK:
5150 case VCHR:
5151 GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5152 return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap));
5153 case VFIFO:
5154 GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5155 return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap));
5156 }
5157
5158 resid = uio->uio_resid;
5159 error = zfs_write(vp, uio, ioflags(ap->a_ioflag), ap->a_cred, NULL);
5160
5161 return error;
5162 }
5163
5164 static int
zfs_netbsd_access(void * v)5165 zfs_netbsd_access(void *v)
5166 {
5167 struct vop_access_args /* {
5168 struct vnode *a_vp;
5169 accmode_t a_accmode;
5170 kauth_cred_t a_cred;
5171 } */ *ap = v;
5172 vnode_t *vp = ap->a_vp;
5173 znode_t *zp = VTOZ(vp);
5174 accmode_t accmode;
5175 kauth_cred_t cred = ap->a_cred;
5176 int error = 0;
5177
5178 /*
5179 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5180 */
5181 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5182 if (accmode != 0)
5183 error = zfs_access(vp, accmode, 0, cred, NULL);
5184
5185 /*
5186 * VADMIN has to be handled by kauth_authorize_vnode().
5187 */
5188 if (error == 0) {
5189 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5190 if (accmode != 0) {
5191 error = kauth_authorize_vnode(cred,
5192 KAUTH_ACCESS_ACTION(accmode, vp->v_type,
5193 zp->z_mode & ALLPERMS), vp, NULL,
5194 genfs_can_access(vp, cred, zp->z_uid,
5195 zp->z_gid, zp->z_mode & ALLPERMS, NULL, accmode));
5196 }
5197 }
5198
5199 /*
5200 * For VEXEC, ensure that at least one execute bit is set for
5201 * non-directories.
5202 */
5203 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5204 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5205 error = EACCES;
5206 }
5207
5208 /* We expect EACCES as common error. */
5209 if (error == EPERM)
5210 error = EACCES;
5211
5212 return error;
5213 }
5214
5215 static int
zfs_netbsd_lookup(void * v)5216 zfs_netbsd_lookup(void *v)
5217 {
5218 struct vop_lookup_v2_args /* {
5219 struct vnode *a_dvp;
5220 struct vnode **a_vpp;
5221 struct componentname *a_cnp;
5222 } */ *ap = v;
5223 struct vnode *dvp = ap->a_dvp;
5224 struct vnode **vpp = ap->a_vpp;
5225 struct componentname *cnp = ap->a_cnp;
5226 char *nm, short_nm[31];
5227 int error;
5228 int iswhiteout;
5229
5230 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5231
5232 *vpp = NULL;
5233
5234 /*
5235 * Do an access check before the cache lookup. zfs_lookup does
5236 * an access check too, but it's too scary to contemplate
5237 * injecting our namecache stuff into zfs internals.
5238 *
5239 * XXX Is this the correct access check?
5240 */
5241 if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
5242 goto out;
5243
5244 /*
5245 * Check the namecache before entering zfs_lookup.
5246 * cache_lookup does the locking dance for us.
5247 */
5248 if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
5249 cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
5250 if (iswhiteout) {
5251 cnp->cn_flags |= ISWHITEOUT;
5252 }
5253 return *vpp == NULL ? ENOENT : 0;
5254 }
5255
5256 /*
5257 * zfs_lookup wants a null-terminated component name, but namei
5258 * gives us a pointer into the full pathname.
5259 */
5260 ASSERT(cnp->cn_namelen < PATH_MAX - 1);
5261 if (cnp->cn_namelen + 1 > sizeof(short_nm))
5262 nm = PNBUF_GET();
5263 else
5264 nm = short_nm;
5265 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5266
5267 error = zfs_lookup(dvp, nm, vpp, 0, cnp, cnp->cn_nameiop, cnp->cn_cred);
5268
5269 if (nm != short_nm)
5270 PNBUF_PUT(nm);
5271
5272 /*
5273 * Translate errors to match our namei insanity. Also, if the
5274 * caller wants to create an entry here, it's apparently our
5275 * responsibility as lookup to make sure that's permissible.
5276 * Go figure.
5277 */
5278 if (cnp->cn_flags & ISLASTCN) {
5279 switch (cnp->cn_nameiop) {
5280 case CREATE:
5281 case RENAME:
5282 if (error == ENOENT) {
5283 error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5284 if (error)
5285 break;
5286 error = EJUSTRETURN;
5287 break;
5288 }
5289 break;
5290 case DELETE:
5291 if (error == 0) {
5292 error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5293 if (error) {
5294 VN_RELE(*vpp);
5295 *vpp = NULL;
5296 }
5297 }
5298 break;
5299 }
5300 }
5301
5302 if (error) {
5303 KASSERT(*vpp == NULL);
5304 goto out;
5305 }
5306 KASSERT(*vpp != NULL);
5307
5308 if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
5309 KASSERT(!(cnp->cn_flags & ISDOTDOT));
5310 KASSERT(dvp == *vpp);
5311 } else if ((cnp->cn_namelen == 2) &&
5312 (cnp->cn_nameptr[0] == '.') &&
5313 (cnp->cn_nameptr[1] == '.')) {
5314 KASSERT(cnp->cn_flags & ISDOTDOT);
5315 } else {
5316 KASSERT(!(cnp->cn_flags & ISDOTDOT));
5317 }
5318
5319 out:
5320 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5321
5322 /*
5323 * Insert name into cache if appropriate.
5324 */
5325
5326 if (error == 0 || (error == ENOENT && cnp->cn_nameiop != CREATE))
5327 cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
5328 cnp->cn_flags);
5329
5330 return (error);
5331 }
5332
5333 static int
zfs_netbsd_create(void * v)5334 zfs_netbsd_create(void *v)
5335 {
5336 struct vop_create_v3_args /* {
5337 struct vnode *a_dvp;
5338 struct vnode **a_vpp;
5339 struct componentname *a_cnp;
5340 struct vattr *a_vap;
5341 } */ *ap = v;
5342 struct vnode *dvp = ap->a_dvp;
5343 struct vnode **vpp = ap->a_vpp;
5344 struct componentname *cnp = ap->a_cnp;
5345 struct vattr *vap = ap->a_vap;
5346 char *nm;
5347 int mode;
5348 int error;
5349
5350 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5351
5352 vattr_init_mask(vap);
5353 mode = vap->va_mode & ALLPERMS;
5354
5355 /* ZFS wants a null-terminated name. */
5356 nm = PNBUF_GET();
5357 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5358
5359 /* XXX !EXCL is wrong here... */
5360 error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5361
5362 PNBUF_PUT(nm);
5363
5364 KASSERT((error == 0) == (*vpp != NULL));
5365 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5366 if (*vpp != NULL)
5367 VOP_UNLOCK(*vpp, 0);
5368
5369 return (error);
5370 }
5371
5372 static int
zfs_netbsd_mknod(void * v)5373 zfs_netbsd_mknod(void *v)
5374 {
5375 struct vop_mknod_v3_args /* {
5376 struct vnode *a_dvp;
5377 struct vnode **a_vpp;
5378 struct componentname *a_cnp;
5379 struct vattr *a_vap;
5380 } */ *ap = v;
5381 struct vnode *dvp = ap->a_dvp;
5382 struct vnode **vpp = ap->a_vpp;
5383 struct componentname *cnp = ap->a_cnp;
5384 struct vattr *vap = ap->a_vap;
5385 char *nm;
5386 int mode;
5387 int error;
5388
5389 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5390
5391 vattr_init_mask(vap);
5392 mode = vap->va_mode & ALLPERMS;
5393
5394 /* ZFS wants a null-terminated name. */
5395 nm = PNBUF_GET();
5396 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5397
5398 /* XXX !EXCL is wrong here... */
5399 error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5400
5401 PNBUF_PUT(nm);
5402
5403 KASSERT((error == 0) == (*vpp != NULL));
5404 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5405 if (*vpp != NULL)
5406 VOP_UNLOCK(*vpp, 0);
5407
5408 return (error);
5409 }
5410
5411 static int
zfs_netbsd_remove(void * v)5412 zfs_netbsd_remove(void *v)
5413 {
5414 struct vop_remove_v3_args /* {
5415 struct vnode *a_dvp;
5416 struct vnode *a_vp;
5417 struct componentname *a_cnp;
5418 nlink_t ctx_vp_new_nlink;
5419 } */ *ap = v;
5420 struct vnode *dvp = ap->a_dvp;
5421 struct vnode *vp = ap->a_vp;
5422 struct componentname *cnp = ap->a_cnp;
5423 char *nm;
5424 int error;
5425
5426 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5427 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5428
5429 /* ZFS wants a null-terminated name. */
5430 nm = PNBUF_GET();
5431 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5432
5433 error = zfs_remove(dvp, vp, nm, cnp->cn_cred);
5434
5435 /*
5436 * XXX Should update ctx_vp_new_nlink, but for now the
5437 * XXX the kevent sent on "vp" matches historical behavior.
5438 */
5439
5440 PNBUF_PUT(nm);
5441 vput(vp);
5442 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5443 return (error);
5444 }
5445
5446 static int
zfs_netbsd_mkdir(void * v)5447 zfs_netbsd_mkdir(void *v)
5448 {
5449 struct vop_mkdir_v3_args /* {
5450 struct vnode *a_dvp;
5451 struct vnode **a_vpp;
5452 struct componentname *a_cnp;
5453 struct vattr *a_vap;
5454 } */ *ap = v;
5455 struct vnode *dvp = ap->a_dvp;
5456 struct vnode **vpp = ap->a_vpp;
5457 struct componentname *cnp = ap->a_cnp;
5458 struct vattr *vap = ap->a_vap;
5459 char *nm;
5460 int error;
5461
5462 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5463
5464 vattr_init_mask(vap);
5465
5466 /* ZFS wants a null-terminated name. */
5467 nm = PNBUF_GET();
5468 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5469
5470 error = zfs_mkdir(dvp, nm, vap, vpp, cnp->cn_cred);
5471
5472 PNBUF_PUT(nm);
5473
5474 KASSERT((error == 0) == (*vpp != NULL));
5475 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5476 if (*vpp != NULL)
5477 VOP_UNLOCK(*vpp, 0);
5478
5479 return (error);
5480 }
5481
5482 static int
zfs_netbsd_rmdir(void * v)5483 zfs_netbsd_rmdir(void *v)
5484 {
5485 struct vop_rmdir_v2_args /* {
5486 struct vnode *a_dvp;
5487 struct vnode *a_vp;
5488 struct componentname *a_cnp;
5489 } */ *ap = v;
5490 struct vnode *dvp = ap->a_dvp;
5491 struct vnode *vp = ap->a_vp;
5492 struct componentname *cnp = ap->a_cnp;
5493 char *nm;
5494 int error;
5495
5496 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5497 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5498
5499 /* ZFS wants a null-terminated name. */
5500 nm = PNBUF_GET();
5501 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5502
5503 error = zfs_rmdir(dvp, vp, nm, cnp->cn_cred);
5504
5505 PNBUF_PUT(nm);
5506 vput(vp);
5507 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5508 return error;
5509 }
5510
5511 static int
zfs_netbsd_readdir(void * v)5512 zfs_netbsd_readdir(void *v)
5513 {
5514 struct vop_readdir_args *ap = v;
5515
5516 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5517 ap->a_ncookies, ap->a_cookies));
5518 }
5519
5520 static int
zfs_netbsd_fsync(void * v)5521 zfs_netbsd_fsync(void *v)
5522 {
5523 struct vop_fsync_args *ap = v;
5524
5525 return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5526 }
5527
5528 static int
zfs_spec_fsync(void * v)5529 zfs_spec_fsync(void *v)
5530 {
5531 struct vop_fsync_args *ap = v;
5532 int error;
5533
5534 error = spec_fsync(v);
5535 if (error)
5536 return error;
5537
5538 return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5539 }
5540
5541 static int
zfs_netbsd_getattr(void * v)5542 zfs_netbsd_getattr(void *v)
5543 {
5544 struct vop_getattr_args *ap = v;
5545 vattr_t *vap = ap->a_vap;
5546 xvattr_t xvap;
5547 u_long fflags = 0;
5548 int error;
5549
5550 xva_init(&xvap);
5551 xvap.xva_vattr = *vap;
5552 xvap.xva_vattr.va_mask |= AT_XVATTR;
5553
5554 /* Convert chflags into ZFS-type flags. */
5555 /* XXX: what about SF_SETTABLE?. */
5556 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5557 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5558 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5559 XVA_SET_REQ(&xvap, XAT_NODUMP);
5560 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5561 if (error != 0)
5562 return (error);
5563
5564 /* Convert ZFS xattr into chflags. */
5565 #define FLAG_CHECK(fflag, xflag, xfield) do { \
5566 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
5567 fflags |= (fflag); \
5568 } while (0)
5569 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5570 xvap.xva_xoptattrs.xoa_immutable);
5571 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5572 xvap.xva_xoptattrs.xoa_appendonly);
5573 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5574 xvap.xva_xoptattrs.xoa_nounlink);
5575 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5576 xvap.xva_xoptattrs.xoa_nodump);
5577 #undef FLAG_CHECK
5578 *vap = xvap.xva_vattr;
5579 vap->va_flags = fflags;
5580 return (0);
5581 }
5582
5583 static int
zfs_netbsd_setattr(void * v)5584 zfs_netbsd_setattr(void *v)
5585 {
5586 struct vop_setattr_args *ap = v;
5587 vnode_t *vp = ap->a_vp;
5588 vattr_t *vap = ap->a_vap;
5589 cred_t *cred = ap->a_cred;
5590 znode_t *zp = VTOZ(vp);
5591 xvattr_t xvap;
5592 kauth_action_t action;
5593 u_long fflags, sfflags = 0;
5594 uint64_t zflags;
5595 int error, flags = 0;
5596 bool changing_sysflags;
5597
5598 vattr_init_mask(vap);
5599 vap->va_mask &= ~AT_NOSET;
5600 if (ISSET(vap->va_vaflags, VA_UTIMES_NULL))
5601 flags |= ATTR_UTIME;
5602
5603 xva_init(&xvap);
5604 xvap.xva_vattr = *vap;
5605
5606 zflags = VTOZ(vp)->z_pflags;
5607
5608 /* Ignore size changes on device nodes. */
5609 if (vp->v_type == VBLK || vp->v_type == VCHR)
5610 xvap.xva_vattr.va_mask &= ~AT_SIZE;
5611 if (vap->va_flags != VNOVAL) {
5612 int error;
5613
5614 fflags = vap->va_flags;
5615 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5616 return (EOPNOTSUPP);
5617
5618 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
5619 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
5620 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
5621 XVA_SET_REQ(&xvap, (xflag)); \
5622 (xfield) = ((fflags & (fflag)) != 0); \
5623 if (((fflag) & SF_SETTABLE) != 0) \
5624 sfflags |= (fflag); \
5625 } \
5626 } while (0)
5627 /* Convert chflags into ZFS-type flags. */
5628 /* XXX: what about SF_SETTABLE?. */
5629 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5630 xvap.xva_xoptattrs.xoa_immutable);
5631 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5632 xvap.xva_xoptattrs.xoa_appendonly);
5633 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5634 xvap.xva_xoptattrs.xoa_nounlink);
5635 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5636 xvap.xva_xoptattrs.xoa_nodump);
5637 #undef FLAG_CHANGE
5638
5639 action = KAUTH_VNODE_WRITE_FLAGS;
5640 changing_sysflags = false;
5641
5642 if (zflags & (ZFS_IMMUTABLE|ZFS_APPENDONLY|ZFS_NOUNLINK)) {
5643 action |= KAUTH_VNODE_HAS_SYSFLAGS;
5644 }
5645 if (sfflags != 0) {
5646 action |= KAUTH_VNODE_WRITE_SYSFLAGS;
5647 changing_sysflags = true;
5648 }
5649
5650 error = kauth_authorize_vnode(cred, action, vp, NULL,
5651 genfs_can_chflags(vp, cred, zp->z_uid, changing_sysflags));
5652 if (error)
5653 return error;
5654 }
5655
5656 if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
5657 vap->va_birthtime.tv_sec != VNOVAL) {
5658 error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
5659 NULL, genfs_can_chtimes(vp, cred, zp->z_uid,
5660 vap->va_vaflags));
5661 if (error)
5662 return error;
5663 }
5664
5665 error = zfs_setattr(vp, (vattr_t *)&xvap, flags, cred, NULL);
5666 if (error)
5667 return error;
5668
5669 cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid, true);
5670
5671 return error;
5672 }
5673
5674 static int
zfs_netbsd_rename(void * v)5675 zfs_netbsd_rename(void *v)
5676 {
5677 struct vop_rename_args /* {
5678 struct vnode *a_fdvp;
5679 struct vnode *a_fvp;
5680 struct componentname *a_fcnp;
5681 struct vnode *a_tdvp;
5682 struct vnode *a_tvp;
5683 struct componentname *a_tcnp;
5684 } */ *ap = v;
5685 vnode_t *fdvp = ap->a_fdvp;
5686 vnode_t *fvp = ap->a_fvp;
5687 struct componentname *fcnp = ap->a_fcnp;
5688 vnode_t *tdvp = ap->a_tdvp;
5689 vnode_t *tvp = ap->a_tvp;
5690 struct componentname *tcnp = ap->a_tcnp;
5691 kauth_cred_t cred;
5692 int error;
5693
5694 KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
5695 KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
5696 KASSERT(fdvp->v_type == VDIR);
5697 KASSERT(tdvp->v_type == VDIR);
5698
5699 cred = fcnp->cn_cred;
5700
5701 /*
5702 * XXX Want a better equality test. `tcnp->cn_cred == cred'
5703 * hoses p2k because puffs transmits the creds separately and
5704 * allocates distinct but equivalent structures for them.
5705 */
5706 KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
5707
5708 /*
5709 * Drop the insane locks.
5710 */
5711 VOP_UNLOCK(tdvp, 0);
5712 if (tvp != NULL && tvp != tdvp)
5713 VOP_UNLOCK(tvp, 0);
5714
5715 /*
5716 * Release the source and target nodes; zfs_rename will look
5717 * them up again once the locking situation is sane.
5718 */
5719 VN_RELE(fvp);
5720 if (tvp != NULL)
5721 VN_RELE(tvp);
5722 fvp = NULL;
5723 tvp = NULL;
5724
5725 /*
5726 * Do the rename ZFSly.
5727 */
5728 error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred);
5729
5730 /*
5731 * Release the directories now too, because the VOP_RENAME
5732 * protocol is insane.
5733 */
5734
5735 VN_RELE(fdvp);
5736 VN_RELE(tdvp);
5737 if (fvp != NULL)
5738 VN_RELE(fvp);
5739 if (tvp != NULL)
5740 VN_RELE(tvp);
5741
5742 return (error);
5743 }
5744
5745 static int
zfs_netbsd_symlink(void * v)5746 zfs_netbsd_symlink(void *v)
5747 {
5748 struct vop_symlink_v3_args /* {
5749 struct vnode *a_dvp;
5750 struct vnode **a_vpp;
5751 struct componentname *a_cnp;
5752 struct vattr *a_vap;
5753 char *a_target;
5754 } */ *ap = v;
5755 struct vnode *dvp = ap->a_dvp;
5756 struct vnode **vpp = ap->a_vpp;
5757 struct componentname *cnp = ap->a_cnp;
5758 struct vattr *vap = ap->a_vap;
5759 char *target = ap->a_target;
5760 char *nm;
5761 int error;
5762
5763 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5764
5765 vap->va_type = VLNK; /* Netbsd: Syscall only sets va_mode. */
5766 vattr_init_mask(vap);
5767
5768 /* ZFS wants a null-terminated name. */
5769 nm = PNBUF_GET();
5770 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5771
5772 error = zfs_symlink(dvp, vpp, nm, vap, target, cnp->cn_cred, 0);
5773
5774 PNBUF_PUT(nm);
5775 KASSERT((error == 0) == (*vpp != NULL));
5776 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5777 if (*vpp != NULL)
5778 VOP_UNLOCK(*vpp, 0);
5779
5780 return (error);
5781 }
5782
5783 static int
zfs_netbsd_readlink(void * v)5784 zfs_netbsd_readlink(void *v)
5785 {
5786 struct vop_readlink_args *ap = v;
5787
5788 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5789 }
5790
5791 static int
zfs_netbsd_link(void * v)5792 zfs_netbsd_link(void *v)
5793 {
5794 struct vop_link_v2_args /* {
5795 struct vnode *a_dvp;
5796 struct vnode *a_vp;
5797 struct componentname *a_cnp;
5798 } */ *ap = v;
5799 struct vnode *dvp = ap->a_dvp;
5800 struct vnode *vp = ap->a_vp;
5801 struct componentname *cnp = ap->a_cnp;
5802 char *nm;
5803 int error;
5804
5805 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5806
5807 /* ZFS wants a null-terminated name. */
5808 nm = PNBUF_GET();
5809 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5810
5811 if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
5812 /* XXX: No ABORTOP? */
5813 PNBUF_PUT(nm);
5814 return error;
5815 }
5816 error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
5817 dvp, 0);
5818 if (error)
5819 goto out;
5820 error = zfs_link(dvp, vp, nm, cnp->cn_cred,
5821 NULL, 0);
5822
5823 out:
5824 PNBUF_PUT(nm);
5825 VOP_UNLOCK(vp, 0);
5826 return error;
5827 }
5828
5829 static int
zfs_netbsd_inactive(void * v)5830 zfs_netbsd_inactive(void *v)
5831 {
5832 struct vop_inactive_v2_args *ap = v;
5833 vnode_t *vp = ap->a_vp;
5834 znode_t *zp = VTOZ(vp);
5835
5836 /*
5837 * NetBSD: nothing to do here, other than indicate if the
5838 * vnode should be reclaimed. No need to lock, if we race
5839 * vrele() will call us again.
5840 */
5841 *ap->a_recycle = (zp->z_unlinked != 0);
5842
5843 return (0);
5844 }
5845
5846 static int
zfs_netbsd_reclaim(void * v)5847 zfs_netbsd_reclaim(void *v)
5848 {
5849 struct vop_reclaim_v2_args /* {
5850 struct vnode *a_vp;
5851 } */ *ap = v;
5852 struct vnode *vp = ap->a_vp;
5853 znode_t *zp;
5854 zfsvfs_t *zfsvfs;
5855 int error;
5856
5857 VOP_UNLOCK(vp, 0);
5858 zp = VTOZ(vp);
5859 zfsvfs = zp->z_zfsvfs;
5860
5861 KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp);
5862
5863 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5864
5865 /*
5866 * Process a deferred atime update.
5867 */
5868 if (zp->z_atime_dirty && zp->z_unlinked == 0 && zp->z_sa_hdl != NULL) {
5869 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
5870
5871 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5872 zfs_sa_upgrade_txholds(tx, zp);
5873 error = dmu_tx_assign(tx, TXG_WAIT);
5874 if (error) {
5875 dmu_tx_abort(tx);
5876 } else {
5877 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
5878 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
5879 zp->z_atime_dirty = 0;
5880 dmu_tx_commit(tx);
5881 }
5882 }
5883
5884 /*
5885 * Operation zfs_znode.c::zfs_zget_cleaner() depends on this
5886 * zil_commit() as a barrier to guarantee the znode cannot
5887 * get freed before its log entries are resolved.
5888 */
5889 if (zfsvfs->z_log)
5890 zil_commit(zfsvfs->z_log, zp->z_id);
5891
5892 if (zp->z_sa_hdl == NULL)
5893 zfs_znode_free(zp);
5894 else
5895 zfs_zinactive(zp);
5896 rw_exit(&zfsvfs->z_teardown_inactive_lock);
5897 return 0;
5898 }
5899
5900 static int
zfs_netbsd_fid(void * v)5901 zfs_netbsd_fid(void *v)
5902 {
5903 struct vop_fid_args *ap = v;
5904
5905 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5906 }
5907
5908 static int
zfs_netbsd_pathconf(void * v)5909 zfs_netbsd_pathconf(void *v)
5910 {
5911 struct vop_pathconf_args *ap = v;
5912 ulong_t val;
5913 int error;
5914
5915 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
5916 if (error == 0)
5917 *ap->a_retval = val;
5918 else if (error == EOPNOTSUPP) {
5919 switch (ap->a_name) {
5920 case _PC_NAME_MAX:
5921 *ap->a_retval = NAME_MAX;
5922 return (0);
5923 case _PC_PATH_MAX:
5924 *ap->a_retval = PATH_MAX;
5925 return (0);
5926 case _PC_LINK_MAX:
5927 *ap->a_retval = LINK_MAX;
5928 return (0);
5929 case _PC_MAX_CANON:
5930 *ap->a_retval = MAX_CANON;
5931 return (0);
5932 case _PC_MAX_INPUT:
5933 *ap->a_retval = MAX_INPUT;
5934 return (0);
5935 case _PC_PIPE_BUF:
5936 *ap->a_retval = PIPE_BUF;
5937 return (0);
5938 case _PC_CHOWN_RESTRICTED:
5939 *ap->a_retval = 1;
5940 return (0);
5941 case _PC_NO_TRUNC:
5942 *ap->a_retval = 1;
5943 return (0);
5944 case _PC_VDISABLE:
5945 *ap->a_retval = _POSIX_VDISABLE;
5946 return (0);
5947 default:
5948 return (EINVAL);
5949 }
5950 /* NOTREACHED */
5951 }
5952 return (error);
5953 }
5954
5955 static int
zfs_netbsd_advlock(void * v)5956 zfs_netbsd_advlock(void *v)
5957 {
5958 struct vop_advlock_args /* {
5959 struct vnode *a_vp;
5960 void *a_id;
5961 int a_op;
5962 struct flock *a_fl;
5963 int a_flags;
5964 } */ *ap = v;
5965 struct vnode *vp;
5966 struct znode *zp;
5967 struct zfsvfs *zfsvfs;
5968 int error;
5969
5970 vp = ap->a_vp;
5971 zp = VTOZ(vp);
5972 zfsvfs = zp->z_zfsvfs;
5973
5974 ZFS_ENTER(zfsvfs);
5975 ZFS_VERIFY_ZP(zp);
5976 error = lf_advlock(ap, &zp->z_lockf, zp->z_size);
5977 ZFS_EXIT(zfsvfs);
5978
5979 return error;
5980 }
5981
5982 static int
zfs_netbsd_getpages(void * v)5983 zfs_netbsd_getpages(void *v)
5984 {
5985 struct vop_getpages_args /* {
5986 struct vnode *a_vp;
5987 voff_t a_offset;
5988 struct vm_page **a_m;
5989 int *a_count;
5990 int a_centeridx;
5991 vm_prot_t a_access_type;
5992 int a_advice;
5993 int a_flags;
5994 } */ * const ap = v;
5995
5996 vnode_t *const vp = ap->a_vp;
5997 const int flags = ap->a_flags;
5998 const bool async = (flags & PGO_SYNCIO) == 0;
5999 const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
6000
6001 struct uvm_object * const uobj = &vp->v_uobj;
6002 krwlock_t * const rw = uobj->vmobjlock;
6003 znode_t *zp = VTOZ(vp);
6004 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6005 vfs_t *mp;
6006 struct vm_page *pg;
6007 caddr_t va;
6008 int npages = *ap->a_count, found, err = 0;
6009
6010 if (flags & PGO_LOCKED) {
6011 uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL,
6012 UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
6013 (memwrite ? UFP_NORDONLY : 0));
6014 KASSERT(npages == *ap->a_count);
6015 if (memwrite) {
6016 KASSERT(rw_write_held(uobj->vmobjlock));
6017 for (int i = 0; i < npages; i++) {
6018 pg = ap->a_m[i];
6019 if (pg == NULL || pg == PGO_DONTCARE) {
6020 continue;
6021 }
6022 if (uvm_pagegetdirty(pg) ==
6023 UVM_PAGE_STATUS_CLEAN) {
6024 uvm_pagemarkdirty(pg,
6025 UVM_PAGE_STATUS_UNKNOWN);
6026 }
6027 }
6028 }
6029 return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
6030 }
6031 rw_exit(rw);
6032
6033 if (async) {
6034 return 0;
6035 }
6036
6037 mp = vp->v_mount;
6038 fstrans_start(mp);
6039 if (vp->v_mount != mp) {
6040 fstrans_done(mp);
6041 return ENOENT;
6042 }
6043 ZFS_ENTER(zfsvfs);
6044 ZFS_VERIFY_ZP(zp);
6045
6046 rw_enter(rw, RW_WRITER);
6047 if (ap->a_offset + (npages << PAGE_SHIFT) > round_page(vp->v_size)) {
6048 rw_exit(rw);
6049 ZFS_EXIT(zfsvfs);
6050 fstrans_done(mp);
6051 return EINVAL;
6052 }
6053 uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL, UFP_ALL);
6054 KASSERT(npages == *ap->a_count);
6055
6056 for (int i = 0; i < npages; i++) {
6057 pg = ap->a_m[i];
6058 if (pg->flags & PG_FAKE) {
6059 voff_t offset = pg->offset;
6060 KASSERT(pg->offset == ap->a_offset + (i << PAGE_SHIFT));
6061 rw_exit(rw);
6062
6063 va = zfs_map_page(pg, S_WRITE);
6064 err = dmu_read(zfsvfs->z_os, zp->z_id, offset,
6065 PAGE_SIZE, va, DMU_READ_PREFETCH);
6066 zfs_unmap_page(pg, va);
6067
6068 if (err != 0) {
6069 uvm_aio_aiodone_pages(ap->a_m, npages, false, err);
6070 memset(ap->a_m, 0, sizeof(ap->a_m[0]) *
6071 npages);
6072 break;
6073 }
6074 rw_enter(rw, RW_WRITER);
6075 pg->flags &= ~(PG_FAKE);
6076 }
6077
6078 if (memwrite && uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
6079 /* For write faults, start dirtiness tracking. */
6080 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
6081 }
6082 }
6083 rw_exit(rw);
6084
6085 ZFS_EXIT(zfsvfs);
6086 fstrans_done(mp);
6087
6088 return (err);
6089 }
6090
6091 static int
zfs_putapage(vnode_t * vp,page_t ** pp,int count,int flags)6092 zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags)
6093 {
6094 znode_t *zp = VTOZ(vp);
6095 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6096 dmu_tx_t *tx;
6097 voff_t off, koff;
6098 voff_t len, klen;
6099 int err;
6100
6101 bool *cleanedp;
6102 struct uvm_object *uobj = &vp->v_uobj;
6103 krwlock_t *rw = uobj->vmobjlock;
6104
6105 if (zp->z_sa_hdl == NULL) {
6106 err = 0;
6107 goto out;
6108 }
6109
6110 /*
6111 * Calculate the length and assert that no whole pages are past EOF.
6112 * This check is equivalent to "off + len <= round_page(zp->z_size)",
6113 * with gyrations to avoid signed integer overflow.
6114 */
6115
6116 off = pp[0]->offset;
6117 len = count * PAGESIZE;
6118 KASSERT(off <= zp->z_size);
6119 KASSERT(len <= round_page(zp->z_size));
6120 KASSERT(off <= round_page(zp->z_size) - len);
6121
6122 /*
6123 * If EOF is within the last page, reduce len to avoid writing past
6124 * the file size in the ZFS buffer. Assert that
6125 * "off + len <= zp->z_size", again avoiding signed integer overflow.
6126 */
6127
6128 if (len > zp->z_size - off) {
6129 len = zp->z_size - off;
6130 }
6131 KASSERT(len <= zp->z_size);
6132 KASSERT(off <= zp->z_size - len);
6133
6134 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
6135 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
6136 err = SET_ERROR(EDQUOT);
6137 goto out;
6138 }
6139 tx = dmu_tx_create(zfsvfs->z_os);
6140 dmu_tx_hold_write(tx, zp->z_id, off, len);
6141
6142 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
6143 zfs_sa_upgrade_txholds(tx, zp);
6144 err = dmu_tx_assign(tx, TXG_WAIT);
6145 if (err != 0) {
6146 dmu_tx_abort(tx);
6147 goto out;
6148 }
6149
6150 if (zp->z_blksz <= PAGESIZE) {
6151 KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count);
6152 caddr_t va = zfs_map_page(*pp, S_READ);
6153 ASSERT3U(len, <=, PAGESIZE);
6154 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
6155 zfs_unmap_page(*pp, va);
6156 } else {
6157 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
6158 }
6159 cleanedp = tsd_get(zfs_putpage_key);
6160 *cleanedp = true;
6161
6162 if (err == 0) {
6163 uint64_t mtime[2], ctime[2];
6164 sa_bulk_attr_t bulk[3];
6165 int count = 0;
6166
6167 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
6168 &mtime, 16);
6169 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
6170 &ctime, 16);
6171 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
6172 &zp->z_pflags, 8);
6173 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
6174 B_TRUE);
6175 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
6176 ASSERT0(err);
6177 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6178 }
6179 dmu_tx_commit(tx);
6180
6181 out:
6182 uvm_aio_aiodone_pages(pp, count, true, err);
6183 return (err);
6184 }
6185
6186 static void
zfs_netbsd_gop_markupdate(vnode_t * vp,int flags)6187 zfs_netbsd_gop_markupdate(vnode_t *vp, int flags)
6188 {
6189 znode_t *zp = VTOZ(vp);
6190 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6191 dmu_tx_t *tx;
6192 sa_bulk_attr_t bulk[2];
6193 uint64_t mtime[2], ctime[2];
6194 int count = 0, err;
6195
6196 KASSERT(flags == GOP_UPDATE_MODIFIED);
6197
6198 tx = dmu_tx_create(zfsvfs->z_os);
6199 err = dmu_tx_assign(tx, TXG_WAIT);
6200 if (err != 0) {
6201 dmu_tx_abort(tx);
6202 return;
6203 }
6204 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
6205 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
6206 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
6207 dmu_tx_commit(tx);
6208 }
6209
6210 static int
zfs_netbsd_putpages(void * v)6211 zfs_netbsd_putpages(void *v)
6212 {
6213 struct vop_putpages_args /* {
6214 struct vnode *a_vp;
6215 voff_t a_offlo;
6216 voff_t a_offhi;
6217 int a_flags;
6218 } */ * const ap = v;
6219
6220 struct vnode *vp = ap->a_vp;
6221 voff_t offlo = ap->a_offlo;
6222 voff_t offhi = ap->a_offhi;
6223 int flags = ap->a_flags;
6224
6225 znode_t *zp = VTOZ(vp);
6226 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6227 rl_t *rl = NULL;
6228 uint64_t len;
6229 int error;
6230 bool cleaned = false;
6231
6232 bool async = (flags & PGO_SYNCIO) == 0;
6233 bool cleaning = (flags & PGO_CLEANIT) != 0;
6234
6235 if (cleaning) {
6236 ASSERT((offlo & PAGE_MASK) == 0 && (offhi & PAGE_MASK) == 0);
6237 ASSERT(offlo < offhi || offhi == 0);
6238 if (offhi == 0)
6239 len = UINT64_MAX;
6240 else
6241 len = offhi - offlo;
6242 rw_exit(vp->v_uobj.vmobjlock);
6243 if (curlwp == uvm.pagedaemon_lwp) {
6244 error = fstrans_start_nowait(vp->v_mount);
6245 if (error)
6246 return error;
6247 } else {
6248 vfs_t *mp = vp->v_mount;
6249 fstrans_start(mp);
6250 if (vp->v_mount != mp) {
6251 fstrans_done(mp);
6252 ASSERT(!vn_has_cached_data(vp));
6253 return 0;
6254 }
6255 }
6256 /*
6257 * Cannot use ZFS_ENTER() here as it returns with error
6258 * if z_unmounted. The next statement is equivalent.
6259 */
6260 rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
6261
6262 rl = zfs_range_lock(zp, offlo, len, RL_WRITER);
6263 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
6264 tsd_set(zfs_putpage_key, &cleaned);
6265 }
6266 error = genfs_putpages(v);
6267 if (cleaning) {
6268 tsd_set(zfs_putpage_key, NULL);
6269 zfs_range_unlock(rl);
6270
6271 /*
6272 * Only zil_commit() if we cleaned something. This avoids
6273 * deadlock if we're called from zfs_netbsd_setsize().
6274 */
6275
6276 if (cleaned)
6277 if (!async || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6278 zil_commit(zfsvfs->z_log, zp->z_id);
6279 ZFS_EXIT(zfsvfs);
6280 fstrans_done(vp->v_mount);
6281 }
6282 return error;
6283 }
6284
6285 /*
6286 * Restrict the putpages range to the ZFS block containing the offset.
6287 */
6288 static void
zfs_netbsd_gop_putrange(struct vnode * vp,off_t off,off_t * lop,off_t * hip)6289 zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
6290 {
6291 znode_t *zp = VTOZ(vp);
6292
6293 *lop = trunc_page(rounddown2(off, zp->z_blksz));
6294 *hip = round_page(*lop + zp->z_blksz);
6295 }
6296
6297 void
zfs_netbsd_setsize(vnode_t * vp,off_t size)6298 zfs_netbsd_setsize(vnode_t *vp, off_t size)
6299 {
6300 struct uvm_object *uobj = &vp->v_uobj;
6301 krwlock_t *rw = uobj->vmobjlock;
6302 page_t *pg;
6303 int count, pgoff;
6304 caddr_t va;
6305 off_t tsize;
6306
6307 uvm_vnp_setsize(vp, size);
6308 if (!vn_has_cached_data(vp))
6309 return;
6310
6311 tsize = trunc_page(size);
6312 if (tsize == size)
6313 return;
6314
6315 /*
6316 * If there's a partial page, we need to zero the tail.
6317 */
6318
6319 rw_enter(rw, RW_WRITER);
6320 count = 1;
6321 pg = NULL;
6322 if (uvn_findpages(uobj, tsize, &count, &pg, NULL, UFP_NOALLOC)) {
6323 va = zfs_map_page(pg, S_WRITE);
6324 pgoff = size - tsize;
6325 memset(va + pgoff, 0, PAGESIZE - pgoff);
6326 zfs_unmap_page(pg, va);
6327 uvm_page_unbusy(&pg, 1);
6328 }
6329
6330 rw_exit(rw);
6331 }
6332
6333 static int
zfs_netbsd_print(void * v)6334 zfs_netbsd_print(void *v)
6335 {
6336 struct vop_print_args /* {
6337 struct vnode *a_vp;
6338 } */ *ap = v;
6339 vnode_t *vp;
6340 znode_t *zp;
6341
6342 vp = ap->a_vp;
6343 zp = VTOZ(vp);
6344
6345 printf("\tino %" PRIu64 " size %" PRIu64 "\n",
6346 zp->z_id, zp->z_size);
6347 return 0;
6348 }
6349
6350 const struct genfs_ops zfs_genfsops = {
6351 .gop_write = zfs_putapage,
6352 .gop_markupdate = zfs_netbsd_gop_markupdate,
6353 .gop_putrange = zfs_netbsd_gop_putrange,
6354 };
6355
6356 int (**zfs_vnodeop_p)(void *);
6357 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
6358 { &vop_default_desc, vn_default_error },
6359 { &vop_parsepath_desc, genfs_parsepath },
6360 { &vop_lookup_desc, zfs_netbsd_lookup },
6361 { &vop_create_desc, zfs_netbsd_create },
6362 { &vop_mknod_desc, zfs_netbsd_mknod },
6363 { &vop_open_desc, zfs_netbsd_open },
6364 { &vop_close_desc, zfs_netbsd_close },
6365 { &vop_access_desc, zfs_netbsd_access },
6366 { &vop_accessx_desc, genfs_accessx },
6367 { &vop_getattr_desc, zfs_netbsd_getattr },
6368 { &vop_setattr_desc, zfs_netbsd_setattr },
6369 { &vop_read_desc, zfs_netbsd_read },
6370 { &vop_write_desc, zfs_netbsd_write },
6371 { &vop_ioctl_desc, zfs_netbsd_ioctl },
6372 { &vop_poll_desc, genfs_poll },
6373 { &vop_kqfilter_desc, genfs_kqfilter },
6374 { &vop_revoke_desc, genfs_revoke },
6375 { &vop_fsync_desc, zfs_netbsd_fsync },
6376 { &vop_remove_desc, zfs_netbsd_remove },
6377 { &vop_link_desc, zfs_netbsd_link },
6378 { &vop_lock_desc, genfs_lock },
6379 { &vop_unlock_desc, genfs_unlock },
6380 { &vop_rename_desc, zfs_netbsd_rename },
6381 { &vop_mkdir_desc, zfs_netbsd_mkdir },
6382 { &vop_rmdir_desc, zfs_netbsd_rmdir },
6383 { &vop_symlink_desc, zfs_netbsd_symlink },
6384 { &vop_readdir_desc, zfs_netbsd_readdir },
6385 { &vop_readlink_desc, zfs_netbsd_readlink },
6386 { &vop_inactive_desc, zfs_netbsd_inactive },
6387 { &vop_reclaim_desc, zfs_netbsd_reclaim },
6388 { &vop_pathconf_desc, zfs_netbsd_pathconf },
6389 { &vop_seek_desc, genfs_seek },
6390 { &vop_getpages_desc, zfs_netbsd_getpages },
6391 { &vop_putpages_desc, zfs_netbsd_putpages },
6392 { &vop_mmap_desc, genfs_mmap },
6393 { &vop_islocked_desc, genfs_islocked },
6394 { &vop_advlock_desc, zfs_netbsd_advlock },
6395 { &vop_print_desc, zfs_netbsd_print },
6396 { &vop_fcntl_desc, genfs_fcntl },
6397 { NULL, NULL }
6398 };
6399
6400 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
6401 { &zfs_vnodeop_p, zfs_vnodeop_entries };
6402
6403 int (**zfs_specop_p)(void *);
6404 const struct vnodeopv_entry_desc zfs_specop_entries[] = {
6405 { &vop_default_desc, vn_default_error },
6406 GENFS_SPECOP_ENTRIES,
6407 { &vop_close_desc, spec_close },
6408 { &vop_access_desc, zfs_netbsd_access },
6409 { &vop_accessx_desc, genfs_accessx },
6410 { &vop_getattr_desc, zfs_netbsd_getattr },
6411 { &vop_setattr_desc, zfs_netbsd_setattr },
6412 { &vop_read_desc, /**/zfs_netbsd_read },
6413 { &vop_write_desc, /**/zfs_netbsd_write },
6414 { &vop_fsync_desc, zfs_spec_fsync },
6415 { &vop_lock_desc, genfs_lock },
6416 { &vop_unlock_desc, genfs_unlock },
6417 { &vop_inactive_desc, zfs_netbsd_inactive },
6418 { &vop_reclaim_desc, zfs_netbsd_reclaim },
6419 { &vop_islocked_desc, genfs_islocked },
6420 { &vop_bwrite_desc, vn_bwrite },
6421 { &vop_print_desc, zfs_netbsd_print },
6422 { &vop_fcntl_desc, genfs_fcntl },
6423 { NULL, NULL }
6424 };
6425
6426 const struct vnodeopv_desc zfs_specop_opv_desc =
6427 { &zfs_specop_p, zfs_specop_entries };
6428
6429 int (**zfs_fifoop_p)(void *);
6430 const struct vnodeopv_entry_desc zfs_fifoop_entries[] = {
6431 { &vop_default_desc, vn_default_error },
6432 GENFS_FIFOOP_ENTRIES,
6433 { &vop_close_desc, vn_fifo_bypass },
6434 { &vop_access_desc, zfs_netbsd_access },
6435 { &vop_accessx_desc, genfs_accessx },
6436 { &vop_getattr_desc, zfs_netbsd_getattr },
6437 { &vop_setattr_desc, zfs_netbsd_setattr },
6438 { &vop_read_desc, /**/zfs_netbsd_read },
6439 { &vop_write_desc, /**/zfs_netbsd_write },
6440 { &vop_fsync_desc, zfs_netbsd_fsync },
6441 { &vop_lock_desc, genfs_lock },
6442 { &vop_unlock_desc, genfs_unlock },
6443 { &vop_inactive_desc, zfs_netbsd_inactive },
6444 { &vop_reclaim_desc, zfs_netbsd_reclaim },
6445 { &vop_islocked_desc, genfs_islocked },
6446 { &vop_bwrite_desc, vn_bwrite },
6447 { &vop_strategy_desc, vn_fifo_bypass },
6448 { &vop_print_desc, zfs_netbsd_print },
6449 { &vop_fcntl_desc, genfs_fcntl },
6450 { NULL, NULL }
6451 };
6452
6453 const struct vnodeopv_desc zfs_fifoop_opv_desc =
6454 { &zfs_fifoop_p, zfs_fifoop_entries };
6455
6456 #endif /* __NetBSD__ */
6457