xref: /netbsd-src/sys/fs/union/union_subr.c (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1 /*	$NetBSD: union_subr.c,v 1.51 2011/10/18 09:22:53 hannken Exp $	*/
2 
3 /*
4  * Copyright (c) 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Jan-Simon Pendry.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
35  */
36 
37 /*
38  * Copyright (c) 1994 Jan-Simon Pendry
39  *
40  * This code is derived from software contributed to Berkeley by
41  * Jan-Simon Pendry.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. All advertising materials mentioning features or use of this software
52  *    must display the following acknowledgement:
53  *	This product includes software developed by the University of
54  *	California, Berkeley and its contributors.
55  * 4. Neither the name of the University nor the names of its contributors
56  *    may be used to endorse or promote products derived from this software
57  *    without specific prior written permission.
58  *
59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69  * SUCH DAMAGE.
70  *
71  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
72  */
73 
74 #include <sys/cdefs.h>
75 __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.51 2011/10/18 09:22:53 hannken Exp $");
76 
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc.h>
80 #include <sys/time.h>
81 #include <sys/kernel.h>
82 #include <sys/vnode.h>
83 #include <sys/namei.h>
84 #include <sys/malloc.h>
85 #include <sys/dirent.h>
86 #include <sys/file.h>
87 #include <sys/filedesc.h>
88 #include <sys/queue.h>
89 #include <sys/mount.h>
90 #include <sys/stat.h>
91 #include <sys/kauth.h>
92 
93 #include <uvm/uvm_extern.h>
94 
95 #include <fs/union/union.h>
96 #include <miscfs/specfs/specdev.h>
97 
98 /* must be power of two, otherwise change UNION_HASH() */
99 #define NHASH 32
100 
101 /* unsigned int ... */
102 #define UNION_HASH(u, l) \
103 	(((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1))
104 
105 static LIST_HEAD(unhead, union_node) unhead[NHASH];
106 static kmutex_t unheadlock[NHASH];
107 
108 void union_updatevp(struct union_node *, struct vnode *, struct vnode *);
109 static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t,    const char *, u_long);
110 int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
111 static void union_dircache_r(struct vnode *, struct vnode ***, int *);
112 struct vnode *union_dircache(struct vnode *, struct lwp *);
113 
114 void
115 union_init(void)
116 {
117 	int i;
118 
119 	for (i = 0; i < NHASH; i++) {
120 		LIST_INIT(&unhead[i]);
121 		mutex_init(&unheadlock[i], MUTEX_DEFAULT, IPL_NONE);
122 	}
123 }
124 
125 /*
126  * Free global unionfs resources.
127  */
128 void
129 union_done(void)
130 {
131 	int i;
132 
133 	for (i = 0; i < NHASH; i++)
134 		mutex_destroy(&unheadlock[i]);
135 
136 	/* Make sure to unset the readdir hook. */
137 	vn_union_readdir_hook = NULL;
138 }
139 
140 void
141 union_updatevp(struct union_node *un, struct vnode *uppervp,
142 	struct vnode *lowervp)
143 {
144 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
145 	int nhash = UNION_HASH(uppervp, lowervp);
146 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
147 	int lhash, uhash;
148 
149 	/*
150 	 * Ensure locking is ordered from lower to higher
151 	 * to avoid deadlocks.
152 	 */
153 	if (nhash < ohash) {
154 		lhash = nhash;
155 		uhash = ohash;
156 	} else {
157 		lhash = ohash;
158 		uhash = nhash;
159 	}
160 
161 	if (lhash != uhash)
162 		mutex_enter(&unheadlock[lhash]);
163 
164 	mutex_enter(&unheadlock[uhash]);
165 
166 	if (ohash != nhash || !docache) {
167 		if (un->un_flags & UN_CACHED) {
168 			un->un_flags &= ~UN_CACHED;
169 			LIST_REMOVE(un, un_cache);
170 		}
171 	}
172 
173 	if (ohash != nhash)
174 		mutex_exit(&unheadlock[ohash]);
175 
176 	if (un->un_lowervp != lowervp) {
177 		if (un->un_lowervp) {
178 			vrele(un->un_lowervp);
179 			if (un->un_path) {
180 				free(un->un_path, M_TEMP);
181 				un->un_path = 0;
182 			}
183 			if (un->un_dirvp) {
184 				vrele(un->un_dirvp);
185 				un->un_dirvp = NULLVP;
186 			}
187 		}
188 		un->un_lowervp = lowervp;
189 		un->un_lowersz = VNOVAL;
190 	}
191 
192 	if (un->un_uppervp != uppervp) {
193 		if (un->un_uppervp)
194 			vrele(un->un_uppervp);
195 
196 		un->un_uppervp = uppervp;
197 		un->un_uppersz = VNOVAL;
198 		/* Update union vnode interlock. */
199 		if (uppervp != NULL) {
200 			mutex_obj_hold(uppervp->v_interlock);
201 			uvm_obj_setlock(&UNIONTOV(un)->v_uobj,
202 			    uppervp->v_interlock);
203 		}
204 	}
205 
206 	if (docache && (ohash != nhash)) {
207 		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
208 		un->un_flags |= UN_CACHED;
209 	}
210 
211 	mutex_exit(&unheadlock[nhash]);
212 }
213 
214 void
215 union_newlower(struct union_node *un, struct vnode *lowervp)
216 {
217 
218 	union_updatevp(un, un->un_uppervp, lowervp);
219 }
220 
221 void
222 union_newupper(struct union_node *un, struct vnode *uppervp)
223 {
224 
225 	union_updatevp(un, uppervp, un->un_lowervp);
226 }
227 
228 /*
229  * Keep track of size changes in the underlying vnodes.
230  * If the size changes, then callback to the vm layer
231  * giving priority to the upper layer size.
232  */
233 void
234 union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
235 {
236 	struct union_node *un;
237 	off_t sz;
238 
239 	/* only interested in regular files */
240 	if (vp->v_type != VREG) {
241 		uvm_vnp_setsize(vp, 0);
242 		return;
243 	}
244 
245 	un = VTOUNION(vp);
246 	sz = VNOVAL;
247 
248 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
249 		un->un_uppersz = uppersz;
250 		if (sz == VNOVAL)
251 			sz = un->un_uppersz;
252 	}
253 
254 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
255 		un->un_lowersz = lowersz;
256 		if (sz == VNOVAL)
257 			sz = un->un_lowersz;
258 	}
259 
260 	if (sz != VNOVAL) {
261 #ifdef UNION_DIAGNOSTIC
262 		printf("union: %s size now %qd\n",
263 		    uppersz != VNOVAL ? "upper" : "lower", sz);
264 #endif
265 		uvm_vnp_setsize(vp, sz);
266 	}
267 }
268 
269 /*
270  * allocate a union_node/vnode pair.  the vnode is
271  * referenced and locked.  the new vnode is returned
272  * via (vpp).  (mp) is the mountpoint of the union filesystem,
273  * (dvp) is the parent directory where the upper layer object
274  * should exist (but doesn't) and (cnp) is the componentname
275  * information which is partially copied to allow the upper
276  * layer object to be created at a later time.  (uppervp)
277  * and (lowervp) reference the upper and lower layer objects
278  * being mapped.  either, but not both, can be nil.
279  * if supplied, (uppervp) is locked.
280  * the reference is either maintained in the new union_node
281  * object which is allocated, or they are vrele'd.
282  *
283  * all union_nodes are maintained on a singly-linked
284  * list.  new nodes are only allocated when they cannot
285  * be found on this list.  entries on the list are
286  * removed when the vfs reclaim entry is called.
287  *
288  * a single lock is kept for the entire list.  this is
289  * needed because the getnewvnode() function can block
290  * waiting for a vnode to become free, in which case there
291  * may be more than one process trying to get the same
292  * vnode.  this lock is only taken if we are going to
293  * call getnewvnode, since the kernel itself is single-threaded.
294  *
295  * if an entry is found on the list, then call vget() to
296  * take a reference.  this is done because there may be
297  * zero references to it and so it needs to removed from
298  * the vnode free list.
299  */
300 int
301 union_allocvp(
302 	struct vnode **vpp,
303 	struct mount *mp,
304 	struct vnode *undvp,		/* parent union vnode */
305 	struct vnode *dvp,		/* may be null */
306 	struct componentname *cnp,	/* may be null */
307 	struct vnode *uppervp,		/* may be null */
308 	struct vnode *lowervp,		/* may be null */
309 	int docache)
310 {
311 	int error;
312 	struct vattr va;
313 	struct union_node *un = NULL, *un1;
314 	struct vnode *vp, *xlowervp = NULLVP;
315 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
316 	voff_t uppersz, lowersz;
317 	dev_t rdev;
318 	int hash = 0;
319 	int vflag, iflag;
320 	int try;
321 
322 	if (uppervp == NULLVP && lowervp == NULLVP)
323 		panic("union: unidentifiable allocation");
324 
325 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
326 		xlowervp = lowervp;
327 		lowervp = NULLVP;
328 	}
329 
330 	/* detect the root vnode (and aliases) */
331 	iflag = VI_LAYER;
332 	vflag = 0;
333 	if ((uppervp == um->um_uppervp) &&
334 	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
335 		if (lowervp == NULLVP) {
336 			lowervp = um->um_lowervp;
337 			if (lowervp != NULLVP)
338 				vref(lowervp);
339 		}
340 		iflag = 0;
341 		vflag = VV_ROOT;
342 	}
343 
344 loop:
345 	if (!docache) {
346 		un = 0;
347 	} else for (try = 0; try < 3; try++) {
348 		switch (try) {
349 		case 0:
350 			if (lowervp == NULLVP)
351 				continue;
352 			hash = UNION_HASH(uppervp, lowervp);
353 			break;
354 
355 		case 1:
356 			if (uppervp == NULLVP)
357 				continue;
358 			hash = UNION_HASH(uppervp, NULLVP);
359 			break;
360 
361 		case 2:
362 			if (lowervp == NULLVP)
363 				continue;
364 			hash = UNION_HASH(NULLVP, lowervp);
365 			break;
366 		}
367 
368 		mutex_enter(&unheadlock[hash]);
369 
370 		for (un = unhead[hash].lh_first; un != 0;
371 					un = un->un_cache.le_next) {
372 			if ((un->un_lowervp == lowervp ||
373 			     un->un_lowervp == NULLVP) &&
374 			    (un->un_uppervp == uppervp ||
375 			     un->un_uppervp == NULLVP) &&
376 			    (UNIONTOV(un)->v_mount == mp)) {
377 				vp = UNIONTOV(un);
378 				mutex_enter(vp->v_interlock);
379 				if (vget(vp, 0)) {
380 					mutex_exit(&unheadlock[hash]);
381 					goto loop;
382 				}
383 				break;
384 			}
385 		}
386 
387 		mutex_exit(&unheadlock[hash]);
388 
389 		if (un)
390 			break;
391 	}
392 
393 	if (un) {
394 		/*
395 		 * Obtain a lock on the union_node.
396 		 * uppervp is locked, though un->un_uppervp
397 		 * may not be.  this doesn't break the locking
398 		 * hierarchy since in the case that un->un_uppervp
399 		 * is not yet locked it will be vrele'd and replaced
400 		 * with uppervp.
401 		 */
402 
403 		if ((dvp != NULLVP) && (uppervp == dvp)) {
404 			/*
405 			 * Access ``.'', so (un) will already
406 			 * be locked.  Since this process has
407 			 * the lock on (uppervp) no other
408 			 * process can hold the lock on (un).
409 			 */
410 			KASSERT((un->un_flags & UN_LOCKED) != 0);
411 			KASSERT(curlwp == NULL || un->un_lwp == NULL ||
412 			    un->un_lwp == curlwp);
413 		} else {
414 			if (un->un_flags & UN_LOCKED) {
415 				vrele(UNIONTOV(un));
416 				un->un_flags |= UN_WANTED;
417 				(void) tsleep(&un->un_flags, PINOD,
418 				    "unionalloc", 0);
419 				goto loop;
420 			}
421 			un->un_flags |= UN_LOCKED;
422 
423 			un->un_lwp = curlwp;
424 		}
425 
426 		/*
427 		 * At this point, the union_node is locked,
428 		 * un->un_uppervp may not be locked, and uppervp
429 		 * is locked or nil.
430 		 */
431 
432 		/*
433 		 * Save information about the upper layer.
434 		 */
435 		if (uppervp != un->un_uppervp) {
436 			union_newupper(un, uppervp);
437 		} else if (uppervp) {
438 			vrele(uppervp);
439 		}
440 
441 		if (un->un_uppervp) {
442 			un->un_flags |= UN_ULOCK;
443 			un->un_flags &= ~UN_KLOCK;
444 		}
445 
446 		/*
447 		 * Save information about the lower layer.
448 		 * This needs to keep track of pathname
449 		 * and directory information which union_vn_create
450 		 * might need.
451 		 */
452 		if (lowervp != un->un_lowervp) {
453 			union_newlower(un, lowervp);
454 			if (cnp && (lowervp != NULLVP)) {
455 				un->un_hash = cnp->cn_hash;
456 				un->un_path = malloc(cnp->cn_namelen+1,
457 						M_TEMP, M_WAITOK);
458 				memcpy(un->un_path, cnp->cn_nameptr,
459 						cnp->cn_namelen);
460 				un->un_path[cnp->cn_namelen] = '\0';
461 				vref(dvp);
462 				un->un_dirvp = dvp;
463 			}
464 		} else if (lowervp) {
465 			vrele(lowervp);
466 		}
467 		*vpp = UNIONTOV(un);
468 		return (0);
469 	}
470 
471 	uppersz = lowersz = VNOVAL;
472 	if (uppervp != NULLVP)
473 		if (VOP_GETATTR(uppervp, &va, FSCRED) == 0)
474 			uppersz = va.va_size;
475 	if (lowervp != NULLVP) {
476 		vn_lock(lowervp, LK_SHARED | LK_RETRY);
477 		error = VOP_GETATTR(lowervp, &va, FSCRED);
478 		VOP_UNLOCK(lowervp);
479 		if (error == 0)
480 			lowersz = va.va_size;
481 	}
482 	hash = UNION_HASH(uppervp, lowervp);
483 
484 	/*
485 	 * Get a new vnode and share the lock with upper layer vnode,
486 	 * unless layers are inverted.
487 	 */
488 	vnode_t *svp = (uppervp != NULLVP) ? uppervp : lowervp;
489 	error = getnewvnode(VT_UNION, mp, union_vnodeop_p,
490 	    svp->v_interlock, vpp);
491 	if (error) {
492 		if (uppervp) {
493 			if (dvp == uppervp)
494 				vrele(uppervp);
495 			else
496 				vput(uppervp);
497 		}
498 		if (lowervp)
499 			vrele(lowervp);
500 
501 		goto out;
502 	}
503 
504 	if (docache) {
505 		mutex_enter(&unheadlock[hash]);
506 		LIST_FOREACH(un1, &unhead[hash], un_cache) {
507 			if (un1->un_lowervp == lowervp &&
508 			    un1->un_uppervp == uppervp &&
509 			    UNIONTOV(un1)->v_mount == mp) {
510 				/*
511 				 * Another thread beat us, push back freshly
512 				 * allocated vnode and retry.
513 				 */
514 				mutex_exit(&unheadlock[hash]);
515 				ungetnewvnode(*vpp);
516 				goto loop;
517 			}
518 		}
519 	}
520 
521 	(*vpp)->v_data = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
522 
523 	(*vpp)->v_vflag |= vflag;
524 	(*vpp)->v_iflag |= iflag;
525 	rdev = NODEV;
526 	if (uppervp) {
527 		(*vpp)->v_type = uppervp->v_type;
528 		if (uppervp->v_type == VCHR || uppervp->v_type == VBLK)
529 			rdev = uppervp->v_rdev;
530 	} else {
531 		(*vpp)->v_type = lowervp->v_type;
532 		if (lowervp->v_type == VCHR || lowervp->v_type == VBLK)
533 			rdev = lowervp->v_rdev;
534 	}
535 	if (rdev != NODEV)
536 		spec_node_init(*vpp, rdev);
537 
538 	un = VTOUNION(*vpp);
539 	un->un_vnode = *vpp;
540 	un->un_uppervp = uppervp;
541 	un->un_lowervp = lowervp;
542 	un->un_pvp = undvp;
543 	if (undvp != NULLVP)
544 		vref(undvp);
545 	un->un_dircache = 0;
546 	un->un_openl = 0;
547 	un->un_flags = UN_LOCKED;
548 
549 	un->un_uppersz = VNOVAL;
550 	un->un_lowersz = VNOVAL;
551 	union_newsize(*vpp, uppersz, lowersz);
552 
553 	if (un->un_uppervp)
554 		un->un_flags |= UN_ULOCK;
555 	un->un_lwp = curlwp;
556 	if (dvp && cnp && (lowervp != NULLVP)) {
557 		un->un_hash = cnp->cn_hash;
558 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
559 		memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
560 		un->un_path[cnp->cn_namelen] = '\0';
561 		vref(dvp);
562 		un->un_dirvp = dvp;
563 	} else {
564 		un->un_hash = 0;
565 		un->un_path = 0;
566 		un->un_dirvp = 0;
567 	}
568 
569 	if (docache) {
570 		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
571 		un->un_flags |= UN_CACHED;
572 	}
573 
574 	if (xlowervp)
575 		vrele(xlowervp);
576 
577 out:
578 	if (docache)
579 		mutex_exit(&unheadlock[hash]);
580 
581 	return (error);
582 }
583 
584 int
585 union_freevp(struct vnode *vp)
586 {
587 	int hash;
588 	struct union_node *un = VTOUNION(vp);
589 
590 	hash = UNION_HASH(un->un_uppervp, un->un_lowervp);
591 
592 	mutex_enter(&unheadlock[hash]);
593 	if (un->un_flags & UN_CACHED) {
594 		un->un_flags &= ~UN_CACHED;
595 		LIST_REMOVE(un, un_cache);
596 	}
597 	mutex_exit(&unheadlock[hash]);
598 
599 	if (un->un_pvp != NULLVP)
600 		vrele(un->un_pvp);
601 	if (un->un_uppervp != NULLVP)
602 		vrele(un->un_uppervp);
603 	if (un->un_lowervp != NULLVP)
604 		vrele(un->un_lowervp);
605 	if (un->un_dirvp != NULLVP)
606 		vrele(un->un_dirvp);
607 	if (un->un_path)
608 		free(un->un_path, M_TEMP);
609 
610 	free(vp->v_data, M_TEMP);
611 	vp->v_data = NULL;
612 
613 	return (0);
614 }
615 
616 /*
617  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
618  * using a sequence of reads and writes.  both (fvp)
619  * and (tvp) are locked on entry and exit.
620  */
621 int
622 union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
623 	struct lwp *l)
624 {
625 	char *tbuf;
626 	struct uio uio;
627 	struct iovec iov;
628 	int error = 0;
629 
630 	/*
631 	 * strategy:
632 	 * allocate a buffer of size MAXBSIZE.
633 	 * loop doing reads and writes, keeping track
634 	 * of the current uio offset.
635 	 * give up at the first sign of trouble.
636 	 */
637 
638 	uio.uio_offset = 0;
639 	UIO_SETUP_SYSSPACE(&uio);
640 
641 	VOP_UNLOCK(fvp);			/* XXX */
642 	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
643 	VOP_UNLOCK(tvp);			/* XXX */
644 	vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
645 
646 	tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
647 
648 	/* ugly loop follows... */
649 	do {
650 		off_t offset = uio.uio_offset;
651 
652 		uio.uio_iov = &iov;
653 		uio.uio_iovcnt = 1;
654 		iov.iov_base = tbuf;
655 		iov.iov_len = MAXBSIZE;
656 		uio.uio_resid = iov.iov_len;
657 		uio.uio_rw = UIO_READ;
658 		error = VOP_READ(fvp, &uio, 0, cred);
659 
660 		if (error == 0) {
661 			uio.uio_iov = &iov;
662 			uio.uio_iovcnt = 1;
663 			iov.iov_base = tbuf;
664 			iov.iov_len = MAXBSIZE - uio.uio_resid;
665 			uio.uio_offset = offset;
666 			uio.uio_rw = UIO_WRITE;
667 			uio.uio_resid = iov.iov_len;
668 
669 			if (uio.uio_resid == 0)
670 				break;
671 
672 			do {
673 				error = VOP_WRITE(tvp, &uio, 0, cred);
674 			} while ((uio.uio_resid > 0) && (error == 0));
675 		}
676 
677 	} while (error == 0);
678 
679 	free(tbuf, M_TEMP);
680 	return (error);
681 }
682 
683 /*
684  * (un) is assumed to be locked on entry and remains
685  * locked on exit.
686  */
687 int
688 union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
689 	struct lwp *l)
690 {
691 	int error;
692 	struct vnode *lvp, *uvp;
693 	struct vattr lvattr, uvattr;
694 
695 	error = union_vn_create(&uvp, un, l);
696 	if (error)
697 		return (error);
698 
699 	/* at this point, uppervp is locked */
700 	union_newupper(un, uvp);
701 	un->un_flags |= UN_ULOCK;
702 
703 	lvp = un->un_lowervp;
704 
705 	if (docopy) {
706 		/*
707 		 * XX - should not ignore errors
708 		 * from VOP_CLOSE
709 		 */
710 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
711 
712         	error = VOP_GETATTR(lvp, &lvattr, cred);
713 		if (error == 0)
714 			error = VOP_OPEN(lvp, FREAD, cred);
715 		if (error == 0) {
716 			error = union_copyfile(lvp, uvp, cred, l);
717 			(void) VOP_CLOSE(lvp, FREAD, cred);
718 		}
719 		if (error == 0) {
720 			/* Copy permissions up too */
721 			vattr_null(&uvattr);
722 			uvattr.va_mode = lvattr.va_mode;
723 			uvattr.va_flags = lvattr.va_flags;
724         		error = VOP_SETATTR(uvp, &uvattr, cred);
725 		}
726 		VOP_UNLOCK(lvp);
727 #ifdef UNION_DIAGNOSTIC
728 		if (error == 0)
729 			uprintf("union: copied up %s\n", un->un_path);
730 #endif
731 
732 	}
733 	union_vn_close(uvp, FWRITE, cred, l);
734 
735 	/*
736 	 * Subsequent IOs will go to the top layer, so
737 	 * call close on the lower vnode and open on the
738 	 * upper vnode to ensure that the filesystem keeps
739 	 * its references counts right.  This doesn't do
740 	 * the right thing with (cred) and (FREAD) though.
741 	 * Ignoring error returns is not right, either.
742 	 */
743 	if (error == 0) {
744 		int i;
745 
746 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
747 		for (i = 0; i < un->un_openl; i++) {
748 			(void) VOP_CLOSE(lvp, FREAD, cred);
749 			(void) VOP_OPEN(uvp, FREAD, cred);
750 		}
751 		un->un_openl = 0;
752 		VOP_UNLOCK(lvp);
753 	}
754 
755 	return (error);
756 
757 }
758 
759 /*
760  * Prepare the creation of a new node in the upper layer.
761  *
762  * (dvp) is the directory in which to create the new node.
763  * it is locked on entry and exit.
764  * (cnp) is the componentname to be created.
765  * (cred, path, hash) are credentials, path and its hash to fill (cnp).
766  */
767 static int
768 union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
769     const char *path, u_long hash)
770 {
771 	int error;
772 	const char *cp;
773 	struct vnode *vp;
774 
775 	cnp->cn_nameiop = CREATE;
776 	cnp->cn_flags = LOCKPARENT | ISLASTCN;
777 	cnp->cn_cred = cred;
778 	cnp->cn_nameptr = path;
779 	cnp->cn_namelen = strlen(path);
780 	if (hash == 0) {
781 		cp = NULL;
782 		cnp->cn_hash = namei_hash(cnp->cn_nameptr, &cp);
783 		KASSERT(*cp == 0);
784 	} else {
785 		cnp->cn_hash = hash;
786 	}
787 
788 	error = VOP_LOOKUP(dvp, &vp, cnp);
789 
790 	if (error == 0) {
791 		KASSERT(vp != NULL);
792 		VOP_ABORTOP(dvp, cnp);
793 		if (dvp != vp)
794 			vput(vp);
795 		else
796 			vrele(vp);
797 		error = EEXIST;
798 	} else if (error == EJUSTRETURN) {
799 		error = 0;
800 	}
801 
802 	return error;
803 }
804 
805 /*
806  * Create a shadow directory in the upper layer.
807  * The new vnode is returned locked.
808  *
809  * (um) points to the union mount structure for access to the
810  * the mounting process's credentials.
811  * (dvp) is the directory in which to create the shadow directory.
812  * it is unlocked on entry and exit.
813  * (cnp) is the componentname to be created.
814  * (vpp) is the returned newly created shadow directory, which
815  * is returned locked.
816  *
817  * N.B. We still attempt to create shadow directories even if the union
818  * is mounted read-only, which is a little nonintuitive.
819  */
820 int
821 union_mkshadow(struct union_mount *um, struct vnode *dvp,
822 	struct componentname *cnp, struct vnode **vpp)
823 {
824 	int error;
825 	struct vattr va;
826 	struct componentname cn;
827 	char *pnbuf;
828 
829 	if (cnp->cn_namelen + 1 > MAXPATHLEN)
830 		return ENAMETOOLONG;
831 	pnbuf = PNBUF_GET();
832 	memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
833 	pnbuf[cnp->cn_namelen] = '\0';
834 
835 	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
836 
837 	error = union_do_lookup(dvp, &cn,
838 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf, 0);
839 	if (error) {
840 		VOP_UNLOCK(dvp);
841 		PNBUF_PUT(pnbuf);
842 		return error;
843 	}
844 
845 	/*
846 	 * policy: when creating the shadow directory in the
847 	 * upper layer, create it owned by the user who did
848 	 * the mount, group from parent directory, and mode
849 	 * 777 modified by umask (ie mostly identical to the
850 	 * mkdir syscall).  (jsp, kb)
851 	 */
852 
853 	vattr_null(&va);
854 	va.va_type = VDIR;
855 	va.va_mode = um->um_cmode;
856 
857 	vref(dvp);
858 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
859 	PNBUF_PUT(pnbuf);
860 	return error;
861 }
862 
863 /*
864  * Create a whiteout entry in the upper layer.
865  *
866  * (um) points to the union mount structure for access to the
867  * the mounting process's credentials.
868  * (dvp) is the directory in which to create the whiteout.
869  * it is locked on entry and exit.
870  * (cnp) is the componentname to be created.
871  * (un) holds the path and its hash to be created.
872  */
873 int
874 union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
875 	struct componentname *cnp, struct union_node *un)
876 {
877 	int error;
878 	struct componentname cn;
879 
880 	error = union_do_lookup(dvp, &cn,
881 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
882 	    un->un_path, un->un_hash);
883 	if (error)
884 		return error;
885 
886 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
887 	return error;
888 }
889 
890 /*
891  * union_vn_create: creates and opens a new shadow file
892  * on the upper union layer.  this function is similar
893  * in spirit to calling vn_open but it avoids calling namei().
894  * the problem with calling namei is that a) it locks too many
895  * things, and b) it doesn't start at the "right" directory,
896  * whereas union_do_lookup is told where to start.
897  */
898 int
899 union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
900 {
901 	struct vnode *vp;
902 	kauth_cred_t cred = l->l_cred;
903 	struct vattr vat;
904 	struct vattr *vap = &vat;
905 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
906 	int error;
907 	int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
908 	struct componentname cn;
909 
910 	*vpp = NULLVP;
911 
912 	vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
913 
914 	error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
915 	    un->un_path, un->un_hash);
916 	if (error) {
917 		VOP_UNLOCK(un->un_dirvp);
918 		return error;
919 	}
920 
921 	/*
922 	 * Good - there was no race to create the file
923 	 * so go ahead and create it.  The permissions
924 	 * on the file will be 0666 modified by the
925 	 * current user's umask.  Access to the file, while
926 	 * it is unioned, will require access to the top *and*
927 	 * bottom files.  Access when not unioned will simply
928 	 * require access to the top-level file.
929 	 * TODO: confirm choice of access permissions.
930 	 */
931 	vattr_null(vap);
932 	vap->va_type = VREG;
933 	vap->va_mode = cmode;
934 	vref(un->un_dirvp);
935 	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
936 	if (error)
937 		return error;
938 
939 	error = VOP_OPEN(vp, fmode, cred);
940 	if (error) {
941 		vput(vp);
942 		return error;
943 	}
944 
945 	vp->v_writecount++;
946 	*vpp = vp;
947 	return 0;
948 }
949 
950 int
951 union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
952 {
953 
954 	if (fmode & FWRITE)
955 		--vp->v_writecount;
956 	return (VOP_CLOSE(vp, fmode, cred));
957 }
958 
959 void
960 union_removed_upper(struct union_node *un)
961 {
962 	int hash;
963 
964 #if 1
965 	/*
966 	 * We do not set the uppervp to NULLVP here, because lowervp
967 	 * may also be NULLVP, so this routine would end up creating
968 	 * a bogus union node with no upper or lower VP (that causes
969 	 * pain in many places that assume at least one VP exists).
970 	 * Since we've removed this node from the cache hash chains,
971 	 * it won't be found again.  When all current holders
972 	 * release it, union_inactive() will vgone() it.
973 	 */
974 	union_diruncache(un);
975 #else
976 	union_newupper(un, NULLVP);
977 #endif
978 
979 	hash = UNION_HASH(un->un_uppervp, un->un_lowervp);
980 
981 	mutex_enter(&unheadlock[hash]);
982 	if (un->un_flags & UN_CACHED) {
983 		un->un_flags &= ~UN_CACHED;
984 		LIST_REMOVE(un, un_cache);
985 	}
986 	mutex_exit(&unheadlock[hash]);
987 
988 	if (un->un_flags & UN_ULOCK) {
989 		un->un_flags &= ~UN_ULOCK;
990 		VOP_UNLOCK(un->un_uppervp);
991 	}
992 }
993 
994 #if 0
995 struct vnode *
996 union_lowervp(struct vnode *vp)
997 {
998 	struct union_node *un = VTOUNION(vp);
999 
1000 	if ((un->un_lowervp != NULLVP) &&
1001 	    (vp->v_type == un->un_lowervp->v_type)) {
1002 		if (vget(un->un_lowervp, 0) == 0)
1003 			return (un->un_lowervp);
1004 	}
1005 
1006 	return (NULLVP);
1007 }
1008 #endif
1009 
1010 /*
1011  * determine whether a whiteout is needed
1012  * during a remove/rmdir operation.
1013  */
1014 int
1015 union_dowhiteout(struct union_node *un, kauth_cred_t cred)
1016 {
1017 	struct vattr va;
1018 
1019 	if (un->un_lowervp != NULLVP)
1020 		return (1);
1021 
1022 	if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
1023 	    (va.va_flags & OPAQUE))
1024 		return (1);
1025 
1026 	return (0);
1027 }
1028 
1029 static void
1030 union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
1031 {
1032 	struct union_node *un;
1033 
1034 	if (vp->v_op != union_vnodeop_p) {
1035 		if (vppp) {
1036 			vref(vp);
1037 			*(*vppp)++ = vp;
1038 			if (--(*cntp) == 0)
1039 				panic("union: dircache table too small");
1040 		} else {
1041 			(*cntp)++;
1042 		}
1043 
1044 		return;
1045 	}
1046 
1047 	un = VTOUNION(vp);
1048 	if (un->un_uppervp != NULLVP)
1049 		union_dircache_r(un->un_uppervp, vppp, cntp);
1050 	if (un->un_lowervp != NULLVP)
1051 		union_dircache_r(un->un_lowervp, vppp, cntp);
1052 }
1053 
1054 struct vnode *
1055 union_dircache(struct vnode *vp, struct lwp *l)
1056 {
1057 	int cnt;
1058 	struct vnode *nvp = NULLVP;
1059 	struct vnode **vpp;
1060 	struct vnode **dircache;
1061 	int error;
1062 
1063 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1064 	dircache = VTOUNION(vp)->un_dircache;
1065 
1066 	nvp = NULLVP;
1067 
1068 	if (dircache == 0) {
1069 		cnt = 0;
1070 		union_dircache_r(vp, 0, &cnt);
1071 		cnt++;
1072 		dircache = (struct vnode **)
1073 				malloc(cnt * sizeof(struct vnode *),
1074 					M_TEMP, M_WAITOK);
1075 		vpp = dircache;
1076 		union_dircache_r(vp, &vpp, &cnt);
1077 		VTOUNION(vp)->un_dircache = dircache;
1078 		*vpp = NULLVP;
1079 		vpp = dircache + 1;
1080 	} else {
1081 		vpp = dircache;
1082 		do {
1083 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
1084 				break;
1085 		} while (*vpp != NULLVP);
1086 	}
1087 
1088 	if (*vpp == NULLVP)
1089 		goto out;
1090 
1091 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1092 	vref(*vpp);
1093 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
1094 	if (!error) {
1095 		VTOUNION(vp)->un_dircache = 0;
1096 		VTOUNION(nvp)->un_dircache = dircache;
1097 	}
1098 
1099 out:
1100 	VOP_UNLOCK(vp);
1101 	return (nvp);
1102 }
1103 
1104 void
1105 union_diruncache(struct union_node *un)
1106 {
1107 	struct vnode **vpp;
1108 
1109 	if (un->un_dircache != 0) {
1110 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
1111 			vrele(*vpp);
1112 		free(un->un_dircache, M_TEMP);
1113 		un->un_dircache = 0;
1114 	}
1115 }
1116 
1117 /*
1118  * Check whether node can rmdir (check empty).
1119  */
1120 int
1121 union_check_rmdir(struct union_node *un, kauth_cred_t cred)
1122 {
1123 	int dirlen, eofflag, error;
1124 	char *dirbuf;
1125 	struct vattr va;
1126 	struct vnode *tvp;
1127 	struct dirent *dp, *edp;
1128 	struct componentname cn;
1129 	struct iovec aiov;
1130 	struct uio auio;
1131 
1132 	KASSERT(un->un_uppervp != NULL);
1133 
1134 	/* Check upper for being opaque. */
1135 	KASSERT(VOP_ISLOCKED(un->un_uppervp));
1136 	error = VOP_GETATTR(un->un_uppervp, &va, cred);
1137 	if (error || (va.va_flags & OPAQUE))
1138 		return error;
1139 
1140 	if (un->un_lowervp == NULL)
1141 		return 0;
1142 
1143 	/* Check lower for being empty. */
1144 	vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
1145 	error = VOP_GETATTR(un->un_lowervp, &va, cred);
1146 	if (error) {
1147 		VOP_UNLOCK(un->un_lowervp);
1148 		return error;
1149 	}
1150 	dirlen = va.va_blocksize;
1151 	dirbuf = kmem_alloc(dirlen, KM_SLEEP);
1152 	if (dirbuf == NULL) {
1153 		VOP_UNLOCK(un->un_lowervp);
1154 		return ENOMEM;
1155 	}
1156 	/* error = 0; */
1157 	eofflag = 0;
1158 	auio.uio_offset = 0;
1159 	do {
1160 		aiov.iov_len = dirlen;
1161 		aiov.iov_base = dirbuf;
1162 		auio.uio_iov = &aiov;
1163 		auio.uio_iovcnt = 1;
1164 		auio.uio_resid = aiov.iov_len;
1165 		auio.uio_rw = UIO_READ;
1166 		UIO_SETUP_SYSSPACE(&auio);
1167 		error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
1168 		    NULL, NULL);
1169 		if (error)
1170 			break;
1171 		edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
1172 		for (dp = (struct dirent *)dirbuf;
1173 		    error == 0 && dp < edp;
1174 		    dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
1175 			if (dp->d_reclen == 0) {
1176 				error = ENOTEMPTY;
1177 				break;
1178 			}
1179 			if (dp->d_type == DT_WHT ||
1180 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
1181 			    (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
1182 				continue;
1183 			/* Check for presence in the upper layer. */
1184 			cn.cn_nameiop = LOOKUP;
1185 			cn.cn_flags = ISLASTCN | RDONLY;
1186 			cn.cn_cred = cred;
1187 			cn.cn_nameptr = dp->d_name;
1188 			cn.cn_namelen = dp->d_namlen;
1189 			cn.cn_hash = 0;
1190 			error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
1191 			if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
1192 				error = 0;
1193 				continue;
1194 			}
1195 			if (error == 0)
1196 				vput(tvp);
1197 			error = ENOTEMPTY;
1198 		}
1199 	} while (error == 0 && !eofflag);
1200 	kmem_free(dirbuf, dirlen);
1201 	VOP_UNLOCK(un->un_lowervp);
1202 
1203 	return error;
1204 }
1205 
1206 /*
1207  * This hook is called from vn_readdir() to switch to lower directory
1208  * entry after the upper directory is read.
1209  */
1210 int
1211 union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
1212 {
1213 	struct vnode *vp = *vpp, *lvp;
1214 	struct vattr va;
1215 	int error;
1216 
1217 	if (vp->v_op != union_vnodeop_p)
1218 		return (0);
1219 
1220 	/*
1221 	 * If the directory is opaque,
1222 	 * then don't show lower entries
1223 	 */
1224 	error = VOP_GETATTR(vp, &va, fp->f_cred);
1225 	if (error || (va.va_flags & OPAQUE))
1226 		return error;
1227 
1228 	if ((lvp = union_dircache(vp, l)) == NULLVP)
1229 		return (0);
1230 
1231 	error = VOP_OPEN(lvp, FREAD, fp->f_cred);
1232 	if (error) {
1233 		vput(lvp);
1234 		return (error);
1235 	}
1236 	VOP_UNLOCK(lvp);
1237 	fp->f_data = lvp;
1238 	fp->f_offset = 0;
1239 	error = vn_close(vp, FREAD, fp->f_cred);
1240 	if (error)
1241 		return (error);
1242 	*vpp = lvp;
1243 	return (0);
1244 }
1245