xref: /netbsd-src/sys/fs/union/union_subr.c (revision d710132b4b8ce7f7cccaaf660cb16aa16b4077a0)
1 /*	$NetBSD: union_subr.c,v 1.2 2003/03/17 09:11:30 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 1994 Jan-Simon Pendry
5  * Copyright (c) 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Jan-Simon Pendry.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
40  */
41 
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.2 2003/03/17 09:11:30 jdolecek Exp $");
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/time.h>
49 #include <sys/kernel.h>
50 #include <sys/vnode.h>
51 #include <sys/namei.h>
52 #include <sys/malloc.h>
53 #include <sys/file.h>
54 #include <sys/filedesc.h>
55 #include <sys/queue.h>
56 #include <sys/mount.h>
57 #include <sys/stat.h>
58 
59 #include <uvm/uvm_extern.h>
60 
61 #include <fs/union/union.h>
62 
63 #ifdef DIAGNOSTIC
64 #include <sys/proc.h>
65 #endif
66 
67 /* must be power of two, otherwise change UNION_HASH() */
68 #define NHASH 32
69 
70 /* unsigned int ... */
71 #define UNION_HASH(u, l) \
72 	(((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1))
73 
74 static LIST_HEAD(unhead, union_node) unhead[NHASH];
75 static int unvplock[NHASH];
76 
77 static int union_list_lock __P((int));
78 static void union_list_unlock __P((int));
79 void union_updatevp __P((struct union_node *, struct vnode *, struct vnode *));
80 static int union_relookup __P((struct union_mount *, struct vnode *,
81 			       struct vnode **, struct componentname *,
82 			       struct componentname *, const char *, int));
83 int union_vn_close __P((struct vnode *, int, struct ucred *, struct proc *));
84 static void union_dircache_r __P((struct vnode *, struct vnode ***, int *));
85 struct vnode *union_dircache __P((struct vnode *, struct proc *));
86 
87 void
88 union_init()
89 {
90 	int i;
91 
92 	for (i = 0; i < NHASH; i++)
93 		LIST_INIT(&unhead[i]);
94 	memset((caddr_t) unvplock, 0, sizeof(unvplock));
95 }
96 
97 /*
98  * Free global unionfs resources.
99  */
100 void
101 union_done()
102 {
103 
104 	/* Make sure to unset the readdir hook. */
105 	vn_union_readdir_hook = NULL;
106 }
107 
108 static int
109 union_list_lock(ix)
110 	int ix;
111 {
112 
113 	if (unvplock[ix] & UN_LOCKED) {
114 		unvplock[ix] |= UN_WANTED;
115 		(void) tsleep(&unvplock[ix], PINOD, "unionlk", 0);
116 		return (1);
117 	}
118 
119 	unvplock[ix] |= UN_LOCKED;
120 
121 	return (0);
122 }
123 
124 static void
125 union_list_unlock(ix)
126 	int ix;
127 {
128 
129 	unvplock[ix] &= ~UN_LOCKED;
130 
131 	if (unvplock[ix] & UN_WANTED) {
132 		unvplock[ix] &= ~UN_WANTED;
133 		wakeup((caddr_t) &unvplock[ix]);
134 	}
135 }
136 
137 void
138 union_updatevp(un, uppervp, lowervp)
139 	struct union_node *un;
140 	struct vnode *uppervp;
141 	struct vnode *lowervp;
142 {
143 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
144 	int nhash = UNION_HASH(uppervp, lowervp);
145 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
146 	int lhash, uhash;
147 
148 	/*
149 	 * Ensure locking is ordered from lower to higher
150 	 * to avoid deadlocks.
151 	 */
152 	if (nhash < ohash) {
153 		lhash = nhash;
154 		uhash = ohash;
155 	} else {
156 		lhash = ohash;
157 		uhash = nhash;
158 	}
159 
160 	if (lhash != uhash)
161 		while (union_list_lock(lhash))
162 			continue;
163 
164 	while (union_list_lock(uhash))
165 		continue;
166 
167 	if (ohash != nhash || !docache) {
168 		if (un->un_flags & UN_CACHED) {
169 			un->un_flags &= ~UN_CACHED;
170 			LIST_REMOVE(un, un_cache);
171 		}
172 	}
173 
174 	if (ohash != nhash)
175 		union_list_unlock(ohash);
176 
177 	if (un->un_lowervp != lowervp) {
178 		if (un->un_lowervp) {
179 			vrele(un->un_lowervp);
180 			if (un->un_path) {
181 				free(un->un_path, M_TEMP);
182 				un->un_path = 0;
183 			}
184 			if (un->un_dirvp) {
185 				vrele(un->un_dirvp);
186 				un->un_dirvp = NULLVP;
187 			}
188 		}
189 		un->un_lowervp = lowervp;
190 		un->un_lowersz = VNOVAL;
191 	}
192 
193 	if (un->un_uppervp != uppervp) {
194 		if (un->un_uppervp)
195 			vrele(un->un_uppervp);
196 
197 		un->un_uppervp = uppervp;
198 		un->un_uppersz = VNOVAL;
199 	}
200 
201 	if (docache && (ohash != nhash)) {
202 		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
203 		un->un_flags |= UN_CACHED;
204 	}
205 
206 	union_list_unlock(nhash);
207 }
208 
209 void
210 union_newlower(un, lowervp)
211 	struct union_node *un;
212 	struct vnode *lowervp;
213 {
214 
215 	union_updatevp(un, un->un_uppervp, lowervp);
216 }
217 
218 void
219 union_newupper(un, uppervp)
220 	struct union_node *un;
221 	struct vnode *uppervp;
222 {
223 
224 	union_updatevp(un, uppervp, un->un_lowervp);
225 }
226 
227 /*
228  * Keep track of size changes in the underlying vnodes.
229  * If the size changes, then callback to the vm layer
230  * giving priority to the upper layer size.
231  */
232 void
233 union_newsize(vp, uppersz, lowersz)
234 	struct vnode *vp;
235 	off_t uppersz, lowersz;
236 {
237 	struct union_node *un;
238 	off_t sz;
239 
240 	/* only interested in regular files */
241 	if (vp->v_type != VREG)
242 		return;
243 
244 	un = VTOUNION(vp);
245 	sz = VNOVAL;
246 
247 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
248 		un->un_uppersz = uppersz;
249 		if (sz == VNOVAL)
250 			sz = un->un_uppersz;
251 	}
252 
253 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
254 		un->un_lowersz = lowersz;
255 		if (sz == VNOVAL)
256 			sz = un->un_lowersz;
257 	}
258 
259 	if (sz != VNOVAL) {
260 #ifdef UNION_DIAGNOSTIC
261 		printf("union: %s size now %qd\n",
262 		    uppersz != VNOVAL ? "upper" : "lower", sz);
263 #endif
264 		uvm_vnp_setsize(vp, sz);
265 	}
266 }
267 
268 /*
269  * allocate a union_node/vnode pair.  the vnode is
270  * referenced and locked.  the new vnode is returned
271  * via (vpp).  (mp) is the mountpoint of the union filesystem,
272  * (dvp) is the parent directory where the upper layer object
273  * should exist (but doesn't) and (cnp) is the componentname
274  * information which is partially copied to allow the upper
275  * layer object to be created at a later time.  (uppervp)
276  * and (lowervp) reference the upper and lower layer objects
277  * being mapped.  either, but not both, can be nil.
278  * if supplied, (uppervp) is locked.
279  * the reference is either maintained in the new union_node
280  * object which is allocated, or they are vrele'd.
281  *
282  * all union_nodes are maintained on a singly-linked
283  * list.  new nodes are only allocated when they cannot
284  * be found on this list.  entries on the list are
285  * removed when the vfs reclaim entry is called.
286  *
287  * a single lock is kept for the entire list.  this is
288  * needed because the getnewvnode() function can block
289  * waiting for a vnode to become free, in which case there
290  * may be more than one process trying to get the same
291  * vnode.  this lock is only taken if we are going to
292  * call getnewvnode, since the kernel itself is single-threaded.
293  *
294  * if an entry is found on the list, then call vget() to
295  * take a reference.  this is done because there may be
296  * zero references to it and so it needs to removed from
297  * the vnode free list.
298  */
299 int
300 union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp, docache)
301 	struct vnode **vpp;
302 	struct mount *mp;
303 	struct vnode *undvp;		/* parent union vnode */
304 	struct vnode *dvp;		/* may be null */
305 	struct componentname *cnp;	/* may be null */
306 	struct vnode *uppervp;		/* may be null */
307 	struct vnode *lowervp;		/* may be null */
308 	int docache;
309 {
310 	int error;
311 	struct union_node *un = NULL;
312 	struct vnode *xlowervp = NULLVP;
313 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
314 	int hash = 0;
315 	int vflag;
316 	int try;
317 
318 	if (uppervp == NULLVP && lowervp == NULLVP)
319 		panic("union: unidentifiable allocation");
320 
321 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
322 		xlowervp = lowervp;
323 		lowervp = NULLVP;
324 	}
325 
326 	/* detect the root vnode (and aliases) */
327 	vflag = VLAYER;
328 	if ((uppervp == um->um_uppervp) &&
329 	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
330 		if (lowervp == NULLVP) {
331 			lowervp = um->um_lowervp;
332 			if (lowervp != NULLVP)
333 				VREF(lowervp);
334 		}
335 		vflag = VROOT;
336 	}
337 
338 loop:
339 	if (!docache) {
340 		un = 0;
341 	} else for (try = 0; try < 3; try++) {
342 		switch (try) {
343 		case 0:
344 			if (lowervp == NULLVP)
345 				continue;
346 			hash = UNION_HASH(uppervp, lowervp);
347 			break;
348 
349 		case 1:
350 			if (uppervp == NULLVP)
351 				continue;
352 			hash = UNION_HASH(uppervp, NULLVP);
353 			break;
354 
355 		case 2:
356 			if (lowervp == NULLVP)
357 				continue;
358 			hash = UNION_HASH(NULLVP, lowervp);
359 			break;
360 		}
361 
362 		while (union_list_lock(hash))
363 			continue;
364 
365 		for (un = unhead[hash].lh_first; un != 0;
366 					un = un->un_cache.le_next) {
367 			if ((un->un_lowervp == lowervp ||
368 			     un->un_lowervp == NULLVP) &&
369 			    (un->un_uppervp == uppervp ||
370 			     un->un_uppervp == NULLVP) &&
371 			    (UNIONTOV(un)->v_mount == mp)) {
372 				if (vget(UNIONTOV(un), 0)) {
373 					union_list_unlock(hash);
374 					goto loop;
375 				}
376 				break;
377 			}
378 		}
379 
380 		union_list_unlock(hash);
381 
382 		if (un)
383 			break;
384 	}
385 
386 	if (un) {
387 		/*
388 		 * Obtain a lock on the union_node.
389 		 * uppervp is locked, though un->un_uppervp
390 		 * may not be.  this doesn't break the locking
391 		 * hierarchy since in the case that un->un_uppervp
392 		 * is not yet locked it will be vrele'd and replaced
393 		 * with uppervp.
394 		 */
395 
396 		if ((dvp != NULLVP) && (uppervp == dvp)) {
397 			/*
398 			 * Access ``.'', so (un) will already
399 			 * be locked.  Since this process has
400 			 * the lock on (uppervp) no other
401 			 * process can hold the lock on (un).
402 			 */
403 #ifdef DIAGNOSTIC
404 			if ((un->un_flags & UN_LOCKED) == 0)
405 				panic("union: . not locked");
406 			else if (curproc && un->un_pid != curproc->p_pid &&
407 				    un->un_pid > -1 && curproc->p_pid > -1)
408 				panic("union: allocvp not lock owner");
409 #endif
410 		} else {
411 			if (un->un_flags & UN_LOCKED) {
412 				vrele(UNIONTOV(un));
413 				un->un_flags |= UN_WANTED;
414 				(void) tsleep(&un->un_flags, PINOD,
415 				    "unionalloc", 0);
416 				goto loop;
417 			}
418 			un->un_flags |= UN_LOCKED;
419 
420 #ifdef DIAGNOSTIC
421 			if (curproc)
422 				un->un_pid = curproc->p_pid;
423 			else
424 				un->un_pid = -1;
425 #endif
426 		}
427 
428 		/*
429 		 * At this point, the union_node is locked,
430 		 * un->un_uppervp may not be locked, and uppervp
431 		 * is locked or nil.
432 		 */
433 
434 		/*
435 		 * Save information about the upper layer.
436 		 */
437 		if (uppervp != un->un_uppervp) {
438 			union_newupper(un, uppervp);
439 		} else if (uppervp) {
440 			vrele(uppervp);
441 		}
442 
443 		if (un->un_uppervp) {
444 			un->un_flags |= UN_ULOCK;
445 			un->un_flags &= ~UN_KLOCK;
446 		}
447 
448 		/*
449 		 * Save information about the lower layer.
450 		 * This needs to keep track of pathname
451 		 * and directory information which union_vn_create
452 		 * might need.
453 		 */
454 		if (lowervp != un->un_lowervp) {
455 			union_newlower(un, lowervp);
456 			if (cnp && (lowervp != NULLVP)) {
457 				un->un_hash = cnp->cn_hash;
458 				un->un_path = malloc(cnp->cn_namelen+1,
459 						M_TEMP, M_WAITOK);
460 				memcpy(un->un_path, cnp->cn_nameptr,
461 						cnp->cn_namelen);
462 				un->un_path[cnp->cn_namelen] = '\0';
463 				VREF(dvp);
464 				un->un_dirvp = dvp;
465 			}
466 		} else if (lowervp) {
467 			vrele(lowervp);
468 		}
469 		*vpp = UNIONTOV(un);
470 		return (0);
471 	}
472 
473 	if (docache) {
474 		/*
475 		 * otherwise lock the vp list while we call getnewvnode
476 		 * since that can block.
477 		 */
478 		hash = UNION_HASH(uppervp, lowervp);
479 
480 		if (union_list_lock(hash))
481 			goto loop;
482 	}
483 
484 	error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp);
485 	if (error) {
486 		if (uppervp) {
487 			if (dvp == uppervp)
488 				vrele(uppervp);
489 			else
490 				vput(uppervp);
491 		}
492 		if (lowervp)
493 			vrele(lowervp);
494 
495 		goto out;
496 	}
497 
498 	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
499 		M_TEMP, M_WAITOK);
500 
501 	(*vpp)->v_flag |= vflag;
502 	(*vpp)->v_vnlock = NULL;	/* Make upper layers call VOP_LOCK */
503 	if (uppervp)
504 		(*vpp)->v_type = uppervp->v_type;
505 	else
506 		(*vpp)->v_type = lowervp->v_type;
507 	un = VTOUNION(*vpp);
508 	un->un_vnode = *vpp;
509 	un->un_uppervp = uppervp;
510 	un->un_uppersz = VNOVAL;
511 	un->un_lowervp = lowervp;
512 	un->un_lowersz = VNOVAL;
513 	un->un_pvp = undvp;
514 	if (undvp != NULLVP)
515 		VREF(undvp);
516 	un->un_dircache = 0;
517 	un->un_openl = 0;
518 	un->un_flags = UN_LOCKED;
519 	if (un->un_uppervp)
520 		un->un_flags |= UN_ULOCK;
521 #ifdef DIAGNOSTIC
522 	if (curproc)
523 		un->un_pid = curproc->p_pid;
524 	else
525 		un->un_pid = -1;
526 #endif
527 	if (cnp && (lowervp != NULLVP)) {
528 		un->un_hash = cnp->cn_hash;
529 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
530 		memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
531 		un->un_path[cnp->cn_namelen] = '\0';
532 		VREF(dvp);
533 		un->un_dirvp = dvp;
534 	} else {
535 		un->un_hash = 0;
536 		un->un_path = 0;
537 		un->un_dirvp = 0;
538 	}
539 
540 	if (docache) {
541 		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
542 		un->un_flags |= UN_CACHED;
543 	}
544 
545 	if (xlowervp)
546 		vrele(xlowervp);
547 
548 out:
549 	if (docache)
550 		union_list_unlock(hash);
551 
552 	return (error);
553 }
554 
555 int
556 union_freevp(vp)
557 	struct vnode *vp;
558 {
559 	struct union_node *un = VTOUNION(vp);
560 
561 	if (un->un_flags & UN_CACHED) {
562 		un->un_flags &= ~UN_CACHED;
563 		LIST_REMOVE(un, un_cache);
564 	}
565 
566 	if (un->un_pvp != NULLVP)
567 		vrele(un->un_pvp);
568 	if (un->un_uppervp != NULLVP)
569 		vrele(un->un_uppervp);
570 	if (un->un_lowervp != NULLVP)
571 		vrele(un->un_lowervp);
572 	if (un->un_dirvp != NULLVP)
573 		vrele(un->un_dirvp);
574 	if (un->un_path)
575 		free(un->un_path, M_TEMP);
576 
577 	FREE(vp->v_data, M_TEMP);
578 	vp->v_data = 0;
579 
580 	return (0);
581 }
582 
583 /*
584  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
585  * using a sequence of reads and writes.  both (fvp)
586  * and (tvp) are locked on entry and exit.
587  */
588 int
589 union_copyfile(fvp, tvp, cred, p)
590 	struct vnode *fvp;
591 	struct vnode *tvp;
592 	struct ucred *cred;
593 	struct proc *p;
594 {
595 	char *buf;
596 	struct uio uio;
597 	struct iovec iov;
598 	int error = 0;
599 
600 	/*
601 	 * strategy:
602 	 * allocate a buffer of size MAXBSIZE.
603 	 * loop doing reads and writes, keeping track
604 	 * of the current uio offset.
605 	 * give up at the first sign of trouble.
606 	 */
607 
608 	uio.uio_procp = p;
609 	uio.uio_segflg = UIO_SYSSPACE;
610 	uio.uio_offset = 0;
611 
612 	VOP_UNLOCK(fvp, 0);			/* XXX */
613 	VOP_LEASE(fvp, p, cred, LEASE_READ);
614 	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
615 	VOP_UNLOCK(tvp, 0);			/* XXX */
616 	VOP_LEASE(tvp, p, cred, LEASE_WRITE);
617 	vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
618 
619 	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
620 
621 	/* ugly loop follows... */
622 	do {
623 		off_t offset = uio.uio_offset;
624 
625 		uio.uio_iov = &iov;
626 		uio.uio_iovcnt = 1;
627 		iov.iov_base = buf;
628 		iov.iov_len = MAXBSIZE;
629 		uio.uio_resid = iov.iov_len;
630 		uio.uio_rw = UIO_READ;
631 		error = VOP_READ(fvp, &uio, 0, cred);
632 
633 		if (error == 0) {
634 			uio.uio_iov = &iov;
635 			uio.uio_iovcnt = 1;
636 			iov.iov_base = buf;
637 			iov.iov_len = MAXBSIZE - uio.uio_resid;
638 			uio.uio_offset = offset;
639 			uio.uio_rw = UIO_WRITE;
640 			uio.uio_resid = iov.iov_len;
641 
642 			if (uio.uio_resid == 0)
643 				break;
644 
645 			do {
646 				error = VOP_WRITE(tvp, &uio, 0, cred);
647 			} while ((uio.uio_resid > 0) && (error == 0));
648 		}
649 
650 	} while (error == 0);
651 
652 	free(buf, M_TEMP);
653 	return (error);
654 }
655 
656 /*
657  * (un) is assumed to be locked on entry and remains
658  * locked on exit.
659  */
660 int
661 union_copyup(un, docopy, cred, p)
662 	struct union_node *un;
663 	int docopy;
664 	struct ucred *cred;
665 	struct proc *p;
666 {
667 	int error;
668 	struct vnode *lvp, *uvp;
669 	struct vattr lvattr, uvattr;
670 
671 	error = union_vn_create(&uvp, un, p);
672 	if (error)
673 		return (error);
674 
675 	/* at this point, uppervp is locked */
676 	union_newupper(un, uvp);
677 	un->un_flags |= UN_ULOCK;
678 
679 	lvp = un->un_lowervp;
680 
681 	if (docopy) {
682 		/*
683 		 * XX - should not ignore errors
684 		 * from VOP_CLOSE
685 		 */
686 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
687 
688         	error = VOP_GETATTR(lvp, &lvattr, cred, p);
689 		if (error == 0)
690 			error = VOP_OPEN(lvp, FREAD, cred, p);
691 		if (error == 0) {
692 			error = union_copyfile(lvp, uvp, cred, p);
693 			(void) VOP_CLOSE(lvp, FREAD, cred, p);
694 		}
695 		if (error == 0) {
696 			/* Copy permissions up too */
697 			VATTR_NULL(&uvattr);
698 			uvattr.va_mode = lvattr.va_mode;
699 			uvattr.va_flags = lvattr.va_flags;
700         		error = VOP_SETATTR(uvp, &uvattr, cred, p);
701 		}
702 		VOP_UNLOCK(lvp, 0);
703 #ifdef UNION_DIAGNOSTIC
704 		if (error == 0)
705 			uprintf("union: copied up %s\n", un->un_path);
706 #endif
707 
708 	}
709 	union_vn_close(uvp, FWRITE, cred, p);
710 
711 	/*
712 	 * Subsequent IOs will go to the top layer, so
713 	 * call close on the lower vnode and open on the
714 	 * upper vnode to ensure that the filesystem keeps
715 	 * its references counts right.  This doesn't do
716 	 * the right thing with (cred) and (FREAD) though.
717 	 * Ignoring error returns is not right, either.
718 	 */
719 	if (error == 0) {
720 		int i;
721 
722 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
723 		for (i = 0; i < un->un_openl; i++) {
724 			(void) VOP_CLOSE(lvp, FREAD, cred, p);
725 			(void) VOP_OPEN(uvp, FREAD, cred, p);
726 		}
727 		un->un_openl = 0;
728 		VOP_UNLOCK(lvp, 0);
729 	}
730 
731 	return (error);
732 
733 }
734 
735 static int
736 union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
737 	struct union_mount *um;
738 	struct vnode *dvp;
739 	struct vnode **vpp;
740 	struct componentname *cnp;
741 	struct componentname *cn;
742 	const char *path;
743 	int pathlen;
744 {
745 	int error;
746 
747 	/*
748 	 * A new componentname structure must be faked up because
749 	 * there is no way to know where the upper level cnp came
750 	 * from or what it is being used for.  This must duplicate
751 	 * some of the work done by NDINIT, some of the work done
752 	 * by namei, some of the work done by lookup and some of
753 	 * the work done by VOP_LOOKUP when given a CREATE flag.
754 	 * Conclusion: Horrible.
755 	 *
756 	 * The pathname buffer will be PNBUF_PUT'd by VOP_MKDIR.
757 	 */
758 	cn->cn_namelen = pathlen;
759 	if ((cn->cn_namelen + 1) > MAXPATHLEN)
760 		return (ENAMETOOLONG);
761 	cn->cn_pnbuf = PNBUF_GET();
762 	memcpy(cn->cn_pnbuf, path, cn->cn_namelen);
763 	cn->cn_pnbuf[cn->cn_namelen] = '\0';
764 
765 	cn->cn_nameiop = CREATE;
766 	cn->cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
767 	cn->cn_proc = cnp->cn_proc;
768 	if (um->um_op == UNMNT_ABOVE)
769 		cn->cn_cred = cnp->cn_cred;
770 	else
771 		cn->cn_cred = um->um_cred;
772 	cn->cn_nameptr = cn->cn_pnbuf;
773 	cn->cn_hash = cnp->cn_hash;
774 	cn->cn_consume = cnp->cn_consume;
775 
776 	VREF(dvp);
777 	error = relookup(dvp, vpp, cn);
778 	if (!error)
779 		vrele(dvp);
780 	else {
781 		PNBUF_PUT(cn->cn_pnbuf);
782 		cn->cn_pnbuf = 0;
783 	}
784 
785 	return (error);
786 }
787 
788 /*
789  * Create a shadow directory in the upper layer.
790  * The new vnode is returned locked.
791  *
792  * (um) points to the union mount structure for access to the
793  * the mounting process's credentials.
794  * (dvp) is the directory in which to create the shadow directory.
795  * it is unlocked on entry and exit.
796  * (cnp) is the componentname to be created.
797  * (vpp) is the returned newly created shadow directory, which
798  * is returned locked.
799  *
800  * N.B. We still attempt to create shadow directories even if the union
801  * is mounted read-only, which is a little nonintuitive.
802  */
803 int
804 union_mkshadow(um, dvp, cnp, vpp)
805 	struct union_mount *um;
806 	struct vnode *dvp;
807 	struct componentname *cnp;
808 	struct vnode **vpp;
809 {
810 	int error;
811 	struct vattr va;
812 	struct proc *p = cnp->cn_proc;
813 	struct componentname cn;
814 
815 	error = union_relookup(um, dvp, vpp, cnp, &cn,
816 			cnp->cn_nameptr, cnp->cn_namelen);
817 	if (error)
818 		return (error);
819 
820 	if (*vpp) {
821 		VOP_ABORTOP(dvp, &cn);
822 		VOP_UNLOCK(dvp, 0);
823 		vrele(*vpp);
824 		*vpp = NULLVP;
825 		return (EEXIST);
826 	}
827 
828 	/*
829 	 * policy: when creating the shadow directory in the
830 	 * upper layer, create it owned by the user who did
831 	 * the mount, group from parent directory, and mode
832 	 * 777 modified by umask (ie mostly identical to the
833 	 * mkdir syscall).  (jsp, kb)
834 	 */
835 
836 	VATTR_NULL(&va);
837 	va.va_type = VDIR;
838 	va.va_mode = um->um_cmode;
839 
840 	/* VOP_LEASE: dvp is locked */
841 	VOP_LEASE(dvp, p, cn.cn_cred, LEASE_WRITE);
842 
843 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
844 	return (error);
845 }
846 
847 /*
848  * Create a whiteout entry in the upper layer.
849  *
850  * (um) points to the union mount structure for access to the
851  * the mounting process's credentials.
852  * (dvp) is the directory in which to create the whiteout.
853  * it is locked on entry and exit.
854  * (cnp) is the componentname to be created.
855  */
856 int
857 union_mkwhiteout(um, dvp, cnp, path)
858 	struct union_mount *um;
859 	struct vnode *dvp;
860 	struct componentname *cnp;
861 	char *path;
862 {
863 	int error;
864 	struct proc *p = cnp->cn_proc;
865 	struct vnode *wvp;
866 	struct componentname cn;
867 
868 	VOP_UNLOCK(dvp, 0);
869 	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
870 	if (error) {
871 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
872 		return (error);
873 	}
874 
875 	if (wvp) {
876 		VOP_ABORTOP(dvp, &cn);
877 		vrele(dvp);
878 		vrele(wvp);
879 		return (EEXIST);
880 	}
881 
882 	/* VOP_LEASE: dvp is locked */
883 	VOP_LEASE(dvp, p, p->p_ucred, LEASE_WRITE);
884 
885 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
886 	if (error)
887 		VOP_ABORTOP(dvp, &cn);
888 
889 	vrele(dvp);
890 
891 	return (error);
892 }
893 
894 /*
895  * union_vn_create: creates and opens a new shadow file
896  * on the upper union layer.  this function is similar
897  * in spirit to calling vn_open but it avoids calling namei().
898  * the problem with calling namei is that a) it locks too many
899  * things, and b) it doesn't start at the "right" directory,
900  * whereas relookup is told where to start.
901  */
902 int
903 union_vn_create(vpp, un, p)
904 	struct vnode **vpp;
905 	struct union_node *un;
906 	struct proc *p;
907 {
908 	struct vnode *vp;
909 	struct ucred *cred = p->p_ucred;
910 	struct vattr vat;
911 	struct vattr *vap = &vat;
912 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
913 	int error;
914 	int cmode = UN_FILEMODE & ~p->p_cwdi->cwdi_cmask;
915 	struct componentname cn;
916 
917 	*vpp = NULLVP;
918 
919 	/*
920 	 * Build a new componentname structure (for the same
921 	 * reasons outlines in union_mkshadow).
922 	 * The difference here is that the file is owned by
923 	 * the current user, rather than by the person who
924 	 * did the mount, since the current user needs to be
925 	 * able to write the file (that's why it is being
926 	 * copied in the first place).
927 	 */
928 	cn.cn_namelen = strlen(un->un_path);
929 	if ((cn.cn_namelen + 1) > MAXPATHLEN)
930 		return (ENAMETOOLONG);
931 	cn.cn_pnbuf = PNBUF_GET();
932 	memcpy(cn.cn_pnbuf, un->un_path, cn.cn_namelen+1);
933 	cn.cn_nameiop = CREATE;
934 	cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
935 	cn.cn_proc = p;
936 	cn.cn_cred = p->p_ucred;
937 	cn.cn_nameptr = cn.cn_pnbuf;
938 	cn.cn_hash = un->un_hash;
939 	cn.cn_consume = 0;
940 
941 	VREF(un->un_dirvp);
942 	if ((error = relookup(un->un_dirvp, &vp, &cn)) != 0)
943 		return (error);
944 	vrele(un->un_dirvp);
945 
946 	if (vp) {
947 		VOP_ABORTOP(un->un_dirvp, &cn);
948 		if (un->un_dirvp == vp)
949 			vrele(un->un_dirvp);
950 		else
951 			vput(un->un_dirvp);
952 		vrele(vp);
953 		return (EEXIST);
954 	}
955 
956 	/*
957 	 * Good - there was no race to create the file
958 	 * so go ahead and create it.  The permissions
959 	 * on the file will be 0666 modified by the
960 	 * current user's umask.  Access to the file, while
961 	 * it is unioned, will require access to the top *and*
962 	 * bottom files.  Access when not unioned will simply
963 	 * require access to the top-level file.
964 	 * TODO: confirm choice of access permissions.
965 	 */
966 	VATTR_NULL(vap);
967 	vap->va_type = VREG;
968 	vap->va_mode = cmode;
969 	VOP_LEASE(un->un_dirvp, p, cred, LEASE_WRITE);
970 	if ((error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap)) != 0)
971 		return (error);
972 
973 	if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0) {
974 		vput(vp);
975 		return (error);
976 	}
977 
978 	vp->v_writecount++;
979 	*vpp = vp;
980 	return (0);
981 }
982 
983 int
984 union_vn_close(vp, fmode, cred, p)
985 	struct vnode *vp;
986 	int fmode;
987 	struct ucred *cred;
988 	struct proc *p;
989 {
990 
991 	if (fmode & FWRITE)
992 		--vp->v_writecount;
993 	return (VOP_CLOSE(vp, fmode, cred, p));
994 }
995 
996 void
997 union_removed_upper(un)
998 	struct union_node *un;
999 {
1000 #if 1
1001 	/*
1002 	 * We do not set the uppervp to NULLVP here, because lowervp
1003 	 * may also be NULLVP, so this routine would end up creating
1004 	 * a bogus union node with no upper or lower VP (that causes
1005 	 * pain in many places that assume at least one VP exists).
1006 	 * Since we've removed this node from the cache hash chains,
1007 	 * it won't be found again.  When all current holders
1008 	 * release it, union_inactive() will vgone() it.
1009 	 */
1010 	union_diruncache(un);
1011 #else
1012 	union_newupper(un, NULLVP);
1013 #endif
1014 
1015 	if (un->un_flags & UN_CACHED) {
1016 		un->un_flags &= ~UN_CACHED;
1017 		LIST_REMOVE(un, un_cache);
1018 	}
1019 
1020 	if (un->un_flags & UN_ULOCK) {
1021 		un->un_flags &= ~UN_ULOCK;
1022 		VOP_UNLOCK(un->un_uppervp, 0);
1023 	}
1024 }
1025 
1026 #if 0
1027 struct vnode *
1028 union_lowervp(vp)
1029 	struct vnode *vp;
1030 {
1031 	struct union_node *un = VTOUNION(vp);
1032 
1033 	if ((un->un_lowervp != NULLVP) &&
1034 	    (vp->v_type == un->un_lowervp->v_type)) {
1035 		if (vget(un->un_lowervp, 0) == 0)
1036 			return (un->un_lowervp);
1037 	}
1038 
1039 	return (NULLVP);
1040 }
1041 #endif
1042 
1043 /*
1044  * determine whether a whiteout is needed
1045  * during a remove/rmdir operation.
1046  */
1047 int
1048 union_dowhiteout(un, cred, p)
1049 	struct union_node *un;
1050 	struct ucred *cred;
1051 	struct proc *p;
1052 {
1053 	struct vattr va;
1054 
1055 	if (un->un_lowervp != NULLVP)
1056 		return (1);
1057 
1058 	if (VOP_GETATTR(un->un_uppervp, &va, cred, p) == 0 &&
1059 	    (va.va_flags & OPAQUE))
1060 		return (1);
1061 
1062 	return (0);
1063 }
1064 
1065 static void
1066 union_dircache_r(vp, vppp, cntp)
1067 	struct vnode *vp;
1068 	struct vnode ***vppp;
1069 	int *cntp;
1070 {
1071 	struct union_node *un;
1072 
1073 	if (vp->v_op != union_vnodeop_p) {
1074 		if (vppp) {
1075 			VREF(vp);
1076 			*(*vppp)++ = vp;
1077 			if (--(*cntp) == 0)
1078 				panic("union: dircache table too small");
1079 		} else {
1080 			(*cntp)++;
1081 		}
1082 
1083 		return;
1084 	}
1085 
1086 	un = VTOUNION(vp);
1087 	if (un->un_uppervp != NULLVP)
1088 		union_dircache_r(un->un_uppervp, vppp, cntp);
1089 	if (un->un_lowervp != NULLVP)
1090 		union_dircache_r(un->un_lowervp, vppp, cntp);
1091 }
1092 
1093 struct vnode *
1094 union_dircache(vp, p)
1095 	struct vnode *vp;
1096 	struct proc *p;
1097 {
1098 	int cnt;
1099 	struct vnode *nvp = NULLVP;
1100 	struct vnode **vpp;
1101 	struct vnode **dircache;
1102 	int error;
1103 
1104 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1105 	dircache = VTOUNION(vp)->un_dircache;
1106 
1107 	nvp = NULLVP;
1108 
1109 	if (dircache == 0) {
1110 		cnt = 0;
1111 		union_dircache_r(vp, 0, &cnt);
1112 		cnt++;
1113 		dircache = (struct vnode **)
1114 				malloc(cnt * sizeof(struct vnode *),
1115 					M_TEMP, M_WAITOK);
1116 		vpp = dircache;
1117 		union_dircache_r(vp, &vpp, &cnt);
1118 		VTOUNION(vp)->un_dircache = dircache;
1119 		*vpp = NULLVP;
1120 		vpp = dircache + 1;
1121 	} else {
1122 		vpp = dircache;
1123 		do {
1124 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
1125 				break;
1126 		} while (*vpp != NULLVP);
1127 	}
1128 
1129 	if (*vpp == NULLVP)
1130 		goto out;
1131 
1132 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1133 	VREF(*vpp);
1134 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
1135 	if (!error) {
1136 		VTOUNION(vp)->un_dircache = 0;
1137 		VTOUNION(nvp)->un_dircache = dircache;
1138 	}
1139 
1140 out:
1141 	VOP_UNLOCK(vp, 0);
1142 	return (nvp);
1143 }
1144 
1145 void
1146 union_diruncache(un)
1147 	struct union_node *un;
1148 {
1149 	struct vnode **vpp;
1150 
1151 	if (un->un_dircache != 0) {
1152 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
1153 			vrele(*vpp);
1154 		free(un->un_dircache, M_TEMP);
1155 		un->un_dircache = 0;
1156 	}
1157 }
1158 
1159 /*
1160  * This hook is called from vn_readdir() to switch to lower directory
1161  * entry after the upper directory is read.
1162  */
1163 int
1164 union_readdirhook(struct vnode **vpp, struct file *fp, struct proc *p)
1165 {
1166 	struct vnode *vp = *vpp, *lvp;
1167 	struct vattr va;
1168 	int error;
1169 
1170 	if (vp->v_op != union_vnodeop_p)
1171 		return (0);
1172 
1173 	if ((lvp = union_dircache(vp, p)) == NULLVP)
1174 		return (0);
1175 
1176 	/*
1177 	 * If the directory is opaque,
1178 	 * then don't show lower entries
1179 	 */
1180 	error = VOP_GETATTR(vp, &va, fp->f_cred, p);
1181 	if (error || (va.va_flags & OPAQUE)) {
1182 		vput(lvp);
1183 		return (error);
1184 	}
1185 
1186 	error = VOP_OPEN(lvp, FREAD, fp->f_cred, p);
1187 	if (error) {
1188 		vput(lvp);
1189 		return (error);
1190 	}
1191 	VOP_UNLOCK(lvp, 0);
1192 	fp->f_data = (caddr_t) lvp;
1193 	fp->f_offset = 0;
1194 	error = vn_close(vp, FREAD, fp->f_cred, p);
1195 	if (error)
1196 		return (error);
1197 	*vpp = lvp;
1198 	return (0);
1199 }
1200