xref: /dflybsd-src/sys/vfs/hammer2/hammer2_inode.c (revision a413fe45675e6d132bfaa7a0a089b5a6e670bb07)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41 
42 #include "hammer2.h"
43 
44 #define INODE_DEBUG	0
45 
46 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
47 					 hammer2_cluster_t **cparentp,
48 					 hammer2_cluster_t **clusterp,
49 					 hammer2_tid_t inum);
50 
51 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
52 	     hammer2_tid_t, inum);
53 
54 int
55 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
56 {
57 	if (ip1->inum < ip2->inum)
58 		return(-1);
59 	if (ip1->inum > ip2->inum)
60 		return(1);
61 	return(0);
62 }
63 
64 /*
65  * HAMMER2 inode locks
66  *
67  * HAMMER2 offers shared locks and exclusive locks on inodes.
68  *
69  * The standard exclusive inode lock always resolves the inode meta-data,
70  * but there is a bypass version used by the vnode reclamation code that
71  * avoids the I/O.
72  *
73  * The inode locking function locks the inode itself, resolves any stale
74  * chains in the inode's cluster, and allocates a fresh copy of the
75  * cluster with 1 ref and all the underlying chains locked.  Duplication
76  * races are handled by this function.
77  *
78  * ip->cluster will be stable while the inode is locked.
79  *
80  * NOTE: We don't combine the inode/chain lock because putting away an
81  *       inode would otherwise confuse multiple lock holders of the inode.
82  *
83  * NOTE: In-memory inodes always point to hardlink targets (the actual file),
84  *	 and never point to a hardlink pointer.
85  */
86 hammer2_cluster_t *
87 hammer2_inode_lock_ex(hammer2_inode_t *ip)
88 {
89 	return hammer2_inode_lock_nex(ip, HAMMER2_RESOLVE_ALWAYS);
90 }
91 
92 hammer2_cluster_t *
93 hammer2_inode_lock_nex(hammer2_inode_t *ip, int how)
94 {
95 	hammer2_cluster_t *cluster;
96 	hammer2_chain_t *chain;
97 	int i;
98 
99 	hammer2_inode_ref(ip);
100 	hammer2_mtx_ex(&ip->lock);
101 	cluster = hammer2_cluster_copy(&ip->cluster,
102 				       HAMMER2_CLUSTER_COPY_NOCHAINS);
103 
104 	ip->cluster.focus = NULL;
105 	cluster->focus = NULL;
106 
107 	for (i = 0; i < cluster->nchains; ++i) {
108 		chain = ip->cluster.array[i].chain;
109 		if (chain == NULL) {
110 			kprintf("inode_lock: %p: missing chain\n", ip);
111 			continue;
112 		}
113 
114 		hammer2_chain_lock(chain, how);
115 		cluster->array[i].chain = chain;
116 		if (cluster->focus == NULL)
117 			cluster->focus = chain;
118 		if (ip->cluster.focus == NULL)
119 			ip->cluster.focus = chain;
120 	}
121 
122 	/*
123 	 * Returned cluster must resolve hardlink pointers
124 	 */
125 	if ((how & HAMMER2_RESOLVE_MASK) == HAMMER2_RESOLVE_ALWAYS) {
126 		const hammer2_inode_data_t *ripdata;
127 		ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
128 		KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
129 		/*
130 		if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK &&
131 		    (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
132 			error = hammer2_hardlink_find(ip->pip, NULL, cluster);
133 			KKASSERT(error == 0);
134 		}
135 		*/
136 	}
137 	return (cluster);
138 }
139 
140 void
141 hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
142 {
143 	if (cluster)
144 		hammer2_cluster_unlock(cluster);
145 	hammer2_mtx_unlock(&ip->lock);
146 	hammer2_inode_drop(ip);
147 }
148 
149 /*
150  * Standard shared inode lock always resolves the inode meta-data.
151  *
152  * NOTE: We don't combine the inode/chain lock because putting away an
153  *       inode would otherwise confuse multiple lock holders of the inode.
154  *
155  *	 Shared locks are especially sensitive to having too many shared
156  *	 lock counts (from the same thread) on certain paths which might
157  *	 need to upgrade them.  Only one count of a shared lock can be
158  *	 upgraded.
159  */
160 hammer2_cluster_t *
161 hammer2_inode_lock_sh(hammer2_inode_t *ip)
162 {
163 	const hammer2_inode_data_t *ripdata;
164 	hammer2_cluster_t *cluster;
165 	hammer2_chain_t *chain;
166 	int i;
167 
168 	hammer2_inode_ref(ip);
169 	cluster = hammer2_cluster_copy(&ip->cluster,
170 				       HAMMER2_CLUSTER_COPY_NOCHAINS);
171 	hammer2_mtx_sh(&ip->lock);
172 
173 	cluster->focus = NULL;
174 
175 	for (i = 0; i < cluster->nchains; ++i) {
176 		chain = ip->cluster.array[i].chain;
177 
178 		if (chain == NULL) {
179 			kprintf("inode_lock: %p: missing chain\n", ip);
180 			continue;
181 		}
182 
183 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
184 					  HAMMER2_RESOLVE_SHARED);
185 		cluster->array[i].chain = chain;
186 		if (cluster->focus == NULL)
187 			cluster->focus = chain;
188 	}
189 
190 	/*
191 	 * Returned cluster must resolve hardlink pointers
192 	 */
193 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
194 	KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
195 	/*
196 	if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK &&
197 	    (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
198 		error = hammer2_hardlink_find(ip->pip, NULL, cluster);
199 		KKASSERT(error == 0);
200 	}
201 	*/
202 
203 	return (cluster);
204 }
205 
206 void
207 hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
208 {
209 	if (cluster)
210 		hammer2_cluster_unlock(cluster);
211 	hammer2_mtx_unlock(&ip->lock);
212 	hammer2_inode_drop(ip);
213 }
214 
215 /*
216  * Temporarily release a lock held shared or exclusive.  Caller must
217  * hold the lock shared or exclusive on call and lock will be released
218  * on return.
219  *
220  * Restore a lock that was temporarily released.
221  */
222 hammer2_mtx_state_t
223 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
224 {
225 	return hammer2_mtx_temp_release(&ip->lock);
226 }
227 
228 void
229 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
230 {
231 	hammer2_mtx_temp_restore(&ip->lock, ostate);
232 }
233 
234 /*
235  * Upgrade a shared inode lock to exclusive and return.  If the inode lock
236  * is already held exclusively this is a NOP.
237  *
238  * The caller MUST hold the inode lock either shared or exclusive on call
239  * and will own the lock exclusively on return.
240  *
241  * Returns non-zero if the lock was already exclusive prior to the upgrade.
242  */
243 int
244 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
245 {
246 	int wasexclusive;
247 
248 	if (mtx_islocked_ex(&ip->lock)) {
249 		wasexclusive = 1;
250 	} else {
251 		hammer2_mtx_unlock(&ip->lock);
252 		hammer2_mtx_ex(&ip->lock);
253 		wasexclusive = 0;
254 	}
255 	return wasexclusive;
256 }
257 
258 /*
259  * Downgrade an inode lock from exclusive to shared only if the inode
260  * lock was previously shared.  If the inode lock was previously exclusive,
261  * this is a NOP.
262  */
263 void
264 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
265 {
266 	if (wasexclusive == 0)
267 		mtx_downgrade(&ip->lock);
268 }
269 
270 /*
271  * Lookup an inode by inode number
272  */
273 hammer2_inode_t *
274 hammer2_inode_lookup(hammer2_pfsmount_t *pmp, hammer2_tid_t inum)
275 {
276 	hammer2_inode_t *ip;
277 
278 	KKASSERT(pmp);
279 	if (pmp->spmp_hmp) {
280 		ip = NULL;
281 	} else {
282 		hammer2_spin_ex(&pmp->inum_spin);
283 		ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
284 		if (ip)
285 			hammer2_inode_ref(ip);
286 		hammer2_spin_unex(&pmp->inum_spin);
287 	}
288 	return(ip);
289 }
290 
291 /*
292  * Adding a ref to an inode is only legal if the inode already has at least
293  * one ref.
294  *
295  * (can be called with spinlock held)
296  */
297 void
298 hammer2_inode_ref(hammer2_inode_t *ip)
299 {
300 	atomic_add_int(&ip->refs, 1);
301 }
302 
303 /*
304  * Drop an inode reference, freeing the inode when the last reference goes
305  * away.
306  */
307 void
308 hammer2_inode_drop(hammer2_inode_t *ip)
309 {
310 	hammer2_pfsmount_t *pmp;
311 	hammer2_inode_t *pip;
312 	u_int refs;
313 
314 	while (ip) {
315 		refs = ip->refs;
316 		cpu_ccfence();
317 		if (refs == 1) {
318 			/*
319 			 * Transition to zero, must interlock with
320 			 * the inode inumber lookup tree (if applicable).
321 			 * It should not be possible for anyone to race
322 			 * the transition to 0.
323 			 *
324 			 */
325 			pmp = ip->pmp;
326 			KKASSERT(pmp);
327 			hammer2_spin_ex(&pmp->inum_spin);
328 
329 			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
330 				KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
331 				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
332 					atomic_clear_int(&ip->flags,
333 						     HAMMER2_INODE_ONRBTREE);
334 					RB_REMOVE(hammer2_inode_tree,
335 						  &pmp->inum_tree, ip);
336 				}
337 				hammer2_spin_unex(&pmp->inum_spin);
338 
339 				pip = ip->pip;
340 				ip->pip = NULL;
341 				ip->pmp = NULL;
342 
343 				/*
344 				 * Cleaning out ip->cluster isn't entirely
345 				 * trivial.
346 				 */
347 				hammer2_inode_repoint(ip, NULL, NULL);
348 
349 				/*
350 				 * We have to drop pip (if non-NULL) to
351 				 * dispose of our implied reference from
352 				 * ip->pip.  We can simply loop on it.
353 				 */
354 				kfree(ip, pmp->minode);
355 				atomic_add_long(&pmp->inmem_inodes, -1);
356 				ip = pip;
357 				/* continue with pip (can be NULL) */
358 			} else {
359 				hammer2_spin_unex(&ip->pmp->inum_spin);
360 			}
361 		} else {
362 			/*
363 			 * Non zero transition
364 			 */
365 			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
366 				break;
367 		}
368 	}
369 }
370 
371 /*
372  * Get the vnode associated with the given inode, allocating the vnode if
373  * necessary.  The vnode will be returned exclusively locked.
374  *
375  * The caller must lock the inode (shared or exclusive).
376  *
377  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
378  * races.
379  */
380 struct vnode *
381 hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
382 {
383 	const hammer2_inode_data_t *ripdata;
384 	hammer2_pfsmount_t *pmp;
385 	struct vnode *vp;
386 
387 	pmp = ip->pmp;
388 	KKASSERT(pmp != NULL);
389 	*errorp = 0;
390 
391 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
392 
393 	for (;;) {
394 		/*
395 		 * Attempt to reuse an existing vnode assignment.  It is
396 		 * possible to race a reclaim so the vget() may fail.  The
397 		 * inode must be unlocked during the vget() to avoid a
398 		 * deadlock against a reclaim.
399 		 */
400 		int wasexclusive;
401 
402 		vp = ip->vp;
403 		if (vp) {
404 			/*
405 			 * Inode must be unlocked during the vget() to avoid
406 			 * possible deadlocks, but leave the ip ref intact.
407 			 *
408 			 * vnode is held to prevent destruction during the
409 			 * vget().  The vget() can still fail if we lost
410 			 * a reclaim race on the vnode.
411 			 */
412 			hammer2_mtx_state_t ostate;
413 
414 			vhold(vp);
415 			ostate = hammer2_inode_lock_temp_release(ip);
416 			if (vget(vp, LK_EXCLUSIVE)) {
417 				vdrop(vp);
418 				hammer2_inode_lock_temp_restore(ip, ostate);
419 				continue;
420 			}
421 			hammer2_inode_lock_temp_restore(ip, ostate);
422 			vdrop(vp);
423 			/* vp still locked and ref from vget */
424 			if (ip->vp != vp) {
425 				kprintf("hammer2: igetv race %p/%p\n",
426 					ip->vp, vp);
427 				vput(vp);
428 				continue;
429 			}
430 			*errorp = 0;
431 			break;
432 		}
433 
434 		/*
435 		 * No vnode exists, allocate a new vnode.  Beware of
436 		 * allocation races.  This function will return an
437 		 * exclusively locked and referenced vnode.
438 		 */
439 		*errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
440 		if (*errorp) {
441 			kprintf("hammer2: igetv getnewvnode failed %d\n",
442 				*errorp);
443 			vp = NULL;
444 			break;
445 		}
446 
447 		/*
448 		 * Lock the inode and check for an allocation race.
449 		 */
450 		wasexclusive = hammer2_inode_lock_upgrade(ip);
451 		if (ip->vp != NULL) {
452 			vp->v_type = VBAD;
453 			vx_put(vp);
454 			hammer2_inode_lock_downgrade(ip, wasexclusive);
455 			continue;
456 		}
457 
458 		switch (ripdata->type) {
459 		case HAMMER2_OBJTYPE_DIRECTORY:
460 			vp->v_type = VDIR;
461 			break;
462 		case HAMMER2_OBJTYPE_REGFILE:
463 			vp->v_type = VREG;
464 			vinitvmio(vp, ripdata->size,
465 				  HAMMER2_LBUFSIZE,
466 				  (int)ripdata->size & HAMMER2_LBUFMASK);
467 			break;
468 		case HAMMER2_OBJTYPE_SOFTLINK:
469 			/*
470 			 * XXX for now we are using the generic file_read
471 			 * and file_write code so we need a buffer cache
472 			 * association.
473 			 */
474 			vp->v_type = VLNK;
475 			vinitvmio(vp, ripdata->size,
476 				  HAMMER2_LBUFSIZE,
477 				  (int)ripdata->size & HAMMER2_LBUFMASK);
478 			break;
479 		case HAMMER2_OBJTYPE_CDEV:
480 			vp->v_type = VCHR;
481 			/* fall through */
482 		case HAMMER2_OBJTYPE_BDEV:
483 			vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
484 			if (ripdata->type != HAMMER2_OBJTYPE_CDEV)
485 				vp->v_type = VBLK;
486 			addaliasu(vp, ripdata->rmajor, ripdata->rminor);
487 			break;
488 		case HAMMER2_OBJTYPE_FIFO:
489 			vp->v_type = VFIFO;
490 			vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
491 			break;
492 		default:
493 			panic("hammer2: unhandled objtype %d", ripdata->type);
494 			break;
495 		}
496 
497 		if (ip == pmp->iroot)
498 			vsetflags(vp, VROOT);
499 
500 		vp->v_data = ip;
501 		ip->vp = vp;
502 		hammer2_inode_ref(ip);		/* vp association */
503 		hammer2_inode_lock_downgrade(ip, wasexclusive);
504 		break;
505 	}
506 
507 	/*
508 	 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
509 	 */
510 	if (hammer2_debug & 0x0002) {
511 		kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
512 			vp, vp->v_refcnt, vp->v_auxrefs);
513 	}
514 	return (vp);
515 }
516 
517 /*
518  * Returns the inode associated with the passed-in cluster, creating the
519  * inode if necessary and synchronizing it to the passed-in cluster otherwise.
520  *
521  * The passed-in chain must be locked and will remain locked on return.
522  * The returned inode will be locked and the caller may dispose of both
523  * via hammer2_inode_unlock_ex().  However, if the caller needs to resolve
524  * a hardlink it must ref/unlock/relock/drop the inode.
525  *
526  * The hammer2_inode structure regulates the interface between the high level
527  * kernel VNOPS API and the filesystem backend (the chains).
528  */
529 hammer2_inode_t *
530 hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
531 		  hammer2_cluster_t *cluster)
532 {
533 	hammer2_inode_t *nip;
534 	const hammer2_inode_data_t *iptmp;
535 	const hammer2_inode_data_t *nipdata;
536 
537 	KKASSERT(hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
538 	KKASSERT(pmp);
539 
540 	/*
541 	 * Interlocked lookup/ref of the inode.  This code is only needed
542 	 * when looking up inodes with nlinks != 0 (TODO: optimize out
543 	 * otherwise and test for duplicates).
544 	 */
545 again:
546 	for (;;) {
547 		iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
548 		nip = hammer2_inode_lookup(pmp, iptmp->inum);
549 		if (nip == NULL)
550 			break;
551 
552 		hammer2_mtx_ex(&nip->lock);
553 
554 		/*
555 		 * Handle SMP race (not applicable to the super-root spmp
556 		 * which can't index inodes due to duplicative inode numbers).
557 		 */
558 		if (pmp->spmp_hmp == NULL &&
559 		    (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
560 			hammer2_mtx_unlock(&nip->lock);
561 			hammer2_inode_drop(nip);
562 			continue;
563 		}
564 		hammer2_inode_repoint(nip, NULL, cluster);
565 		return nip;
566 	}
567 
568 	/*
569 	 * We couldn't find the inode number, create a new inode.
570 	 */
571 	nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
572 	atomic_add_long(&pmp->inmem_inodes, 1);
573 	hammer2_pfs_memory_inc(pmp);
574 	hammer2_pfs_memory_wakeup(pmp);
575 	if (pmp->spmp_hmp)
576 		nip->flags = HAMMER2_INODE_SROOT;
577 
578 	/*
579 	 * Initialize nip's cluster
580 	 */
581 	nip->cluster.refs = 1;
582 	nip->cluster.pmp = pmp;
583 	nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
584 	hammer2_cluster_replace(&nip->cluster, cluster);
585 
586 	nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
587 	nip->inum = nipdata->inum;
588 	nip->size = nipdata->size;
589 	nip->mtime = nipdata->mtime;
590 	hammer2_inode_repoint(nip, NULL, cluster);
591 	nip->pip = dip;				/* can be NULL */
592 	if (dip)
593 		hammer2_inode_ref(dip);	/* ref dip for nip->pip */
594 
595 	nip->pmp = pmp;
596 
597 	/*
598 	 * ref and lock on nip gives it state compatible to after a
599 	 * hammer2_inode_lock_ex() call.
600 	 */
601 	nip->refs = 1;
602 	hammer2_mtx_init(&nip->lock, "h2inode");
603 	hammer2_mtx_ex(&nip->lock);
604 	/* combination of thread lock and chain lock == inode lock */
605 
606 	/*
607 	 * Attempt to add the inode.  If it fails we raced another inode
608 	 * get.  Undo all the work and try again.
609 	 */
610 	if (pmp->spmp_hmp == NULL) {
611 		hammer2_spin_ex(&pmp->inum_spin);
612 		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
613 			hammer2_spin_unex(&pmp->inum_spin);
614 			hammer2_mtx_unlock(&nip->lock);
615 			hammer2_inode_drop(nip);
616 			goto again;
617 		}
618 		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
619 		hammer2_spin_unex(&pmp->inum_spin);
620 	}
621 
622 	return (nip);
623 }
624 
625 /*
626  * Create a new inode in the specified directory using the vattr to
627  * figure out the type of inode.
628  *
629  * If no error occurs the new inode with its cluster locked is returned in
630  * *nipp, otherwise an error is returned and *nipp is set to NULL.
631  *
632  * If vap and/or cred are NULL the related fields are not set and the
633  * inode type defaults to a directory.  This is used when creating PFSs
634  * under the super-root, so the inode number is set to 1 in this case.
635  *
636  * dip is not locked on entry.
637  *
638  * NOTE: When used to create a snapshot, the inode is temporarily associated
639  *	 with the super-root spmp. XXX should pass new pmp for snapshot.
640  */
641 hammer2_inode_t *
642 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
643 		     struct vattr *vap, struct ucred *cred,
644 		     const uint8_t *name, size_t name_len,
645 		     hammer2_cluster_t **clusterp, int *errorp)
646 {
647 	const hammer2_inode_data_t *dipdata;
648 	hammer2_inode_data_t *nipdata;
649 	hammer2_cluster_t *cluster;
650 	hammer2_cluster_t *cparent;
651 	hammer2_inode_t *nip;
652 	hammer2_key_t key_dummy;
653 	hammer2_key_t lhc;
654 	int error;
655 	uid_t xuid;
656 	uuid_t dip_uid;
657 	uuid_t dip_gid;
658 	uint32_t dip_mode;
659 	uint8_t dip_comp_algo;
660 	uint8_t dip_check_algo;
661 	int ddflag;
662 
663 	lhc = hammer2_dirhash(name, name_len);
664 	*errorp = 0;
665 
666 	/*
667 	 * Locate the inode or indirect block to create the new
668 	 * entry in.  At the same time check for key collisions
669 	 * and iterate until we don't get one.
670 	 *
671 	 * NOTE: hidden inodes do not have iterators.
672 	 */
673 retry:
674 	cparent = hammer2_inode_lock_ex(dip);
675 	dipdata = &hammer2_cluster_rdata(cparent)->ipdata;
676 	dip_uid = dipdata->uid;
677 	dip_gid = dipdata->gid;
678 	dip_mode = dipdata->mode;
679 	dip_comp_algo = dipdata->comp_algo;
680 	dip_check_algo = dipdata->check_algo;
681 
682 	error = 0;
683 	while (error == 0) {
684 		cluster = hammer2_cluster_lookup(cparent, &key_dummy,
685 						 lhc, lhc, 0, &ddflag);
686 		if (cluster == NULL)
687 			break;
688 		if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
689 			error = ENOSPC;
690 		if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
691 			error = ENOSPC;
692 		hammer2_cluster_unlock(cluster);
693 		cluster = NULL;
694 		++lhc;
695 	}
696 
697 	if (error == 0) {
698 		error = hammer2_cluster_create(trans, cparent, &cluster,
699 					     lhc, 0,
700 					     HAMMER2_BREF_TYPE_INODE,
701 					     HAMMER2_INODE_BYTES,
702 					     0);
703 	}
704 #if INODE_DEBUG
705 	kprintf("CREATE INODE %*.*s chain=%p\n",
706 		(int)name_len, (int)name_len, name,
707 		(cluster ? cluster->focus : NULL));
708 #endif
709 
710 	/*
711 	 * Cleanup and handle retries.
712 	 */
713 	if (error == EAGAIN) {
714 		hammer2_cluster_ref(cparent);
715 		hammer2_inode_unlock_ex(dip, cparent);
716 		hammer2_cluster_wait(cparent);
717 		hammer2_cluster_drop(cparent);
718 		goto retry;
719 	}
720 	hammer2_inode_unlock_ex(dip, cparent);
721 	cparent = NULL;
722 
723 	if (error) {
724 		KKASSERT(cluster == NULL);
725 		*errorp = error;
726 		return (NULL);
727 	}
728 
729 	/*
730 	 * Set up the new inode.
731 	 *
732 	 * NOTE: *_get() integrates chain's lock into the inode lock.
733 	 *
734 	 * NOTE: Only one new inode can currently be created per
735 	 *	 transaction.  If the need arises we can adjust
736 	 *	 hammer2_trans_init() to allow more.
737 	 *
738 	 * NOTE: nipdata will have chain's blockset data.
739 	 */
740 	KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_MODIFIED);
741 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
742 	nipdata->inum = trans->inode_tid;
743 	hammer2_cluster_modsync(cluster);
744 	nip = hammer2_inode_get(dip->pmp, dip, cluster);
745 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
746 
747 	if (vap) {
748 		KKASSERT(trans->inodes_created == 0);
749 		nipdata->type = hammer2_get_obj_type(vap->va_type);
750 		nipdata->inum = trans->inode_tid;
751 		++trans->inodes_created;
752 
753 		switch (nipdata->type) {
754 		case HAMMER2_OBJTYPE_CDEV:
755 		case HAMMER2_OBJTYPE_BDEV:
756 			nipdata->rmajor = vap->va_rmajor;
757 			nipdata->rminor = vap->va_rminor;
758 			break;
759 		default:
760 			break;
761 		}
762 	} else {
763 		nipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
764 		nipdata->inum = 1;
765 	}
766 
767 	/* Inherit parent's inode compression mode. */
768 	nip->comp_heuristic = 0;
769 	nipdata->comp_algo = dip_comp_algo;
770 	nipdata->check_algo = dip_check_algo;
771 	nipdata->version = HAMMER2_INODE_VERSION_ONE;
772 	hammer2_update_time(&nipdata->ctime);
773 	nipdata->mtime = nipdata->ctime;
774 	if (vap)
775 		nipdata->mode = vap->va_mode;
776 	nipdata->nlinks = 1;
777 	if (vap) {
778 		if (dip && dip->pmp) {
779 			xuid = hammer2_to_unix_xid(&dip_uid);
780 			xuid = vop_helper_create_uid(dip->pmp->mp,
781 						     dip_mode,
782 						     xuid,
783 						     cred,
784 						     &vap->va_mode);
785 		} else {
786 			/* super-root has no dip and/or pmp */
787 			xuid = 0;
788 		}
789 		if (vap->va_vaflags & VA_UID_UUID_VALID)
790 			nipdata->uid = vap->va_uid_uuid;
791 		else if (vap->va_uid != (uid_t)VNOVAL)
792 			hammer2_guid_to_uuid(&nipdata->uid, vap->va_uid);
793 		else
794 			hammer2_guid_to_uuid(&nipdata->uid, xuid);
795 
796 		if (vap->va_vaflags & VA_GID_UUID_VALID)
797 			nipdata->gid = vap->va_gid_uuid;
798 		else if (vap->va_gid != (gid_t)VNOVAL)
799 			hammer2_guid_to_uuid(&nipdata->gid, vap->va_gid);
800 		else if (dip)
801 			nipdata->gid = dip_gid;
802 	}
803 
804 	/*
805 	 * Regular files and softlinks allow a small amount of data to be
806 	 * directly embedded in the inode.  This flag will be cleared if
807 	 * the size is extended past the embedded limit.
808 	 */
809 	if (nipdata->type == HAMMER2_OBJTYPE_REGFILE ||
810 	    nipdata->type == HAMMER2_OBJTYPE_SOFTLINK) {
811 		nipdata->op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
812 	}
813 
814 	KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
815 	bcopy(name, nipdata->filename, name_len);
816 	nipdata->name_key = lhc;
817 	nipdata->name_len = name_len;
818 	hammer2_cluster_modsync(cluster);
819 	*clusterp = cluster;
820 
821 	return (nip);
822 }
823 
824 /*
825  * The cluster has been removed from the original directory and replaced
826  * with a hardlink pointer.  Move the cluster to the specified parent
827  * directory, change the filename to "0xINODENUMBER", and adjust the key.
828  * The cluster becomes our invisible hardlink target.
829  *
830  * The original cluster must be deleted on entry.
831  */
832 static
833 void
834 hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
835 			hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
836 			int nlinks, int *errorp)
837 {
838 	const hammer2_inode_data_t *iptmp;
839 	hammer2_inode_data_t *nipdata;
840 	hammer2_cluster_t *xcluster;
841 	hammer2_key_t key_dummy;
842 	hammer2_key_t lhc;
843 	hammer2_blockref_t bref;
844 	int ddflag;
845 
846 	iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
847 	lhc = iptmp->inum;
848 	KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
849 
850 	/*
851 	 * Locate the inode or indirect block to create the new
852 	 * entry in.  lhc represents the inode number so there is
853 	 * no collision iteration.
854 	 *
855 	 * There should be no key collisions with invisible inode keys.
856 	 *
857 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
858 	 *	    dip->cluster cache.
859 	 */
860 	*errorp = 0;
861 	xcluster = hammer2_cluster_lookup(dcluster, &key_dummy,
862 				      lhc, lhc, 0, &ddflag);
863 	if (xcluster) {
864 		kprintf("X3 chain %p dip %p dchain %p dip->chain %p\n",
865 			xcluster->focus, dip, dcluster->focus,
866 			dip->cluster.focus);
867 		hammer2_cluster_unlock(xcluster);
868 		xcluster = NULL;
869 		*errorp = ENOSPC;
870 #if 0
871 		Debugger("X3");
872 #endif
873 	}
874 
875 	/*
876 	 * Handle the error case
877 	 */
878 	if (*errorp) {
879 		panic("error2");
880 		KKASSERT(xcluster == NULL);
881 		return;
882 	}
883 
884 	/*
885 	 * Use xcluster as a placeholder for (lhc).  Duplicate cluster to the
886 	 * same target bref as xcluster and then delete xcluster.  The
887 	 * duplication occurs after xcluster in flush order even though
888 	 * xcluster is deleted after the duplication. XXX
889 	 *
890 	 * WARNING! Duplications (to a different parent) can cause indirect
891 	 *	    blocks to be inserted, refactor xcluster.
892 	 *
893 	 * WARNING! Only key and keybits is extracted from a passed-in bref.
894 	 */
895 	hammer2_cluster_bref(cluster, &bref);
896 	bref.key = lhc;			/* invisible dir entry key */
897 	bref.keybits = 0;
898 	hammer2_cluster_rename(trans, &bref, dcluster, cluster, 0);
899 
900 	/*
901 	 * cluster is now 'live' again.. adjust the filename.
902 	 *
903 	 * Directory entries are inodes but this is a hidden hardlink
904 	 * target.  The name isn't used but to ease debugging give it
905 	 * a name after its inode number.
906 	 */
907 	hammer2_cluster_modify(trans, cluster, 0);
908 	nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
909 	ksnprintf(nipdata->filename, sizeof(nipdata->filename),
910 		  "0x%016jx", (intmax_t)nipdata->inum);
911 	nipdata->name_len = strlen(nipdata->filename);
912 	nipdata->name_key = lhc;
913 	nipdata->nlinks += nlinks;
914 	hammer2_cluster_modsync(cluster);
915 }
916 
917 /*
918  * Connect the target inode represented by (cluster) to the media topology
919  * at (dip, name, len).  The caller can pass a rough *chainp, this function
920  * will issue lookup()s to position the parent chain properly for the
921  * chain insertion.
922  *
923  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
924  * entry instead of connecting (cluster).
925  *
926  * If hlink is FALSE this function expects (cluster) to be unparented.
927  */
928 int
929 hammer2_inode_connect(hammer2_trans_t *trans,
930 		      hammer2_cluster_t **clusterp, int hlink,
931 		      hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
932 		      const uint8_t *name, size_t name_len,
933 		      hammer2_key_t lhc)
934 {
935 	hammer2_inode_data_t *wipdata;
936 	hammer2_cluster_t *ocluster;
937 	hammer2_cluster_t *ncluster;
938 	hammer2_key_t key_dummy;
939 	int ddflag;
940 	int error;
941 
942 	/*
943 	 * Since ocluster is either disconnected from the topology or
944 	 * represents a hardlink terminus which is always a parent of or
945 	 * equal to dip, we should be able to safely lock dip->chain for
946 	 * our setup.
947 	 *
948 	 * WARNING! Must use inode_lock_ex() on dip to handle a stale
949 	 *	    dip->cluster.
950 	 *
951 	 * If name is non-NULL we calculate lhc, else we use the passed-in
952 	 * lhc.
953 	 */
954 	ocluster = *clusterp;
955 
956 	if (name) {
957 		lhc = hammer2_dirhash(name, name_len);
958 
959 		/*
960 		 * Locate the inode or indirect block to create the new
961 		 * entry in.  At the same time check for key collisions
962 		 * and iterate until we don't get one.
963 		 */
964 		error = 0;
965 		while (error == 0) {
966 			ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
967 						      lhc, lhc,
968 						      0, &ddflag);
969 			if (ncluster == NULL)
970 				break;
971 			if ((lhc & HAMMER2_DIRHASH_LOMASK) ==
972 			    HAMMER2_DIRHASH_LOMASK) {
973 				error = ENOSPC;
974 			}
975 			hammer2_cluster_unlock(ncluster);
976 			ncluster = NULL;
977 			++lhc;
978 		}
979 	} else {
980 		/*
981 		 * Reconnect to specific key (used when moving
982 		 * unlinked-but-open files into the hidden directory).
983 		 */
984 		ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
985 						  lhc, lhc,
986 						  0, &ddflag);
987 		KKASSERT(ncluster == NULL);
988 	}
989 
990 	if (error == 0) {
991 		if (hlink) {
992 			/*
993 			 * Hardlink pointer needed, create totally fresh
994 			 * directory entry.
995 			 *
996 			 * We must refactor ocluster because it might have
997 			 * been shifted into an indirect cluster by the
998 			 * create.
999 			 */
1000 			KKASSERT(ncluster == NULL);
1001 			error = hammer2_cluster_create(trans,
1002 						       dcluster, &ncluster,
1003 						       lhc, 0,
1004 						       HAMMER2_BREF_TYPE_INODE,
1005 						       HAMMER2_INODE_BYTES,
1006 						       0);
1007 		} else {
1008 			/*
1009 			 * Reconnect the original cluster under the new name.
1010 			 * Original cluster must have already been deleted by
1011 			 * teh caller.
1012 			 *
1013 			 * WARNING! Can cause held-over clusters to require a
1014 			 *	    refactor.  Fortunately we have none (our
1015 			 *	    locked clusters are passed into and
1016 			 *	    modified by the call).
1017 			 */
1018 			ncluster = ocluster;
1019 			ocluster = NULL;
1020 			error = hammer2_cluster_create(trans,
1021 						       dcluster, &ncluster,
1022 						       lhc, 0,
1023 						       HAMMER2_BREF_TYPE_INODE,
1024 						       HAMMER2_INODE_BYTES,
1025 						       0);
1026 		}
1027 	}
1028 
1029 	/*
1030 	 * Unlock stuff.
1031 	 */
1032 	KKASSERT(error != EAGAIN);
1033 
1034 	/*
1035 	 * ncluster should be NULL on error, leave ocluster
1036 	 * (ocluster == *clusterp) alone.
1037 	 */
1038 	if (error) {
1039 		KKASSERT(ncluster == NULL);
1040 		return (error);
1041 	}
1042 
1043 	/*
1044 	 * Directory entries are inodes so if the name has changed we have
1045 	 * to update the inode.
1046 	 *
1047 	 * When creating an OBJTYPE_HARDLINK entry remember to unlock the
1048 	 * cluster, the caller will access the hardlink via the actual hardlink
1049 	 * target file and not the hardlink pointer entry, so we must still
1050 	 * return ocluster.
1051 	 */
1052 	if (hlink && hammer2_hardlink_enable >= 0) {
1053 		/*
1054 		 * Create the HARDLINK pointer.  oip represents the hardlink
1055 		 * target in this situation.
1056 		 *
1057 		 * We will return ocluster (the hardlink target).
1058 		 */
1059 		hammer2_cluster_modify(trans, ncluster, 0);
1060 		hammer2_cluster_clr_chainflags(ncluster,
1061 					       HAMMER2_CHAIN_UNLINKED);
1062 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1063 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1064 		bcopy(name, wipdata->filename, name_len);
1065 		wipdata->name_key = lhc;
1066 		wipdata->name_len = name_len;
1067 		wipdata->target_type =
1068 				hammer2_cluster_rdata(ocluster)->ipdata.type;
1069 		wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1070 		wipdata->inum = hammer2_cluster_rdata(ocluster)->ipdata.inum;
1071 		wipdata->version = HAMMER2_INODE_VERSION_ONE;
1072 		wipdata->nlinks = 1;
1073 		wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1074 		hammer2_cluster_modsync(ncluster);
1075 		hammer2_cluster_unlock(ncluster);
1076 		ncluster = ocluster;
1077 		ocluster = NULL;
1078 	} else {
1079 		/*
1080 		 * ncluster is a duplicate of ocluster at the new location.
1081 		 * We must fixup the name stored in the inode data.
1082 		 * The bref key has already been adjusted by inode_connect().
1083 		 */
1084 		hammer2_cluster_modify(trans, ncluster, 0);
1085 		hammer2_cluster_clr_chainflags(ncluster,
1086 					       HAMMER2_CHAIN_UNLINKED);
1087 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1088 
1089 		KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1090 		bcopy(name, wipdata->filename, name_len);
1091 		wipdata->name_key = lhc;
1092 		wipdata->name_len = name_len;
1093 		wipdata->nlinks = 1;
1094 		hammer2_cluster_modsync(ncluster);
1095 	}
1096 
1097 	/*
1098 	 * We are replacing ocluster with ncluster, unlock ocluster.  In the
1099 	 * case where ocluster is left unchanged the code above sets
1100 	 * ncluster to ocluster and ocluster to NULL, resulting in a NOP here.
1101 	 */
1102 	if (ocluster)
1103 		hammer2_cluster_unlock(ocluster);
1104 	*clusterp = ncluster;
1105 
1106 	return (0);
1107 }
1108 
1109 /*
1110  * Repoint ip->cluster's chains to cluster's chains.  Caller must hold
1111  * the inode exclusively locked.  cluster may be NULL to clean out any
1112  * chains in ip->cluster.
1113  */
1114 void
1115 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1116 		      hammer2_cluster_t *cluster)
1117 {
1118 	hammer2_chain_t *ochain;
1119 	hammer2_chain_t *nchain;
1120 	hammer2_inode_t *opip;
1121 	int i;
1122 
1123 	/*
1124 	 * Replace chains in ip->cluster with chains from cluster and
1125 	 * adjust the focus if necessary.
1126 	 *
1127 	 * NOTE: nchain and/or ochain can be NULL due to gaps
1128 	 *	 in the cluster arrays.
1129 	 */
1130 	ip->cluster.focus = NULL;
1131 	for (i = 0; cluster && i < cluster->nchains; ++i) {
1132 		nchain = cluster->array[i].chain;
1133 		if (i < ip->cluster.nchains) {
1134 			ochain = ip->cluster.array[i].chain;
1135 			if (ochain == nchain) {
1136 				if (ip->cluster.focus == NULL)
1137 					ip->cluster.focus = nchain;
1138 				continue;
1139 			}
1140 		} else {
1141 			ochain = NULL;
1142 		}
1143 
1144 		/*
1145 		 * Make adjustments
1146 		 */
1147 		ip->cluster.array[i].chain = nchain;
1148 		if (ip->cluster.focus == NULL)
1149 			ip->cluster.focus = nchain;
1150 		if (nchain)
1151 			hammer2_chain_ref(nchain);
1152 		if (ochain)
1153 			hammer2_chain_drop(ochain);
1154 	}
1155 
1156 	/*
1157 	 * Release any left-over chains in ip->cluster.
1158 	 */
1159 	while (i < ip->cluster.nchains) {
1160 		nchain = ip->cluster.array[i].chain;
1161 		if (nchain) {
1162 			ip->cluster.array[i].chain = NULL;
1163 			hammer2_chain_drop(nchain);
1164 		}
1165 		++i;
1166 	}
1167 	ip->cluster.nchains = cluster ? cluster->nchains : 0;
1168 
1169 	/*
1170 	 * Repoint ip->pip if requested (non-NULL pip).
1171 	 */
1172 	if (pip && ip->pip != pip) {
1173 		opip = ip->pip;
1174 		hammer2_inode_ref(pip);
1175 		ip->pip = pip;
1176 		if (opip)
1177 			hammer2_inode_drop(opip);
1178 	}
1179 }
1180 
1181 /*
1182  * Unlink the file from the specified directory inode.  The directory inode
1183  * does not need to be locked.
1184  *
1185  * isdir determines whether a directory/non-directory check should be made.
1186  * No check is made if isdir is set to -1.
1187  *
1188  * isopen specifies whether special unlink-with-open-descriptor handling
1189  * must be performed.  If set to -1 the caller is deleting a PFS and we
1190  * check whether the chain is mounted or not (chain->pmp != NULL).  1 is
1191  * implied if it is mounted.
1192  *
1193  * If isopen is 1 and nlinks drops to 0 this function must move the chain
1194  * to a special hidden directory until last-close occurs on the file.
1195  *
1196  * NOTE!  The underlying file can still be active with open descriptors
1197  *	  or if the chain is being manually held (e.g. for rename).
1198  *
1199  *	  The caller is responsible for fixing up ip->chain if e.g. a
1200  *	  rename occurs (see chain_duplicate()).
1201  *
1202  * NOTE!  The chain is not deleted if it is moved to the hidden directory,
1203  *	  but otherwise will be deleted.
1204  */
1205 int
1206 hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
1207 		    const uint8_t *name, size_t name_len,
1208 		    int isdir, int *hlinkp, struct nchandle *nch,
1209 		    int nlinks)
1210 {
1211 	const hammer2_inode_data_t *ripdata;
1212 	hammer2_inode_data_t *wipdata;
1213 	hammer2_cluster_t *cparent;
1214 	hammer2_cluster_t *hcluster;
1215 	hammer2_cluster_t *hparent;
1216 	hammer2_cluster_t *cluster;
1217 	hammer2_cluster_t *dparent;
1218 	hammer2_cluster_t *dcluster;
1219 	hammer2_key_t key_dummy;
1220 	hammer2_key_t key_next;
1221 	hammer2_key_t lhc;
1222 	int error;
1223 	int ddflag;
1224 	int hlink;
1225 	uint8_t type;
1226 
1227 	error = 0;
1228 	hlink = 0;
1229 	hcluster = NULL;
1230 	hparent = NULL;
1231 	lhc = hammer2_dirhash(name, name_len);
1232 
1233 again:
1234 	/*
1235 	 * Search for the filename in the directory
1236 	 */
1237 	cparent = hammer2_inode_lock_ex(dip);
1238 	cluster = hammer2_cluster_lookup(cparent, &key_next,
1239 				     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1240 				     0, &ddflag);
1241 	while (cluster) {
1242 		if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
1243 			ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1244 			if (ripdata->name_len == name_len &&
1245 			    bcmp(ripdata->filename, name, name_len) == 0) {
1246 				break;
1247 			}
1248 		}
1249 		cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1250 					       key_next,
1251 					       lhc + HAMMER2_DIRHASH_LOMASK,
1252 					       0);
1253 	}
1254 	hammer2_inode_unlock_ex(dip, NULL);	/* retain cparent */
1255 
1256 	/*
1257 	 * Not found or wrong type (isdir < 0 disables the type check).
1258 	 * If a hardlink pointer, type checks use the hardlink target.
1259 	 */
1260 	if (cluster == NULL) {
1261 		error = ENOENT;
1262 		goto done;
1263 	}
1264 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1265 	type = ripdata->type;
1266 	if (type == HAMMER2_OBJTYPE_HARDLINK) {
1267 		hlink = 1;
1268 		type = ripdata->target_type;
1269 	}
1270 
1271 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
1272 		error = ENOTDIR;
1273 		goto done;
1274 	}
1275 	if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir >= 1) {
1276 		error = EISDIR;
1277 		goto done;
1278 	}
1279 
1280 	/*
1281 	 * Hardlink must be resolved.  We can't hold the parent locked
1282 	 * while we do this or we could deadlock.  The physical file will
1283 	 * be located at or above the current directory.
1284 	 *
1285 	 * We loop to reacquire the hardlink origination.
1286 	 *
1287 	 * NOTE: hammer2_hardlink_find() will locate the hardlink target,
1288 	 *	 returning a modified hparent and hcluster.
1289 	 */
1290 	if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
1291 		if (hcluster == NULL) {
1292 			hcluster = cluster;
1293 			cluster = NULL;	/* safety */
1294 			hammer2_cluster_unlock(cparent);
1295 			cparent = NULL; /* safety */
1296 			ripdata = NULL;	/* safety (associated w/cparent) */
1297 			error = hammer2_hardlink_find(dip, &hparent, hcluster);
1298 
1299 			/*
1300 			 * If we couldn't find the hardlink target then some
1301 			 * parent directory containing the hardlink pointer
1302 			 * probably got renamed to above the original target,
1303 			 * a case not yet handled by H2.
1304 			 */
1305 			if (error) {
1306 				kprintf("H2 unlink_file: hardlink target for "
1307 					"\"%s\" not found\n",
1308 					name);
1309 				kprintf("(likely due to known directory "
1310 					"rename bug)\n");
1311 				goto done;
1312 			}
1313 			goto again;
1314 		}
1315 	}
1316 
1317 	/*
1318 	 * If this is a directory the directory must be empty.  However, if
1319 	 * isdir < 0 we are doing a rename and the directory does not have
1320 	 * to be empty, and if isdir > 1 we are deleting a PFS/snapshot
1321 	 * and the directory does not have to be empty.
1322 	 *
1323 	 * NOTE: We check the full key range here which covers both visible
1324 	 *	 and invisible entries.  Theoretically there should be no
1325 	 *	 invisible (hardlink target) entries if there are no visible
1326 	 *	 entries.
1327 	 */
1328 	if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
1329 		dparent = hammer2_cluster_lookup_init(cluster, 0);
1330 		dcluster = hammer2_cluster_lookup(dparent, &key_dummy,
1331 					          0, (hammer2_key_t)-1,
1332 					          HAMMER2_LOOKUP_NODATA,
1333 						  &ddflag);
1334 		if (dcluster) {
1335 			hammer2_cluster_unlock(dcluster);
1336 			hammer2_cluster_lookup_done(dparent);
1337 			error = ENOTEMPTY;
1338 			goto done;
1339 		}
1340 		hammer2_cluster_lookup_done(dparent);
1341 		dparent = NULL;
1342 		/* dcluster NULL */
1343 	}
1344 
1345 	/*
1346 	 * If this was a hardlink then (cparent, cluster) is the hardlink
1347 	 * pointer, which we can simply destroy outright.  Discard the
1348 	 * clusters and replace with the hardlink target.
1349 	 */
1350 	if (hcluster) {
1351 		hammer2_cluster_delete(trans, cparent, cluster,
1352 				       HAMMER2_DELETE_PERMANENT);
1353 		hammer2_cluster_unlock(cparent);
1354 		hammer2_cluster_unlock(cluster);
1355 		cparent = hparent;
1356 		cluster = hcluster;
1357 		hparent = NULL;
1358 		hcluster = NULL;
1359 	}
1360 
1361 	/*
1362 	 * This leaves us with the hardlink target or non-hardlinked file
1363 	 * or directory in (cparent, cluster).
1364 	 *
1365 	 * Delete the target when nlinks reaches 0 with special handling
1366 	 * if (isopen) is set.
1367 	 *
1368 	 * NOTE! In DragonFly the vnops function calls cache_unlink() after
1369 	 *	 calling us here to clean out the namecache association,
1370 	 *	 (which does not represent a ref for the open-test), and to
1371 	 *	 force finalization of the vnode if/when the last ref gets
1372 	 *	 dropped.
1373 	 *
1374 	 * NOTE! Files are unlinked by rename and then relinked.  nch will be
1375 	 *	 passed as NULL in this situation.  hammer2_inode_connect()
1376 	 *	 will bump nlinks.
1377 	 */
1378 	KKASSERT(cluster != NULL);
1379 	hammer2_cluster_modify(trans, cluster, 0);
1380 	wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1381 	ripdata = wipdata;
1382 	wipdata->nlinks += nlinks;
1383 	if ((int64_t)wipdata->nlinks < 0) {	/* XXX debugging */
1384 		wipdata->nlinks = 0;
1385 	}
1386 	hammer2_cluster_modsync(cluster);
1387 
1388 	if (wipdata->nlinks == 0) {
1389 		/*
1390 		 * Target nlinks has reached 0, file now unlinked (but may
1391 		 * still be open).
1392 		 */
1393 		/* XXX need interlock if mounted
1394 		if ((cluster->focus->flags & HAMMER2_CHAIN_PFSROOT) &&
1395 		    cluster->pmp) {
1396 			error = EINVAL;
1397 			kprintf("hammer2: PFS \"%s\" cannot be deleted "
1398 				"while still mounted\n",
1399 				wipdata->filename);
1400 			goto done;
1401 		}
1402 		*/
1403 		hammer2_cluster_set_chainflags(cluster, HAMMER2_CHAIN_UNLINKED);
1404 		if (nch && cache_isopen(nch)) {
1405 			hammer2_inode_move_to_hidden(trans, &cparent, &cluster,
1406 						     wipdata->inum);
1407 		} else {
1408 			/*
1409 			 * This won't get everything if a vnode is still
1410 			 * present, but the cache_unlink() call the caller
1411 			 * makes will.
1412 			 */
1413 			hammer2_cluster_delete(trans, cparent, cluster,
1414 					       HAMMER2_DELETE_PERMANENT);
1415 		}
1416 	} else if (hlink == 0) {
1417 		/*
1418 		 * In this situation a normal non-hardlinked file (which can
1419 		 * only have nlinks == 1) still has a non-zero nlinks, the
1420 		 * caller must be doing a RENAME operation and so is passing
1421 		 * a nlinks adjustment of 0, and only wishes to remove file
1422 		 * in order to be able to reconnect it under a different name.
1423 		 *
1424 		 * In this situation we do a non-permanent deletion of the
1425 		 * chain in order to allow the file to be reconnected in
1426 		 * a different location.
1427 		 */
1428 		KKASSERT(nlinks == 0);
1429 		hammer2_cluster_delete(trans, cparent, cluster, 0);
1430 	}
1431 	error = 0;
1432 done:
1433 	if (cparent)
1434 		hammer2_cluster_unlock(cparent);
1435 	if (cluster)
1436 		hammer2_cluster_unlock(cluster);
1437 	if (hparent)
1438 		hammer2_cluster_unlock(hparent);
1439 	if (hcluster)
1440 		hammer2_cluster_unlock(hcluster);
1441 	if (hlinkp)
1442 		*hlinkp = hlink;
1443 
1444 	return error;
1445 }
1446 
1447 /*
1448  * This is called from the mount code to initialize pmp->ihidden
1449  */
1450 void
1451 hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
1452 {
1453 	hammer2_trans_t trans;
1454 	hammer2_cluster_t *cparent;
1455 	hammer2_cluster_t *cluster;
1456 	hammer2_cluster_t *scan;
1457 	const hammer2_inode_data_t *ripdata;
1458 	hammer2_inode_data_t *wipdata;
1459 	hammer2_key_t key_dummy;
1460 	hammer2_key_t key_next;
1461 	int ddflag;
1462 	int error;
1463 	int count;
1464 	int dip_check_algo;
1465 	int dip_comp_algo;
1466 
1467 	if (pmp->ihidden)
1468 		return;
1469 
1470 	/*
1471 	 * Find the hidden directory
1472 	 */
1473 	bzero(&key_dummy, sizeof(key_dummy));
1474 	hammer2_trans_init(&trans, pmp, 0);
1475 
1476 	/*
1477 	 * Setup for lookup, retrieve iroot's check and compression
1478 	 * algorithm request which was likely generated by newfs_hammer2.
1479 	 *
1480 	 * The check/comp fields will probably never be used since inodes
1481 	 * are renamed into the hidden directory and not created relative to
1482 	 * the hidden directory, chain creation inherits from bref.methods,
1483 	 * and data chains inherit from their respective file inode *_algo
1484 	 * fields.
1485 	 */
1486 	cparent = hammer2_inode_lock_ex(pmp->iroot);
1487 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1488 	dip_check_algo = ripdata->check_algo;
1489 	dip_comp_algo = ripdata->comp_algo;
1490 	ripdata = NULL;
1491 
1492 	cluster = hammer2_cluster_lookup(cparent, &key_dummy,
1493 					 HAMMER2_INODE_HIDDENDIR,
1494 					 HAMMER2_INODE_HIDDENDIR,
1495 					 0, &ddflag);
1496 	if (cluster) {
1497 		pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1498 		hammer2_inode_ref(pmp->ihidden);
1499 
1500 		/*
1501 		 * Remove any unlinked files which were left open as-of
1502 		 * any system crash.
1503 		 *
1504 		 * Don't pass NODATA, we need the inode data so the delete
1505 		 * can do proper statistics updates.
1506 		 */
1507 		count = 0;
1508 		scan = hammer2_cluster_lookup(cluster, &key_next,
1509 					      0, HAMMER2_TID_MAX,
1510 					      0, &ddflag);
1511 		while (scan) {
1512 			if (hammer2_cluster_type(scan) ==
1513 			    HAMMER2_BREF_TYPE_INODE) {
1514 				hammer2_cluster_delete(&trans, cluster, scan,
1515 						   HAMMER2_DELETE_PERMANENT);
1516 				++count;
1517 			}
1518 			scan = hammer2_cluster_next(cluster, scan, &key_next,
1519 						    0, HAMMER2_TID_MAX, 0);
1520 		}
1521 
1522 		hammer2_inode_unlock_ex(pmp->ihidden, cluster);
1523 		hammer2_inode_unlock_ex(pmp->iroot, cparent);
1524 		hammer2_trans_done(&trans);
1525 		kprintf("hammer2: PFS loaded hidden dir, "
1526 			"removed %d dead entries\n", count);
1527 		return;
1528 	}
1529 
1530 	/*
1531 	 * Create the hidden directory
1532 	 */
1533 	error = hammer2_cluster_create(&trans, cparent, &cluster,
1534 				       HAMMER2_INODE_HIDDENDIR, 0,
1535 				       HAMMER2_BREF_TYPE_INODE,
1536 				       HAMMER2_INODE_BYTES,
1537 				       0);
1538 	hammer2_inode_unlock_ex(pmp->iroot, cparent);
1539 
1540 	hammer2_cluster_modify(&trans, cluster, 0);
1541 	wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1542 	wipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
1543 	wipdata->inum = HAMMER2_INODE_HIDDENDIR;
1544 	wipdata->nlinks = 1;
1545 	wipdata->comp_algo = dip_comp_algo;
1546 	wipdata->check_algo = dip_check_algo;
1547 	hammer2_cluster_modsync(cluster);
1548 	kprintf("hammer2: PFS root missing hidden directory, creating\n");
1549 
1550 	pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1551 	hammer2_inode_ref(pmp->ihidden);
1552 	hammer2_inode_unlock_ex(pmp->ihidden, cluster);
1553 	hammer2_trans_done(&trans);
1554 }
1555 
1556 /*
1557  * If an open file is unlinked H2 needs to retain the file in the topology
1558  * to ensure that its backing store is not recovered by the bulk free scan.
1559  * This also allows us to avoid having to special-case the CHAIN_DELETED flag.
1560  *
1561  * To do this the file is moved to a hidden directory in the PFS root and
1562  * renamed.  The hidden directory must be created if it does not exist.
1563  */
1564 static
1565 void
1566 hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
1567 			     hammer2_cluster_t **cparentp,
1568 			     hammer2_cluster_t **clusterp,
1569 			     hammer2_tid_t inum)
1570 {
1571 	hammer2_cluster_t *dcluster;
1572 	hammer2_pfsmount_t *pmp;
1573 	int error;
1574 
1575 	pmp = (*clusterp)->pmp;
1576 	KKASSERT(pmp != NULL);
1577 	KKASSERT(pmp->ihidden != NULL);
1578 
1579 	hammer2_cluster_delete(trans, *cparentp, *clusterp, 0);
1580 	dcluster = hammer2_inode_lock_ex(pmp->ihidden);
1581 	error = hammer2_inode_connect(trans, clusterp, 0,
1582 				      pmp->ihidden, dcluster,
1583 				      NULL, 0, inum);
1584 	hammer2_inode_unlock_ex(pmp->ihidden, dcluster);
1585 	KKASSERT(error == 0);
1586 }
1587 
1588 /*
1589  * Given an exclusively locked inode and cluster we consolidate the cluster
1590  * for hardlink creation, adding (nlinks) to the file's link count and
1591  * potentially relocating the inode to (cdip) which is a parent directory
1592  * common to both the current location of the inode and the intended new
1593  * hardlink.
1594  *
1595  * Replaces (*clusterp) if consolidation occurred, unlocking the old cluster
1596  * and returning a new locked cluster.
1597  *
1598  * NOTE!  This function will also replace ip->cluster.
1599  */
1600 int
1601 hammer2_hardlink_consolidate(hammer2_trans_t *trans,
1602 			     hammer2_inode_t *ip,
1603 			     hammer2_cluster_t **clusterp,
1604 			     hammer2_inode_t *cdip,
1605 			     hammer2_cluster_t *cdcluster,
1606 			     int nlinks)
1607 {
1608 	const hammer2_inode_data_t *ripdata;
1609 	hammer2_inode_data_t *wipdata;
1610 	hammer2_cluster_t *cluster;
1611 	hammer2_cluster_t *cparent;
1612 	int error;
1613 
1614 	cluster = *clusterp;
1615 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1616 	if (nlinks == 0 &&			/* no hardlink needed */
1617 	    (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE)) {
1618 		return (0);
1619 	}
1620 
1621 	if (hammer2_hardlink_enable == 0) {	/* disallow hardlinks */
1622 		hammer2_cluster_unlock(cluster);
1623 		*clusterp = NULL;
1624 		return (ENOTSUP);
1625 	}
1626 
1627 	cparent = NULL;
1628 
1629 	/*
1630 	 * If no change in the hardlink's target directory is required and
1631 	 * this is already a hardlink target, all we need to do is adjust
1632 	 * the link count.
1633 	 */
1634 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1635 	if (cdip == ip->pip &&
1636 	    (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1637 		if (nlinks) {
1638 			hammer2_cluster_modify(trans, cluster, 0);
1639 			wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1640 			wipdata->nlinks += nlinks;
1641 			hammer2_cluster_modsync(cluster);
1642 			ripdata = wipdata;
1643 		}
1644 		error = 0;
1645 		goto done;
1646 	}
1647 
1648 	/*
1649 	 * Cluster is the real inode.  The originating directory is locked
1650 	 * by the caller so we can manipulate it without worrying about races
1651 	 * against other lookups.
1652 	 *
1653 	 * If cluster is visible we need to delete it from the current
1654 	 * location and create a hardlink pointer in its place.  If it is
1655 	 * not visible we need only delete it.  Then later cluster will be
1656 	 * renamed to a parent directory and converted (if necessary) to
1657 	 * a hidden inode (via shiftup).
1658 	 *
1659 	 * NOTE! We must hold cparent locked through the delete/create/rename
1660 	 *	 operation to ensure that other threads block resolving to
1661 	 *	 the same hardlink, otherwise the other threads may not see
1662 	 *	 the hardlink.
1663 	 */
1664 	KKASSERT((cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0);
1665 	cparent = hammer2_cluster_parent(cluster);
1666 
1667 	hammer2_cluster_delete(trans, cparent, cluster, 0);
1668 
1669 	ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1670 	KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
1671 	if (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) {
1672 		hammer2_cluster_t *ncluster;
1673 		hammer2_key_t lhc;
1674 
1675 		ncluster = NULL;
1676 		lhc = cluster->focus->bref.key;
1677 		error = hammer2_cluster_create(trans, cparent, &ncluster,
1678 					     lhc, 0,
1679 					     HAMMER2_BREF_TYPE_INODE,
1680 					     HAMMER2_INODE_BYTES,
1681 					     0);
1682 		hammer2_cluster_modify(trans, ncluster, 0);
1683 		wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1684 
1685 		/* wipdata->comp_algo = ripdata->comp_algo; */
1686 		wipdata->comp_algo = 0;
1687 		wipdata->check_algo = 0;
1688 		wipdata->version = HAMMER2_INODE_VERSION_ONE;
1689 		wipdata->inum = ripdata->inum;
1690 		wipdata->target_type = ripdata->type;
1691 		wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1692 		wipdata->uflags = 0;
1693 		wipdata->rmajor = 0;
1694 		wipdata->rminor = 0;
1695 		wipdata->ctime = 0;
1696 		wipdata->mtime = 0;
1697 		wipdata->atime = 0;
1698 		wipdata->btime = 0;
1699 		bzero(&wipdata->uid, sizeof(wipdata->uid));
1700 		bzero(&wipdata->gid, sizeof(wipdata->gid));
1701 		wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1702 		wipdata->cap_flags = 0;
1703 		wipdata->mode = 0;
1704 		wipdata->size = 0;
1705 		wipdata->nlinks = 1;
1706 		wipdata->iparent = 0;	/* XXX */
1707 		wipdata->pfs_type = 0;
1708 		wipdata->pfs_inum = 0;
1709 		bzero(&wipdata->pfs_clid, sizeof(wipdata->pfs_clid));
1710 		bzero(&wipdata->pfs_fsid, sizeof(wipdata->pfs_fsid));
1711 		wipdata->data_quota = 0;
1712 		wipdata->data_count = 0;
1713 		wipdata->inode_quota = 0;
1714 		wipdata->inode_count = 0;
1715 		wipdata->attr_tid = 0;
1716 		wipdata->dirent_tid = 0;
1717 		bzero(&wipdata->u, sizeof(wipdata->u));
1718 		bcopy(ripdata->filename, wipdata->filename, ripdata->name_len);
1719 		wipdata->name_key = ncluster->focus->bref.key;
1720 		wipdata->name_len = ripdata->name_len;
1721 		/* XXX transaction ids */
1722 		hammer2_cluster_modsync(ncluster);
1723 		hammer2_cluster_unlock(ncluster);
1724 	}
1725 	ripdata = wipdata;
1726 
1727 	/*
1728 	 * cluster represents the hardlink target and is now flagged deleted.
1729 	 * duplicate it to the parent directory and adjust nlinks.
1730 	 *
1731 	 * WARNING! The shiftup() call can cause ncluster to be moved into
1732 	 *	    an indirect block, and our ncluster will wind up pointing
1733 	 *	    to the older/original version.
1734 	 */
1735 	KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_DELETED);
1736 	hammer2_hardlink_shiftup(trans, cluster, cdip, cdcluster,
1737 				 nlinks, &error);
1738 
1739 	if (error == 0)
1740 		hammer2_inode_repoint(ip, cdip, cluster);
1741 
1742 done:
1743 	/*
1744 	 * Cleanup, cluster/ncluster already dealt with.
1745 	 *
1746 	 * Return the shifted cluster in *clusterp.
1747 	 */
1748 	if (cparent)
1749 		hammer2_cluster_unlock(cparent);
1750 	*clusterp = cluster;
1751 
1752 	return (error);
1753 }
1754 
1755 /*
1756  * If (*ochainp) is non-NULL it points to the forward OBJTYPE_HARDLINK
1757  * inode while (*chainp) points to the resolved (hidden hardlink
1758  * target) inode.  In this situation when nlinks is 1 we wish to
1759  * deconsolidate the hardlink, moving it back to the directory that now
1760  * represents the only remaining link.
1761  */
1762 int
1763 hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
1764 			       hammer2_inode_t *dip,
1765 			       hammer2_chain_t **chainp,
1766 			       hammer2_chain_t **ochainp)
1767 {
1768 	if (*ochainp == NULL)
1769 		return (0);
1770 	/* XXX */
1771 	return (0);
1772 }
1773 
1774 /*
1775  * The caller presents a locked cluster with an obj_type of
1776  * HAMMER2_OBJTYPE_HARDLINK.  This routine will locate and replace the
1777  * cluster with the target hardlink, also locked.
1778  *
1779  * If cparentp is not NULL a locked cluster representing the hardlink's
1780  * parent is also returned.
1781  *
1782  * If we are unable to locate the hardlink target EIO is returned and
1783  * (*cparentp) is set to NULL.  The passed-in cluster still needs to be
1784  * unlocked by the caller but will be degenerate... not have any chains.
1785  */
1786 int
1787 hammer2_hardlink_find(hammer2_inode_t *dip,
1788 		      hammer2_cluster_t **cparentp, hammer2_cluster_t *cluster)
1789 {
1790 	const hammer2_inode_data_t *ipdata;
1791 	hammer2_cluster_t *cparent;
1792 	hammer2_cluster_t *rcluster;
1793 	hammer2_inode_t *ip;
1794 	hammer2_inode_t *pip;
1795 	hammer2_key_t key_dummy;
1796 	hammer2_key_t lhc;
1797 	int ddflag;
1798 
1799 	pip = dip;
1800 	hammer2_inode_ref(pip);		/* for loop */
1801 
1802 	/*
1803 	 * Locate the hardlink.  pip is referenced and not locked.
1804 	 */
1805 	ipdata = &hammer2_cluster_rdata(cluster)->ipdata;
1806 	lhc = ipdata->inum;
1807 
1808 	/*
1809 	 * We don't need the cluster's chains, but we need to retain the
1810 	 * cluster structure itself so we can load the hardlink search
1811 	 * result into it.
1812 	 */
1813 	KKASSERT(cluster->refs == 1);
1814 	atomic_add_int(&cluster->refs, 1);
1815 	hammer2_cluster_unlock(cluster);	/* hack */
1816 	cluster->nchains = 0;			/* hack */
1817 
1818 	rcluster = NULL;
1819 	cparent = NULL;
1820 
1821 	while ((ip = pip) != NULL) {
1822 		cparent = hammer2_inode_lock_ex(ip);
1823 		hammer2_inode_drop(ip);			/* loop */
1824 		KKASSERT(hammer2_cluster_type(cparent) ==
1825 			 HAMMER2_BREF_TYPE_INODE);
1826 		rcluster = hammer2_cluster_lookup(cparent, &key_dummy,
1827 					     lhc, lhc, 0, &ddflag);
1828 		if (rcluster)
1829 			break;
1830 		hammer2_cluster_lookup_done(cparent);	/* discard parent */
1831 		cparent = NULL;				/* safety */
1832 		pip = ip->pip;		/* safe, ip held locked */
1833 		if (pip)
1834 			hammer2_inode_ref(pip);		/* loop */
1835 		hammer2_inode_unlock_ex(ip, NULL);
1836 	}
1837 
1838 	/*
1839 	 * chain is locked, ip is locked.  Unlock ip, return the locked
1840 	 * chain.  *ipp is already set w/a ref count and not locked.
1841 	 *
1842 	 * (cparent is already unlocked).
1843 	 */
1844 	if (rcluster) {
1845 		hammer2_cluster_replace(cluster, rcluster);
1846 		hammer2_cluster_drop(rcluster);
1847 		if (cparentp) {
1848 			*cparentp = cparent;
1849 			hammer2_inode_unlock_ex(ip, NULL);
1850 		} else {
1851 			hammer2_inode_unlock_ex(ip, cparent);
1852 		}
1853 		return (0);
1854 	} else {
1855 		if (cparentp)
1856 			*cparentp = NULL;
1857 		if (ip)
1858 			hammer2_inode_unlock_ex(ip, cparent);
1859 		return (EIO);
1860 	}
1861 }
1862 
1863 /*
1864  * Find the directory common to both fdip and tdip.
1865  *
1866  * Returns a held but not locked inode.  Caller typically locks the inode,
1867  * and when through unlocks AND drops it.
1868  */
1869 hammer2_inode_t *
1870 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1871 {
1872 	hammer2_inode_t *scan1;
1873 	hammer2_inode_t *scan2;
1874 
1875 	/*
1876 	 * We used to have a depth field but it complicated matters too
1877 	 * much for directory renames.  So now its ugly.  Check for
1878 	 * simple cases before giving up and doing it the expensive way.
1879 	 *
1880 	 * XXX need a bottom-up topology stability lock
1881 	 */
1882 	if (fdip == tdip || fdip == tdip->pip) {
1883 		hammer2_inode_ref(fdip);
1884 		return(fdip);
1885 	}
1886 	if (fdip->pip == tdip) {
1887 		hammer2_inode_ref(tdip);
1888 		return(tdip);
1889 	}
1890 
1891 	/*
1892 	 * XXX not MPSAFE
1893 	 */
1894 	for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1895 		scan2 = tdip;
1896 		while (scan2->pmp == tdip->pmp) {
1897 			if (scan1 == scan2) {
1898 				hammer2_inode_ref(scan1);
1899 				return(scan1);
1900 			}
1901 			scan2 = scan2->pip;
1902 			if (scan2 == NULL)
1903 				break;
1904 		}
1905 	}
1906 	panic("hammer2_inode_common_parent: no common parent %p %p\n",
1907 	      fdip, tdip);
1908 	/* NOT REACHED */
1909 	return(NULL);
1910 }
1911 
1912 /*
1913  * Synchronize the inode's frontend state with the chain state prior
1914  * to any explicit flush of the inode or any strategy write call.
1915  *
1916  * Called with a locked inode.
1917  */
1918 void
1919 hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
1920 		    hammer2_cluster_t *cparent)
1921 {
1922 	const hammer2_inode_data_t *ripdata;
1923 	hammer2_inode_data_t *wipdata;
1924 	hammer2_cluster_t *dparent;
1925 	hammer2_cluster_t *cluster;
1926 	hammer2_key_t lbase;
1927 	hammer2_key_t key_next;
1928 	int dosync = 0;
1929 	int ddflag;
1930 
1931 	ripdata = &hammer2_cluster_rdata(cparent)->ipdata;    /* target file */
1932 
1933 	if (ip->flags & HAMMER2_INODE_MTIME) {
1934 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1935 		atomic_clear_int(&ip->flags, HAMMER2_INODE_MTIME);
1936 		wipdata->mtime = ip->mtime;
1937 		dosync = 1;
1938 		ripdata = wipdata;
1939 	}
1940 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size < ripdata->size) {
1941 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1942 		wipdata->size = ip->size;
1943 		dosync = 1;
1944 		ripdata = wipdata;
1945 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1946 
1947 		/*
1948 		 * We must delete any chains beyond the EOF.  The chain
1949 		 * straddling the EOF will be pending in the bioq.
1950 		 */
1951 		lbase = (ripdata->size + HAMMER2_PBUFMASK64) &
1952 			~HAMMER2_PBUFMASK64;
1953 		dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
1954 		cluster = hammer2_cluster_lookup(dparent, &key_next,
1955 					         lbase, (hammer2_key_t)-1,
1956 						 HAMMER2_LOOKUP_NODATA,
1957 						 &ddflag);
1958 		while (cluster) {
1959 			/*
1960 			 * Degenerate embedded case, nothing to loop on
1961 			 */
1962 			switch (hammer2_cluster_type(cluster)) {
1963 			case HAMMER2_BREF_TYPE_INODE:
1964 				hammer2_cluster_unlock(cluster);
1965 				cluster = NULL;
1966 				break;
1967 			case HAMMER2_BREF_TYPE_DATA:
1968 				hammer2_cluster_delete(trans, dparent, cluster,
1969 						   HAMMER2_DELETE_PERMANENT);
1970 				/* fall through */
1971 			default:
1972 				cluster = hammer2_cluster_next(dparent, cluster,
1973 						   &key_next,
1974 						   key_next, (hammer2_key_t)-1,
1975 						   HAMMER2_LOOKUP_NODATA);
1976 				break;
1977 			}
1978 		}
1979 		hammer2_cluster_lookup_done(dparent);
1980 	} else
1981 	if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size > ripdata->size) {
1982 		wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
1983 		wipdata->size = ip->size;
1984 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1985 
1986 		/*
1987 		 * When resizing larger we may not have any direct-data
1988 		 * available.
1989 		 */
1990 		if ((wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1991 		    ip->size > HAMMER2_EMBEDDED_BYTES) {
1992 			wipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1993 			bzero(&wipdata->u.blockset,
1994 			      sizeof(wipdata->u.blockset));
1995 		}
1996 		dosync = 1;
1997 		ripdata = wipdata;
1998 	}
1999 	if (dosync)
2000 		hammer2_cluster_modsync(cparent);
2001 }
2002