xref: /dflybsd-src/sys/vfs/hammer2/hammer2_inode.c (revision d63676ccce44debffd22823892f437449f6acdad)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41 
42 #include "hammer2.h"
43 
44 #define INODE_DEBUG	0
45 
46 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
47 	     hammer2_tid_t, meta.inum);
48 
49 int
50 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
51 {
52 	if (ip1->meta.inum < ip2->meta.inum)
53 		return(-1);
54 	if (ip1->meta.inum > ip2->meta.inum)
55 		return(1);
56 	return(0);
57 }
58 
59 /*
60  * HAMMER2 inode locks
61  *
62  * HAMMER2 offers shared and exclusive locks on inodes.  Pass a mask of
63  * flags for options:
64  *
65  *	- pass HAMMER2_RESOLVE_SHARED if a shared lock is desired.  The
66  *	  inode locking function will automatically set the RDONLY flag.
67  *
68  *	- pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
69  *	  Most front-end inode locks do.
70  *
71  *	- pass HAMMER2_RESOLVE_NEVER if you do not want to require that
72  *	  the inode data be resolved.  This is used by the syncthr because
73  *	  it can run on an unresolved/out-of-sync cluster, and also by the
74  *	  vnode reclamation code to avoid unnecessary I/O (particularly when
75  *	  disposing of hundreds of thousands of cached vnodes).
76  *
77  * The inode locking function locks the inode itself, resolves any stale
78  * chains in the inode's cluster, and allocates a fresh copy of the
79  * cluster with 1 ref and all the underlying chains locked.
80  *
81  * ip->cluster will be stable while the inode is locked.
82  *
83  * NOTE: We don't combine the inode/chain lock because putting away an
84  *       inode would otherwise confuse multiple lock holders of the inode.
85  *
86  * NOTE: In-memory inodes always point to hardlink targets (the actual file),
87  *	 and never point to a hardlink pointer.
88  *
89  * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
90  *	 will feel free to reduce the chain set in the cluster as an
91  *	 optimization.  It will still be validated against the quorum if
92  *	 appropriate, but the optimization might be able to reduce data
93  *	 accesses to one node.  This flag is automatically set if the inode
94  *	 is locked with HAMMER2_RESOLVE_SHARED.
95  */
96 void
97 hammer2_inode_lock(hammer2_inode_t *ip, int how)
98 {
99 	hammer2_inode_ref(ip);
100 
101 	/*
102 	 * Inode structure mutex
103 	 */
104 	if (how & HAMMER2_RESOLVE_SHARED) {
105 		/*how |= HAMMER2_RESOLVE_RDONLY; not used */
106 		hammer2_mtx_sh(&ip->lock);
107 	} else {
108 		hammer2_mtx_ex(&ip->lock);
109 	}
110 }
111 
112 /*
113  * Create a locked copy of ip->cluster.  Note that the copy will have a
114  * ref on the cluster AND its chains and we don't want a second ref to
115  * either when we lock it.
116  *
117  * Exclusive inode locks set the template focus chain in (ip)
118  * as a hint.  Cluster locks can ALWAYS replace the focus in the
119  * working copy if the hint does not work out, so beware.
120  */
121 hammer2_cluster_t *
122 hammer2_inode_cluster(hammer2_inode_t *ip, int how)
123 {
124 	hammer2_cluster_t *cluster;
125 
126 	cluster = hammer2_cluster_copy(&ip->cluster);
127 	hammer2_cluster_lock(cluster, how);
128 	hammer2_cluster_resolve(cluster);
129 
130 	/*
131 	 * cluster->focus will be set if resolving RESOLVE_ALWAYS, but
132 	 * only update the cached focus in the inode structure when taking
133 	 * out an exclusive lock.
134 	 */
135 	if ((how & HAMMER2_RESOLVE_SHARED) == 0)
136 		ip->cluster.focus = cluster->focus;
137 
138 	return cluster;
139 }
140 
141 /*
142  * Select a chain out of an inode's cluster and lock it.
143  *
144  * The inode does not have to be locked.
145  */
146 hammer2_chain_t *
147 hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how)
148 {
149 	hammer2_chain_t *chain;
150 
151 	hammer2_spin_sh(&ip->cluster_spin);
152 	if (clindex >= ip->cluster.nchains)
153 		chain = NULL;
154 	else
155 		chain = ip->cluster.array[clindex].chain;
156 	if (chain) {
157 		hammer2_chain_ref(chain);
158 		hammer2_spin_unsh(&ip->cluster_spin);
159 		hammer2_chain_lock(chain, how);
160 	} else {
161 		hammer2_spin_unsh(&ip->cluster_spin);
162 	}
163 	return chain;
164 }
165 
166 void
167 hammer2_inode_unlock(hammer2_inode_t *ip)
168 {
169 	hammer2_mtx_unlock(&ip->lock);
170 	hammer2_inode_drop(ip);
171 }
172 
173 /*
174  * Temporarily release a lock held shared or exclusive.  Caller must
175  * hold the lock shared or exclusive on call and lock will be released
176  * on return.
177  *
178  * Restore a lock that was temporarily released.
179  */
180 hammer2_mtx_state_t
181 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
182 {
183 	return hammer2_mtx_temp_release(&ip->lock);
184 }
185 
186 void
187 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
188 {
189 	hammer2_mtx_temp_restore(&ip->lock, ostate);
190 }
191 
192 /*
193  * Upgrade a shared inode lock to exclusive and return.  If the inode lock
194  * is already held exclusively this is a NOP.
195  *
196  * The caller MUST hold the inode lock either shared or exclusive on call
197  * and will own the lock exclusively on return.
198  *
199  * Returns non-zero if the lock was already exclusive prior to the upgrade.
200  */
201 int
202 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
203 {
204 	int wasexclusive;
205 
206 	if (mtx_islocked_ex(&ip->lock)) {
207 		wasexclusive = 1;
208 	} else {
209 		hammer2_mtx_unlock(&ip->lock);
210 		hammer2_mtx_ex(&ip->lock);
211 		wasexclusive = 0;
212 	}
213 	return wasexclusive;
214 }
215 
216 /*
217  * Downgrade an inode lock from exclusive to shared only if the inode
218  * lock was previously shared.  If the inode lock was previously exclusive,
219  * this is a NOP.
220  */
221 void
222 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
223 {
224 	if (wasexclusive == 0)
225 		mtx_downgrade(&ip->lock);
226 }
227 
228 /*
229  * Lookup an inode by inode number
230  */
231 hammer2_inode_t *
232 hammer2_inode_lookup(hammer2_pfs_t *pmp, hammer2_tid_t inum)
233 {
234 	hammer2_inode_t *ip;
235 
236 	KKASSERT(pmp);
237 	if (pmp->spmp_hmp) {
238 		ip = NULL;
239 	} else {
240 		hammer2_spin_ex(&pmp->inum_spin);
241 		ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
242 		if (ip)
243 			hammer2_inode_ref(ip);
244 		hammer2_spin_unex(&pmp->inum_spin);
245 	}
246 	return(ip);
247 }
248 
249 /*
250  * Adding a ref to an inode is only legal if the inode already has at least
251  * one ref.
252  *
253  * (can be called with spinlock held)
254  */
255 void
256 hammer2_inode_ref(hammer2_inode_t *ip)
257 {
258 	atomic_add_int(&ip->refs, 1);
259 }
260 
261 /*
262  * Drop an inode reference, freeing the inode when the last reference goes
263  * away.
264  */
265 void
266 hammer2_inode_drop(hammer2_inode_t *ip)
267 {
268 	hammer2_pfs_t *pmp;
269 	hammer2_inode_t *pip;
270 	u_int refs;
271 
272 	while (ip) {
273 		refs = ip->refs;
274 		cpu_ccfence();
275 		if (refs == 1) {
276 			/*
277 			 * Transition to zero, must interlock with
278 			 * the inode inumber lookup tree (if applicable).
279 			 * It should not be possible for anyone to race
280 			 * the transition to 0.
281 			 */
282 			pmp = ip->pmp;
283 			KKASSERT(pmp);
284 			hammer2_spin_ex(&pmp->inum_spin);
285 
286 			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
287 				KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
288 				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
289 					atomic_clear_int(&ip->flags,
290 						     HAMMER2_INODE_ONRBTREE);
291 					RB_REMOVE(hammer2_inode_tree,
292 						  &pmp->inum_tree, ip);
293 				}
294 				hammer2_spin_unex(&pmp->inum_spin);
295 
296 				pip = ip->pip;
297 				ip->pip = NULL;
298 				ip->pmp = NULL;
299 
300 				/*
301 				 * Cleaning out ip->cluster isn't entirely
302 				 * trivial.
303 				 */
304 				hammer2_inode_repoint(ip, NULL, NULL);
305 
306 				/*
307 				 * We have to drop pip (if non-NULL) to
308 				 * dispose of our implied reference from
309 				 * ip->pip.  We can simply loop on it.
310 				 */
311 				kfree(ip, pmp->minode);
312 				atomic_add_long(&pmp->inmem_inodes, -1);
313 				ip = pip;
314 				/* continue with pip (can be NULL) */
315 			} else {
316 				hammer2_spin_unex(&ip->pmp->inum_spin);
317 			}
318 		} else {
319 			/*
320 			 * Non zero transition
321 			 */
322 			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
323 				break;
324 		}
325 	}
326 }
327 
328 /*
329  * Get the vnode associated with the given inode, allocating the vnode if
330  * necessary.  The vnode will be returned exclusively locked.
331  *
332  * The caller must lock the inode (shared or exclusive).
333  *
334  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
335  * races.
336  */
337 struct vnode *
338 hammer2_igetv(hammer2_inode_t *ip, int *errorp)
339 {
340 	hammer2_pfs_t *pmp;
341 	struct vnode *vp;
342 
343 	pmp = ip->pmp;
344 	KKASSERT(pmp != NULL);
345 	*errorp = 0;
346 
347 	for (;;) {
348 		/*
349 		 * Attempt to reuse an existing vnode assignment.  It is
350 		 * possible to race a reclaim so the vget() may fail.  The
351 		 * inode must be unlocked during the vget() to avoid a
352 		 * deadlock against a reclaim.
353 		 */
354 		int wasexclusive;
355 
356 		vp = ip->vp;
357 		if (vp) {
358 			/*
359 			 * Inode must be unlocked during the vget() to avoid
360 			 * possible deadlocks, but leave the ip ref intact.
361 			 *
362 			 * vnode is held to prevent destruction during the
363 			 * vget().  The vget() can still fail if we lost
364 			 * a reclaim race on the vnode.
365 			 */
366 			hammer2_mtx_state_t ostate;
367 
368 			vhold(vp);
369 			ostate = hammer2_inode_lock_temp_release(ip);
370 			if (vget(vp, LK_EXCLUSIVE)) {
371 				vdrop(vp);
372 				hammer2_inode_lock_temp_restore(ip, ostate);
373 				continue;
374 			}
375 			hammer2_inode_lock_temp_restore(ip, ostate);
376 			vdrop(vp);
377 			/* vp still locked and ref from vget */
378 			if (ip->vp != vp) {
379 				kprintf("hammer2: igetv race %p/%p\n",
380 					ip->vp, vp);
381 				vput(vp);
382 				continue;
383 			}
384 			*errorp = 0;
385 			break;
386 		}
387 
388 		/*
389 		 * No vnode exists, allocate a new vnode.  Beware of
390 		 * allocation races.  This function will return an
391 		 * exclusively locked and referenced vnode.
392 		 */
393 		*errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
394 		if (*errorp) {
395 			kprintf("hammer2: igetv getnewvnode failed %d\n",
396 				*errorp);
397 			vp = NULL;
398 			break;
399 		}
400 
401 		/*
402 		 * Lock the inode and check for an allocation race.
403 		 */
404 		wasexclusive = hammer2_inode_lock_upgrade(ip);
405 		if (ip->vp != NULL) {
406 			vp->v_type = VBAD;
407 			vx_put(vp);
408 			hammer2_inode_lock_downgrade(ip, wasexclusive);
409 			continue;
410 		}
411 
412 		switch (ip->meta.type) {
413 		case HAMMER2_OBJTYPE_DIRECTORY:
414 			vp->v_type = VDIR;
415 			break;
416 		case HAMMER2_OBJTYPE_REGFILE:
417 			vp->v_type = VREG;
418 			vinitvmio(vp, ip->meta.size,
419 				  HAMMER2_LBUFSIZE,
420 				  (int)ip->meta.size & HAMMER2_LBUFMASK);
421 			break;
422 		case HAMMER2_OBJTYPE_SOFTLINK:
423 			/*
424 			 * XXX for now we are using the generic file_read
425 			 * and file_write code so we need a buffer cache
426 			 * association.
427 			 */
428 			vp->v_type = VLNK;
429 			vinitvmio(vp, ip->meta.size,
430 				  HAMMER2_LBUFSIZE,
431 				  (int)ip->meta.size & HAMMER2_LBUFMASK);
432 			break;
433 		case HAMMER2_OBJTYPE_CDEV:
434 			vp->v_type = VCHR;
435 			/* fall through */
436 		case HAMMER2_OBJTYPE_BDEV:
437 			vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
438 			if (ip->meta.type != HAMMER2_OBJTYPE_CDEV)
439 				vp->v_type = VBLK;
440 			addaliasu(vp,
441 				  ip->meta.rmajor,
442 				  ip->meta.rminor);
443 			break;
444 		case HAMMER2_OBJTYPE_FIFO:
445 			vp->v_type = VFIFO;
446 			vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
447 			break;
448 		default:
449 			panic("hammer2: unhandled objtype %d",
450 			      ip->meta.type);
451 			break;
452 		}
453 
454 		if (ip == pmp->iroot)
455 			vsetflags(vp, VROOT);
456 
457 		vp->v_data = ip;
458 		ip->vp = vp;
459 		hammer2_inode_ref(ip);		/* vp association */
460 		hammer2_inode_lock_downgrade(ip, wasexclusive);
461 		break;
462 	}
463 
464 	/*
465 	 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
466 	 */
467 	if (hammer2_debug & 0x0002) {
468 		kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
469 			vp, vp->v_refcnt, vp->v_auxrefs);
470 	}
471 	return (vp);
472 }
473 
474 /*
475  * Returns the inode associated with the passed-in cluster, creating the
476  * inode if necessary and synchronizing it to the passed-in cluster otherwise.
477  *
478  * The passed-in cluster must be locked and will remain locked on return.
479  * The returned inode will be locked and the caller may dispose of both
480  * via hammer2_inode_unlock() + hammer2_inode_drop().  However, if the caller
481  * needs to resolve a hardlink it must ref/unlock/relock/drop the inode.
482  *
483  * The hammer2_inode structure regulates the interface between the high level
484  * kernel VNOPS API and the filesystem backend (the chains).
485  *
486  * On return the inode is locked with the supplied cluster.
487  */
488 hammer2_inode_t *
489 hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
490 		  hammer2_cluster_t *cluster)
491 {
492 	hammer2_inode_t *nip;
493 	const hammer2_inode_data_t *iptmp;
494 	const hammer2_inode_data_t *nipdata;
495 
496 	KKASSERT(cluster == NULL ||
497 		 hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
498 	KKASSERT(pmp);
499 
500 	/*
501 	 * Interlocked lookup/ref of the inode.  This code is only needed
502 	 * when looking up inodes with nlinks != 0 (TODO: optimize out
503 	 * otherwise and test for duplicates).
504 	 *
505 	 * Cluster can be NULL during the initial pfs allocation.
506 	 */
507 again:
508 	while (cluster) {
509 		iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
510 		nip = hammer2_inode_lookup(pmp, iptmp->meta.inum);
511 		if (nip == NULL)
512 			break;
513 
514 		hammer2_mtx_ex(&nip->lock);
515 
516 		/*
517 		 * Handle SMP race (not applicable to the super-root spmp
518 		 * which can't index inodes due to duplicative inode numbers).
519 		 */
520 		if (pmp->spmp_hmp == NULL &&
521 		    (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
522 			hammer2_mtx_unlock(&nip->lock);
523 			hammer2_inode_drop(nip);
524 			continue;
525 		}
526 		hammer2_inode_repoint(nip, NULL, cluster);
527 
528 		return nip;
529 	}
530 
531 	/*
532 	 * We couldn't find the inode number, create a new inode.
533 	 */
534 	nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
535 	spin_init(&nip->cluster_spin, "h2clspin");
536 	atomic_add_long(&pmp->inmem_inodes, 1);
537 	hammer2_pfs_memory_inc(pmp);
538 	hammer2_pfs_memory_wakeup(pmp);
539 	if (pmp->spmp_hmp)
540 		nip->flags = HAMMER2_INODE_SROOT;
541 
542 	/*
543 	 * Initialize nip's cluster.  A cluster is provided for normal
544 	 * inodes but typically not for the super-root or PFS inodes.
545 	 */
546 	nip->cluster.refs = 1;
547 	nip->cluster.pmp = pmp;
548 	nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
549 	if (cluster) {
550 		nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
551 		nip->meta = nipdata->meta;
552 		hammer2_cluster_bref(cluster, &nip->bref);
553 		atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
554 		hammer2_inode_repoint(nip, NULL, cluster);
555 	} else {
556 		nip->meta.inum = 1;		/* PFS inum is always 1 XXX */
557 		/* mtime will be updated when a cluster is available */
558 		atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);/*XXX*/
559 	}
560 
561 	nip->pip = dip;				/* can be NULL */
562 	if (dip)
563 		hammer2_inode_ref(dip);	/* ref dip for nip->pip */
564 
565 	nip->pmp = pmp;
566 
567 	/*
568 	 * ref and lock on nip gives it state compatible to after a
569 	 * hammer2_inode_lock() call.
570 	 */
571 	nip->refs = 1;
572 	hammer2_mtx_init(&nip->lock, "h2inode");
573 	hammer2_mtx_ex(&nip->lock);
574 	/* combination of thread lock and chain lock == inode lock */
575 
576 	/*
577 	 * Attempt to add the inode.  If it fails we raced another inode
578 	 * get.  Undo all the work and try again.
579 	 */
580 	if (pmp->spmp_hmp == NULL) {
581 		hammer2_spin_ex(&pmp->inum_spin);
582 		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
583 			hammer2_spin_unex(&pmp->inum_spin);
584 			hammer2_mtx_unlock(&nip->lock);
585 			hammer2_inode_drop(nip);
586 			goto again;
587 		}
588 		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
589 		hammer2_spin_unex(&pmp->inum_spin);
590 	}
591 
592 	return (nip);
593 }
594 
595 /*
596  * Create a new inode in the specified directory using the vattr to
597  * figure out the type of inode.
598  *
599  * If no error occurs the new inode with its cluster locked is returned in
600  * *nipp, otherwise an error is returned and *nipp is set to NULL.
601  *
602  * If vap and/or cred are NULL the related fields are not set and the
603  * inode type defaults to a directory.  This is used when creating PFSs
604  * under the super-root, so the inode number is set to 1 in this case.
605  *
606  * dip is not locked on entry.
607  *
608  * NOTE: When used to create a snapshot, the inode is temporarily associated
609  *	 with the super-root spmp. XXX should pass new pmp for snapshot.
610  */
611 hammer2_inode_t *
612 hammer2_inode_create(hammer2_inode_t *dip,
613 		     struct vattr *vap, struct ucred *cred,
614 		     const uint8_t *name, size_t name_len, hammer2_key_t lhc,
615 		     hammer2_key_t inum, uint8_t type, uint8_t target_type,
616 		     int flags, int *errorp)
617 {
618 	hammer2_xop_create_t *xop;
619 	hammer2_inode_t *nip;
620 	int error;
621 	uid_t xuid;
622 	uuid_t dip_uid;
623 	uuid_t dip_gid;
624 	uint32_t dip_mode;
625 	uint8_t dip_comp_algo;
626 	uint8_t dip_check_algo;
627 
628 	if (name)
629 		lhc = hammer2_dirhash(name, name_len);
630 	*errorp = 0;
631 	nip = NULL;
632 
633 	/*
634 	 * Locate the inode or indirect block to create the new
635 	 * entry in.  At the same time check for key collisions
636 	 * and iterate until we don't get one.
637 	 *
638 	 * NOTE: hidden inodes do not have iterators.
639 	 *
640 	 * Lock the directory exclusively for now to guarantee that
641 	 * we can find an unused lhc for the name.  Due to collisions,
642 	 * two different creates can end up with the same lhc so we
643 	 * cannot depend on the OS to prevent the collision.
644 	 */
645 	hammer2_inode_lock(dip, 0);
646 
647 	dip_uid = dip->meta.uid;
648 	dip_gid = dip->meta.gid;
649 	dip_mode = dip->meta.mode;
650 	dip_comp_algo = dip->meta.comp_algo;
651 	dip_check_algo = dip->meta.check_algo;
652 
653 	/*
654 	 * If name specified, locate an unused key in the collision space.
655 	 * Otherwise use the passed-in lhc directly.
656 	 */
657 	if (name) {
658 		hammer2_xop_scanlhc_t *sxop;
659 		hammer2_key_t lhcbase;
660 
661 		lhcbase = lhc;
662 		sxop = &hammer2_xop_alloc(dip)->xop_scanlhc;
663 		sxop->lhc = lhc;
664 		hammer2_xop_start(&sxop->head, hammer2_xop_scanlhc);
665 		while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
666 			if (lhc != sxop->head.cluster.focus->bref.key)
667 				break;
668 			++lhc;
669 		}
670 		hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
671 
672 		if (error) {
673 			if (error != ENOENT)
674 				goto done2;
675 			++lhc;
676 			error = 0;
677 		}
678 		if ((lhcbase ^ lhc) & ~HAMMER2_DIRHASH_LOMASK) {
679 			error = ENOSPC;
680 			goto done2;
681 		}
682 	}
683 
684 	/*
685 	 * Create the inode with the lhc as the key.
686 	 */
687 	xop = &hammer2_xop_alloc(dip)->xop_create;
688 	xop->lhc = lhc;
689 	xop->flags = flags;
690 	bzero(&xop->meta, sizeof(xop->meta));
691 
692 	if (vap) {
693 		xop->meta.type = hammer2_get_obj_type(vap->va_type);
694 
695 		switch (xop->meta.type) {
696 		case HAMMER2_OBJTYPE_CDEV:
697 		case HAMMER2_OBJTYPE_BDEV:
698 			xop->meta.rmajor = vap->va_rmajor;
699 			xop->meta.rminor = vap->va_rminor;
700 			break;
701 		default:
702 			break;
703 		}
704 		type = xop->meta.type;
705 	} else {
706 		xop->meta.type = type;
707 		xop->meta.target_type = target_type;
708 	}
709 	xop->meta.inum = inum;
710 
711 	/* Inherit parent's inode compression mode. */
712 	xop->meta.comp_algo = dip_comp_algo;
713 	xop->meta.check_algo = dip_check_algo;
714 	xop->meta.version = HAMMER2_INODE_VERSION_ONE;
715 	hammer2_update_time(&xop->meta.ctime);
716 	xop->meta.mtime = xop->meta.ctime;
717 	if (vap)
718 		xop->meta.mode = vap->va_mode;
719 	xop->meta.nlinks = 1;
720 	if (vap) {
721 		if (dip && dip->pmp) {
722 			xuid = hammer2_to_unix_xid(&dip_uid);
723 			xuid = vop_helper_create_uid(dip->pmp->mp,
724 						     dip_mode,
725 						     xuid,
726 						     cred,
727 						     &vap->va_mode);
728 		} else {
729 			/* super-root has no dip and/or pmp */
730 			xuid = 0;
731 		}
732 		if (vap->va_vaflags & VA_UID_UUID_VALID)
733 			xop->meta.uid = vap->va_uid_uuid;
734 		else if (vap->va_uid != (uid_t)VNOVAL)
735 			hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
736 		else
737 			hammer2_guid_to_uuid(&xop->meta.uid, xuid);
738 
739 		if (vap->va_vaflags & VA_GID_UUID_VALID)
740 			xop->meta.gid = vap->va_gid_uuid;
741 		else if (vap->va_gid != (gid_t)VNOVAL)
742 			hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
743 		else if (dip)
744 			xop->meta.gid = dip_gid;
745 	}
746 
747 	/*
748 	 * Regular files and softlinks allow a small amount of data to be
749 	 * directly embedded in the inode.  This flag will be cleared if
750 	 * the size is extended past the embedded limit.
751 	 */
752 	if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
753 	    xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK ||
754 	    xop->meta.type == HAMMER2_OBJTYPE_HARDLINK) {
755 		xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
756 	}
757 	if (name)
758 		hammer2_xop_setname(&xop->head, name, name_len);
759 	xop->meta.name_len = name_len;
760 	xop->meta.name_key = lhc;
761 	KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
762 
763 	hammer2_xop_start(&xop->head, hammer2_inode_xop_create);
764 
765 	error = hammer2_xop_collect(&xop->head, 0);
766 #if INODE_DEBUG
767 	kprintf("CREATE INODE %*.*s\n",
768 		(int)name_len, (int)name_len, name);
769 #endif
770 
771 	if (error) {
772 		*errorp = error;
773 		goto done;
774 	}
775 
776 	/*
777 	 * Set up the new inode if not a hardlink pointer.
778 	 *
779 	 * NOTE: *_get() integrates chain's lock into the inode lock.
780 	 *
781 	 * NOTE: Only one new inode can currently be created per
782 	 *	 transaction.  If the need arises we can adjust
783 	 *	 hammer2_trans_init() to allow more.
784 	 *
785 	 * NOTE: nipdata will have chain's blockset data.
786 	 */
787 	if (type != HAMMER2_OBJTYPE_HARDLINK) {
788 		nip = hammer2_inode_get(dip->pmp, dip, &xop->head.cluster);
789 		nip->comp_heuristic = 0;
790 	} else {
791 		nip = NULL;
792 	}
793 
794 done:
795 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
796 done2:
797 	hammer2_inode_unlock(dip);
798 
799 	return (nip);
800 }
801 
802 /*
803  * Connect the disconnected inode (ip) to the directory (dip) with the
804  * specified (name, name_len).  If name is NULL, (lhc) will be used as
805  * the directory key and the inode's embedded name will not be modified
806  * for future recovery purposes.
807  *
808  * dip and ip must both be locked exclusively (dip in particular to avoid
809  * lhc collisions).
810  */
811 int
812 hammer2_inode_connect_simple(hammer2_inode_t *dip, hammer2_inode_t *ip,
813 			     const char *name, size_t name_len,
814 			     hammer2_key_t lhc)
815 {
816 	hammer2_xop_scanlhc_t *sxop;
817 	hammer2_xop_connect_t *xop;
818 	hammer2_inode_t *opip;
819 	hammer2_key_t lhcbase;
820 	int error;
821 
822 	/*
823 	 * Calculate the lhc and resolve the collision space.
824 	 */
825 	if (name) {
826 		lhc = lhcbase = hammer2_dirhash(name, name_len);
827 		sxop = &hammer2_xop_alloc(dip)->xop_scanlhc;
828 		sxop->lhc = lhc;
829 		hammer2_xop_start(&sxop->head, hammer2_xop_scanlhc);
830 		while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
831 			if (lhc != sxop->head.cluster.focus->bref.key)
832 				break;
833 			++lhc;
834 		}
835 		hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
836 
837 		if (error) {
838 			if (error != ENOENT)
839 				goto done;
840 			++lhc;
841 			error = 0;
842 		}
843 		if ((lhcbase ^ lhc) & ~HAMMER2_DIRHASH_LOMASK) {
844 			error = ENOSPC;
845 			goto done;
846 		}
847 	} else {
848 		error = 0;
849 	}
850 
851 	/*
852 	 * Formally reconnect the in-memory structure.  ip must
853 	 * be locked exclusively to safely change ip->pip.
854 	 */
855 	if (ip->pip != dip) {
856 		hammer2_inode_ref(dip);
857 		opip = ip->pip;
858 		ip->pip = dip;
859 		if (opip)
860 			hammer2_inode_drop(opip);
861 	}
862 
863 	/*
864 	 * Connect her up
865 	 */
866 	xop = &hammer2_xop_alloc(dip)->xop_connect;
867 	if (name)
868 		hammer2_xop_setname(&xop->head, name, name_len);
869 	hammer2_xop_setip2(&xop->head, ip);
870 	xop->lhc = lhc;
871 	hammer2_xop_start(&xop->head, hammer2_inode_xop_connect);
872 	error = hammer2_xop_collect(&xop->head, 0);
873 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
874 
875 	/*
876 	 * On success make the same adjustments to ip->meta or the
877 	 * next flush may blow up the chain.
878 	 */
879 	if (error == 0) {
880 		hammer2_inode_modify(ip);
881 		ip->meta.name_key = lhc;
882 		if (name)
883 			ip->meta.name_len = name_len;
884 	}
885 done:
886 	return error;
887 }
888 
889 /*
890  * Repoint ip->cluster's chains to cluster's chains and fixup the default
891  * focus.  Only valid elements are repointed.  Invalid elements have to be
892  * adjusted by the appropriate slave sync threads.
893  *
894  * Caller must hold the inode and cluster exclusive locked, if not NULL,
895  * must also be locked.
896  *
897  * Cluster may be NULL to clean out any chains in ip->cluster.
898  */
899 void
900 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
901 		      hammer2_cluster_t *cluster)
902 {
903 	hammer2_chain_t *dropch[HAMMER2_MAXCLUSTER];
904 	hammer2_chain_t *ochain;
905 	hammer2_chain_t *nchain;
906 	hammer2_inode_t *opip;
907 	int i;
908 
909 	bzero(dropch, sizeof(dropch));
910 
911 	/*
912 	 * Replace chains in ip->cluster with chains from cluster and
913 	 * adjust the focus if necessary.
914 	 *
915 	 * NOTE: nchain and/or ochain can be NULL due to gaps
916 	 *	 in the cluster arrays.
917 	 */
918 	hammer2_spin_ex(&ip->cluster_spin);
919 	for (i = 0; cluster && i < cluster->nchains; ++i) {
920 		/*
921 		 * Do not replace invalid elements as this might race
922 		 * syncthr replacements.
923 		 */
924 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
925 			continue;
926 
927 		/*
928 		 * Do not replace elements which are the same.  Also handle
929 		 * element count discrepancies.
930 		 */
931 		nchain = cluster->array[i].chain;
932 		if (i < ip->cluster.nchains) {
933 			ochain = ip->cluster.array[i].chain;
934 			if (ochain == nchain)
935 				continue;
936 		} else {
937 			ochain = NULL;
938 		}
939 
940 		/*
941 		 * Make adjustments
942 		 */
943 		ip->cluster.array[i].chain = nchain;
944 		ip->cluster.array[i].flags &= ~HAMMER2_CITEM_INVALID;
945 		ip->cluster.array[i].flags |= cluster->array[i].flags &
946 					      HAMMER2_CITEM_INVALID;
947 		if (nchain)
948 			hammer2_chain_ref(nchain);
949 		dropch[i] = ochain;
950 	}
951 
952 	/*
953 	 * Release any left-over chains in ip->cluster.
954 	 */
955 	while (i < ip->cluster.nchains) {
956 		nchain = ip->cluster.array[i].chain;
957 		if (nchain) {
958 			ip->cluster.array[i].chain = NULL;
959 			ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
960 		}
961 		dropch[i] = nchain;
962 		++i;
963 	}
964 
965 	/*
966 	 * Fixup fields.  Note that the inode-embedded cluster is never
967 	 * directly locked.
968 	 */
969 	if (cluster) {
970 		ip->cluster.nchains = cluster->nchains;
971 		ip->cluster.focus = cluster->focus;
972 		ip->cluster.flags = cluster->flags & ~HAMMER2_CLUSTER_LOCKED;
973 	} else {
974 		ip->cluster.nchains = 0;
975 		ip->cluster.focus = NULL;
976 		ip->cluster.flags &= ~HAMMER2_CLUSTER_ZFLAGS;
977 	}
978 
979 	/*
980 	 * Repoint ip->pip if requested (non-NULL pip).
981 	 */
982 	if (pip && ip->pip != pip) {
983 		opip = ip->pip;
984 		hammer2_inode_ref(pip);
985 		ip->pip = pip;
986 	} else {
987 		opip = NULL;
988 	}
989 	hammer2_spin_unex(&ip->cluster_spin);
990 
991 	/*
992 	 * Cleanup outside of spinlock
993 	 */
994 	while (--i >= 0) {
995 		if (dropch[i])
996 			hammer2_chain_drop(dropch[i]);
997 	}
998 	if (opip)
999 		hammer2_inode_drop(opip);
1000 }
1001 
1002 /*
1003  * Repoint a single element from the cluster to the ip.  Used by the
1004  * synchronization threads to piecemeal update inodes.  Does not change
1005  * focus and requires inode to be re-locked to clean-up flags (XXX).
1006  */
1007 void
1008 hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
1009 			  int idx)
1010 {
1011 	hammer2_chain_t *ochain;
1012 	hammer2_chain_t *nchain;
1013 	int i;
1014 
1015 	hammer2_spin_ex(&ip->cluster_spin);
1016 	KKASSERT(idx < cluster->nchains);
1017 	if (idx < ip->cluster.nchains) {
1018 		ochain = ip->cluster.array[idx].chain;
1019 		nchain = cluster->array[idx].chain;
1020 	} else {
1021 		ochain = NULL;
1022 		nchain = cluster->array[idx].chain;
1023 		ip->cluster.nchains = idx + 1;
1024 		for (i = ip->cluster.nchains; i <= idx; ++i) {
1025 			bzero(&ip->cluster.array[i],
1026 			      sizeof(ip->cluster.array[i]));
1027 			ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1028 		}
1029 	}
1030 	if (ochain != nchain) {
1031 		/*
1032 		 * Make adjustments.
1033 		 */
1034 		ip->cluster.array[idx].chain = nchain;
1035 		ip->cluster.array[idx].flags &= ~HAMMER2_CITEM_INVALID;
1036 		ip->cluster.array[idx].flags |= cluster->array[idx].flags &
1037 						HAMMER2_CITEM_INVALID;
1038 	}
1039 	hammer2_spin_unex(&ip->cluster_spin);
1040 	if (ochain != nchain) {
1041 		if (nchain)
1042 			hammer2_chain_ref(nchain);
1043 		if (ochain)
1044 			hammer2_chain_drop(ochain);
1045 	}
1046 }
1047 
1048 /*
1049  * Called with a locked inode to finish unlinking an inode after xop_unlink
1050  * had been run.  This function is responsible for decrementing nlinks and
1051  * moving deleted inodes to the hidden directory if they are still open.
1052  *
1053  * We don't bother decrementing nlinks if the file is not open and this was
1054  * the last link.
1055  *
1056  * If the inode is a hardlink target it's chain has not yet been deleted,
1057  * otherwise it's chain has been deleted.
1058  *
1059  * If isopen then any prior deletion was not permanent and the inode must
1060  * be moved to the hidden directory.
1061  */
1062 int
1063 hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen)
1064 {
1065 	hammer2_pfs_t *pmp;
1066 	int error;
1067 
1068 	pmp = ip->pmp;
1069 
1070 	/*
1071 	 * Decrement nlinks.  If this is the last link and the file is
1072 	 * not open, the chain has already been removed and we don't bother
1073 	 * dirtying the inode.
1074 	 */
1075 	if (ip->meta.nlinks == 1) {
1076 		atomic_set_int(&ip->flags, HAMMER2_INODE_ISUNLINKED);
1077 		if (isopen == 0)
1078 			return 0;
1079 	}
1080 
1081 	hammer2_inode_modify(ip);
1082 	--ip->meta.nlinks;
1083 	if ((int64_t)ip->meta.nlinks < 0)
1084 		ip->meta.nlinks = 0;	/* safety */
1085 
1086 	/*
1087 	 * If nlinks is not zero we are done.  However, this should only be
1088 	 * possible with a hardlink target.  If the inode is an embedded
1089 	 * hardlink nlinks should have dropped to zero, warn and proceed
1090 	 * with the next step.
1091 	 */
1092 	if (ip->meta.nlinks) {
1093 		if ((ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE) == 0)
1094 			return 0;
1095 		kprintf("hammer2_inode_unlink: nlinks was not 0 (%jd)\n",
1096 			(intmax_t)ip->meta.nlinks);
1097 		return 0;
1098 	}
1099 
1100 	/*
1101 	 * nlinks is now zero, the inode should have already been deleted.
1102 	 * If the file is open it was deleted non-permanently and must be
1103 	 * moved to the hidden directory.
1104 	 *
1105 	 * When moving to the hidden directory we force the name_key to the
1106 	 * inode number to avoid collisions.
1107 	 */
1108 	if (isopen) {
1109 		hammer2_inode_lock(pmp->ihidden, 0);
1110 		error = hammer2_inode_connect_simple(pmp->ihidden, ip,
1111 						     NULL, 0, ip->meta.inum);
1112 		hammer2_inode_unlock(pmp->ihidden);
1113 	} else {
1114 		error = 0;
1115 	}
1116 	return error;
1117 }
1118 
1119 /*
1120  * This is called from the mount code to initialize pmp->ihidden
1121  */
1122 void
1123 hammer2_inode_install_hidden(hammer2_pfs_t *pmp)
1124 {
1125 	int error;
1126 
1127 	if (pmp->ihidden)
1128 		return;
1129 
1130 	hammer2_trans_init(pmp, 0);
1131 	hammer2_inode_lock(pmp->iroot, 0);
1132 
1133 	/*
1134 	 * Find the hidden directory
1135 	 */
1136 	{
1137 		hammer2_xop_lookup_t *xop;
1138 
1139 		xop = &hammer2_xop_alloc(pmp->iroot)->xop_lookup;
1140 		xop->lhc = HAMMER2_INODE_HIDDENDIR;
1141 		hammer2_xop_start(&xop->head, hammer2_xop_lookup);
1142 		error = hammer2_xop_collect(&xop->head, 0);
1143 
1144 		if (error == 0) {
1145 			/*
1146 			 * Found the hidden directory
1147 			 */
1148 			kprintf("PFS FOUND HIDDEN DIR\n");
1149 			pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot,
1150 							 &xop->head.cluster);
1151 			hammer2_inode_ref(pmp->ihidden);
1152 			hammer2_inode_unlock(pmp->ihidden);
1153 		}
1154 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1155 	}
1156 
1157 	/*
1158 	 * Create the hidden directory if it could not be found.
1159 	 */
1160 	if (error == ENOENT) {
1161 		kprintf("PFS CREATE HIDDEN DIR\n");
1162 
1163 		pmp->ihidden = hammer2_inode_create(pmp->iroot, NULL, NULL,
1164 						    NULL, 0,
1165 				/* lhc */	    HAMMER2_INODE_HIDDENDIR,
1166 				/* inum */	    HAMMER2_INODE_HIDDENDIR,
1167 				/* type */	    HAMMER2_OBJTYPE_DIRECTORY,
1168 				/* target_type */   0,
1169 				/* flags */	    0,
1170 						    &error);
1171 		if (pmp->ihidden) {
1172 			hammer2_inode_ref(pmp->ihidden);
1173 			hammer2_inode_unlock(pmp->ihidden);
1174 		}
1175 		if (error)
1176 			kprintf("PFS CREATE ERROR %d\n", error);
1177 	}
1178 
1179 	/*
1180 	 * Scan the hidden directory on-mount and destroy its contents
1181 	 */
1182 	if (error == 0) {
1183 		hammer2_xop_unlinkall_t *xop;
1184 
1185 		hammer2_inode_lock(pmp->ihidden, 0);
1186 		xop = &hammer2_xop_alloc(pmp->ihidden)->xop_unlinkall;
1187 		xop->head.lkey = 0;
1188 		hammer2_xop_start(&xop->head, hammer2_inode_xop_unlinkall);
1189 
1190 		while ((error = hammer2_xop_collect(&xop->head, 0)) == 0) {
1191 			;
1192 		}
1193 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1194 		hammer2_inode_unlock(pmp->ihidden);
1195 	}
1196 
1197 	hammer2_inode_unlock(pmp->iroot);
1198 	hammer2_trans_done(pmp);
1199 }
1200 
1201 /*
1202  * Find the directory common to both fdip and tdip.
1203  *
1204  * Returns a held but not locked inode.  Caller typically locks the inode,
1205  * and when through unlocks AND drops it.
1206  */
1207 hammer2_inode_t *
1208 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1209 {
1210 	hammer2_inode_t *scan1;
1211 	hammer2_inode_t *scan2;
1212 
1213 	/*
1214 	 * We used to have a depth field but it complicated matters too
1215 	 * much for directory renames.  So now its ugly.  Check for
1216 	 * simple cases before giving up and doing it the expensive way.
1217 	 *
1218 	 * XXX need a bottom-up topology stability lock
1219 	 */
1220 	if (fdip == tdip || fdip == tdip->pip) {
1221 		hammer2_inode_ref(fdip);
1222 		return(fdip);
1223 	}
1224 	if (fdip->pip == tdip) {
1225 		hammer2_inode_ref(tdip);
1226 		return(tdip);
1227 	}
1228 
1229 	/*
1230 	 * XXX not MPSAFE
1231 	 */
1232 	for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1233 		scan2 = tdip;
1234 		while (scan2->pmp == tdip->pmp) {
1235 			if (scan1 == scan2) {
1236 				hammer2_inode_ref(scan1);
1237 				return(scan1);
1238 			}
1239 			scan2 = scan2->pip;
1240 			if (scan2 == NULL)
1241 				break;
1242 		}
1243 	}
1244 	panic("hammer2_inode_common_parent: no common parent %p %p\n",
1245 	      fdip, tdip);
1246 	/* NOT REACHED */
1247 	return(NULL);
1248 }
1249 
1250 /*
1251  * Set an inode's cluster modified, marking the related chains RW and
1252  * duplicating them if necessary.
1253  *
1254  * The passed-in chain is a localized copy of the chain previously acquired
1255  * when the inode was locked (and possilby replaced in the mean time), and
1256  * must also be updated.  In fact, we update it first and then synchronize
1257  * the inode's cluster cache.
1258  */
1259 void
1260 hammer2_inode_modify(hammer2_inode_t *ip)
1261 {
1262 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1263 	if (ip->vp)
1264 		vsetisdirty(ip->vp);
1265 }
1266 
1267 /*
1268  * Synchronize the inode's frontend state with the chain state prior
1269  * to any explicit flush of the inode or any strategy write call.
1270  *
1271  * Called with a locked inode.
1272  */
1273 void
1274 hammer2_inode_fsync(hammer2_inode_t *ip, hammer2_cluster_t *cparent)
1275 {
1276 	int clear_directdata = 0;
1277 
1278 	/* temporary hack, allow cparent to be NULL */
1279 	if (cparent == NULL) {
1280 		cparent = hammer2_inode_cluster(ip, HAMMER2_RESOLVE_ALWAYS);
1281 		hammer2_inode_fsync(ip, cparent);
1282 		hammer2_cluster_unlock(cparent);
1283 		hammer2_cluster_drop(cparent);
1284 		return;
1285 	}
1286 
1287 	if ((ip->flags & HAMMER2_INODE_RESIZED) == 0) {
1288 		/* do nothing */
1289 	} else if (ip->meta.size < ip->osize) {
1290 		/*
1291 		 * We must delete any chains beyond the EOF.  The chain
1292 		 * straddling the EOF will be pending in the bioq.
1293 		 */
1294 		hammer2_cluster_t *dparent;
1295 		hammer2_cluster_t *cluster;
1296 		hammer2_key_t lbase;
1297 		hammer2_key_t key_next;
1298 
1299 		lbase = (ip->meta.size + HAMMER2_PBUFMASK64) &
1300 			~HAMMER2_PBUFMASK64;
1301 		dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
1302 		cluster = hammer2_cluster_lookup(dparent, &key_next,
1303 					         lbase, (hammer2_key_t)-1,
1304 						 HAMMER2_LOOKUP_NODATA);
1305 		while (cluster) {
1306 			/*
1307 			 * Degenerate embedded case, nothing to loop on
1308 			 */
1309 			switch (hammer2_cluster_type(cluster)) {
1310 			case HAMMER2_BREF_TYPE_INODE:
1311 				hammer2_cluster_unlock(cluster);
1312 				hammer2_cluster_drop(cluster);
1313 				cluster = NULL;
1314 				break;
1315 			case HAMMER2_BREF_TYPE_DATA:
1316 				hammer2_cluster_delete(dparent, cluster,
1317 						   HAMMER2_DELETE_PERMANENT);
1318 				/* fall through */
1319 			default:
1320 				cluster = hammer2_cluster_next(dparent, cluster,
1321 						   &key_next,
1322 						   key_next, (hammer2_key_t)-1,
1323 						   HAMMER2_LOOKUP_NODATA);
1324 				break;
1325 			}
1326 		}
1327 		hammer2_cluster_lookup_done(dparent);
1328 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1329 		KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1330 	} else if (ip->meta.size > ip->osize) {
1331 		/*
1332 		 * When resizing larger we may not have any direct-data
1333 		 * available.
1334 		 */
1335 		if ((ip->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1336 		    ip->meta.size > HAMMER2_EMBEDDED_BYTES) {
1337 			ip->meta.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1338 			clear_directdata = 1;
1339 		}
1340 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1341 		KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1342 	} else {
1343 		/*
1344 		 * RESIZED was set but size didn't change.
1345 		 */
1346 		atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1347 		KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1348 	}
1349 
1350 	/*
1351 	 * Sync inode meta-data
1352 	 */
1353 	if (ip->flags & HAMMER2_INODE_MODIFIED) {
1354 		hammer2_inode_data_t *wipdata;
1355 
1356 		atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1357 		hammer2_cluster_modify(cparent, 0);
1358 		hammer2_inode_repoint(ip, NULL, cparent);
1359 
1360 		wipdata = &hammer2_cluster_wdata(cparent)->ipdata;
1361 		wipdata->meta = ip->meta;
1362 		if (clear_directdata) {
1363 			bzero(&wipdata->u.blockset,
1364 			      sizeof(wipdata->u.blockset));
1365 		}
1366 		hammer2_cluster_modsync(cparent);
1367 	}
1368 }
1369 
1370 /*
1371  * This handles unlinked open files after the vnode is finally dereferenced.
1372  * To avoid deadlocks it cannot be called from the normal vnode recycling
1373  * path, so we call it (1) after a unlink, rmdir, or rename, (2) on every
1374  * flush, and (3) on umount.
1375  *
1376  * Caller must be in a transaction.
1377  */
1378 void
1379 hammer2_inode_run_unlinkq(hammer2_pfs_t *pmp)
1380 {
1381 	hammer2_xop_destroy_t *xop;
1382 	hammer2_inode_unlink_t *ipul;
1383 	hammer2_inode_t *ip;
1384 	int error;
1385 
1386 	if (TAILQ_EMPTY(&pmp->unlinkq))
1387 		return;
1388 
1389 	LOCKSTART;
1390 	hammer2_spin_ex(&pmp->list_spin);
1391 	while ((ipul = TAILQ_FIRST(&pmp->unlinkq)) != NULL) {
1392 		TAILQ_REMOVE(&pmp->unlinkq, ipul, entry);
1393 		hammer2_spin_unex(&pmp->list_spin);
1394 		ip = ipul->ip;
1395 		kfree(ipul, pmp->minode);
1396 
1397 		hammer2_inode_lock(ip, 0);
1398 		xop = &hammer2_xop_alloc(ip)->xop_destroy;
1399 		hammer2_xop_start(&xop->head, hammer2_inode_xop_destroy);
1400 		error = hammer2_xop_collect(&xop->head, 0);
1401 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1402 
1403 		hammer2_inode_unlock(ip);
1404 		hammer2_inode_drop(ip);			/* ipul ref */
1405 
1406 		hammer2_spin_ex(&pmp->list_spin);
1407 	}
1408 	hammer2_spin_unex(&pmp->list_spin);
1409 	LOCKSTOP;
1410 }
1411 
1412 /*
1413  * Inode create helper (threaded, backend)
1414  *
1415  * Used by ncreate, nmknod, nsymlink, nmkdir.
1416  * Used by nlink and rename to create HARDLINK pointers.
1417  *
1418  * Frontend holds the parent directory ip locked exclusively.  We
1419  * create the inode and feed the exclusively locked chain to the
1420  * frontend.
1421  */
1422 void
1423 hammer2_inode_xop_create(hammer2_xop_t *arg, int clindex)
1424 {
1425 	hammer2_xop_create_t *xop = &arg->xop_create;
1426 	hammer2_chain_t *parent;
1427 	hammer2_chain_t *chain;
1428 	hammer2_key_t key_next;
1429 	int cache_index = -1;
1430 	int error;
1431 
1432 	chain = NULL;
1433 	parent = hammer2_inode_chain(xop->head.ip, clindex,
1434 				     HAMMER2_RESOLVE_ALWAYS);
1435 	if (parent == NULL) {
1436 		error = EIO;
1437 		goto fail;
1438 	}
1439 	chain = hammer2_chain_lookup(&parent, &key_next,
1440 				     xop->lhc, xop->lhc,
1441 				     &cache_index, 0);
1442 	if (chain) {
1443 		hammer2_chain_unlock(chain);
1444 		error = EEXIST;
1445 		goto fail;
1446 	}
1447 
1448 	error = hammer2_chain_create(&parent, &chain,
1449 				     xop->head.ip->pmp,
1450 				     xop->lhc, 0,
1451 				     HAMMER2_BREF_TYPE_INODE,
1452 				     HAMMER2_INODE_BYTES,
1453 				     xop->flags);
1454 	if (error == 0) {
1455 		hammer2_chain_modify(chain, 0);
1456 		chain->data->ipdata.meta = xop->meta;
1457 		if (xop->head.name) {
1458 			bcopy(xop->head.name,
1459 			      chain->data->ipdata.filename,
1460 			      xop->head.name_len);
1461 			chain->data->ipdata.meta.name_len = xop->head.name_len;
1462 		}
1463 		chain->data->ipdata.meta.name_key = xop->lhc;
1464 	}
1465 	hammer2_chain_unlock(chain);
1466 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
1467 				  HAMMER2_RESOLVE_SHARED);
1468 fail:
1469 	if (parent) {
1470 		hammer2_chain_unlock(parent);
1471 		hammer2_chain_drop(parent);
1472 	}
1473 	error = hammer2_xop_feed(&xop->head, chain, clindex, error);
1474 	if (chain)
1475 		hammer2_chain_drop(chain);
1476 }
1477 
1478 /*
1479  * Inode delete helper (backend, threaded)
1480  *
1481  * Generally used by hammer2_run_unlinkq()
1482  */
1483 void
1484 hammer2_inode_xop_destroy(hammer2_xop_t *arg, int clindex)
1485 {
1486 	hammer2_xop_destroy_t *xop = &arg->xop_destroy;
1487 	hammer2_pfs_t *pmp;
1488 	hammer2_chain_t *parent;
1489 	hammer2_chain_t *chain;
1490 	hammer2_inode_t *ip;
1491 	int error;
1492 
1493 	/*
1494 	 * We need the precise parent chain to issue the deletion.
1495 	 */
1496 	ip = xop->head.ip;
1497 	pmp = ip->pmp;
1498 	chain = NULL;
1499 
1500 	parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1501 	if (parent)
1502 		hammer2_chain_getparent(&parent, HAMMER2_RESOLVE_ALWAYS);
1503 	if (parent == NULL) {
1504 		error = EIO;
1505 		goto done;
1506 	}
1507 	chain = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1508 	if (chain == NULL) {
1509 		error = EIO;
1510 		goto done;
1511 	}
1512 	hammer2_chain_delete(parent, chain, 0);
1513 	error = 0;
1514 done:
1515 	hammer2_xop_feed(&xop->head, NULL, clindex, error);
1516 	if (parent) {
1517 		hammer2_chain_unlock(parent);
1518 		hammer2_chain_drop(parent);
1519 	}
1520 	if (chain) {
1521 		hammer2_chain_unlock(chain);
1522 		hammer2_chain_drop(chain);
1523 	}
1524 }
1525 
1526 void
1527 hammer2_inode_xop_unlinkall(hammer2_xop_t *arg, int clindex)
1528 {
1529 	hammer2_xop_unlinkall_t *xop = &arg->xop_unlinkall;
1530 	hammer2_chain_t *parent;
1531 	hammer2_chain_t *chain;
1532 	hammer2_key_t key_next;
1533 	int cache_index = -1;
1534 
1535 	/*
1536 	 * We need the precise parent chain to issue the deletion.
1537 	 */
1538 	parent = hammer2_inode_chain(xop->head.ip, clindex,
1539 				     HAMMER2_RESOLVE_ALWAYS);
1540 	chain = hammer2_chain_lookup(&parent, &key_next,
1541 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
1542 				     &cache_index,
1543 				     HAMMER2_LOOKUP_ALWAYS);
1544 	while (chain) {
1545 		hammer2_chain_delete(parent, chain, HAMMER2_DELETE_PERMANENT);
1546 		hammer2_chain_unlock(chain);
1547 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
1548 					  HAMMER2_RESOLVE_SHARED);
1549 		hammer2_xop_feed(&xop->head, chain, clindex, chain->error);
1550 		chain = hammer2_chain_next(&parent, chain, &key_next,
1551 					   key_next, HAMMER2_KEY_MAX,
1552 					   &cache_index,
1553 					   HAMMER2_LOOKUP_ALWAYS |
1554 					   HAMMER2_LOOKUP_NOUNLOCK);
1555 	}
1556 	hammer2_xop_feed(&xop->head, NULL, clindex, ENOENT);
1557 	if (parent) {
1558 		hammer2_chain_unlock(parent);
1559 		hammer2_chain_drop(parent);
1560 	}
1561 	if (chain) {
1562 		hammer2_chain_unlock(chain);
1563 		hammer2_chain_drop(chain);
1564 	}
1565 }
1566 
1567 void
1568 hammer2_inode_xop_connect(hammer2_xop_t *arg, int clindex)
1569 {
1570 	hammer2_xop_connect_t *xop = &arg->xop_connect;
1571 	hammer2_inode_data_t *wipdata;
1572 	hammer2_chain_t *parent;
1573 	hammer2_chain_t *chain;
1574 	hammer2_pfs_t *pmp;
1575 	hammer2_key_t key_dummy;
1576 	int cache_index = -1;
1577 	int error;
1578 
1579 	/*
1580 	 * Get directory, then issue a lookup to prime the parent chain
1581 	 * for the create.  The lookup is expected to fail.
1582 	 */
1583 	pmp = xop->head.ip->pmp;
1584 	parent = hammer2_inode_chain(xop->head.ip, clindex,
1585 				     HAMMER2_RESOLVE_ALWAYS);
1586 	if (parent == NULL) {
1587 		chain = NULL;
1588 		error = EIO;
1589 		goto fail;
1590 	}
1591 	chain = hammer2_chain_lookup(&parent, &key_dummy,
1592 				     xop->lhc, xop->lhc,
1593 				     &cache_index, 0);
1594 	if (chain) {
1595 		hammer2_chain_unlock(chain);
1596 		hammer2_chain_drop(chain);
1597 		chain = NULL;
1598 		error = EEXIST;
1599 		goto fail;
1600 	}
1601 
1602 	/*
1603 	 * Adjust the filename in the inode, set the name key.
1604 	 *
1605 	 * NOTE: Frontend must also adjust ip2->meta on success, we can't
1606 	 *	 do it here.
1607 	 */
1608 	chain = hammer2_inode_chain(xop->head.ip2, clindex,
1609 				    HAMMER2_RESOLVE_ALWAYS);
1610 	hammer2_chain_modify(chain, 0);
1611 	wipdata = &chain->data->ipdata;
1612 
1613 	hammer2_inode_modify(xop->head.ip2);
1614 	if (xop->head.name) {
1615 		bzero(wipdata->filename, sizeof(wipdata->filename));
1616 		bcopy(xop->head.name, wipdata->filename, xop->head.name_len);
1617 		wipdata->meta.name_len = xop->head.name_len;
1618 	}
1619 	wipdata->meta.name_key = xop->lhc;
1620 
1621 	/*
1622 	 * Reconnect the chain to the new parent directory
1623 	 */
1624 	error = hammer2_chain_create(&parent, &chain, pmp,
1625 				     xop->lhc, 0,
1626 				     HAMMER2_BREF_TYPE_INODE,
1627 				     HAMMER2_INODE_BYTES,
1628 				     0);
1629 
1630 	/*
1631 	 * Feed result back.
1632 	 */
1633 fail:
1634 	hammer2_xop_feed(&xop->head, NULL, clindex, error);
1635 	if (parent) {
1636 		hammer2_chain_unlock(parent);
1637 		hammer2_chain_drop(parent);
1638 	}
1639 	if (chain) {
1640 		hammer2_chain_unlock(chain);
1641 		hammer2_chain_drop(chain);
1642 	}
1643 }
1644