xref: /dflybsd-src/sys/vfs/hammer/hammer_inode.c (revision 21d8bc42f570abe46007647277c2c8d68cf8b55a)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.25 2008/01/25 05:49:08 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 #include <sys/buf.h>
39 #include <sys/buf2.h>
40 
41 /*
42  * The kernel is not actively referencing this vnode but is still holding
43  * it cached.
44  */
45 int
46 hammer_vop_inactive(struct vop_inactive_args *ap)
47 {
48 	struct hammer_inode *ip = VTOI(ap->a_vp);
49 
50 	/*
51 	 * Degenerate case
52 	 */
53 	if (ip == NULL) {
54 		vrecycle(ap->a_vp);
55 		return(0);
56 	}
57 
58 	/*
59 	 * If the inode no longer has any references we recover its
60 	 * in-memory resources immediately.
61 	 */
62 	if (ip->ino_rec.ino_nlinks == 0)
63 		vrecycle(ap->a_vp);
64 	return(0);
65 }
66 
67 /*
68  * Release the vnode association.  This is typically (but not always)
69  * the last reference on the inode and will flush the inode to the
70  * buffer cache.
71  *
72  * XXX Currently our sync code only runs through inodes with vnode
73  * associations, so we depend on hammer_rel_inode() to sync any inode
74  * record data to the block device prior to losing the association.
75  * Otherwise transactions that the user expected to be distinct by
76  * doing a manual sync may be merged.
77  */
78 int
79 hammer_vop_reclaim(struct vop_reclaim_args *ap)
80 {
81 	struct hammer_inode *ip;
82 	struct vnode *vp;
83 
84 	vp = ap->a_vp;
85 
86 	if ((ip = vp->v_data) != NULL) {
87 		vp->v_data = NULL;
88 		ip->vp = NULL;
89 		hammer_rel_inode(ip, 0);
90 	}
91 	return(0);
92 }
93 
94 /*
95  * Obtain a vnode for the specified inode number.  An exclusively locked
96  * vnode is returned.
97  */
98 int
99 hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
100 {
101 	struct hammer_mount *hmp = (void *)mp->mnt_data;
102 	struct hammer_inode *ip;
103 	int error;
104 
105 	/*
106 	 * Get/allocate the hammer_inode structure.  The structure must be
107 	 * unlocked while we manipulate the related vnode to avoid a
108 	 * deadlock.
109 	 */
110 	ip = hammer_get_inode(hmp, NULL, ino, hmp->asof, 0, &error);
111 	if (ip == NULL) {
112 		*vpp = NULL;
113 		return(error);
114 	}
115 	error = hammer_get_vnode(ip, LK_EXCLUSIVE, vpp);
116 	hammer_rel_inode(ip, 0);
117 	return (error);
118 }
119 
120 /*
121  * Return a locked vnode for the specified inode.  The inode must be
122  * referenced but NOT LOCKED on entry and will remain referenced on
123  * return.
124  */
125 int
126 hammer_get_vnode(struct hammer_inode *ip, int lktype, struct vnode **vpp)
127 {
128 	struct vnode *vp;
129 	int error = 0;
130 
131 	for (;;) {
132 		if ((vp = ip->vp) == NULL) {
133 			error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
134 			if (error)
135 				break;
136 			hammer_lock_ex(&ip->lock);
137 			if (ip->vp != NULL) {
138 				hammer_unlock(&ip->lock);
139 				vp->v_type = VBAD;
140 				vx_put(vp);
141 				continue;
142 			}
143 			hammer_ref(&ip->lock);
144 			vp = *vpp;
145 			ip->vp = vp;
146 			vp->v_type = hammer_get_vnode_type(
147 					    ip->ino_rec.base.base.obj_type);
148 
149 			switch(ip->ino_rec.base.base.obj_type) {
150 			case HAMMER_OBJTYPE_CDEV:
151 			case HAMMER_OBJTYPE_BDEV:
152 				vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
153 				addaliasu(vp, ip->ino_data.rmajor,
154 					  ip->ino_data.rminor);
155 				break;
156 			case HAMMER_OBJTYPE_FIFO:
157 				vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
158 				break;
159 			default:
160 				break;
161 			}
162 			if (ip->obj_id == HAMMER_OBJID_ROOT)
163 				vp->v_flag |= VROOT;
164 
165 			vp->v_data = (void *)ip;
166 			/* vnode locked by getnewvnode() */
167 			/* make related vnode dirty if inode dirty? */
168 			hammer_unlock(&ip->lock);
169 			if (vp->v_type == VREG)
170 				vinitvmio(vp, ip->ino_rec.ino_size);
171 			break;
172 		}
173 
174 		/*
175 		 * loop if the vget fails (aka races), or if the vp
176 		 * no longer matches ip->vp.
177 		 */
178 		if (vget(vp, LK_EXCLUSIVE) == 0) {
179 			if (vp == ip->vp)
180 				break;
181 			vput(vp);
182 		}
183 	}
184 	*vpp = vp;
185 	return(error);
186 }
187 
188 /*
189  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
190  * do not attach or detach the related vnode (use hammer_get_vnode() for
191  * that).
192  *
193  * The flags argument is only applied for newly created inodes, and only
194  * certain flags are inherited.
195  */
196 struct hammer_inode *
197 hammer_get_inode(struct hammer_mount *hmp, struct hammer_node **cache,
198 		 u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
199 {
200 	struct hammer_inode_info iinfo;
201 	struct hammer_cursor cursor;
202 	struct hammer_inode *ip;
203 
204 	/*
205 	 * Determine if we already have an inode cached.  If we do then
206 	 * we are golden.
207 	 */
208 	iinfo.obj_id = obj_id;
209 	iinfo.obj_asof = asof;
210 loop:
211 	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
212 	if (ip) {
213 		hammer_ref(&ip->lock);
214 		*errorp = 0;
215 		return(ip);
216 	}
217 
218 	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
219 	++hammer_count_inodes;
220 	ip->obj_id = obj_id;
221 	ip->obj_asof = iinfo.obj_asof;
222 	ip->hmp = hmp;
223 	ip->flags = flags & HAMMER_INODE_RO;
224 	if (hmp->ronly)
225 		ip->flags |= HAMMER_INODE_RO;
226 	RB_INIT(&ip->rec_tree);
227 
228 	/*
229 	 * Locate the on-disk inode.
230 	 */
231 retry:
232 	hammer_init_cursor_hmp(&cursor, cache, hmp);
233 	cursor.key_beg.obj_id = ip->obj_id;
234 	cursor.key_beg.key = 0;
235 	cursor.key_beg.create_tid = 0;
236 	cursor.key_beg.delete_tid = 0;
237 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
238 	cursor.key_beg.obj_type = 0;
239 	cursor.asof = iinfo.obj_asof;
240 	cursor.flags = HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_GET_DATA |
241 		       HAMMER_CURSOR_ASOF;
242 
243 	*errorp = hammer_btree_lookup(&cursor);
244 	if (*errorp == EDEADLK) {
245 		hammer_done_cursor(&cursor);
246 		goto retry;
247 	}
248 
249 	/*
250 	 * On success the B-Tree lookup will hold the appropriate
251 	 * buffer cache buffers and provide a pointer to the requested
252 	 * information.  Copy the information to the in-memory inode
253 	 * and cache the B-Tree node to improve future operations.
254 	 */
255 	if (*errorp == 0) {
256 		ip->ino_rec = cursor.record->inode;
257 		ip->ino_data = cursor.data->inode;
258 		hammer_cache_node(cursor.node, &ip->cache[0]);
259 		if (cache)
260 			hammer_cache_node(cursor.node, cache);
261 	}
262 
263 	/*
264 	 * On success load the inode's record and data and insert the
265 	 * inode into the B-Tree.  It is possible to race another lookup
266 	 * insertion of the same inode so deal with that condition too.
267 	 *
268 	 * The cursor's locked node interlocks against others creating and
269 	 * destroying ip while we were blocked.
270 	 */
271 	if (*errorp == 0) {
272 		hammer_ref(&ip->lock);
273 		if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
274 			hammer_uncache_node(&ip->cache[0]);
275 			hammer_uncache_node(&ip->cache[1]);
276 			hammer_unref(&ip->lock);
277 			--hammer_count_inodes;
278 			kfree(ip, M_HAMMER);
279 			hammer_done_cursor(&cursor);
280 			goto loop;
281 		}
282 		ip->flags |= HAMMER_INODE_ONDISK;
283 	} else {
284 		--hammer_count_inodes;
285 		kfree(ip, M_HAMMER);
286 		ip = NULL;
287 	}
288 	hammer_done_cursor(&cursor);
289 	return (ip);
290 }
291 
292 /*
293  * Create a new filesystem object, returning the inode in *ipp.  The
294  * returned inode will be referenced but not locked.
295  *
296  * The inode is created in-memory and will be delay-synchronized to the
297  * disk.
298  */
299 int
300 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
301 		    struct ucred *cred, hammer_inode_t dip,
302 		    struct hammer_inode **ipp)
303 {
304 	hammer_mount_t hmp;
305 	hammer_inode_t ip;
306 	uid_t xuid;
307 
308 	hmp = trans->hmp;
309 	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
310 	++hammer_count_inodes;
311 	ip->obj_id = hammer_alloc_tid(trans);
312 	KKASSERT(ip->obj_id != 0);
313 	ip->obj_asof = hmp->asof;
314 	ip->hmp = hmp;
315 	ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY |
316 		    HAMMER_INODE_ITIMES;
317 	ip->last_tid = trans->tid;
318 
319 	RB_INIT(&ip->rec_tree);
320 
321 	ip->ino_rec.ino_atime = trans->tid;
322 	ip->ino_rec.ino_mtime = trans->tid;
323 	ip->ino_rec.ino_size = 0;
324 	ip->ino_rec.ino_nlinks = 0;
325 	/* XXX */
326 	ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
327 	ip->ino_rec.base.base.obj_id = ip->obj_id;
328 	ip->ino_rec.base.base.key = 0;
329 	ip->ino_rec.base.base.create_tid = trans->tid;
330 	ip->ino_rec.base.base.delete_tid = 0;
331 	ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
332 	ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
333 
334 	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
335 	ip->ino_data.mode = vap->va_mode;
336 	ip->ino_data.ctime = trans->tid;
337 	ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
338 
339 	switch(ip->ino_rec.base.base.obj_type) {
340 	case HAMMER_OBJTYPE_CDEV:
341 	case HAMMER_OBJTYPE_BDEV:
342 		ip->ino_data.rmajor = vap->va_rmajor;
343 		ip->ino_data.rminor = vap->va_rminor;
344 		break;
345 	default:
346 		break;
347 	}
348 
349 	/*
350 	 * Calculate default uid/gid and overwrite with information from
351 	 * the vap.
352 	 */
353 	xuid = hammer_to_unix_xid(&dip->ino_data.uid);
354 	ip->ino_data.gid = dip->ino_data.gid;
355 	xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
356 				     &vap->va_mode);
357 	ip->ino_data.mode = vap->va_mode;
358 
359 	if (vap->va_vaflags & VA_UID_UUID_VALID)
360 		ip->ino_data.uid = vap->va_uid_uuid;
361 	else if (vap->va_uid != (uid_t)VNOVAL)
362 		hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
363 	if (vap->va_vaflags & VA_GID_UUID_VALID)
364 		ip->ino_data.gid = vap->va_gid_uuid;
365 	else if (vap->va_gid != (gid_t)VNOVAL)
366 		hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
367 
368 	hammer_ref(&ip->lock);
369 	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
370 		hammer_unref(&ip->lock);
371 		panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
372 	}
373 	*ipp = ip;
374 	return(0);
375 }
376 
377 /*
378  * Called by hammer_sync_inode().
379  */
380 static int
381 hammer_update_inode(hammer_inode_t ip)
382 {
383 	struct hammer_cursor cursor;
384 	struct hammer_cursor *spike = NULL;
385 	hammer_record_t record;
386 	int error;
387 	hammer_tid_t last_tid;
388 
389 	/*
390 	 * Locate the record on-disk and mark it as deleted.  Both the B-Tree
391 	 * node and the record must be marked deleted.  The record may or
392 	 * may not be physically deleted, depending on the retention policy.
393 	 *
394 	 * If the inode has already been deleted on-disk we have nothing
395 	 * to do.
396 	 *
397 	 * XXX Update the inode record and data in-place if the retention
398 	 * policy allows it.
399 	 */
400 	last_tid = ip->last_tid;
401 retry:
402 	error = 0;
403 
404 	if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
405 	    HAMMER_INODE_ONDISK) {
406 		hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
407 		cursor.key_beg.obj_id = ip->obj_id;
408 		cursor.key_beg.key = 0;
409 		cursor.key_beg.create_tid = 0;
410 		cursor.key_beg.delete_tid = 0;
411 		cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
412 		cursor.key_beg.obj_type = 0;
413 		cursor.asof = ip->obj_asof;
414 		cursor.flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
415 
416 		error = hammer_btree_lookup(&cursor);
417 
418 		if (error == 0) {
419 			error = hammer_ip_delete_record(&cursor, last_tid);
420 			if (error == 0)
421 				ip->flags |= HAMMER_INODE_DELONDISK;
422 			hammer_cache_node(cursor.node, &ip->cache[0]);
423 		}
424 		hammer_done_cursor(&cursor);
425 		if (error == EDEADLK)
426 			goto retry;
427 	}
428 
429 	/*
430 	 * Write out a new record if the in-memory inode is not marked
431 	 * as having been deleted.  Update our inode statistics if this
432 	 * is the first application of the inode on-disk.
433 	 *
434 	 * If the inode has been deleted permanently, HAMMER_INODE_DELONDISK
435 	 * will remain set and prevent further updates.
436 	 */
437 	if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
438 		record = hammer_alloc_mem_record(ip);
439 		record->rec.inode = ip->ino_rec;
440 		record->rec.inode.base.base.create_tid = last_tid;
441 		record->rec.inode.base.data_len = sizeof(ip->ino_data);
442 		record->data = (void *)&ip->ino_data;
443 		error = hammer_ip_sync_record(record, &spike);
444 		record->flags |= HAMMER_RECF_DELETED;
445 		hammer_rel_mem_record(record);
446 		if (error == ENOSPC) {
447 			error = hammer_spike(&spike);
448 			if (error == 0)
449 				goto retry;
450 		}
451 		KKASSERT(spike == NULL);
452 		if (error == 0) {
453 			ip->flags &= ~(HAMMER_INODE_RDIRTY |
454 				       HAMMER_INODE_DDIRTY |
455 				       HAMMER_INODE_DELONDISK |
456 				       HAMMER_INODE_ITIMES);
457 			if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
458 				hammer_modify_volume(ip->hmp->rootvol);
459 				++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
460 				ip->flags |= HAMMER_INODE_ONDISK;
461 			}
462 		}
463 	}
464 	return(error);
465 }
466 
467 /*
468  * Update only the itimes fields.  This is done no-historically.  The
469  * record is updated in-place on the disk.
470  */
471 static int
472 hammer_update_itimes(hammer_inode_t ip)
473 {
474 	struct hammer_cursor cursor;
475 	struct hammer_inode_record *rec;
476 	int error;
477 
478 retry:
479 	error = 0;
480 	if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
481 	    HAMMER_INODE_ONDISK) {
482 		hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
483 		cursor.key_beg.obj_id = ip->obj_id;
484 		cursor.key_beg.key = 0;
485 		cursor.key_beg.create_tid = 0;
486 		cursor.key_beg.delete_tid = 0;
487 		cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
488 		cursor.key_beg.obj_type = 0;
489 		cursor.asof = ip->obj_asof;
490 		cursor.flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
491 
492 		error = hammer_btree_lookup(&cursor);
493 		if (error == 0) {
494 			rec = &cursor.record->inode;
495 			hammer_modify_buffer_nodep(cursor.record_buffer);
496 			rec->ino_atime = ip->ino_rec.ino_atime;
497 			rec->ino_mtime = ip->ino_rec.ino_mtime;
498 			ip->flags &= ~HAMMER_INODE_ITIMES;
499 			/* XXX recalculate crc */
500 			hammer_cache_node(cursor.node, &ip->cache[0]);
501 		}
502 		hammer_done_cursor(&cursor);
503 		if (error == EDEADLK)
504 			goto retry;
505 	}
506 	return(error);
507 }
508 
509 /*
510  * Release a reference on an inode.  If asked to flush the last release
511  * will flush the inode.
512  */
513 void
514 hammer_rel_inode(struct hammer_inode *ip, int flush)
515 {
516 	hammer_unref(&ip->lock);
517 	if (flush)
518 		ip->flags |= HAMMER_INODE_FLUSH;
519 	if (ip->lock.refs == 0) {
520 		if (ip->flags & HAMMER_INODE_FLUSH)
521 			hammer_unload_inode(ip, (void *)MNT_WAIT);
522 		else
523 			hammer_unload_inode(ip, (void *)MNT_NOWAIT);
524 	}
525 }
526 
527 /*
528  * Unload and destroy the specified inode.
529  *
530  * (typically called via RB_SCAN)
531  */
532 int
533 hammer_unload_inode(struct hammer_inode *ip, void *data)
534 {
535 	int error;
536 
537 	KASSERT(ip->lock.refs == 0,
538 		("hammer_unload_inode: %d refs\n", ip->lock.refs));
539 	KKASSERT(ip->vp == NULL);
540 	hammer_ref(&ip->lock);
541 
542 	error = hammer_sync_inode(ip, (int)data, 1);
543 	if (error)
544 		kprintf("hammer_sync_inode failed error %d\n", error);
545 	if (ip->lock.refs == 1) {
546 		KKASSERT(RB_EMPTY(&ip->rec_tree));
547 		RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
548 
549 		hammer_uncache_node(&ip->cache[0]);
550 		hammer_uncache_node(&ip->cache[1]);
551 		--hammer_count_inodes;
552 		kfree(ip, M_HAMMER);
553 	} else {
554 		hammer_unref(&ip->lock);
555 	}
556 	return(0);
557 }
558 
559 /*
560  * A transaction has modified an inode, requiring updates as specified by
561  * the passed flags.
562  *
563  * HAMMER_INODE_RDIRTY:	Inode record has been updated
564  * HAMMER_INODE_DDIRTY: Inode data has been updated
565  * HAMMER_INODE_DELETED: Inode record/data must be deleted
566  * HAMMER_INODE_ITIMES: mtime/atime has been updated
567  *
568  * last_tid is the TID to use to generate the correct TID when the inode
569  * is synced to disk.
570  */
571 void
572 hammer_modify_inode(struct hammer_transaction *trans,
573 		    struct hammer_inode *ip, int flags)
574 {
575 	KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 ||
576 		  (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
577 		   HAMMER_INODE_DELETED|HAMMER_INODE_ITIMES) == 0);
578 
579 	if (flags &
580 	    (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|HAMMER_INODE_DELETED)) {
581 		if (hammer_debug_tid) {
582 			kprintf("hammer_modify_inode: %016llx (%08x)\n",
583 				trans->tid, (int)(trans->tid / 1000000000LL));
584 		}
585 		ip->last_tid = trans->tid;
586 	}
587 	ip->flags |= flags;
588 }
589 
590 /*
591  * Sync any dirty buffers and records associated with an inode.  The
592  * inode's last_tid field is used as the transaction id for the sync,
593  * overriding any intermediate TIDs that were used for records.  Note
594  * that the dirty buffer cache buffers do not have any knowledge of
595  * the transaction id they were modified under.
596  *
597  * If we can't sync due to a cluster becoming full the spike structure
598  * will be filled in and ENOSPC returned.  We must return -ENOSPC to
599  * terminate the RB_SCAN.
600  */
601 static int
602 hammer_sync_inode_callback(hammer_record_t rec, void *data)
603 {
604 	struct hammer_cursor **spike = data;
605 	int error;
606 
607 	hammer_ref(&rec->lock);
608 	error = hammer_ip_sync_record(rec, spike);
609 	hammer_rel_mem_record(rec);
610 
611 	if (error) {
612 		error = -error;
613 		if (error != -ENOSPC) {
614 			kprintf("hammer_sync_inode_callback: sync failed rec "
615 				"%p, error %d\n", rec, error);
616 		}
617 	}
618 	return(error);
619 }
620 
621 /*
622  * XXX error handling
623  */
624 int
625 hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete)
626 {
627 	struct hammer_transaction trans;
628 	struct hammer_cursor *spike = NULL;
629 	int error;
630 
631 	if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
632 		return(0);
633 	}
634 
635 	hammer_lock_ex(&ip->lock);
636 
637 	/*
638 	 * Use the transaction id of the last operation to sync.
639 	 */
640 	if (ip->last_tid)
641 		hammer_start_transaction_tid(&trans, ip->hmp, ip->last_tid);
642 	else
643 		hammer_start_transaction(&trans, ip->hmp);
644 
645 	/*
646 	 * If the inode has been deleted (nlinks == 0), and the OS no longer
647 	 * has any references to it (handle_delete != 0), clean up in-memory
648 	 * data.
649 	 *
650 	 * NOTE: We do not set the RDIRTY flag when updating the delete_tid,
651 	 * setting HAMMER_INODE_DELETED takes care of it.
652 	 *
653 	 * NOTE: Because we may sync records within this new transaction,
654 	 * force the inode update later on to use our transaction id or
655 	 * the delete_tid of the inode may be less then the create_tid of
656 	 * the inode update.  XXX shouldn't happen but don't take the chance.
657 	 *
658 	 * NOTE: The call to hammer_ip_delete_range() cannot return ENOSPC
659 	 * so we can pass a NULL spike structure, because no partial data
660 	 * deletion can occur (yet).
661 	 */
662 	if (ip->ino_rec.ino_nlinks == 0 && handle_delete &&
663 	    (ip->flags & HAMMER_INODE_GONE) == 0) {
664 		ip->flags |= HAMMER_INODE_GONE;
665 		if (ip->vp)
666 			vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
667 		error = hammer_ip_delete_range_all(&trans, ip);
668 		KKASSERT(RB_EMPTY(&ip->rec_tree));
669 		ip->ino_rec.base.base.delete_tid = trans.tid;
670 		hammer_modify_inode(&trans, ip, HAMMER_INODE_DELETED);
671 		hammer_modify_volume(ip->hmp->rootvol);
672 		--ip->hmp->rootvol->ondisk->vol0_stat_inodes;
673 	}
674 
675 	/*
676 	 * Sync the buffer cache.
677 	 */
678 	if (ip->vp != NULL) {
679 		error = vfsync(ip->vp, waitfor, 1, NULL, NULL);
680 		if (RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL)
681 			ip->flags &= ~HAMMER_INODE_BUFS;
682 	} else {
683 		error = 0;
684 	}
685 
686 
687 	/*
688 	 * Now sync related records
689 	 */
690 	for (;;) {
691 		error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
692 				hammer_sync_inode_callback, &spike);
693 		KKASSERT(error <= 0);
694 		if (error < 0)
695 			error = -error;
696 		if (error == ENOSPC) {
697 			error = hammer_spike(&spike);
698 			if (error == 0)
699 				continue;
700 		}
701 		break;
702 	}
703 	if (RB_EMPTY(&ip->rec_tree))
704 		ip->flags &= ~HAMMER_INODE_XDIRTY;
705 
706 	/*
707 	 * Now update the inode's on-disk inode-data and/or on-disk record.
708 	 */
709 	switch(ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK)) {
710 	case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
711 		/*
712 		 * If deleted and on-disk, don't set any additional flags.
713 		 * the delete flag takes care of things.
714 		 */
715 		break;
716 	case HAMMER_INODE_DELETED:
717 		/*
718 		 * Take care of the case where a deleted inode was never
719 		 * flushed to the disk in the first place.
720 		 */
721 		ip->flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
722 			       HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES);
723 		while (RB_ROOT(&ip->rec_tree)) {
724 			hammer_record_t rec = RB_ROOT(&ip->rec_tree);
725 			hammer_ref(&rec->lock);
726 			rec->flags |= HAMMER_RECF_DELETED;
727 			hammer_rel_mem_record(rec);
728 		}
729 		break;
730 	case HAMMER_INODE_ONDISK:
731 		/*
732 		 * If already on-disk, do not set any additional flags.
733 		 */
734 		break;
735 	default:
736 		/*
737 		 * If not on-disk and not deleted, set both dirty flags
738 		 * to force an initial record to be written.
739 		 */
740 		ip->flags |= HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY;
741 		break;
742 	}
743 
744 	/*
745 	 * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
746 	 * is already on-disk the old record is marked as deleted.
747 	 *
748 	 * If DELETED is set hammer_update_inode() will delete the existing
749 	 * record without writing out a new one.
750 	 *
751 	 * If *ONLY* the ITIMES flag is set we can update the record in-place.
752 	 */
753 	if ((ip->flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
754 			 HAMMER_INODE_ITIMES | HAMMER_INODE_DELETED)) ==
755 	    HAMMER_INODE_ITIMES) {
756 		error = hammer_update_itimes(ip);
757 	} else
758 	if (ip->flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
759 			 HAMMER_INODE_ITIMES | HAMMER_INODE_DELETED)) {
760 		error = hammer_update_inode(ip);
761 	}
762 	hammer_commit_transaction(&trans);
763 	hammer_unlock(&ip->lock);
764 	return(error);
765 }
766 
767 /*
768  * Access the filesystem buffer containing the cluster-relative byte
769  * offset, validate the buffer type, load *bufferp and return a
770  * pointer to the requested data.  The buffer is reference and locked on
771  * return.
772  *
773  * If buf_type is 0 the buffer is assumed to be a pure-data buffer and
774  * no type or crc check is performed.
775  *
776  * If *bufferp is not NULL on entry it is assumed to contain a locked
777  * and referenced buffer which will then be replaced.
778  *
779  * If the caller is holding another unrelated buffer locked it must be
780  * passed in reorderbuf so we can properly order buffer locks.
781  *
782  * XXX add a flag for the buffer type and check the CRC here XXX
783  */
784 void *
785 hammer_bread(hammer_cluster_t cluster, int32_t cloff,
786 	     u_int64_t buf_type, int *errorp,
787 	     struct hammer_buffer **bufferp)
788 {
789 	hammer_buffer_t buffer;
790 	int32_t buf_no;
791 	int32_t buf_off;
792 
793 	/*
794 	 * Load the correct filesystem buffer, replacing *bufferp.
795 	 */
796 	buf_no = cloff / HAMMER_BUFSIZE;
797 	buffer = *bufferp;
798 	if (buffer == NULL || buffer->cluster != cluster ||
799 	    buffer->buf_no != buf_no) {
800 		if (buffer) {
801 			/*hammer_unlock(&buffer->io.lock);*/
802 			hammer_rel_buffer(buffer, 0);
803 		}
804 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
805 		*bufferp = buffer;
806 		if (buffer == NULL)
807 			return(NULL);
808 		/*hammer_lock_ex(&buffer->io.lock);*/
809 	}
810 
811 	/*
812 	 * Validate the buffer type
813 	 */
814 	buf_off = cloff & HAMMER_BUFMASK;
815 	if (buf_type) {
816 		if (buf_type != buffer->ondisk->head.buf_type) {
817 			kprintf("BUFFER HEAD TYPE MISMATCH %llx %llx\n",
818 				buf_type, buffer->ondisk->head.buf_type);
819 			KKASSERT(0);
820 			*errorp = EIO;
821 			return(NULL);
822 		}
823 		if (buf_off < sizeof(buffer->ondisk->head)) {
824 			kprintf("BUFFER OFFSET TOO LOW %d\n", buf_off);
825 			*errorp = EIO;
826 			KKASSERT(0);
827 			return(NULL);
828 		}
829 	}
830 
831 	/*
832 	 * Return a pointer to the buffer data.
833 	 */
834 	*errorp = 0;
835 	return((char *)buffer->ondisk + buf_off);
836 }
837 
838