xref: /dflybsd-src/sys/vfs/hammer/hammer_object.c (revision e7b4468ce80913950cd099c393f3ce6ece6fcb2c)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.15 2008/01/03 06:48:49 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_add(hammer_transaction_t trans,
40 			     hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43 
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50 	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 		return(-1);
52 	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53 		return(1);
54 
55 	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 		return(-1);
57 	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58 		return(1);
59 
60 	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61 		return(-1);
62 	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63 		return(1);
64         return(0);
65 }
66 
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70 	if (info->rec_type < rec->rec.base.base.rec_type)
71 		return(-3);
72 	if (info->rec_type > rec->rec.base.base.rec_type)
73 		return(3);
74 
75         if (info->key < rec->rec.base.base.key)
76                 return(-2);
77         if (info->key > rec->rec.base.base.key)
78                 return(2);
79 
80         /*
81          * This test has a number of special cases.  create_tid in key1 is
82          * the as-of transction id, and delete_tid in key1 is NOT USED.
83          *
84          * A key1->create_tid of 0 matches any record regardles of when
85          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
86          * used to search for the most current state of the object.
87          *
88          * key2->create_tid is a HAMMER record and will never be
89          * 0.   key2->delete_tid is the deletion transaction id or 0 if
90          * the record has not yet been deleted.
91          */
92         if (info->create_tid) {
93                 if (info->create_tid < rec->rec.base.base.create_tid)
94                         return(-1);
95                 if (rec->rec.base.base.delete_tid &&
96 		    info->create_tid >= rec->rec.base.base.delete_tid) {
97                         return(1);
98 		}
99         }
100         return(0);
101 }
102 
103 /*
104  * RB_SCAN comparison code for hammer_mem_first().  The argument order
105  * is reversed so the comparison result has to be negated.  key_beg and
106  * key_end are both range-inclusive.
107  *
108  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
109  * These do not stop the scan.
110  *
111  * Localized deletions are not cached in-memory.
112  */
113 static
114 int
115 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
116 {
117 	hammer_cursor_t cursor = data;
118 	int r;
119 
120 	r = hammer_rec_compare(&cursor->key_beg, rec);
121 	if (r > 1)
122 		return(-1);
123 	if (r == 0)
124 		return(0);
125 	r = hammer_rec_compare(&cursor->key_end, rec);
126 	if (r < -1)
127 		return(1);
128 	return(0);
129 }
130 
131 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
132 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
133 		    hammer_rec_compare, hammer_base_elm_t);
134 
135 /*
136  * Allocate a record for the caller to finish filling in.  The record is
137  * returned referenced.
138  */
139 hammer_record_t
140 hammer_alloc_mem_record(hammer_inode_t ip)
141 {
142 	hammer_record_t record;
143 
144 	++hammer_count_records;
145 	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
146 	record->ip = ip;
147 	hammer_ref(&record->lock);
148 	return (record);
149 }
150 
151 /*
152  * Release a memory record.  Records marked for deletion are immediately
153  * removed from the RB-Tree but otherwise left intact until the last ref
154  * goes away.
155  */
156 void
157 hammer_rel_mem_record(struct hammer_record *record)
158 {
159 	hammer_unref(&record->lock);
160 	if (record->flags & HAMMER_RECF_DELETED) {
161 		if (record->flags & HAMMER_RECF_ONRBTREE) {
162 			RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
163 				  record);
164 			record->flags &= ~HAMMER_RECF_ONRBTREE;
165 		}
166 		if (record->lock.refs == 0) {
167 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
168 				--hammer_count_record_datas;
169 				kfree(record->data, M_HAMMER);
170 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
171 			}
172 			record->data = NULL;
173 			--hammer_count_records;
174 			kfree(record, M_HAMMER);
175 		}
176 	}
177 }
178 
179 /*
180  * Lookup an in-memory record given the key specified in the cursor.  Works
181  * just like hammer_btree_lookup() but operates on an inode's in-memory
182  * record list.
183  *
184  * The lookup must fail if the record is marked for deferred deletion.
185  */
186 static
187 int
188 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
189 {
190 	int error;
191 
192 	if (cursor->iprec) {
193 		hammer_rel_mem_record(cursor->iprec);
194 		cursor->iprec = NULL;
195 	}
196 	if (cursor->ip) {
197 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
198 						  &cursor->ip->rec_tree);
199 	}
200 	cursor->ip = ip;
201 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
202 	cursor->scan.node = NULL;
203 	cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
204 				&ip->rec_tree, &cursor->key_beg);
205 	if (cursor->iprec == NULL) {
206 		error = ENOENT;
207 	} else {
208 		hammer_ref(&cursor->iprec->lock);
209 		error = 0;
210 	}
211 	return(error);
212 }
213 
214 /*
215  * hammer_mem_first() - locate the first in-memory record matching the
216  * cursor.
217  *
218  * The RB_SCAN function we use is designed as a callback.  We terminate it
219  * (return -1) as soon as we get a match.
220  */
221 static
222 int
223 hammer_rec_scan_callback(hammer_record_t rec, void *data)
224 {
225 	hammer_cursor_t cursor = data;
226 
227 	/*
228 	 * Skip if not visible due to our as-of TID
229 	 */
230         if (cursor->key_beg.create_tid) {
231                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
232                         return(0);
233                 if (rec->rec.base.base.delete_tid &&
234 		    cursor->key_beg.create_tid >=
235 		     rec->rec.base.base.delete_tid) {
236                         return(0);
237 		}
238         }
239 
240 	/*
241 	 * Return the first matching record and stop the scan
242 	 */
243 	if (cursor->iprec == NULL) {
244 		cursor->iprec = rec;
245 		hammer_ref(&rec->lock);
246 		return(-1);
247 	}
248 	return(0);
249 }
250 
251 static
252 int
253 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
254 {
255 	if (cursor->iprec) {
256 		hammer_rel_mem_record(cursor->iprec);
257 		cursor->iprec = NULL;
258 	}
259 	if (cursor->ip) {
260 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
261 						  &cursor->ip->rec_tree);
262 	}
263 	cursor->ip = ip;
264 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
265 
266 	cursor->scan.node = NULL;
267 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
268 				   hammer_rec_scan_callback, cursor);
269 
270 	/*
271 	 * Adjust scan.node and keep it linked into the RB-tree so we can
272 	 * hold the cursor through third party modifications of the RB-tree.
273 	 */
274 	if (cursor->iprec) {
275 		cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
276 		return(0);
277 	}
278 	return(ENOENT);
279 }
280 
281 void
282 hammer_mem_done(hammer_cursor_t cursor)
283 {
284 	if (cursor->ip) {
285 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
286 						  &cursor->ip->rec_tree);
287 		cursor->ip = NULL;
288 	}
289         if (cursor->iprec) {
290 		hammer_rel_mem_record(cursor->iprec);
291 		cursor->iprec = NULL;
292 	}
293 }
294 
295 /************************************************************************
296  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
297  ************************************************************************
298  *
299  * These functions manipulate in-memory records.  Such records typically
300  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
301  */
302 
303 /*
304  * Add a directory entry (dip,ncp) which references inode (ip).
305  *
306  * Note that the low 32 bits of the namekey are set temporarily to create
307  * a unique in-memory record, and may be modified a second time when the
308  * record is synchronized to disk.  In particular, the low 32 bits cannot be
309  * all 0's when synching to disk, which is not handled here.
310  */
311 int
312 hammer_ip_add_directory(struct hammer_transaction *trans,
313 		     struct hammer_inode *dip, struct namecache *ncp,
314 		     struct hammer_inode *ip)
315 {
316 	hammer_record_t record;
317 	int error;
318 	int bytes;
319 
320 	record = hammer_alloc_mem_record(dip);
321 
322 	bytes = ncp->nc_nlen;	/* NOTE: terminating \0 is NOT included */
323 	if (++trans->hmp->namekey_iterator == 0)
324 		++trans->hmp->namekey_iterator;
325 
326 	record->rec.entry.base.base.obj_id = dip->obj_id;
327 	record->rec.entry.base.base.key =
328 		hammer_directory_namekey(ncp->nc_name, bytes);
329 	record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
330 	record->rec.entry.base.base.create_tid = trans->tid;
331 	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
332 	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
333 	record->rec.entry.obj_id = ip->obj_id;
334 	if (bytes <= sizeof(record->rec.entry.den_name)) {
335 		record->data = (void *)record->rec.entry.den_name;
336 		record->flags |= HAMMER_RECF_EMBEDDED_DATA;
337 	} else {
338 		++hammer_count_record_datas;
339 		record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
340 		record->flags |= HAMMER_RECF_ALLOCDATA;
341 	}
342 	bcopy(ncp->nc_name, record->data, bytes);
343 	record->rec.entry.base.data_len = bytes;
344 	++ip->ino_rec.ino_nlinks;
345 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
346 	error = hammer_mem_add(trans, record);
347 	return(error);
348 }
349 
350 /*
351  * Delete the directory entry and update the inode link count.  The
352  * cursor must be seeked to the directory entry record being deleted.
353  *
354  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
355  */
356 int
357 hammer_ip_del_directory(struct hammer_transaction *trans,
358 		     hammer_cursor_t cursor, struct hammer_inode *dip,
359 		     struct hammer_inode *ip)
360 {
361 	int error;
362 
363 	error = hammer_ip_delete_record(cursor, trans->tid);
364 
365 	/*
366 	 * One less link.  The file may still be open in the OS even after
367 	 * all links have gone away so we only try to sync if the OS has
368 	 * no references and nlinks falls to 0.
369 	 */
370 	if (error == 0) {
371 		--ip->ino_rec.ino_nlinks;
372 		hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
373 		if (ip->ino_rec.ino_nlinks == 0 &&
374 		    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
375 			hammer_sync_inode(ip, MNT_NOWAIT, 1);
376 		}
377 
378 	}
379 	return(error);
380 }
381 
382 /*
383  * Add a record to an inode.
384  *
385  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
386  * initialize the following additional fields:
387  *
388  * record->rec.entry.base.base.key
389  * record->rec.entry.base.base.rec_type
390  * record->rec.entry.base.base.data_len
391  * record->data		(a copy will be kmalloc'd if not embedded)
392  */
393 int
394 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
395 {
396 	hammer_inode_t ip = record->ip;
397 	int error;
398 	int bytes;
399 	void *data;
400 
401 	record->rec.base.base.obj_id = ip->obj_id;
402 	record->rec.base.base.create_tid = trans->tid;
403 	record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
404 	bytes = record->rec.base.data_len;
405 
406 	if (record->data) {
407 		if ((char *)record->data < (char *)&record->rec ||
408 		    (char *)record->data >= (char *)(&record->rec + 1)) {
409 			++hammer_count_record_datas;
410 			data = kmalloc(bytes, M_HAMMER, M_WAITOK);
411 			record->flags |= HAMMER_RECF_ALLOCDATA;
412 			bcopy(record->data, data, bytes);
413 			record->data = data;
414 		} else {
415 			record->flags |= HAMMER_RECF_EMBEDDED_DATA;
416 		}
417 	}
418 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
419 	error = hammer_mem_add(trans, record);
420 	return(error);
421 }
422 
423 /*
424  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
425  * is called via the strategy called from a cached data source.  This code
426  * is responsible for actually writing a data record out to the disk.
427  */
428 int
429 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
430 		       int64_t offset, void *data, int bytes,
431 		       struct hammer_cursor **spike)
432 {
433 	struct hammer_cursor cursor;
434 	hammer_record_ondisk_t rec;
435 	union hammer_btree_elm elm;
436 	void *bdata;
437 	int error;
438 
439 	error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
440 	if (error)
441 		return(error);
442 	cursor.key_beg.obj_id = ip->obj_id;
443 	cursor.key_beg.key = offset + bytes;
444 	cursor.key_beg.create_tid = trans->tid;
445 	cursor.key_beg.delete_tid = 0;
446 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
447 	cursor.flags = HAMMER_CURSOR_INSERT;
448 
449 	/*
450 	 * Issue a lookup to position the cursor and locate the cluster
451 	 */
452 	error = hammer_btree_lookup(&cursor);
453 	if (error == 0) {
454 		kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
455 			offset, bytes);
456 		hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
457 				       HAMMER_BTREE_TYPE_LEAF, cursor.index);
458 		error = EIO;
459 	}
460 	if (error != ENOENT)
461 		goto done;
462 
463 	/*
464 	 * Allocate record and data space now that we know which cluster
465 	 * the B-Tree node ended up in.
466 	 */
467 	bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
468 				  &cursor.data_buffer);
469 	if (bdata == NULL)
470 		goto done;
471 	rec = hammer_alloc_record(cursor.node->cluster, &error,
472 				  &cursor.record_buffer);
473 	if (rec == NULL)
474 		goto fail1;
475 
476 	/*
477 	 * Fill everything in and insert our B-Tree node.
478 	 */
479 	hammer_modify_buffer(cursor.record_buffer);
480 	rec->base.base = cursor.key_beg;
481 	rec->base.data_crc = crc32(data, bytes);
482 	rec->base.rec_id = 0;	/* XXX */
483 	rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
484 	rec->base.data_len = bytes;
485 	hammer_modify_buffer_done(cursor.record_buffer);
486 
487 	hammer_modify_buffer(cursor.data_buffer);
488 	bcopy(data, bdata, bytes);
489 	hammer_modify_buffer_done(cursor.data_buffer);
490 
491 	elm.leaf.base = cursor.key_beg;
492 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
493 	elm.leaf.data_offset = rec->base.data_offset;
494 	elm.leaf.data_len = bytes;
495 	elm.leaf.data_crc = rec->base.data_crc;
496 
497 	error = hammer_btree_insert(&cursor, &elm);
498 	if (error == 0)
499 		goto done;
500 
501 	hammer_free_record_ptr(cursor.record_buffer, rec);
502 fail1:
503 	hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
504 done:
505 	/*
506 	 * If ENOSPC in cluster fill in the spike structure and return
507 	 * ENOSPC.
508 	 */
509 	if (error == ENOSPC)
510 		hammer_load_spike(&cursor, spike);
511 	hammer_done_cursor(&cursor);
512 	return(error);
513 }
514 
515 /*
516  * Sync an in-memory record to the disk.  this is typically called via fsync
517  * from a cached record source.  This code is responsible for actually
518  * writing a record out to the disk.
519  */
520 int
521 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
522 {
523 	struct hammer_cursor cursor;
524 	hammer_record_ondisk_t rec;
525 	hammer_mount_t hmp;
526 	union hammer_btree_elm elm;
527 	void *bdata;
528 	int error;
529 
530 	error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
531 				       record->ip->hmp);
532 	if (error)
533 		return(error);
534 	cursor.key_beg = record->rec.base.base;
535 	cursor.flags = HAMMER_CURSOR_INSERT;
536 
537 	/*
538 	 * Issue a lookup to position the cursor and locate the cluster.  The
539 	 * target key should not exist.  If we are creating a directory entry
540 	 * we may have to iterate the low 32 bits of the key to find an unused
541 	 * key.
542 	 *
543 	 * If we run out of space trying to adjust the B-Tree for the
544 	 * insert, re-lookup without the insert flag so the cursor
545 	 * is properly positioned for the spike.
546 	 */
547 again:
548 	error = hammer_btree_lookup(&cursor);
549 	if (error == 0) {
550 		if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
551 			hmp = cursor.node->cluster->volume->hmp;
552 			if (++hmp->namekey_iterator == 0)
553 				++hmp->namekey_iterator;
554 			record->rec.base.base.key &= ~(0xFFFFFFFFLL);
555 			record->rec.base.base.key |= hmp->namekey_iterator;
556 			goto again;
557 		}
558 		kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
559 			record->rec.base.base.key);
560 		Debugger("duplicate record1");
561 		error = EIO;
562 	}
563 	if (error != ENOENT)
564 		goto done;
565 
566 	/*
567 	 * Mark the record as undergoing synchronization.  Our cursor is
568 	 * holding a locked B-Tree node for the insertion which interlocks
569 	 * anyone trying to access this record.
570 	 *
571 	 * XXX There is still a race present related to iterations.  An
572 	 * iteration may process the record, a sync may occur, and then
573 	 * later process the B-Tree element for the same record.
574 	 *
575 	 * We do not try to synchronize a deleted record.
576 	 */
577 	if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) {
578 		error = 0;
579 		goto done;
580 	}
581 	record->flags |= HAMMER_RECF_SYNCING;
582 
583 	/*
584 	 * Allocate record and data space now that we know which cluster
585 	 * the B-Tree node ended up in.
586 	 */
587 	if (record->data == NULL ||
588 	    (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
589 		bdata = record->data;
590 	} else {
591 		bdata = hammer_alloc_data(cursor.node->cluster,
592 					  record->rec.base.data_len, &error,
593 					  &cursor.data_buffer);
594 		if (bdata == NULL)
595 			goto fail2;
596 	}
597 	rec = hammer_alloc_record(cursor.node->cluster, &error,
598 				  &cursor.record_buffer);
599 	if (rec == NULL)
600 		goto fail1;
601 
602 	/*
603 	 * Fill everything in and insert our B-Tree node.
604 	 *
605 	 * XXX assign rec_id here
606 	 */
607 	hammer_modify_buffer(cursor.record_buffer);
608 	*rec = record->rec;
609 	if (bdata) {
610 		rec->base.data_crc = crc32(record->data,
611 					   record->rec.base.data_len);
612 		if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
613 			/*
614 			 * Data embedded in record
615 			 */
616 			rec->base.data_offset = ((char *)bdata -
617 						 (char *)&record->rec);
618 			KKASSERT(rec->base.data_offset >= 0 &&
619 				 rec->base.data_offset + rec->base.data_len <=
620 				  sizeof(*rec));
621 			rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
622 		} else {
623 			/*
624 			 * Data separate from record
625 			 */
626 			rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
627 			hammer_modify_buffer(cursor.data_buffer);
628 			bcopy(record->data, bdata, rec->base.data_len);
629 			hammer_modify_buffer_done(cursor.data_buffer);
630 		}
631 	}
632 	rec->base.rec_id = 0;	/* XXX */
633 	hammer_modify_buffer_done(cursor.record_buffer);
634 
635 	elm.leaf.base = cursor.key_beg;
636 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
637 	elm.leaf.data_offset = rec->base.data_offset;
638 	elm.leaf.data_len = rec->base.data_len;
639 	elm.leaf.data_crc = rec->base.data_crc;
640 
641 	error = hammer_btree_insert(&cursor, &elm);
642 
643 	/*
644 	 * Clean up on success, or fall through on error.
645 	 */
646 	if (error == 0) {
647 		record->flags |= HAMMER_RECF_DELETED;
648 		record->flags &= ~HAMMER_RECF_SYNCING;
649 		goto done;
650 	}
651 
652 	hammer_free_record_ptr(cursor.record_buffer, rec);
653 fail1:
654 	if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
655 		hammer_free_data_ptr(cursor.data_buffer, bdata,
656 				     record->rec.base.data_len);
657 	}
658 fail2:
659 	record->flags &= ~HAMMER_RECF_SYNCING;
660 done:
661 	/*
662 	 * If ENOSPC in cluster fill in the spike structure and return
663 	 * ENOSPC.
664 	 */
665 	if (error == ENOSPC)
666 		hammer_load_spike(&cursor, spike);
667 	hammer_done_cursor(&cursor);
668 	return(error);
669 }
670 
671 /*
672  * Write out a record using the specified cursor.  The caller does not have
673  * to seek the cursor.  The flags are used to determine whether the data
674  * (if any) is embedded in the record or not.
675  *
676  * The target cursor will be modified by this call.  Note in particular
677  * that HAMMER_CURSOR_INSERT is set.
678  */
679 int
680 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
681 		    void *data, int cursor_flags)
682 {
683 	union hammer_btree_elm elm;
684 	hammer_record_ondisk_t nrec;
685 	void *bdata;
686 	int error;
687 
688 	cursor->key_beg = orec->base.base;
689 	cursor->flags |= HAMMER_CURSOR_INSERT;
690 
691 	/*
692 	 * Issue a lookup to position the cursor and locate the cluster.  The
693 	 * target key should not exist.
694 	 *
695 	 * If we run out of space trying to adjust the B-Tree for the
696 	 * insert, re-lookup without the insert flag so the cursor
697 	 * is properly positioned for the spike.
698 	 */
699 	error = hammer_btree_lookup(cursor);
700 	if (error == 0) {
701 		kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
702 			orec->base.base.key);
703 		Debugger("duplicate record2");
704 		error = EIO;
705 	}
706 	if (error != ENOENT)
707 		goto done;
708 
709 	/*
710 	 * Allocate record and data space now that we know which cluster
711 	 * the B-Tree node ended up in.
712 	 */
713 	if (data == NULL ||
714 	    (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
715 		bdata = data;
716 	} else {
717 		bdata = hammer_alloc_data(cursor->node->cluster,
718 					  orec->base.data_len, &error,
719 					  &cursor->data_buffer);
720 		if (bdata == NULL)
721 			goto done;
722 	}
723 	nrec = hammer_alloc_record(cursor->node->cluster, &error,
724 				  &cursor->record_buffer);
725 	if (nrec == NULL)
726 		goto fail1;
727 
728 	/*
729 	 * Fill everything in and insert our B-Tree node.
730 	 *
731 	 * XXX assign rec_id here
732 	 */
733 	hammer_modify_buffer(cursor->record_buffer);
734 	*nrec = *orec;
735 	nrec->base.data_offset = 0;
736 	if (bdata) {
737 		nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
738 		if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
739 			/*
740 			 * Data embedded in record
741 			 */
742 			nrec->base.data_offset = ((char *)bdata - (char *)orec);
743 			KKASSERT(nrec->base.data_offset >= 0 &&
744 				 nrec->base.data_offset + nrec->base.data_len <
745 				  sizeof(*nrec));
746 			nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
747 		} else {
748 			/*
749 			 * Data separate from record
750 			 */
751 			nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
752 			hammer_modify_buffer(cursor->data_buffer);
753 			bcopy(data, bdata, nrec->base.data_len);
754 			hammer_modify_buffer_done(cursor->data_buffer);
755 		}
756 	}
757 	nrec->base.rec_id = 0;	/* XXX */
758 	hammer_modify_buffer_done(cursor->record_buffer);
759 
760 	elm.leaf.base = nrec->base.base;
761 	elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
762 	elm.leaf.data_offset = nrec->base.data_offset;
763 	elm.leaf.data_len = nrec->base.data_len;
764 	elm.leaf.data_crc = nrec->base.data_crc;
765 
766 	error = hammer_btree_insert(cursor, &elm);
767 	if (error == 0)
768 		goto done;
769 
770 	hammer_free_record_ptr(cursor->record_buffer, nrec);
771 fail1:
772 	if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
773 		hammer_free_data_ptr(cursor->data_buffer, bdata,
774 				     orec->base.data_len);
775 	}
776 done:
777 	/* leave cursor intact */
778 	return(error);
779 }
780 
781 /*
782  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
783  * entry's key is used to deal with hash collisions in the upper 32 bits.
784  * A unique 64 bit key is generated in-memory and may be regenerated a
785  * second time when the directory record is flushed to the on-disk B-Tree.
786  *
787  * A referenced record is passed to this function.  This function
788  * eats the reference.  If an error occurs the record will be deleted.
789  */
790 static
791 int
792 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
793 {
794 	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
795 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
796 			record->flags |= HAMMER_RECF_DELETED;
797 			hammer_rel_mem_record(record);
798 			return (EEXIST);
799 		}
800 		if (++trans->hmp->namekey_iterator == 0)
801 			++trans->hmp->namekey_iterator;
802 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
803 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
804 	}
805 	record->flags |= HAMMER_RECF_ONRBTREE;
806 	hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
807 	hammer_rel_mem_record(record);
808 	return(0);
809 }
810 
811 /************************************************************************
812  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
813  ************************************************************************
814  *
815  * These functions augment the B-Tree scanning functions in hammer_btree.c
816  * by merging in-memory records with on-disk records.
817  */
818 
819 /*
820  * Locate a particular record either in-memory or on-disk.
821  *
822  * NOTE: This is basically a standalone routine, hammer_ip_next() may
823  * NOT be called to iterate results.
824  */
825 int
826 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
827 {
828 	int error;
829 
830 	/*
831 	 * If the element is in-memory return it without searching the
832 	 * on-disk B-Tree
833 	 */
834 	error = hammer_mem_lookup(cursor, ip);
835 	if (error == 0) {
836 		cursor->record = &cursor->iprec->rec;
837 		return(error);
838 	}
839 	if (error != ENOENT)
840 		return(error);
841 
842 	/*
843 	 * If the inode has on-disk components search the on-disk B-Tree.
844 	 */
845 	if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
846 		return(error);
847 	error = hammer_btree_lookup(cursor);
848 	if (error == 0)
849 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
850 	return(error);
851 }
852 
853 /*
854  * Locate the first record within the cursor's key_beg/key_end range,
855  * restricted to a particular inode.  0 is returned on success, ENOENT
856  * if no records matched the requested range, or some other error.
857  *
858  * When 0 is returned hammer_ip_next() may be used to iterate additional
859  * records within the requested range.
860  */
861 int
862 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
863 {
864 	int error;
865 
866 	/*
867 	 * Clean up fields and setup for merged scan
868 	 */
869 	cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
870 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
871 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
872 	if (cursor->iprec) {
873 		hammer_rel_mem_record(cursor->iprec);
874 		cursor->iprec = NULL;
875 	}
876 
877 	/*
878 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
879 	 * exact lookup so if we get ENOENT we have to call the iterate
880 	 * function to validate the first record after the begin key.
881 	 *
882 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
883 	 * whether it must index forwards or not.  It is also used here
884 	 * to select the next record from in-memory or on-disk.
885 	 */
886 	if (ip->flags & HAMMER_INODE_ONDISK) {
887 		error = hammer_btree_lookup(cursor);
888 		if (error == ENOENT) {
889 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
890 			error = hammer_btree_iterate(cursor);
891 		}
892 		if (error && error != ENOENT)
893 			return(error);
894 		if (error == 0) {
895 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
896 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
897 		} else {
898 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
899 		}
900 	}
901 
902 	/*
903 	 * Search the in-memory record list (Red-Black tree).  Unlike the
904 	 * B-Tree search, mem_first checks for records in the range.
905 	 */
906 	error = hammer_mem_first(cursor, ip);
907 	if (error && error != ENOENT)
908 		return(error);
909 	if (error == 0) {
910 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
911 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
912 	}
913 
914 	/*
915 	 * This will return the first matching record.
916 	 */
917 	return(hammer_ip_next(cursor));
918 }
919 
920 /*
921  * Retrieve the next record in a merged iteration within the bounds of the
922  * cursor.  This call may be made multiple times after the cursor has been
923  * initially searched with hammer_ip_first().
924  *
925  * 0 is returned on success, ENOENT if no further records match the
926  * requested range, or some other error code is returned.
927  */
928 int
929 hammer_ip_next(hammer_cursor_t cursor)
930 {
931 	hammer_btree_elm_t elm;
932 	hammer_record_t rec;
933 	int error;
934 	int r;
935 
936 	/*
937 	 * Load the current on-disk and in-memory record.  If we ate any
938 	 * records we have to get the next one.
939 	 *
940 	 * If we deleted the last on-disk record we had scanned ATEDISK will
941 	 * be clear and DELBTREE will be set, forcing a call to iterate. The
942 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
943 	 * element.  If ATEDISK is set, iterate will skip the 'current'
944 	 * element.
945 	 *
946 	 * Get the next on-disk record
947 	 */
948 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
949 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
950 			error = hammer_btree_iterate(cursor);
951 			cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
952 			if (error == 0)
953 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
954 			else
955 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
956 						 HAMMER_CURSOR_ATEDISK;
957 		}
958 	}
959 
960 	/*
961 	 * Get the next in-memory record.  The record can be ripped out
962 	 * of the RB tree so we maintain a scan_info structure to track
963 	 * the next node.
964 	 *
965 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
966 	 *			 (non-inclusive of snapshot exclusions)?
967 	 * hammer_rec_scan_callback: Is the record in our snapshot?
968 	 */
969 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
970 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
971 			if (cursor->iprec) {
972 				hammer_rel_mem_record(cursor->iprec);
973 				cursor->iprec = NULL;
974 			}
975 			rec = cursor->scan.node;	/* next node */
976 			while (rec) {
977 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
978 					break;
979 				if (hammer_rec_scan_callback(rec, cursor) != 0)
980 					break;
981 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
982 			}
983 			if (cursor->iprec) {
984 				KKASSERT(cursor->iprec == rec);
985 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
986 				cursor->scan.node =
987 					hammer_rec_rb_tree_RB_NEXT(rec);
988 			} else {
989 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
990 			}
991 		}
992 	}
993 
994 	/*
995 	 * Extract either the disk or memory record depending on their
996 	 * relative position.
997 	 */
998 	error = 0;
999 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1000 	case 0:
1001 		/*
1002 		 * Both entries valid
1003 		 */
1004 		elm = &cursor->node->ondisk->elms[cursor->index];
1005 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1006 		if (r < 0) {
1007 			error = hammer_btree_extract(cursor,
1008 						     HAMMER_CURSOR_GET_RECORD);
1009 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1010 			break;
1011 		}
1012 		/* fall through to the memory entry */
1013 	case HAMMER_CURSOR_ATEDISK:
1014 		/*
1015 		 * Only the memory entry is valid
1016 		 */
1017 		cursor->record = &cursor->iprec->rec;
1018 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1019 		break;
1020 	case HAMMER_CURSOR_ATEMEM:
1021 		/*
1022 		 * Only the disk entry is valid
1023 		 */
1024 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1025 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1026 		break;
1027 	default:
1028 		/*
1029 		 * Neither entry is valid
1030 		 *
1031 		 * XXX error not set properly
1032 		 */
1033 		cursor->record = NULL;
1034 		error = ENOENT;
1035 		break;
1036 	}
1037 	return(error);
1038 }
1039 
1040 /*
1041  * Resolve the cursor->data pointer for the current cursor position in
1042  * a merged iteration.
1043  */
1044 int
1045 hammer_ip_resolve_data(hammer_cursor_t cursor)
1046 {
1047 	int error;
1048 
1049 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1050 		cursor->data = cursor->iprec->data;
1051 		error = 0;
1052 	} else {
1053 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1054 	}
1055 	return(error);
1056 }
1057 
1058 /*
1059  * Delete all records within the specified range for inode ip.
1060  *
1061  * NOTE: An unaligned range will cause new records to be added to cover
1062  * the edge cases. (XXX not implemented yet).
1063  *
1064  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1065  *
1066  * NOTE: Record keys for regular file data have to be special-cased since
1067  * they indicate the end of the range (key = base + bytes).
1068  *
1069  * NOTE: The spike structure must be filled in if we return ENOSPC.
1070  */
1071 int
1072 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1073 		       int64_t ran_beg, int64_t ran_end,
1074 		       struct hammer_cursor **spike)
1075 {
1076 	struct hammer_cursor cursor;
1077 	hammer_record_ondisk_t rec;
1078 	hammer_base_elm_t base;
1079 	int error;
1080 	int64_t off;
1081 
1082 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1083 
1084 	cursor.key_beg.obj_id = ip->obj_id;
1085 	cursor.key_beg.create_tid = ip->obj_asof;
1086 	cursor.key_beg.delete_tid = 0;
1087 	cursor.key_beg.obj_type = 0;
1088 
1089 	cursor.key_end = cursor.key_beg;
1090 	if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1091 		cursor.key_beg.key = ran_beg;
1092 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1093 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1094 		cursor.key_end.key = ran_end;
1095 	} else {
1096 		/*
1097 		 * The key in the B-Tree is (base+bytes), so the first possible
1098 		 * matching key is ran_beg + 1.
1099 		 */
1100 		int64_t tmp64;
1101 
1102 		cursor.key_beg.key = ran_beg + 1;
1103 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1104 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1105 
1106 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1107 		if (tmp64 < ran_end)
1108 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1109 		else
1110 			cursor.key_end.key = ran_end + MAXPHYS + 1;
1111 	}
1112 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1113 
1114 	error = hammer_ip_first(&cursor, ip);
1115 
1116 	/*
1117 	 * Iterate through matching records and mark them as deleted.
1118 	 */
1119 	while (error == 0) {
1120 		rec = cursor.record;
1121 		base = &rec->base.base;
1122 
1123 		KKASSERT(base->delete_tid == 0);
1124 
1125 		/*
1126 		 * There may be overlap cases for regular file data.  Also
1127 		 * remember the key for a regular file record is the offset
1128 		 * of the last byte of the record (base + len - 1), NOT the
1129 		 * base offset.
1130 		 */
1131 #if 0
1132 		kprintf("delete_range rec_type %02x\n", base->rec_type);
1133 #endif
1134 		if (base->rec_type == HAMMER_RECTYPE_DATA) {
1135 #if 0
1136 			kprintf("delete_range loop key %016llx\n",
1137 				base->key - rec->base.data_len);
1138 #endif
1139 			off = base->key - rec->base.data_len;
1140 			/*
1141 			 * Check the left edge case.  We currently do not
1142 			 * split existing records.
1143 			 */
1144 			if (off < ran_beg) {
1145 				panic("hammer left edge case %016llx %d\n",
1146 					base->key, rec->base.data_len);
1147 			}
1148 
1149 			/*
1150 			 * Check the right edge case.  Note that the
1151 			 * record can be completely out of bounds, which
1152 			 * terminates the search.
1153 			 *
1154 			 * base->key is exclusive of the right edge while
1155 			 * ran_end is inclusive of the right edge.  The
1156 			 * (key - data_len) left boundary is inclusive.
1157 			 *
1158 			 * XXX theory-check this test at some point, are
1159 			 * we missing a + 1 somewhere?  Note that ran_end
1160 			 * could overflow.
1161 			 */
1162 			if (base->key - 1 > ran_end) {
1163 				if (base->key - rec->base.data_len > ran_end) {
1164 					kprintf("right edge OOB\n");
1165 					break;
1166 				}
1167 				panic("hammer right edge case\n");
1168 			}
1169 		}
1170 
1171 		/*
1172 		 * Mark the record and B-Tree entry as deleted.  This will
1173 		 * also physically delete the B-Tree entry, record, and
1174 		 * data if the retention policy dictates.  The function
1175 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1176 		 * uses to perform a fixup.
1177 		 */
1178 		error = hammer_ip_delete_record(&cursor, trans->tid);
1179 		if (error)
1180 			break;
1181 		error = hammer_ip_next(&cursor);
1182 	}
1183 	hammer_done_cursor(&cursor);
1184 	if (error == ENOENT)
1185 		error = 0;
1186 	return(error);
1187 }
1188 
1189 /*
1190  * Delete all records associated with an inode except the inode record
1191  * itself.
1192  */
1193 int
1194 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1195 {
1196 	struct hammer_cursor cursor;
1197 	hammer_record_ondisk_t rec;
1198 	hammer_base_elm_t base;
1199 	int error;
1200 
1201 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1202 
1203 	cursor.key_beg.obj_id = ip->obj_id;
1204 	cursor.key_beg.create_tid = ip->obj_asof;
1205 	cursor.key_beg.delete_tid = 0;
1206 	cursor.key_beg.obj_type = 0;
1207 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1208 	cursor.key_beg.key = HAMMER_MIN_KEY;
1209 
1210 	cursor.key_end = cursor.key_beg;
1211 	cursor.key_end.rec_type = 0xFFFF;
1212 	cursor.key_end.key = HAMMER_MAX_KEY;
1213 
1214 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1215 
1216 	error = hammer_ip_first(&cursor, ip);
1217 
1218 	/*
1219 	 * Iterate through matching records and mark them as deleted.
1220 	 */
1221 	while (error == 0) {
1222 		rec = cursor.record;
1223 		base = &rec->base.base;
1224 
1225 		KKASSERT(base->delete_tid == 0);
1226 
1227 		/*
1228 		 * Mark the record and B-Tree entry as deleted.  This will
1229 		 * also physically delete the B-Tree entry, record, and
1230 		 * data if the retention policy dictates.  The function
1231 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1232 		 * uses to perform a fixup.
1233 		 */
1234 		error = hammer_ip_delete_record(&cursor, trans->tid);
1235 		if (error)
1236 			break;
1237 		error = hammer_ip_next(&cursor);
1238 	}
1239 	hammer_done_cursor(&cursor);
1240 	if (error == ENOENT)
1241 		error = 0;
1242 	return(error);
1243 }
1244 
1245 /*
1246  * Delete the record at the current cursor
1247  */
1248 int
1249 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1250 {
1251 	hammer_btree_elm_t elm;
1252 	hammer_mount_t hmp;
1253 	int error;
1254 
1255 	/*
1256 	 * In-memory (unsynchronized) records can simply be freed.
1257 	 */
1258 	if (cursor->record == &cursor->iprec->rec) {
1259 		cursor->iprec->flags |= HAMMER_RECF_DELETED;
1260 		return(0);
1261 	}
1262 
1263 	/*
1264 	 * On-disk records are marked as deleted by updating their delete_tid.
1265 	 */
1266 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1267 	elm = NULL;
1268 	hmp = cursor->node->cluster->volume->hmp;
1269 
1270 	if (error == 0) {
1271 		hammer_modify_buffer(cursor->record_buffer);
1272 		cursor->record->base.base.delete_tid = tid;
1273 
1274 		hammer_modify_buffer_done(cursor->record_buffer);
1275 		hammer_modify_node(cursor->node);
1276 		elm = &cursor->node->ondisk->elms[cursor->index];
1277 		elm->leaf.base.delete_tid = tid;
1278 		hammer_modify_node_done(cursor->node);
1279 	}
1280 
1281 	/*
1282 	 * If we were mounted with the nohistory option, we physically
1283 	 * delete the record.
1284 	 */
1285 	if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1286 		int32_t rec_offset;
1287 		int32_t data_offset;
1288 		int32_t data_len;
1289 		hammer_cluster_t cluster;
1290 
1291 		rec_offset = elm->leaf.rec_offset;
1292 		data_offset = elm->leaf.data_offset;
1293 		data_len = elm->leaf.data_len;
1294 #if 0
1295 		kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1296 			rec_offset, data_offset, data_len);
1297 #endif
1298 		cluster = cursor->node->cluster;
1299 		hammer_ref_cluster(cluster);
1300 
1301 		error = hammer_btree_delete(cursor);
1302 		if (error == 0) {
1303 			/*
1304 			 * This forces a fixup for the iteration because
1305 			 * the cursor is now either sitting at the 'next'
1306 			 * element or sitting at the end of a leaf.
1307 			 */
1308 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1309 				cursor->flags |= HAMMER_CURSOR_DELBTREE;
1310 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1311 			}
1312 			hammer_free_record(cluster, rec_offset);
1313 			if (data_offset && (data_offset - rec_offset < 0 ||
1314 			    data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1315 				hammer_free_data(cluster, data_offset,data_len);
1316 			}
1317 		}
1318 		hammer_rel_cluster(cluster, 0);
1319 		if (error) {
1320 			panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1321 			error = 0;
1322 		}
1323 	}
1324 	return(error);
1325 }
1326 
1327 /*
1328  * Determine whether a directory is empty or not.  Returns 0 if the directory
1329  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1330  */
1331 int
1332 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1333 {
1334 	struct hammer_cursor cursor;
1335 	int error;
1336 
1337 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1338 
1339 	cursor.key_beg.obj_id = ip->obj_id;
1340 	cursor.key_beg.create_tid = ip->obj_asof;
1341 	cursor.key_beg.delete_tid = 0;
1342 	cursor.key_beg.obj_type = 0;
1343 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1344 	cursor.key_beg.key = HAMMER_MIN_KEY;
1345 
1346 	cursor.key_end = cursor.key_beg;
1347 	cursor.key_end.rec_type = 0xFFFF;
1348 	cursor.key_end.key = HAMMER_MAX_KEY;
1349 
1350 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1351 
1352 	error = hammer_ip_first(&cursor, ip);
1353 	if (error == ENOENT)
1354 		error = 0;
1355 	else if (error == 0)
1356 		error = ENOTEMPTY;
1357 	hammer_done_cursor(&cursor);
1358 	return(error);
1359 }
1360 
1361