xref: /dflybsd-src/sys/vfs/hammer/hammer_object.c (revision 21d8bc42f570abe46007647277c2c8d68cf8b55a)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.26 2008/01/25 21:50:56 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_add(hammer_transaction_t trans,
40 			     hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43 
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50 	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 		return(-1);
52 	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53 		return(1);
54 
55 	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 		return(-1);
57 	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58 		return(1);
59 
60 	if (rec1->rec.base.base.create_tid == 0) {
61 		if (rec2->rec.base.base.create_tid == 0)
62 			return(0);
63 		return(1);
64 	}
65 	if (rec2->rec.base.base.create_tid == 0)
66 		return(-1);
67 
68 	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
69 		return(-1);
70 	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
71 		return(1);
72         return(0);
73 }
74 
75 static int
76 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
77 {
78 	if (info->rec_type < rec->rec.base.base.rec_type)
79 		return(-3);
80 	if (info->rec_type > rec->rec.base.base.rec_type)
81 		return(3);
82 
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87 
88 	if (info->create_tid == 0) {
89 		if (rec->rec.base.base.create_tid == 0)
90 			return(0);
91 		return(1);
92 	}
93 	if (rec->rec.base.base.create_tid == 0)
94 		return(-1);
95 	if (info->create_tid < rec->rec.base.base.create_tid)
96 		return(-1);
97 	if (info->create_tid > rec->rec.base.base.create_tid)
98 		return(1);
99         return(0);
100 }
101 
102 /*
103  * RB_SCAN comparison code for hammer_mem_first().  The argument order
104  * is reversed so the comparison result has to be negated.  key_beg and
105  * key_end are both range-inclusive.
106  *
107  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
108  * These do not stop the scan.
109  *
110  * Localized deletions are not cached in-memory.
111  */
112 static
113 int
114 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
115 {
116 	hammer_cursor_t cursor = data;
117 	int r;
118 
119 	r = hammer_rec_compare(&cursor->key_beg, rec);
120 	if (r > 1)
121 		return(-1);
122 	r = hammer_rec_compare(&cursor->key_end, rec);
123 	if (r < -1)
124 		return(1);
125 	return(0);
126 }
127 
128 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
130 		    hammer_rec_compare, hammer_base_elm_t);
131 
132 /*
133  * Allocate a record for the caller to finish filling in.  The record is
134  * returned referenced.
135  */
136 hammer_record_t
137 hammer_alloc_mem_record(hammer_inode_t ip)
138 {
139 	hammer_record_t record;
140 
141 	++hammer_count_records;
142 	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
143 	record->ip = ip;
144 	record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
145 	hammer_ref(&record->lock);
146 	return (record);
147 }
148 
149 /*
150  * Release a memory record.  Records marked for deletion are immediately
151  * removed from the RB-Tree but otherwise left intact until the last ref
152  * goes away.
153  */
154 void
155 hammer_rel_mem_record(struct hammer_record *record)
156 {
157 	hammer_unref(&record->lock);
158 
159 	if (record->flags & HAMMER_RECF_DELETED) {
160 		if (record->flags & HAMMER_RECF_ONRBTREE) {
161 			RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
162 				  record);
163 			record->flags &= ~HAMMER_RECF_ONRBTREE;
164 		}
165 		if (record->lock.refs == 0) {
166 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
167 				--hammer_count_record_datas;
168 				kfree(record->data, M_HAMMER);
169 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
170 			}
171 			record->data = NULL;
172 			--hammer_count_records;
173 			kfree(record, M_HAMMER);
174 			return;
175 		}
176 	}
177 
178 	/*
179 	 * If someone wanted the record wake them up.
180 	 */
181 	if (record->flags & HAMMER_RECF_WANTED) {
182 		record->flags &= ~HAMMER_RECF_WANTED;
183 		wakeup(record);
184 	}
185 }
186 
187 /*
188  * Lookup an in-memory record given the key specified in the cursor.  Works
189  * just like hammer_btree_lookup() but operates on an inode's in-memory
190  * record list.
191  *
192  * The lookup must fail if the record is marked for deferred deletion.
193  */
194 static
195 int
196 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
197 {
198 	int error;
199 
200 	if (cursor->iprec) {
201 		hammer_rel_mem_record(cursor->iprec);
202 		cursor->iprec = NULL;
203 	}
204 	if (cursor->ip) {
205 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
206 						  &cursor->ip->rec_tree);
207 	}
208 	cursor->ip = ip;
209 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
210 	cursor->scan.node = NULL;
211 	cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
212 				&ip->rec_tree, &cursor->key_beg);
213 	if (cursor->iprec == NULL) {
214 		error = ENOENT;
215 	} else {
216 		hammer_ref(&cursor->iprec->lock);
217 		error = 0;
218 	}
219 	return(error);
220 }
221 
222 /*
223  * hammer_mem_first() - locate the first in-memory record matching the
224  * cursor.
225  *
226  * The RB_SCAN function we use is designed as a callback.  We terminate it
227  * (return -1) as soon as we get a match.
228  */
229 static
230 int
231 hammer_rec_scan_callback(hammer_record_t rec, void *data)
232 {
233 	hammer_cursor_t cursor = data;
234 
235 	/*
236 	 * We terminate on success, so this should be NULL on entry.
237 	 */
238 	KKASSERT(cursor->iprec == NULL);
239 
240 	/*
241 	 * Skip if the record was marked deleted
242 	 */
243 	if (rec->flags & HAMMER_RECF_DELETED)
244 		return(0);
245 
246 	/*
247 	 * Skip if not visible due to our as-of TID
248 	 */
249         if (cursor->flags & HAMMER_CURSOR_ASOF) {
250                 if (cursor->asof < rec->rec.base.base.create_tid)
251                         return(0);
252                 if (rec->rec.base.base.delete_tid &&
253 		    cursor->asof >= rec->rec.base.base.delete_tid) {
254                         return(0);
255 		}
256         }
257 
258 	/*
259 	 * Block if currently being synchronized to disk, otherwise we
260 	 * may get a duplicate.  Wakeup the syncer if it's stuck on
261 	 * the record.
262 	 */
263 	hammer_ref(&rec->lock);
264 	++rec->blocked;
265 	while (rec->flags & HAMMER_RECF_SYNCING) {
266 		rec->flags |= HAMMER_RECF_WANTED;
267 		tsleep(rec, 0, "hmrrc2", 0);
268 	}
269 	--rec->blocked;
270 
271 	/*
272 	 * The record may have been deleted while we were blocked.
273 	 */
274 	if (rec->flags & HAMMER_RECF_DELETED) {
275 		hammer_rel_mem_record(cursor->iprec);
276 		return(0);
277 	}
278 
279 	/*
280 	 * Set the matching record and stop the scan.
281 	 */
282 	cursor->iprec = rec;
283 	return(-1);
284 }
285 
286 static
287 int
288 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
289 {
290 	if (cursor->iprec) {
291 		hammer_rel_mem_record(cursor->iprec);
292 		cursor->iprec = NULL;
293 	}
294 	if (cursor->ip) {
295 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
296 						  &cursor->ip->rec_tree);
297 	}
298 	cursor->ip = ip;
299 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
300 
301 	cursor->scan.node = NULL;
302 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
303 				   hammer_rec_scan_callback, cursor);
304 
305 	/*
306 	 * Adjust scan.node and keep it linked into the RB-tree so we can
307 	 * hold the cursor through third party modifications of the RB-tree.
308 	 */
309 	if (cursor->iprec) {
310 		cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
311 		return(0);
312 	}
313 	return(ENOENT);
314 }
315 
316 void
317 hammer_mem_done(hammer_cursor_t cursor)
318 {
319 	if (cursor->ip) {
320 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
321 						  &cursor->ip->rec_tree);
322 		cursor->ip = NULL;
323 	}
324         if (cursor->iprec) {
325 		hammer_rel_mem_record(cursor->iprec);
326 		cursor->iprec = NULL;
327 	}
328 }
329 
330 /************************************************************************
331  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
332  ************************************************************************
333  *
334  * These functions manipulate in-memory records.  Such records typically
335  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
336  */
337 
338 /*
339  * Add a directory entry (dip,ncp) which references inode (ip).
340  *
341  * Note that the low 32 bits of the namekey are set temporarily to create
342  * a unique in-memory record, and may be modified a second time when the
343  * record is synchronized to disk.  In particular, the low 32 bits cannot be
344  * all 0's when synching to disk, which is not handled here.
345  */
346 int
347 hammer_ip_add_directory(struct hammer_transaction *trans,
348 		     struct hammer_inode *dip, struct namecache *ncp,
349 		     struct hammer_inode *ip)
350 {
351 	hammer_record_t record;
352 	int error;
353 	int bytes;
354 
355 	record = hammer_alloc_mem_record(dip);
356 
357 	bytes = ncp->nc_nlen;	/* NOTE: terminating \0 is NOT included */
358 	if (++trans->hmp->namekey_iterator == 0)
359 		++trans->hmp->namekey_iterator;
360 
361 	record->rec.entry.base.base.obj_id = dip->obj_id;
362 	record->rec.entry.base.base.key =
363 		hammer_directory_namekey(ncp->nc_name, bytes);
364 	record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
365 	record->rec.entry.base.base.create_tid = trans->tid;
366 	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
367 	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
368 	record->rec.entry.obj_id = ip->obj_id;
369 	if (bytes <= sizeof(record->rec.entry.den_name)) {
370 		record->data = (void *)record->rec.entry.den_name;
371 		record->flags |= HAMMER_RECF_EMBEDDED_DATA;
372 	} else {
373 		++hammer_count_record_datas;
374 		record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
375 		record->flags |= HAMMER_RECF_ALLOCDATA;
376 	}
377 	bcopy(ncp->nc_name, record->data, bytes);
378 	record->rec.entry.base.data_len = bytes;
379 	++ip->ino_rec.ino_nlinks;
380 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
381 	error = hammer_mem_add(trans, record);
382 	return(error);
383 }
384 
385 /*
386  * Delete the directory entry and update the inode link count.  The
387  * cursor must be seeked to the directory entry record being deleted.
388  *
389  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
390  *
391  * This function can return EDEADLK requiring the caller to terminate
392  * the cursor and retry.
393  */
394 int
395 hammer_ip_del_directory(struct hammer_transaction *trans,
396 		     hammer_cursor_t cursor, struct hammer_inode *dip,
397 		     struct hammer_inode *ip)
398 {
399 	int error;
400 
401 	error = hammer_ip_delete_record(cursor, trans->tid);
402 
403 	/*
404 	 * One less link.  The file may still be open in the OS even after
405 	 * all links have gone away so we only try to sync if the OS has
406 	 * no references and nlinks falls to 0.
407 	 *
408 	 * We have to terminate the cursor before syncing the inode to
409 	 * avoid deadlocking against ourselves.
410 	 */
411 	if (error == 0) {
412 		--ip->ino_rec.ino_nlinks;
413 		hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
414 		if (ip->ino_rec.ino_nlinks == 0 &&
415 		    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
416 			hammer_done_cursor(cursor);
417 			hammer_sync_inode(ip, MNT_NOWAIT, 1);
418 		}
419 
420 	}
421 	return(error);
422 }
423 
424 /*
425  * Add a record to an inode.
426  *
427  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
428  * initialize the following additional fields:
429  *
430  * record->rec.entry.base.base.key
431  * record->rec.entry.base.base.rec_type
432  * record->rec.entry.base.base.data_len
433  * record->data		(a copy will be kmalloc'd if not embedded)
434  */
435 int
436 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
437 {
438 	hammer_inode_t ip = record->ip;
439 	int error;
440 	int bytes;
441 	void *data;
442 
443 	record->rec.base.base.obj_id = ip->obj_id;
444 	record->rec.base.base.create_tid = trans->tid;
445 	record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
446 	bytes = record->rec.base.data_len;
447 
448 	if (record->data) {
449 		if ((char *)record->data < (char *)&record->rec ||
450 		    (char *)record->data >= (char *)(&record->rec + 1)) {
451 			++hammer_count_record_datas;
452 			data = kmalloc(bytes, M_HAMMER, M_WAITOK);
453 			record->flags |= HAMMER_RECF_ALLOCDATA;
454 			bcopy(record->data, data, bytes);
455 			record->data = data;
456 		} else {
457 			record->flags |= HAMMER_RECF_EMBEDDED_DATA;
458 		}
459 	}
460 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
461 	error = hammer_mem_add(trans, record);
462 	return(error);
463 }
464 
465 /*
466  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
467  * is called via the strategy called from a cached data source.  This code
468  * is responsible for actually writing a data record out to the disk.
469  *
470  * This can only occur non-historically (i.e. 'current' data only).
471  */
472 int
473 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
474 		       int64_t offset, void *data, int bytes,
475 		       struct hammer_cursor **spike)
476 {
477 	struct hammer_cursor cursor;
478 	hammer_record_ondisk_t rec;
479 	union hammer_btree_elm elm;
480 	void *bdata;
481 	int error;
482 
483 	KKASSERT((offset & HAMMER_BUFMASK) == 0);
484 	KKASSERT((bytes & HAMMER_BUFMASK) == 0);
485 retry:
486 	error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
487 	if (error)
488 		return(error);
489 	cursor.key_beg.obj_id = ip->obj_id;
490 	cursor.key_beg.key = offset + bytes;
491 	cursor.key_beg.create_tid = trans->tid;
492 	cursor.key_beg.delete_tid = 0;
493 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
494 	cursor.asof = trans->tid;
495 	cursor.flags |= HAMMER_CURSOR_INSERT;
496 
497 	/*
498 	 * Issue a lookup to position the cursor and locate the cluster
499 	 */
500 	error = hammer_btree_lookup(&cursor);
501 	if (error == 0) {
502 		kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
503 			offset, bytes);
504 		hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
505 				       HAMMER_BTREE_TYPE_LEAF, cursor.index);
506 		error = EIO;
507 	}
508 	if (error != ENOENT)
509 		goto done;
510 
511 	/*
512 	 * Allocate record and data space now that we know which cluster
513 	 * the B-Tree node ended up in.
514 	 */
515 	bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
516 				  &cursor.data_buffer);
517 	if (bdata == NULL)
518 		goto done;
519 	rec = hammer_alloc_record(cursor.node->cluster, &error,
520 				  HAMMER_RECTYPE_DATA, &cursor.record_buffer);
521 	if (rec == NULL)
522 		goto fail1;
523 
524 	/*
525 	 * Fill everything in and insert our B-Tree node.
526 	 */
527 	hammer_modify_buffer(cursor.record_buffer);
528 	rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
529 	rec->base.base.obj_id = ip->obj_id;
530 	rec->base.base.key = offset + bytes;
531 	rec->base.base.create_tid = trans->tid;
532 	rec->base.base.delete_tid = 0;
533 	rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
534 	rec->base.data_crc = crc32(data, bytes);
535 	rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster);
536 	rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
537 	rec->base.data_len = bytes;
538 
539 	hammer_modify_buffer(cursor.data_buffer);
540 	bcopy(data, bdata, bytes);
541 
542 	elm.leaf.base = rec->base.base;
543 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
544 	elm.leaf.data_offset = rec->base.data_offset;
545 	elm.leaf.data_len = bytes;
546 	elm.leaf.data_crc = rec->base.data_crc;
547 
548 	/*
549 	 * Data records can wind up on-disk before the inode itself is
550 	 * on-disk.  One must assume data records may be on-disk if either
551 	 * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
552 	 */
553 	ip->flags |= HAMMER_INODE_DONDISK;
554 
555 	error = hammer_btree_insert(&cursor, &elm);
556 	if (error == 0)
557 		goto done;
558 
559 	hammer_free_record_ptr(cursor.record_buffer, rec, HAMMER_RECTYPE_DATA);
560 fail1:
561 	hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
562 done:
563 	/*
564 	 * If ENOSPC in cluster fill in the spike structure and return
565 	 * ENOSPC.
566 	 */
567 	if (error == ENOSPC)
568 		hammer_load_spike(&cursor, spike);
569 	hammer_done_cursor(&cursor);
570 	if (error == EDEADLK)
571 		goto retry;
572 	return(error);
573 }
574 
575 /*
576  * Sync an in-memory record to the disk.  this is typically called via fsync
577  * from a cached record source.  This code is responsible for actually
578  * writing a record out to the disk.
579  */
580 int
581 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
582 {
583 	struct hammer_cursor cursor;
584 	hammer_record_ondisk_t rec;
585 	hammer_mount_t hmp;
586 	union hammer_btree_elm elm;
587 	void *bdata;
588 	int error;
589 
590 retry:
591 	/*
592 	 * If the record has been deleted or is being synchronized, stop.
593 	 * Interlock with the syncing flag.
594 	 */
595 	if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING))
596 		return(0);
597 	record->flags |= HAMMER_RECF_SYNCING;
598 
599 	/*
600 	 * If someone other then us is referencing the record and not
601 	 * blocking waiting for us, we have to wait until they finish.
602 	 *
603 	 * It is possible the record got destroyed while we were blocked.
604 	 */
605 	if (record->lock.refs > record->blocked + 1) {
606 		while (record->lock.refs > record->blocked + 1) {
607 			record->flags |= HAMMER_RECF_WANTED;
608 			tsleep(record, 0, "hmrrc1", 0);
609 		}
610 		if (record->flags & HAMMER_RECF_DELETED)
611 			return(0);
612 	}
613 
614 	/*
615 	 * Get a cursor
616 	 */
617 	error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
618 				       record->ip->hmp);
619 	if (error)
620 		return(error);
621 	cursor.key_beg = record->rec.base.base;
622 	cursor.flags |= HAMMER_CURSOR_INSERT;
623 
624 	/*
625 	 * Issue a lookup to position the cursor and locate the cluster.  The
626 	 * target key should not exist.  If we are creating a directory entry
627 	 * we may have to iterate the low 32 bits of the key to find an unused
628 	 * key.
629 	 */
630 	for (;;) {
631 		error = hammer_btree_lookup(&cursor);
632 		if (error)
633 			break;
634 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
635 			kprintf("hammer_ip_sync_record: duplicate rec "
636 				"at (%016llx)\n", record->rec.base.base.key);
637 			Debugger("duplicate record1");
638 			error = EIO;
639 			break;
640 		}
641 		hmp = cursor.node->cluster->volume->hmp;
642 		if (++hmp->namekey_iterator == 0)
643 			++hmp->namekey_iterator;
644 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
645 		record->rec.base.base.key |= hmp->namekey_iterator;
646 		cursor.key_beg.key = record->rec.base.base.key;
647 	}
648 	if (error != ENOENT)
649 		goto done;
650 
651 	/*
652 	 * Mark the record as undergoing synchronization.  Our cursor is
653 	 * holding a locked B-Tree node for the insertion which interlocks
654 	 * anyone trying to access this record.
655 	 *
656 	 * XXX There is still a race present related to iterations.  An
657 	 * iteration may process the record, a sync may occur, and then
658 	 * later process the B-Tree element for the same record.
659 	 *
660 	 * We do not try to synchronize a deleted record.
661 	 */
662 	if (record->flags & HAMMER_RECF_DELETED) {
663 		error = 0;
664 		goto done;
665 	}
666 
667 	/*
668 	 * Allocate record and data space now that we know which cluster
669 	 * the B-Tree node ended up in.
670 	 */
671 	if (record->data == NULL ||
672 	    (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
673 		bdata = record->data;
674 	} else {
675 		bdata = hammer_alloc_data(cursor.node->cluster,
676 					  record->rec.base.data_len, &error,
677 					  &cursor.data_buffer);
678 		if (bdata == NULL)
679 			goto done;
680 	}
681 	rec = hammer_alloc_record(cursor.node->cluster, &error,
682 				  record->rec.base.base.rec_type,
683 				  &cursor.record_buffer);
684 	if (rec == NULL)
685 		goto fail1;
686 
687 	/*
688 	 * Fill everything in and insert our B-Tree node.
689 	 */
690 	hammer_modify_buffer(cursor.record_buffer);
691 	*rec = record->rec;
692 	if (bdata) {
693 		rec->base.data_crc = crc32(record->data,
694 					   record->rec.base.data_len);
695 		if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
696 			/*
697 			 * Data embedded in record
698 			 */
699 			rec->base.data_offset = ((char *)bdata -
700 						 (char *)&record->rec);
701 			KKASSERT(rec->base.data_offset >= 0 &&
702 				 rec->base.data_offset + rec->base.data_len <=
703 				  sizeof(*rec));
704 			rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
705 		} else {
706 			/*
707 			 * Data separate from record
708 			 */
709 			rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
710 			hammer_modify_buffer(cursor.data_buffer);
711 			bcopy(record->data, bdata, rec->base.data_len);
712 		}
713 	}
714 	rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster);
715 
716 	elm.leaf.base = record->rec.base.base;
717 	elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
718 	elm.leaf.data_offset = rec->base.data_offset;
719 	elm.leaf.data_len = rec->base.data_len;
720 	elm.leaf.data_crc = rec->base.data_crc;
721 
722 	error = hammer_btree_insert(&cursor, &elm);
723 
724 	/*
725 	 * Clean up on success, or fall through on error.
726 	 */
727 	if (error == 0) {
728 		record->flags |= HAMMER_RECF_DELETED;
729 		goto done;
730 	}
731 
732 	hammer_free_record_ptr(cursor.record_buffer, rec,
733 			       record->rec.base.base.rec_type);
734 fail1:
735 	if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
736 		hammer_free_data_ptr(cursor.data_buffer, bdata,
737 				     record->rec.base.data_len);
738 	}
739 done:
740 	record->flags &= ~HAMMER_RECF_SYNCING;
741 	/*
742 	 * If ENOSPC in cluster fill in the spike structure and return
743 	 * ENOSPC.
744 	 */
745 	if (error == ENOSPC)
746 		hammer_load_spike(&cursor, spike);
747 	hammer_done_cursor(&cursor);
748 	if (error == EDEADLK)
749 		goto retry;
750 	return(error);
751 }
752 
753 /*
754  * Write out a record using the specified cursor.  The caller does not have
755  * to seek the cursor.  The flags are used to determine whether the data
756  * (if any) is embedded in the record or not.
757  *
758  * The cursor must be initialized, including its key_beg element.  The B-Tree
759  * key may be different from the base element in the record (e.g. for spikes).
760  *
761  * NOTE: This can return EDEADLK, requiring the caller to release its cursor
762  * and retry the operation.
763  */
764 int
765 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
766 		    void *data, int cursor_flags)
767 {
768 	union hammer_btree_elm elm;
769 	hammer_record_ondisk_t nrec;
770 	hammer_cluster_t ncluster;
771 	hammer_volume_t nvolume;
772 	int32_t nrec_offset;
773 	void *bdata;
774 	int error;
775 
776 	KKASSERT(cursor->flags & HAMMER_CURSOR_INSERT);
777 
778 	/*
779 	 * Issue a lookup to position the cursor and locate the cluster.  The
780 	 * target key should not exist.
781 	 *
782 	 * If we run out of space trying to adjust the B-Tree for the
783 	 * insert, re-lookup without the insert flag so the cursor
784 	 * is properly positioned for the spike.
785 	 */
786 	error = hammer_btree_lookup(cursor);
787 	if (error == 0) {
788 		kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
789 			orec->base.base.key);
790 		Debugger("duplicate record2");
791 		error = EIO;
792 	}
793 	if (error != ENOENT)
794 		goto done;
795 
796 	/*
797 	 * Allocate record and data space now that we know which cluster
798 	 * the B-Tree node ended up in.
799 	 */
800 	if (data == NULL ||
801 	    (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
802 		bdata = data;
803 	} else {
804 		bdata = hammer_alloc_data(cursor->node->cluster,
805 					  orec->base.data_len, &error,
806 					  &cursor->data_buffer);
807 		if (bdata == NULL)
808 			goto done;
809 	}
810 	nrec = hammer_alloc_record(cursor->node->cluster, &error,
811 				  orec->base.base.rec_type,
812 				  &cursor->record_buffer);
813 	if (nrec == NULL)
814 		goto fail1;
815 
816 	/*
817 	 * Fill everything in and insert our B-Tree node.
818 	 */
819 	hammer_modify_buffer(cursor->record_buffer);
820 	*nrec = *orec;
821 	nrec->base.data_offset = 0;
822 	if (bdata) {
823 		nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
824 		if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
825 			/*
826 			 * Data embedded in record
827 			 */
828 			nrec->base.data_offset = ((char *)bdata - (char *)orec);
829 			KKASSERT(nrec->base.data_offset >= 0 &&
830 				 nrec->base.data_offset + nrec->base.data_len <
831 				  sizeof(*nrec));
832 			nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
833 		} else {
834 			/*
835 			 * Data separate from record
836 			 */
837 			nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
838 			hammer_modify_buffer(cursor->data_buffer);
839 			bcopy(data, bdata, nrec->base.data_len);
840 		}
841 	}
842 	nrec->base.rec_id = hammer_alloc_recid(cursor->node->cluster);
843 	nrec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
844 
845 	switch(cursor->key_beg.btype) {
846 	case HAMMER_BTREE_TYPE_RECORD:
847 		elm.leaf.base = cursor->key_beg;
848 		elm.leaf.rec_offset = nrec_offset;
849 		elm.leaf.data_offset = nrec->base.data_offset;
850 		elm.leaf.data_len = nrec->base.data_len;
851 		elm.leaf.data_crc = nrec->base.data_crc;
852 		error = hammer_btree_insert(cursor, &elm);
853 		break;
854 	case HAMMER_BTREE_TYPE_SPIKE_BEG:
855 #if 0
856 		Debugger("SpikeSpike");
857 #endif
858 		nvolume = hammer_get_volume(cursor->node->cluster->volume->hmp,
859 					    nrec->spike.vol_no, &error);
860 		if (error)
861 			break;
862 		ncluster = hammer_get_cluster(nvolume, nrec->spike.clu_no,
863 					      &error, GET_CLUSTER_NORECOVER);
864 		hammer_rel_volume(nvolume, 0);
865                 if (error)
866                         break;
867                 hammer_modify_cluster(ncluster);
868 		ncluster->ondisk->clu_btree_parent_offset =
869 			cursor->node->node_offset;
870                 hammer_rel_cluster(ncluster, 0);
871 		error = hammer_btree_insert_cluster(cursor, ncluster,
872 						    nrec_offset);
873 		break;
874 	case HAMMER_BTREE_TYPE_SPIKE_END:
875 		panic("hammer_write_record: cannot write spike-end elms");
876 		break;
877 	default:
878 		panic("hammer_write_record: unknown btype %02x",
879 		      elm.leaf.base.btype);
880 		break;
881 	}
882 	if (error == 0)
883 		goto done;
884 
885 	hammer_free_record_ptr(cursor->record_buffer, nrec,
886 			       orec->base.base.rec_type);
887 fail1:
888 	if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
889 		hammer_free_data_ptr(cursor->data_buffer, bdata,
890 				     orec->base.data_len);
891 	}
892 done:
893 	/* leave cursor intact */
894 	return(error);
895 }
896 
897 /*
898  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
899  * entry's key is used to deal with hash collisions in the upper 32 bits.
900  * A unique 64 bit key is generated in-memory and may be regenerated a
901  * second time when the directory record is flushed to the on-disk B-Tree.
902  *
903  * A referenced record is passed to this function.  This function
904  * eats the reference.  If an error occurs the record will be deleted.
905  */
906 static
907 int
908 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
909 {
910 	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
911 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
912 			record->flags |= HAMMER_RECF_DELETED;
913 			hammer_rel_mem_record(record);
914 			return (EEXIST);
915 		}
916 		if (++trans->hmp->namekey_iterator == 0)
917 			++trans->hmp->namekey_iterator;
918 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
919 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
920 	}
921 	record->flags |= HAMMER_RECF_ONRBTREE;
922 	hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
923 	hammer_rel_mem_record(record);
924 	return(0);
925 }
926 
927 /************************************************************************
928  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
929  ************************************************************************
930  *
931  * These functions augment the B-Tree scanning functions in hammer_btree.c
932  * by merging in-memory records with on-disk records.
933  */
934 
935 /*
936  * Locate a particular record either in-memory or on-disk.
937  *
938  * NOTE: This is basically a standalone routine, hammer_ip_next() may
939  * NOT be called to iterate results.
940  */
941 int
942 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
943 {
944 	int error;
945 
946 	/*
947 	 * If the element is in-memory return it without searching the
948 	 * on-disk B-Tree
949 	 */
950 	error = hammer_mem_lookup(cursor, ip);
951 	if (error == 0) {
952 		cursor->record = &cursor->iprec->rec;
953 		return(error);
954 	}
955 	if (error != ENOENT)
956 		return(error);
957 
958 	/*
959 	 * If the inode has on-disk components search the on-disk B-Tree.
960 	 */
961 	if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
962 		return(error);
963 	error = hammer_btree_lookup(cursor);
964 	if (error == 0)
965 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
966 	return(error);
967 }
968 
969 /*
970  * Locate the first record within the cursor's key_beg/key_end range,
971  * restricted to a particular inode.  0 is returned on success, ENOENT
972  * if no records matched the requested range, or some other error.
973  *
974  * When 0 is returned hammer_ip_next() may be used to iterate additional
975  * records within the requested range.
976  *
977  * This function can return EDEADLK, requiring the caller to terminate
978  * the cursor and try again.
979  */
980 int
981 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
982 {
983 	int error;
984 
985 	/*
986 	 * Clean up fields and setup for merged scan
987 	 */
988 	cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
989 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
990 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
991 	if (cursor->iprec) {
992 		hammer_rel_mem_record(cursor->iprec);
993 		cursor->iprec = NULL;
994 	}
995 
996 	/*
997 	 * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
998 	 * exact lookup so if we get ENOENT we have to call the iterate
999 	 * function to validate the first record after the begin key.
1000 	 *
1001 	 * The ATEDISK flag is used by hammer_btree_iterate to determine
1002 	 * whether it must index forwards or not.  It is also used here
1003 	 * to select the next record from in-memory or on-disk.
1004 	 *
1005 	 * EDEADLK can only occur if the lookup hit an empty internal
1006 	 * element and couldn't delete it.  Since this could only occur
1007 	 * in-range, we can just iterate from the failure point.
1008 	 */
1009 	if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1010 		error = hammer_btree_lookup(cursor);
1011 		if (error == ENOENT || error == EDEADLK) {
1012 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1013 			error = hammer_btree_iterate(cursor);
1014 		}
1015 		if (error && error != ENOENT)
1016 			return(error);
1017 		if (error == 0) {
1018 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1019 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1020 		} else {
1021 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1022 		}
1023 	}
1024 
1025 	/*
1026 	 * Search the in-memory record list (Red-Black tree).  Unlike the
1027 	 * B-Tree search, mem_first checks for records in the range.
1028 	 */
1029 	error = hammer_mem_first(cursor, ip);
1030 	if (error && error != ENOENT)
1031 		return(error);
1032 	if (error == 0) {
1033 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1034 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1035 	}
1036 
1037 	/*
1038 	 * This will return the first matching record.
1039 	 */
1040 	return(hammer_ip_next(cursor));
1041 }
1042 
1043 /*
1044  * Retrieve the next record in a merged iteration within the bounds of the
1045  * cursor.  This call may be made multiple times after the cursor has been
1046  * initially searched with hammer_ip_first().
1047  *
1048  * 0 is returned on success, ENOENT if no further records match the
1049  * requested range, or some other error code is returned.
1050  */
1051 int
1052 hammer_ip_next(hammer_cursor_t cursor)
1053 {
1054 	hammer_btree_elm_t elm;
1055 	hammer_record_t rec;
1056 	int error;
1057 	int r;
1058 
1059 	/*
1060 	 * Load the current on-disk and in-memory record.  If we ate any
1061 	 * records we have to get the next one.
1062 	 *
1063 	 * If we deleted the last on-disk record we had scanned ATEDISK will
1064 	 * be clear and DELBTREE will be set, forcing a call to iterate. The
1065 	 * fact that ATEDISK is clear causes iterate to re-test the 'current'
1066 	 * element.  If ATEDISK is set, iterate will skip the 'current'
1067 	 * element.
1068 	 *
1069 	 * Get the next on-disk record
1070 	 */
1071 	if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1072 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1073 			error = hammer_btree_iterate(cursor);
1074 			cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1075 			if (error == 0)
1076 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1077 			else
1078 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
1079 						 HAMMER_CURSOR_ATEDISK;
1080 		}
1081 	}
1082 
1083 	/*
1084 	 * Get the next in-memory record.  The record can be ripped out
1085 	 * of the RB tree so we maintain a scan_info structure to track
1086 	 * the next node.
1087 	 *
1088 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
1089 	 *			 (non-inclusive of snapshot exclusions)?
1090 	 * hammer_rec_scan_callback: Is the record in our snapshot?
1091 	 */
1092 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1093 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1094 			if (cursor->iprec) {
1095 				hammer_rel_mem_record(cursor->iprec);
1096 				cursor->iprec = NULL;
1097 			}
1098 			rec = cursor->scan.node;	/* next node */
1099 			while (rec) {
1100 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
1101 					break;
1102 				if (hammer_rec_scan_callback(rec, cursor) != 0)
1103 					break;
1104 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
1105 			}
1106 			if (cursor->iprec) {
1107 				KKASSERT(cursor->iprec == rec);
1108 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1109 				cursor->scan.node =
1110 					hammer_rec_rb_tree_RB_NEXT(rec);
1111 			} else {
1112 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
1113 			}
1114 		}
1115 	}
1116 
1117 	/*
1118 	 * Extract either the disk or memory record depending on their
1119 	 * relative position.
1120 	 */
1121 	error = 0;
1122 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1123 	case 0:
1124 		/*
1125 		 * Both entries valid
1126 		 */
1127 		elm = &cursor->node->ondisk->elms[cursor->index];
1128 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1129 		if (r < 0) {
1130 			error = hammer_btree_extract(cursor,
1131 						     HAMMER_CURSOR_GET_RECORD);
1132 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1133 			break;
1134 		}
1135 		/* fall through to the memory entry */
1136 	case HAMMER_CURSOR_ATEDISK:
1137 		/*
1138 		 * Only the memory entry is valid
1139 		 */
1140 		cursor->record = &cursor->iprec->rec;
1141 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1142 		break;
1143 	case HAMMER_CURSOR_ATEMEM:
1144 		/*
1145 		 * Only the disk entry is valid
1146 		 */
1147 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1148 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1149 		break;
1150 	default:
1151 		/*
1152 		 * Neither entry is valid
1153 		 *
1154 		 * XXX error not set properly
1155 		 */
1156 		cursor->record = NULL;
1157 		error = ENOENT;
1158 		break;
1159 	}
1160 	return(error);
1161 }
1162 
1163 /*
1164  * Resolve the cursor->data pointer for the current cursor position in
1165  * a merged iteration.
1166  */
1167 int
1168 hammer_ip_resolve_data(hammer_cursor_t cursor)
1169 {
1170 	int error;
1171 
1172 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1173 		cursor->data = cursor->iprec->data;
1174 		error = 0;
1175 	} else {
1176 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1177 	}
1178 	return(error);
1179 }
1180 
1181 /*
1182  * Delete all records within the specified range for inode ip.
1183  *
1184  * NOTE: An unaligned range will cause new records to be added to cover
1185  * the edge cases. (XXX not implemented yet).
1186  *
1187  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1188  *
1189  * NOTE: Record keys for regular file data have to be special-cased since
1190  * they indicate the end of the range (key = base + bytes).
1191  *
1192  * NOTE: The spike structure must be filled in if we return ENOSPC.
1193  */
1194 int
1195 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1196 		       int64_t ran_beg, int64_t ran_end,
1197 		       struct hammer_cursor **spike)
1198 {
1199 	struct hammer_cursor cursor;
1200 	hammer_record_ondisk_t rec;
1201 	hammer_base_elm_t base;
1202 	int error;
1203 	int64_t off;
1204 
1205 retry:
1206 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1207 
1208 	cursor.key_beg.obj_id = ip->obj_id;
1209 	cursor.key_beg.create_tid = 0;
1210 	cursor.key_beg.delete_tid = 0;
1211 	cursor.key_beg.obj_type = 0;
1212 	cursor.asof = ip->obj_asof;
1213 	cursor.flags |= HAMMER_CURSOR_ASOF;
1214 
1215 	cursor.key_end = cursor.key_beg;
1216 	if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1217 		cursor.key_beg.key = ran_beg;
1218 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1219 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1220 		cursor.key_end.key = ran_end;
1221 	} else {
1222 		/*
1223 		 * The key in the B-Tree is (base+bytes), so the first possible
1224 		 * matching key is ran_beg + 1.
1225 		 */
1226 		int64_t tmp64;
1227 
1228 		cursor.key_beg.key = ran_beg + 1;
1229 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1230 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1231 
1232 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1233 		if (tmp64 < ran_end)
1234 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1235 		else
1236 			cursor.key_end.key = ran_end + MAXPHYS + 1;
1237 	}
1238 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1239 
1240 	error = hammer_ip_first(&cursor, ip);
1241 
1242 	/*
1243 	 * Iterate through matching records and mark them as deleted.
1244 	 */
1245 	while (error == 0) {
1246 		rec = cursor.record;
1247 		base = &rec->base.base;
1248 
1249 		KKASSERT(base->delete_tid == 0);
1250 
1251 		/*
1252 		 * There may be overlap cases for regular file data.  Also
1253 		 * remember the key for a regular file record is the offset
1254 		 * of the last byte of the record (base + len - 1), NOT the
1255 		 * base offset.
1256 		 */
1257 #if 0
1258 		kprintf("delete_range rec_type %02x\n", base->rec_type);
1259 #endif
1260 		if (base->rec_type == HAMMER_RECTYPE_DATA) {
1261 #if 0
1262 			kprintf("delete_range loop key %016llx\n",
1263 				base->key - rec->base.data_len);
1264 #endif
1265 			off = base->key - rec->base.data_len;
1266 			/*
1267 			 * Check the left edge case.  We currently do not
1268 			 * split existing records.
1269 			 */
1270 			if (off < ran_beg) {
1271 				panic("hammer left edge case %016llx %d\n",
1272 					base->key, rec->base.data_len);
1273 			}
1274 
1275 			/*
1276 			 * Check the right edge case.  Note that the
1277 			 * record can be completely out of bounds, which
1278 			 * terminates the search.
1279 			 *
1280 			 * base->key is exclusive of the right edge while
1281 			 * ran_end is inclusive of the right edge.  The
1282 			 * (key - data_len) left boundary is inclusive.
1283 			 *
1284 			 * XXX theory-check this test at some point, are
1285 			 * we missing a + 1 somewhere?  Note that ran_end
1286 			 * could overflow.
1287 			 */
1288 			if (base->key - 1 > ran_end) {
1289 				if (base->key - rec->base.data_len > ran_end)
1290 					break;
1291 				panic("hammer right edge case\n");
1292 			}
1293 		}
1294 
1295 		/*
1296 		 * Mark the record and B-Tree entry as deleted.  This will
1297 		 * also physically delete the B-Tree entry, record, and
1298 		 * data if the retention policy dictates.  The function
1299 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1300 		 * uses to perform a fixup.
1301 		 */
1302 		error = hammer_ip_delete_record(&cursor, trans->tid);
1303 		if (error)
1304 			break;
1305 		error = hammer_ip_next(&cursor);
1306 	}
1307 	hammer_done_cursor(&cursor);
1308 	if (error == EDEADLK)
1309 		goto retry;
1310 	if (error == ENOENT)
1311 		error = 0;
1312 	return(error);
1313 }
1314 
1315 /*
1316  * Delete all records associated with an inode except the inode record
1317  * itself.
1318  */
1319 int
1320 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1321 {
1322 	struct hammer_cursor cursor;
1323 	hammer_record_ondisk_t rec;
1324 	hammer_base_elm_t base;
1325 	int error;
1326 
1327 retry:
1328 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1329 
1330 	cursor.key_beg.obj_id = ip->obj_id;
1331 	cursor.key_beg.create_tid = 0;
1332 	cursor.key_beg.delete_tid = 0;
1333 	cursor.key_beg.obj_type = 0;
1334 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1335 	cursor.key_beg.key = HAMMER_MIN_KEY;
1336 
1337 	cursor.key_end = cursor.key_beg;
1338 	cursor.key_end.rec_type = 0xFFFF;
1339 	cursor.key_end.key = HAMMER_MAX_KEY;
1340 
1341 	cursor.asof = ip->obj_asof;
1342 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1343 
1344 	error = hammer_ip_first(&cursor, ip);
1345 
1346 	/*
1347 	 * Iterate through matching records and mark them as deleted.
1348 	 */
1349 	while (error == 0) {
1350 		rec = cursor.record;
1351 		base = &rec->base.base;
1352 
1353 		KKASSERT(base->delete_tid == 0);
1354 
1355 		/*
1356 		 * Mark the record and B-Tree entry as deleted.  This will
1357 		 * also physically delete the B-Tree entry, record, and
1358 		 * data if the retention policy dictates.  The function
1359 		 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1360 		 * uses to perform a fixup.
1361 		 */
1362 		error = hammer_ip_delete_record(&cursor, trans->tid);
1363 		if (error)
1364 			break;
1365 		error = hammer_ip_next(&cursor);
1366 	}
1367 	hammer_done_cursor(&cursor);
1368 	if (error == EDEADLK)
1369 		goto retry;
1370 	if (error == ENOENT)
1371 		error = 0;
1372 	return(error);
1373 }
1374 
1375 /*
1376  * Delete the record at the current cursor.  On success the cursor will
1377  * be positioned appropriately for an iteration but may no longer be at
1378  * a leaf node.
1379  *
1380  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1381  * cursor and retry.
1382  */
1383 int
1384 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1385 {
1386 	hammer_btree_elm_t elm;
1387 	hammer_mount_t hmp;
1388 	int error;
1389 
1390 	/*
1391 	 * In-memory (unsynchronized) records can simply be freed.
1392 	 */
1393 	if (cursor->record == &cursor->iprec->rec) {
1394 		cursor->iprec->flags |= HAMMER_RECF_DELETED;
1395 		return(0);
1396 	}
1397 
1398 	/*
1399 	 * On-disk records are marked as deleted by updating their delete_tid.
1400 	 * This does not effect their position in the B-Tree (which is based
1401 	 * on their create_tid).
1402 	 */
1403 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1404 	elm = NULL;
1405 	hmp = cursor->node->cluster->volume->hmp;
1406 
1407 	if (error == 0) {
1408 		hammer_modify_buffer(cursor->record_buffer);
1409 		cursor->record->base.base.delete_tid = tid;
1410 
1411 		error = hammer_cursor_upgrade(cursor);
1412 		if (error == 0) {
1413 			hammer_modify_node(cursor->node);
1414 			elm = &cursor->node->ondisk->elms[cursor->index];
1415 			elm->leaf.base.delete_tid = tid;
1416 		}
1417 	}
1418 
1419 	/*
1420 	 * If we were mounted with the nohistory option, we physically
1421 	 * delete the record.
1422 	 */
1423 	if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1424 		int32_t rec_offset;
1425 		int32_t data_offset;
1426 		int32_t data_len;
1427 		u_int8_t rec_type;
1428 		hammer_cluster_t cluster;
1429 
1430 		rec_offset = elm->leaf.rec_offset;
1431 		data_offset = elm->leaf.data_offset;
1432 		data_len = elm->leaf.data_len;
1433 		rec_type = elm->leaf.base.rec_type;
1434 		KKASSERT(rec_type == cursor->record->base.base.rec_type);
1435 
1436 		/*
1437 		 * We must ref the cluster to prevent it from being
1438 		 * freed prior to our freeing the last record.
1439 		 */
1440 		cluster = cursor->node->cluster;
1441 		hammer_ref_cluster(cluster);
1442 
1443 		error = hammer_btree_delete(cursor);
1444 		if (error == 0) {
1445 			/*
1446 			 * This forces a fixup for the iteration because
1447 			 * the cursor is now either sitting at the 'next'
1448 			 * element or sitting at the end of a leaf.
1449 			 */
1450 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1451 				cursor->flags |= HAMMER_CURSOR_DELBTREE;
1452 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1453 			}
1454 			hammer_free_record(cluster, rec_offset, rec_type);
1455 			if (data_offset && (data_offset - rec_offset < 0 ||
1456 			    data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1457 				hammer_free_data(cluster, data_offset,data_len);
1458 			}
1459 		}
1460 #if 0
1461 		kprintf("hammer_ip_delete_record: %d:%d:%08x %08x/%d "
1462 			"(%d remain in cluster)\n",
1463 			cluster->volume->vol_no, cluster->clu_no,
1464 			rec_offset, data_offset, data_len,
1465 			cluster->ondisk->stat_records);
1466 #endif
1467 		hammer_rel_cluster(cluster, 0);
1468 		if (error) {
1469 			panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1470 			error = 0;
1471 		}
1472 	}
1473 	return(error);
1474 }
1475 
1476 /*
1477  * Determine whether a directory is empty or not.  Returns 0 if the directory
1478  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1479  */
1480 int
1481 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1482 {
1483 	struct hammer_cursor cursor;
1484 	int error;
1485 
1486 	hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1487 
1488 	cursor.key_beg.obj_id = ip->obj_id;
1489 	cursor.key_beg.create_tid = 0;
1490 	cursor.key_beg.delete_tid = 0;
1491 	cursor.key_beg.obj_type = 0;
1492 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1493 	cursor.key_beg.key = HAMMER_MIN_KEY;
1494 
1495 	cursor.key_end = cursor.key_beg;
1496 	cursor.key_end.rec_type = 0xFFFF;
1497 	cursor.key_end.key = HAMMER_MAX_KEY;
1498 
1499 	cursor.asof = ip->obj_asof;
1500 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1501 
1502 	error = hammer_ip_first(&cursor, ip);
1503 	if (error == ENOENT)
1504 		error = 0;
1505 	else if (error == 0)
1506 		error = ENOTEMPTY;
1507 	hammer_done_cursor(&cursor);
1508 	return(error);
1509 }
1510 
1511