xref: /dflybsd-src/sys/vfs/hammer/hammer_object.c (revision cb5730a1cf7fac6e64d67b655abc7713e7c33063)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.4 2007/11/20 22:55:40 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_add(hammer_transaction_t trans,
40 			     hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip);
43 
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50 	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 		return(-1);
52 	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53 		return(1);
54 
55 	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 		return(-1);
57 	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58 		return(1);
59 
60 	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61 		return(-1);
62 	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63 		return(1);
64         return(0);
65 }
66 
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         /*
71          * A key1->rec_type of 0 matches any record type.
72          */
73         if (info->rec_type) {
74                 if (info->rec_type < rec->rec.base.base.rec_type)
75                         return(-3);
76                 if (info->rec_type > rec->rec.base.base.rec_type)
77                         return(3);
78         }
79 
80         /*
81          * There is no special case for key.  0 means 0.
82          */
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87 
88         /*
89          * This test has a number of special cases.  create_tid in key1 is
90          * the as-of transction id, and delete_tid in key1 is NOT USED.
91          *
92          * A key1->create_tid of 0 matches any record regardles of when
93          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
94          * used to search for the most current state of the object.
95          *
96          * key2->create_tid is a HAMMER record and will never be
97          * 0.   key2->delete_tid is the deletion transaction id or 0 if
98          * the record has not yet been deleted.
99          */
100         if (info->create_tid) {
101                 if (info->create_tid < rec->rec.base.base.create_tid)
102                         return(-1);
103                 if (rec->rec.base.base.delete_tid &&
104 		    info->create_tid >= rec->rec.base.base.delete_tid) {
105                         return(1);
106 		}
107         }
108         return(0);
109 }
110 
111 /*
112  * RB_SCAN comparison code for hammer_mem_search().  The argument order
113  * is reversed so the comparison result has to be negated.  key_beg and
114  * key_end are both inclusive boundaries.
115  */
116 static
117 int
118 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
119 {
120 	hammer_cursor_t cursor = data;
121 	int r;
122 
123 	r = hammer_rec_compare(&cursor->key_beg, rec);
124 	if (r > 0)
125 		return(-1);
126 	if (r == 0)
127 		return(0);
128 	r = hammer_rec_compare(&cursor->key_end, rec);
129 	if (r <= 0)
130 		return(1);
131 	return(0);
132 }
133 
134 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
135 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
136 		    hammer_rec_compare, hammer_base_elm_t);
137 
138 /*
139  * Allocate a record for the caller to finish filling in
140  */
141 hammer_record_t
142 hammer_alloc_mem_record(struct hammer_transaction *trans, hammer_inode_t ip)
143 {
144 	hammer_record_t record;
145 
146 	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
147 	record->ip = ip;
148 	return (record);
149 }
150 
151 /*
152  * Release a memory record.  If the record is marked for defered deletion,
153  * destroy the record when the last reference goes away.
154  */
155 void
156 hammer_rel_mem_record(struct hammer_record **recordp)
157 {
158 	hammer_record_t rec;
159 
160 	if ((rec = *recordp) != NULL) {
161 		if (hammer_islastref(&rec->lock)) {
162 			hammer_unref(&rec->lock);
163 			if (rec->flags & HAMMER_RECF_DELETED)
164 				hammer_free_mem_record(rec);
165 		} else {
166 			hammer_unref(&rec->lock);
167 		}
168 		*recordp = NULL;
169 	}
170 }
171 
172 /*
173  * Free a record.  Clean the structure up even though we are throwing it
174  * away as a sanity check.  The actual free operation is delayed while
175  * the record is referenced.  However, the record is removed from the RB
176  * tree immediately.
177  */
178 void
179 hammer_free_mem_record(hammer_record_t record)
180 {
181 	if (record->flags & HAMMER_RECF_ONRBTREE) {
182 		RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
183 		record->flags &= ~HAMMER_RECF_ONRBTREE;
184 	}
185 	if (record->lock.refs) {
186 		record->flags |= HAMMER_RECF_DELETED;
187 		return;
188 	}
189 	if (record->flags & HAMMER_RECF_ALLOCDATA) {
190 		kfree(record->data, M_HAMMER);
191 		record->flags &= ~HAMMER_RECF_ALLOCDATA;
192 	}
193 	record->data = NULL;
194 	kfree(record, M_HAMMER);
195 }
196 
197 /*
198  * Lookup an in-memory record given the key specified in the cursor.  Works
199  * just like hammer_btree_lookup() but operates on an inode's in-memory
200  * record list.
201  *
202  * The lookup must fail if the record is marked for deferred deletion.
203  */
204 static
205 int
206 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
207 {
208 	int error;
209 
210 	if (cursor->iprec)
211 		hammer_rel_mem_record(&cursor->iprec);
212 	if (cursor->ip) {
213 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
214 						  &cursor->ip->rec_tree);
215 	}
216 	cursor->ip = ip;
217 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
218 	cursor->scan.node = NULL;
219 	cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
220 				&ip->rec_tree, &cursor->key_beg);
221 	if (cursor->iprec == NULL) {
222 		error = ENOENT;
223 	} else {
224 		hammer_ref(&cursor->iprec->lock);
225 		error = 0;
226 	}
227 	return(error);
228 }
229 
230 /*
231  * hammer_mem_search() - locate the first in-memory record matching the
232  * cursor.
233  *
234  * The RB_SCAN function we use is designed as a callback.  We terminate it
235  * (return -1) as soon as we get a match.
236  */
237 static
238 int
239 hammer_rec_scan_callback(hammer_record_t rec, void *data)
240 {
241 	hammer_cursor_t cursor = data;
242 
243 	if (cursor->iprec == NULL) {
244 		cursor->iprec = rec;
245 		hammer_ref(&rec->lock);
246 		return(-1);
247 	}
248 	return(0);
249 }
250 
251 static
252 int
253 hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip)
254 {
255 	if (cursor->iprec)
256 		hammer_rel_mem_record(&cursor->iprec);
257 	if (cursor->ip) {
258 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
259 						  &cursor->ip->rec_tree);
260 	}
261 	cursor->ip = ip;
262 	hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
263 	cursor->scan.node = NULL;
264 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265 				   hammer_rec_scan_callback, cursor);
266 	if (cursor->iprec) {
267 		cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
268 		return(0);
269 	}
270 	return(ENOENT);
271 }
272 
273 void
274 hammer_mem_done(hammer_cursor_t cursor)
275 {
276 	if (cursor->ip) {
277 		hammer_rec_rb_tree_scan_info_done(&cursor->scan,
278 						  &cursor->ip->rec_tree);
279 		cursor->ip = NULL;
280 	}
281         if (cursor->iprec)
282 		hammer_rel_mem_record(&cursor->iprec);
283 }
284 
285 /************************************************************************
286  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
287  ************************************************************************
288  *
289  * These functions manipulate in-memory records.  Such records typically
290  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
291  */
292 
293 /*
294  * Add a directory entry (dip,ncp) which references inode (ip).
295  *
296  * Note that the low 32 bits of the namekey are set temporarily to create
297  * a unique in-memory record, and may be modified a second time when the
298  * record is synchronized to disk.  In particular, the low 32 bits cannot be
299  * all 0's when synching to disk, which is not handled here.
300  */
301 int
302 hammer_ip_add_directory(struct hammer_transaction *trans,
303 		     struct hammer_inode *dip, struct namecache *ncp,
304 		     struct hammer_inode *ip)
305 {
306 	hammer_record_t record;
307 	int error;
308 	int bytes;
309 
310 	record = hammer_alloc_mem_record(trans, dip);
311 
312 	bytes = ncp->nc_nlen;	/* NOTE: terminating \0 is NOT included */
313 	if (++trans->hmp->namekey_iterator == 0)
314 		++trans->hmp->namekey_iterator;
315 
316 	record->rec.entry.base.base.obj_id = dip->obj_id;
317 	record->rec.entry.base.base.key =
318 		hammer_directory_namekey(ncp->nc_name, bytes);
319 	record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
320 	record->rec.entry.base.base.create_tid = trans->tid;
321 	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
322 	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
323 	record->rec.entry.obj_id = ip->obj_id;
324 	if (bytes <= sizeof(record->rec.entry.den_name)) {
325 		record->data = (void *)record->rec.entry.den_name;
326 	} else {
327 		record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
328 		record->flags |= HAMMER_RECF_ALLOCDATA;
329 	}
330 	bcopy(ncp->nc_name, record->data, bytes);
331 	record->rec.entry.base.data_len = bytes;
332 	++ip->ino_rec.ino_nlinks;
333 	hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
334 	error = hammer_mem_add(trans, record);
335 	return(error);
336 }
337 
338 /*
339  * Delete the directory entry and update the inode link count.  The
340  * cursor must be seeked to the directory entry record being deleted.
341  *
342  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
343  */
344 int
345 hammer_ip_del_directory(struct hammer_transaction *trans,
346 		     hammer_cursor_t cursor, struct hammer_inode *dip,
347 		     struct hammer_inode *ip)
348 {
349 	int error;
350 
351 	if (cursor->record == &cursor->iprec->rec) {
352 		/*
353 		 * The directory entry was in-memory, just scrap the
354 		 * record.
355 		 */
356 		hammer_free_mem_record(cursor->iprec);
357 		error = 0;
358 	} else {
359 		/*
360 		 * The directory entry was on-disk, mark the record and
361 		 * B-Tree entry as deleted.  The B-Tree entry does not
362 		 * have to be reindexed because a 'current' delete transid
363 		 * will wind up in the same position as the live record.
364 		 */
365 		KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
366 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
367 		if (error == 0) {
368 			cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
369 			cursor->record->base.base.delete_tid = trans->tid;
370 			hammer_modify_node(cursor->node);
371 			hammer_modify_buffer(cursor->record_buffer);
372 
373 		}
374 	}
375 
376 	/*
377 	 * One less link.  Mark the inode and all of its records as deleted
378 	 * when the last link goes away.  The inode will be automatically
379 	 * flushed when its last reference goes away.
380 	 */
381 	if (error == 0) {
382 		--ip->ino_rec.ino_nlinks;
383 		if (ip->ino_rec.ino_nlinks == 0)
384 			ip->ino_rec.base.base.delete_tid = trans->tid;
385 		error = hammer_ip_delete_range(trans, ip,
386 					       HAMMER_MIN_KEY, HAMMER_MAX_KEY);
387 		KKASSERT(RB_EMPTY(&ip->rec_tree));
388 		hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
389 	}
390 	return(error);
391 }
392 
393 /*
394  * Add a data record to the filesystem.
395  *
396  * This is called via the strategy code, typically when the kernel wants to
397  * flush a buffer cache buffer, so this operation goes directly to the disk.
398  */
399 int
400 hammer_ip_add_data(hammer_transaction_t trans, hammer_inode_t ip,
401 		       int64_t offset, void *data, int bytes)
402 {
403 	panic("hammer_ip_add_data");
404 }
405 
406 /*
407  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
408  * entry's key is used to deal with hash collisions in the upper 32 bits.
409  * A unique 64 bit key is generated in-memory and may be regenerated a
410  * second time when the directory record is flushed to the on-disk B-Tree.
411  */
412 static
413 int
414 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
415 {
416 	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
417 		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
418 			hammer_free_mem_record(record);
419 			return (EEXIST);
420 		}
421 		if (++trans->hmp->namekey_iterator == 0)
422 			++trans->hmp->namekey_iterator;
423 		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
424 		record->rec.base.base.key |= trans->hmp->namekey_iterator;
425 	}
426 	record->flags |= HAMMER_RECF_ONRBTREE;
427 	return(0);
428 }
429 
430 /************************************************************************
431  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
432  ************************************************************************
433  *
434  * These functions augment the B-Tree scanning functions in hammer_btree.c
435  * by merging in-memory records with on-disk records.
436  */
437 
438 /*
439  * Locate a particular record either in-memory or on-disk.
440  *
441  * NOTE: This is basically a standalone routine, hammer_ip_next() may
442  * NOT be called to iterate results.
443  */
444 int
445 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
446 {
447 	int error;
448 
449 	/*
450 	 * If the element is in-memory return it without searching the
451 	 * on-disk B-Tree
452 	 */
453 	error = hammer_mem_lookup(cursor, ip);
454 	if (error == 0) {
455 		cursor->record = &cursor->iprec->rec;
456 		return(error);
457 	}
458 	if (error != ENOENT)
459 		return(error);
460 
461 	/*
462 	 * If the inode has on-disk components search the on-disk B-Tree.
463 	 */
464 	if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
465 		return(error);
466 	error = hammer_btree_lookup(cursor);
467 	if (error == 0)
468 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
469 	return(error);
470 }
471 
472 /*
473  * Locate the first record within the cursor's key_beg/key_end range,
474  * restricted to a particular inode.  0 is returned on success, ENOENT
475  * if no records matched the requested range, or some other error.
476  *
477  * When 0 is returned hammer_ip_next() may be used to iterate additional
478  * records within the requested range.
479  */
480 int
481 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
482 {
483 	int error;
484 
485 	/*
486 	 * Clean up fields and setup for merged scan
487 	 */
488 	cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
489 	cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
490 	if (cursor->iprec)
491 		hammer_rel_mem_record(&cursor->iprec);
492 
493 	/*
494 	 * Search the on-disk B-Tree
495 	 */
496 	if (ip->flags & HAMMER_INODE_ONDISK) {
497 		error = hammer_btree_lookup(cursor);
498 		if (error && error != ENOENT)
499 			return(error);
500 		if (error == 0) {
501 			cursor->flags &= ~HAMMER_CURSOR_DISKEOF ;
502 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK ;
503 		}
504 	}
505 
506 	/*
507 	 * Search the in-memory record list (Red-Black tree)
508 	 */
509 	error = hammer_mem_search(cursor, ip);
510 	if (error && error != ENOENT)
511 		return(error);
512 	if (error == 0) {
513 		cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
514 		cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
515 	}
516 
517 	/*
518 	 * This will return the first matching record.
519 	 */
520 	return(hammer_ip_next(cursor));
521 }
522 
523 /*
524  * Retrieve the next record in a merged iteration within the bounds of the
525  * cursor.  This call may be made multiple times after the cursor has been
526  * initially searched with hammer_ip_first().
527  *
528  * 0 is returned on success, ENOENT if no further records match the
529  * requested range, or some other error code is returned.
530  */
531 int
532 hammer_ip_next(hammer_cursor_t cursor)
533 {
534 	hammer_btree_elm_t elm;
535 	hammer_record_t rec;
536 	int error;
537 	int r;
538 
539 	/*
540 	 * Load the current on-disk and in-memory record.  If we ate any
541 	 * records we have to get the next one.
542 	 *
543 	 * Get the next on-disk record
544 	 */
545 	if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
546 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
547 			error = hammer_btree_iterate(cursor);
548 			if (error == 0)
549 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
550 			else
551 				cursor->flags |= HAMMER_CURSOR_DISKEOF;
552 		}
553 	}
554 
555 	/*
556 	 * Get the next in-memory record.  The record can be ripped out
557 	 * of the RB tree so we maintain a scan_info structure to track
558 	 * the next node.
559 	 */
560 	if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
561 		if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
562 			rec = cursor->scan.node;	/* next node */
563 			if (rec) {
564 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
565 				hammer_ref(&rec->lock);
566 				cursor->scan.node =
567 					hammer_rec_rb_tree_RB_NEXT(rec);
568 			} else {
569 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
570 			}
571 			hammer_rel_mem_record(&cursor->iprec);
572 			cursor->iprec = rec;
573 		}
574 	}
575 
576 	/*
577 	 * Extract either the disk or memory record depending on their
578 	 * relative position.
579 	 */
580 	error = 0;
581 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
582 	case 0:
583 		/*
584 		 * Both entries valid
585 		 */
586 		elm = &cursor->node->ondisk->elms[cursor->index];
587 		r = hammer_btree_cmp(&elm->base,
588 				     &cursor->iprec->rec.base.base);
589 		if (r < 0) {
590 			error = hammer_btree_extract(cursor,
591 						     HAMMER_CURSOR_GET_RECORD);
592 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
593 			break;
594 		}
595 		/* fall through to the memory entry */
596 	case HAMMER_CURSOR_ATEDISK:
597 		/*
598 		 * Only the memory entry is valid
599 		 */
600 		cursor->record = &cursor->iprec->rec;
601 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
602 		break;
603 	case HAMMER_CURSOR_ATEMEM:
604 		/*
605 		 * Only the disk entry is valid
606 		 */
607 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
608 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
609 		break;
610 	default:
611 		/*
612 		 * Neither entry is valid
613 		 *
614 		 * XXX error not set properly
615 		 */
616 		cursor->record = NULL;
617 		error = ENOENT;
618 		break;
619 	}
620 	return(error);
621 }
622 
623 /*
624  * Resolve the cursor->data pointer for the current cursor position in
625  * a merged iteration.
626  */
627 int
628 hammer_ip_resolve_data(hammer_cursor_t cursor)
629 {
630 	int error;
631 
632 	if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
633 		cursor->data = cursor->iprec->data;
634 		error = 0;
635 	} else {
636 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
637 	}
638 	return(error);
639 }
640 
641 /*
642  * Delete all records within the specified range for inode ip.
643  *
644  * NOTE: An unaligned range will cause new records to be added to cover
645  * the edge cases.
646  *
647  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
648  */
649 int
650 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
651 		       int64_t ran_beg, int64_t ran_end)
652 {
653 	struct hammer_cursor cursor;
654 	hammer_record_ondisk_t rec;
655 	hammer_base_elm_t base;
656 	int error;
657 	int64_t off;
658 
659 	hammer_init_cursor_ip(&cursor, ip);
660 
661 	cursor.key_beg.obj_id = ip->obj_id;
662 	cursor.key_beg.create_tid = ip->obj_asof;
663 	cursor.key_beg.delete_tid = 0;
664 	cursor.key_beg.obj_type = 0;
665 	cursor.key_beg.key = ran_beg;
666 	cursor.key_end = cursor.key_beg;
667 	if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
668 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
669 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
670 		cursor.key_end.key = ran_end;
671 	} else {
672 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
673 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
674 		if (ran_end + MAXPHYS < ran_end)
675 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
676 		else
677 			cursor.key_end.key = ran_end + MAXPHYS;
678 	}
679 
680 	error = hammer_ip_first(&cursor, ip);
681 
682 	/*
683 	 * Iterate through matching records and mark them as deleted.
684 	 */
685 	while (error == 0) {
686 		rec = cursor.record;
687 		base = &rec->base.base;
688 
689 		KKASSERT(base->delete_tid == 0);
690 
691 		/*
692 		 * There may be overlap cases for regular file data.  Also
693 		 * remember the key for a regular file record is the offset
694 		 * of the last byte of the record (base + len - 1), NOT the
695 		 * base offset.
696 		 */
697 		if (base->rec_type == HAMMER_RECTYPE_DATA) {
698 			off = base->key - rec->base.data_len + 1;
699 			/*
700 			 * Check the left edge case
701 			 */
702 			if (off < ran_beg) {
703 				panic("hammer left edge case\n");
704 			}
705 
706 			/*
707 			 * Check the right edge case.  Note that the
708 			 * record can be completely out of bounds, which
709 			 * terminates the search.
710 			 *
711 			 * base->key is (base_offset + bytes - 1), ran_end
712 			 * works the same way.
713 			 */
714 			if (base->key > ran_end) {
715 				if (base->key - rec->base.data_len + 1 > ran_end) {
716 					kprintf("right edge OOB\n");
717 					break;
718 				}
719 				panic("hammer right edge case\n");
720 			}
721 		}
722 
723 		/*
724 		 * Mark the record and B-Tree entry as deleted
725 		 */
726 		if (cursor.record == &cursor.iprec->rec) {
727 			hammer_free_mem_record(cursor.iprec);
728 
729 		} else {
730 			cursor.node->ondisk->elms[cursor.index].base.delete_tid = trans->tid;
731 			cursor.record->base.base.delete_tid = trans->tid;
732 			hammer_modify_node(cursor.node);
733 			hammer_modify_buffer(cursor.record_buffer);
734 		}
735 		error = hammer_ip_next(&cursor);
736 	}
737 	hammer_done_cursor(&cursor);
738 	if (error == ENOENT)
739 		error = 0;
740 	return(error);
741 }
742 
743