xref: /dflybsd-src/sys/vfs/hammer/hammer_object.c (revision 65c62024e97be0964ff6de261081aec59a904f78)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 static int hammer_mem_lookup(hammer_cursor_t cursor);
40 static void hammer_mem_first(hammer_cursor_t cursor);
41 static int hammer_frontend_trunc_callback(hammer_record_t record,
42 				void *data __unused);
43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data);
44 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
46 		      hammer_btree_leaf_elm_t leaf);
47 
48 struct rec_trunc_info {
49 	u_int16_t	rec_type;
50 	int64_t		trunc_off;
51 };
52 
53 struct hammer_bulk_info {
54 	hammer_record_t record;
55 	struct hammer_btree_leaf_elm leaf;
56 };
57 
58 /*
59  * Red-black tree support.  Comparison code for insertion.
60  */
61 static int
62 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
63 {
64 	if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type)
65 		return(-1);
66 	if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type)
67 		return(1);
68 
69 	if (rec1->leaf.base.key < rec2->leaf.base.key)
70 		return(-1);
71 	if (rec1->leaf.base.key > rec2->leaf.base.key)
72 		return(1);
73 
74 	/*
75 	 * For search & insertion purposes records deleted by the
76 	 * frontend or deleted/committed by the backend are silently
77 	 * ignored.  Otherwise pipelined insertions will get messed
78 	 * up.
79 	 *
80 	 * rec1 is greater then rec2 if rec1 is marked deleted.
81 	 * rec1 is less then rec2 if rec2 is marked deleted.
82 	 *
83 	 * Multiple deleted records may be present, do not return 0
84 	 * if both are marked deleted.
85 	 */
86 	if (rec1->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
87 			   HAMMER_RECF_COMMITTED)) {
88 		return(1);
89 	}
90 	if (rec2->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
91 			   HAMMER_RECF_COMMITTED)) {
92 		return(-1);
93 	}
94 
95         return(0);
96 }
97 
98 /*
99  * Basic record comparison code similar to hammer_btree_cmp().
100  */
101 static int
102 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
103 {
104 	if (elm->rec_type < rec->leaf.base.rec_type)
105 		return(-3);
106 	if (elm->rec_type > rec->leaf.base.rec_type)
107 		return(3);
108 
109         if (elm->key < rec->leaf.base.key)
110                 return(-2);
111         if (elm->key > rec->leaf.base.key)
112                 return(2);
113 
114 	/*
115 	 * Never match against an item deleted by the frontend
116 	 * or backend, or committed by the backend.
117 	 *
118 	 * elm is less then rec if rec is marked deleted.
119 	 */
120 	if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
121 			  HAMMER_RECF_COMMITTED)) {
122 		return(-1);
123 	}
124         return(0);
125 }
126 
127 /*
128  * Ranged scan to locate overlapping record(s).  This is used by
129  * hammer_ip_get_bulk() to locate an overlapping record.  We have
130  * to use a ranged scan because the keys for data records with the
131  * same file base offset can be different due to differing data_len's.
132  *
133  * NOTE: The base file offset of a data record is (key - data_len), not (key).
134  */
135 static int
136 hammer_rec_overlap_cmp(hammer_record_t rec, void *data)
137 {
138 	struct hammer_bulk_info *info = data;
139 	hammer_btree_leaf_elm_t leaf = &info->leaf;
140 
141 	if (rec->leaf.base.rec_type < leaf->base.rec_type)
142 		return(-3);
143 	if (rec->leaf.base.rec_type > leaf->base.rec_type)
144 		return(3);
145 
146 	/*
147 	 * Overlap compare
148 	 */
149 	if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
150 		/* rec_beg >= leaf_end */
151 		if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key)
152 			return(2);
153 		/* rec_end <= leaf_beg */
154 		if (rec->leaf.base.key <= leaf->base.key - leaf->data_len)
155 			return(-2);
156 	} else {
157 		if (rec->leaf.base.key < leaf->base.key)
158 			return(-2);
159 		if (rec->leaf.base.key > leaf->base.key)
160 			return(2);
161 	}
162 
163 	/*
164 	 * We have to return 0 at this point, even if DELETED_FE is set,
165 	 * because returning anything else will cause the scan to ignore
166 	 * one of the branches when we really want it to check both.
167 	 */
168         return(0);
169 }
170 
171 /*
172  * RB_SCAN comparison code for hammer_mem_first().  The argument order
173  * is reversed so the comparison result has to be negated.  key_beg and
174  * key_end are both range-inclusive.
175  *
176  * Localized deletions are not cached in-memory.
177  */
178 static
179 int
180 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
181 {
182 	hammer_cursor_t cursor = data;
183 	int r;
184 
185 	r = hammer_rec_cmp(&cursor->key_beg, rec);
186 	if (r > 1)
187 		return(-1);
188 	r = hammer_rec_cmp(&cursor->key_end, rec);
189 	if (r < -1)
190 		return(1);
191 	return(0);
192 }
193 
194 /*
195  * This compare function is used when simply looking up key_beg.
196  */
197 static
198 int
199 hammer_rec_find_cmp(hammer_record_t rec, void *data)
200 {
201 	hammer_cursor_t cursor = data;
202 	int r;
203 
204 	r = hammer_rec_cmp(&cursor->key_beg, rec);
205 	if (r > 1)
206 		return(-1);
207 	if (r < -1)
208 		return(1);
209 	return(0);
210 }
211 
212 /*
213  * Locate blocks within the truncation range.  Partial blocks do not count.
214  */
215 static
216 int
217 hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
218 {
219 	struct rec_trunc_info *info = data;
220 
221 	if (rec->leaf.base.rec_type < info->rec_type)
222 		return(-1);
223 	if (rec->leaf.base.rec_type > info->rec_type)
224 		return(1);
225 
226 	switch(rec->leaf.base.rec_type) {
227 	case HAMMER_RECTYPE_DB:
228 		/*
229 		 * DB record key is not beyond the truncation point, retain.
230 		 */
231 		if (rec->leaf.base.key < info->trunc_off)
232 			return(-1);
233 		break;
234 	case HAMMER_RECTYPE_DATA:
235 		/*
236 		 * DATA record offset start is not beyond the truncation point,
237 		 * retain.
238 		 */
239 		if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off)
240 			return(-1);
241 		break;
242 	default:
243 		panic("hammer_rec_trunc_cmp: unexpected record type");
244 	}
245 
246 	/*
247 	 * The record start is >= the truncation point, return match,
248 	 * the record should be destroyed.
249 	 */
250 	return(0);
251 }
252 
253 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
254 
255 /*
256  * Allocate a record for the caller to finish filling in.  The record is
257  * returned referenced.
258  */
259 hammer_record_t
260 hammer_alloc_mem_record(hammer_inode_t ip, int data_len)
261 {
262 	hammer_record_t record;
263 	hammer_mount_t hmp;
264 
265 	hmp = ip->hmp;
266 	++hammer_count_records;
267 	record = kmalloc(sizeof(*record), hmp->m_misc,
268 			 M_WAITOK | M_ZERO | M_USE_RESERVE);
269 	record->flush_state = HAMMER_FST_IDLE;
270 	record->ip = ip;
271 	record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
272 	record->leaf.data_len = data_len;
273 	hammer_ref(&record->lock);
274 
275 	if (data_len) {
276 		record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO);
277 		record->flags |= HAMMER_RECF_ALLOCDATA;
278 		++hammer_count_record_datas;
279 	}
280 
281 	return (record);
282 }
283 
284 void
285 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident)
286 {
287 	while (record->flush_state == HAMMER_FST_FLUSH) {
288 		record->flags |= HAMMER_RECF_WANTED;
289 		tsleep(record, 0, ident, 0);
290 	}
291 }
292 
293 /*
294  * Called from the backend, hammer_inode.c, after a record has been
295  * flushed to disk.  The record has been exclusively locked by the
296  * caller and interlocked with BE.
297  *
298  * We clean up the state, unlock, and release the record (the record
299  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
300  */
301 void
302 hammer_flush_record_done(hammer_record_t record, int error)
303 {
304 	hammer_inode_t target_ip;
305 
306 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
307 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
308 
309 	/*
310 	 * If an error occured, the backend was unable to sync the
311 	 * record to its media.  Leave the record intact.
312 	 */
313 	if (error) {
314 		hammer_critical_error(record->ip->hmp, record->ip, error,
315 				      "while flushing record");
316 	}
317 
318 	--record->flush_group->refs;
319 	record->flush_group = NULL;
320 
321 	/*
322 	 * Adjust the flush state and dependancy based on success or
323 	 * failure.
324 	 */
325 	if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
326 		if ((target_ip = record->target_ip) != NULL) {
327 			TAILQ_REMOVE(&target_ip->target_list, record,
328 				     target_entry);
329 			record->target_ip = NULL;
330 			hammer_test_inode(target_ip);
331 		}
332 		record->flush_state = HAMMER_FST_IDLE;
333 	} else {
334 		if (record->target_ip) {
335 			record->flush_state = HAMMER_FST_SETUP;
336 			hammer_test_inode(record->ip);
337 			hammer_test_inode(record->target_ip);
338 		} else {
339 			record->flush_state = HAMMER_FST_IDLE;
340 		}
341 	}
342 	record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
343 
344 	/*
345 	 * Cleanup
346 	 */
347 	if (record->flags & HAMMER_RECF_WANTED) {
348 		record->flags &= ~HAMMER_RECF_WANTED;
349 		wakeup(record);
350 	}
351 	hammer_rel_mem_record(record);
352 }
353 
354 /*
355  * Release a memory record.  Records marked for deletion are immediately
356  * removed from the RB-Tree but otherwise left intact until the last ref
357  * goes away.
358  */
359 void
360 hammer_rel_mem_record(struct hammer_record *record)
361 {
362 	hammer_mount_t hmp;
363 	hammer_reserve_t resv;
364 	hammer_inode_t ip;
365 	hammer_inode_t target_ip;
366 	int diddrop;
367 
368 	hammer_unref(&record->lock);
369 
370 	if (record->lock.refs == 0) {
371 		/*
372 		 * Upon release of the last reference wakeup any waiters.
373 		 * The record structure may get destroyed so callers will
374 		 * loop up and do a relookup.
375 		 *
376 		 * WARNING!  Record must be removed from RB-TREE before we
377 		 * might possibly block.  hammer_test_inode() can block!
378 		 */
379 		ip = record->ip;
380 		hmp = ip->hmp;
381 
382 		/*
383 		 * Upon release of the last reference a record marked deleted
384 		 * by the front or backend, or committed by the backend,
385 		 * is destroyed.
386 		 */
387 		if (record->flags & (HAMMER_RECF_DELETED_FE |
388 				     HAMMER_RECF_DELETED_BE |
389 				     HAMMER_RECF_COMMITTED)) {
390 			KKASSERT(ip->lock.refs > 0);
391 			KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
392 
393 			/*
394 			 * target_ip may have zero refs, we have to ref it
395 			 * to prevent it from being ripped out from under
396 			 * us.
397 			 */
398 			if ((target_ip = record->target_ip) != NULL) {
399 				TAILQ_REMOVE(&target_ip->target_list,
400 					     record, target_entry);
401 				record->target_ip = NULL;
402 				hammer_ref(&target_ip->lock);
403 			}
404 
405 			/*
406 			 * Remove the record from the B-Tree
407 			 */
408 			if (record->flags & HAMMER_RECF_ONRBTREE) {
409 				RB_REMOVE(hammer_rec_rb_tree,
410 					  &record->ip->rec_tree,
411 					  record);
412 				record->flags &= ~HAMMER_RECF_ONRBTREE;
413 				KKASSERT(ip->rsv_recs > 0);
414 				diddrop = 1;
415 			} else {
416 				diddrop = 0;
417 			}
418 
419 			/*
420 			 * We must wait for any direct-IO to complete before
421 			 * we can destroy the record because the bio may
422 			 * have a reference to it.
423 			 */
424 			if (record->flags &
425 			   (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) {
426 				hammer_io_direct_wait(record);
427 			}
428 
429 			/*
430 			 * Account for the completion after the direct IO
431 			 * has completed.
432 			 */
433 			if (diddrop) {
434 				--hmp->rsv_recs;
435 				--ip->rsv_recs;
436 				hmp->rsv_databytes -= record->leaf.data_len;
437 
438 				if (RB_EMPTY(&record->ip->rec_tree)) {
439 					record->ip->flags &= ~HAMMER_INODE_XDIRTY;
440 					record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
441 					hammer_test_inode(record->ip);
442 				}
443 				if (ip->rsv_recs == hammer_limit_inode_recs - 1)
444 					wakeup(&ip->rsv_recs);
445 			}
446 
447 			/*
448 			 * Do this test after removing record from the B-Tree.
449 			 */
450 			if (target_ip) {
451 				hammer_test_inode(target_ip);
452 				hammer_rel_inode(target_ip, 0);
453 			}
454 
455 			if (record->flags & HAMMER_RECF_ALLOCDATA) {
456 				--hammer_count_record_datas;
457 				kfree(record->data, hmp->m_misc);
458 				record->flags &= ~HAMMER_RECF_ALLOCDATA;
459 			}
460 
461 			/*
462 			 * Release the reservation.
463 			 *
464 			 * If the record was not committed we can theoretically
465 			 * undo the reservation.  However, doing so might
466 			 * create weird edge cases with the ordering of
467 			 * direct writes because the related buffer cache
468 			 * elements are per-vnode.  So we don't try.
469 			 */
470 			if ((resv = record->resv) != NULL) {
471 				/* XXX undo leaf.data_offset,leaf.data_len */
472 				hammer_blockmap_reserve_complete(hmp, resv);
473 				record->resv = NULL;
474 			}
475 			record->data = NULL;
476 			--hammer_count_records;
477 			kfree(record, hmp->m_misc);
478 		}
479 	}
480 }
481 
482 /*
483  * Record visibility depends on whether the record is being accessed by
484  * the backend or the frontend.  Backend tests ignore the frontend delete
485  * flag.  Frontend tests do NOT ignore the backend delete/commit flags and
486  * must also check for commit races.
487  *
488  * Return non-zero if the record is visible, zero if it isn't or if it is
489  * deleted.  Returns 0 if the record has been comitted (unless the special
490  * delete-visibility flag is set).  A committed record must be located
491  * via the media B-Tree.  Returns non-zero if the record is good.
492  *
493  * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory
494  * records to be returned.  This is so pending deletions are detected
495  * when using an iterator to locate an unused hash key, or when we need
496  * to locate historical records on-disk to destroy.
497  */
498 static __inline
499 int
500 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
501 {
502 	if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY)
503 		return(1);
504 	if (cursor->flags & HAMMER_CURSOR_BACKEND) {
505 		if (record->flags & (HAMMER_RECF_DELETED_BE |
506 				     HAMMER_RECF_COMMITTED)) {
507 			return(0);
508 		}
509 	} else {
510 		if (record->flags & (HAMMER_RECF_DELETED_FE |
511 				     HAMMER_RECF_DELETED_BE |
512 				     HAMMER_RECF_COMMITTED)) {
513 			return(0);
514 		}
515 	}
516 	return(1);
517 }
518 
519 /*
520  * This callback is used as part of the RB_SCAN function for in-memory
521  * records.  We terminate it (return -1) as soon as we get a match.
522  *
523  * This routine is used by frontend code.
524  *
525  * The primary compare code does not account for ASOF lookups.  This
526  * code handles that case as well as a few others.
527  */
528 static
529 int
530 hammer_rec_scan_callback(hammer_record_t rec, void *data)
531 {
532 	hammer_cursor_t cursor = data;
533 
534 	/*
535 	 * We terminate on success, so this should be NULL on entry.
536 	 */
537 	KKASSERT(cursor->iprec == NULL);
538 
539 	/*
540 	 * Skip if the record was marked deleted or committed.
541 	 */
542 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
543 		return(0);
544 
545 	/*
546 	 * Skip if not visible due to our as-of TID
547 	 */
548         if (cursor->flags & HAMMER_CURSOR_ASOF) {
549                 if (cursor->asof < rec->leaf.base.create_tid)
550                         return(0);
551                 if (rec->leaf.base.delete_tid &&
552 		    cursor->asof >= rec->leaf.base.delete_tid) {
553                         return(0);
554 		}
555         }
556 
557 	/*
558 	 * ref the record.  The record is protected from backend B-Tree
559 	 * interactions by virtue of the cursor's IP lock.
560 	 */
561 	hammer_ref(&rec->lock);
562 
563 	/*
564 	 * The record may have been deleted or committed while we
565 	 * were blocked.  XXX remove?
566 	 */
567 	if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
568 		hammer_rel_mem_record(rec);
569 		return(0);
570 	}
571 
572 	/*
573 	 * Set the matching record and stop the scan.
574 	 */
575 	cursor->iprec = rec;
576 	return(-1);
577 }
578 
579 
580 /*
581  * Lookup an in-memory record given the key specified in the cursor.  Works
582  * just like hammer_btree_lookup() but operates on an inode's in-memory
583  * record list.
584  *
585  * The lookup must fail if the record is marked for deferred deletion.
586  *
587  * The API for mem/btree_lookup() does not mess with the ATE/EOF bits.
588  */
589 static
590 int
591 hammer_mem_lookup(hammer_cursor_t cursor)
592 {
593 	KKASSERT(cursor->ip);
594 	if (cursor->iprec) {
595 		hammer_rel_mem_record(cursor->iprec);
596 		cursor->iprec = NULL;
597 	}
598 	hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp,
599 				   hammer_rec_scan_callback, cursor);
600 
601 	return (cursor->iprec ? 0 : ENOENT);
602 }
603 
604 /*
605  * hammer_mem_first() - locate the first in-memory record matching the
606  * cursor within the bounds of the key range.
607  *
608  * WARNING!  API is slightly different from btree_first().  hammer_mem_first()
609  * will set ATEMEM the same as MEMEOF, and does not return any error.
610  */
611 static
612 void
613 hammer_mem_first(hammer_cursor_t cursor)
614 {
615 	hammer_inode_t ip;
616 
617 	ip = cursor->ip;
618 	KKASSERT(ip != NULL);
619 
620 	if (cursor->iprec) {
621 		hammer_rel_mem_record(cursor->iprec);
622 		cursor->iprec = NULL;
623 	}
624 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
625 				   hammer_rec_scan_callback, cursor);
626 
627 	if (cursor->iprec)
628 		cursor->flags &= ~(HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM);
629 	else
630 		cursor->flags |= HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM;
631 }
632 
633 /************************************************************************
634  *		     HAMMER IN-MEMORY RECORD FUNCTIONS			*
635  ************************************************************************
636  *
637  * These functions manipulate in-memory records.  Such records typically
638  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
639  */
640 
641 /*
642  * Add a directory entry (dip,ncp) which references inode (ip).
643  *
644  * Note that the low 32 bits of the namekey are set temporarily to create
645  * a unique in-memory record, and may be modified a second time when the
646  * record is synchronized to disk.  In particular, the low 32 bits cannot be
647  * all 0's when synching to disk, which is not handled here.
648  *
649  * NOTE: bytes does not include any terminating \0 on name, and name might
650  * not be terminated.
651  */
652 int
653 hammer_ip_add_directory(struct hammer_transaction *trans,
654 		     struct hammer_inode *dip, const char *name, int bytes,
655 		     struct hammer_inode *ip)
656 {
657 	struct hammer_cursor cursor;
658 	hammer_record_t record;
659 	int error;
660 	u_int32_t max_iterations;
661 
662 	record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes));
663 
664 	record->type = HAMMER_MEM_RECORD_ADD;
665 	record->leaf.base.localization = dip->obj_localization +
666 					 hammer_dir_localization(dip);
667 	record->leaf.base.obj_id = dip->obj_id;
668 	record->leaf.base.key = hammer_directory_namekey(dip, name, bytes,
669 							 &max_iterations);
670 	record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
671 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
672 	record->data->entry.obj_id = ip->obj_id;
673 	record->data->entry.localization = ip->obj_localization;
674 	bcopy(name, record->data->entry.name, bytes);
675 
676 	++ip->ino_data.nlinks;
677 	ip->ino_data.ctime = trans->time;
678 	hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
679 
680 	/*
681 	 * Find an unused namekey.  Both the in-memory record tree and
682 	 * the B-Tree are checked.  We do not want historically deleted
683 	 * names to create a collision as our iteration space may be limited,
684 	 * and since create_tid wouldn't match anyway an ASOF search
685 	 * must be used to locate collisions.
686 	 *
687 	 * delete-visibility is set so pending deletions do not give us
688 	 * a false-negative on our ability to use an iterator.
689 	 *
690 	 * The iterator must not rollover the key.  Directory keys only
691 	 * use the positive key space.
692 	 */
693 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
694 	cursor.key_beg = record->leaf.base;
695 	cursor.flags |= HAMMER_CURSOR_ASOF;
696 	cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
697 	cursor.asof = ip->obj_asof;
698 
699 	while (hammer_ip_lookup(&cursor) == 0) {
700 		++record->leaf.base.key;
701 		KKASSERT(record->leaf.base.key > 0);
702 		cursor.key_beg.key = record->leaf.base.key;
703 		if (--max_iterations == 0) {
704 			hammer_rel_mem_record(record);
705 			error = ENOSPC;
706 			goto failed;
707 		}
708 	}
709 
710 	/*
711 	 * The target inode and the directory entry are bound together.
712 	 */
713 	record->target_ip = ip;
714 	record->flush_state = HAMMER_FST_SETUP;
715 	TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
716 
717 	/*
718 	 * The inode now has a dependancy and must be taken out of the idle
719 	 * state.  An inode not in an idle state is given an extra reference.
720 	 *
721 	 * When transitioning to a SETUP state flag for an automatic reflush
722 	 * when the dependancies are disposed of if someone is waiting on
723 	 * the inode.
724 	 */
725 	if (ip->flush_state == HAMMER_FST_IDLE) {
726 		hammer_ref(&ip->lock);
727 		ip->flush_state = HAMMER_FST_SETUP;
728 		if (ip->flags & HAMMER_INODE_FLUSHW)
729 			ip->flags |= HAMMER_INODE_REFLUSH;
730 	}
731 	error = hammer_mem_add(record);
732 	if (error == 0) {
733 		dip->ino_data.mtime = trans->time;
734 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
735 	}
736 failed:
737 	hammer_done_cursor(&cursor);
738 	return(error);
739 }
740 
741 /*
742  * Delete the directory entry and update the inode link count.  The
743  * cursor must be seeked to the directory entry record being deleted.
744  *
745  * The related inode should be share-locked by the caller.  The caller is
746  * on the frontend.  It could also be NULL indicating that the directory
747  * entry being removed has no related inode.
748  *
749  * This function can return EDEADLK requiring the caller to terminate
750  * the cursor, any locks, wait on the returned record, and retry.
751  */
752 int
753 hammer_ip_del_directory(struct hammer_transaction *trans,
754 		     hammer_cursor_t cursor, struct hammer_inode *dip,
755 		     struct hammer_inode *ip)
756 {
757 	hammer_record_t record;
758 	int error;
759 
760 	if (hammer_cursor_inmem(cursor)) {
761 		/*
762 		 * In-memory (unsynchronized) records can simply be freed.
763 		 *
764 		 * Even though the HAMMER_RECF_DELETED_FE flag is ignored
765 		 * by the backend, we must still avoid races against the
766 		 * backend potentially syncing the record to the media.
767 		 *
768 		 * We cannot call hammer_ip_delete_record(), that routine may
769 		 * only be called from the backend.
770 		 */
771 		record = cursor->iprec;
772 		if (record->flags & (HAMMER_RECF_INTERLOCK_BE |
773 				     HAMMER_RECF_DELETED_BE |
774 				     HAMMER_RECF_COMMITTED)) {
775 			KKASSERT(cursor->deadlk_rec == NULL);
776 			hammer_ref(&record->lock);
777 			cursor->deadlk_rec = record;
778 			error = EDEADLK;
779 		} else {
780 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
781 			record->flags |= HAMMER_RECF_DELETED_FE;
782 			error = 0;
783 		}
784 	} else {
785 		/*
786 		 * If the record is on-disk we have to queue the deletion by
787 		 * the record's key.  This also causes lookups to skip the
788 		 * record.
789 		 */
790 		KKASSERT(dip->flags &
791 			 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
792 		record = hammer_alloc_mem_record(dip, 0);
793 		record->type = HAMMER_MEM_RECORD_DEL;
794 		record->leaf.base = cursor->leaf->base;
795 
796 		/*
797 		 * ip may be NULL, indicating the deletion of a directory
798 		 * entry which has no related inode.
799 		 */
800 		record->target_ip = ip;
801 		if (ip) {
802 			record->flush_state = HAMMER_FST_SETUP;
803 			TAILQ_INSERT_TAIL(&ip->target_list, record,
804 					  target_entry);
805 		} else {
806 			record->flush_state = HAMMER_FST_IDLE;
807 		}
808 
809 		/*
810 		 * The inode now has a dependancy and must be taken out of
811 		 * the idle state.  An inode not in an idle state is given
812 		 * an extra reference.
813 		 *
814 		 * When transitioning to a SETUP state flag for an automatic
815 		 * reflush when the dependancies are disposed of if someone
816 		 * is waiting on the inode.
817 		 */
818 		if (ip && ip->flush_state == HAMMER_FST_IDLE) {
819 			hammer_ref(&ip->lock);
820 			ip->flush_state = HAMMER_FST_SETUP;
821 			if (ip->flags & HAMMER_INODE_FLUSHW)
822 				ip->flags |= HAMMER_INODE_REFLUSH;
823 		}
824 
825 		error = hammer_mem_add(record);
826 	}
827 
828 	/*
829 	 * One less link.  The file may still be open in the OS even after
830 	 * all links have gone away.
831 	 *
832 	 * We have to terminate the cursor before syncing the inode to
833 	 * avoid deadlocking against ourselves.  XXX this may no longer
834 	 * be true.
835 	 *
836 	 * If nlinks drops to zero and the vnode is inactive (or there is
837 	 * no vnode), call hammer_inode_unloadable_check() to zonk the
838 	 * inode.  If we don't do this here the inode will not be destroyed
839 	 * on-media until we unmount.
840 	 */
841 	if (error == 0) {
842 		if (ip) {
843 			--ip->ino_data.nlinks;	/* do before we might block */
844 			ip->ino_data.ctime = trans->time;
845 		}
846 		dip->ino_data.mtime = trans->time;
847 		hammer_modify_inode(dip, HAMMER_INODE_MTIME);
848 		if (ip) {
849 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
850 			if (ip->ino_data.nlinks == 0 &&
851 			    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
852 				hammer_done_cursor(cursor);
853 				hammer_inode_unloadable_check(ip, 1);
854 				hammer_flush_inode(ip, 0);
855 			}
856 		}
857 
858 	}
859 	return(error);
860 }
861 
862 /*
863  * Add a record to an inode.
864  *
865  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
866  * initialize the following additional fields:
867  *
868  * The related inode should be share-locked by the caller.  The caller is
869  * on the frontend.
870  *
871  * record->rec.entry.base.base.key
872  * record->rec.entry.base.base.rec_type
873  * record->rec.entry.base.base.data_len
874  * record->data		(a copy will be kmalloc'd if it cannot be embedded)
875  */
876 int
877 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
878 {
879 	hammer_inode_t ip = record->ip;
880 	int error;
881 
882 	KKASSERT(record->leaf.base.localization != 0);
883 	record->leaf.base.obj_id = ip->obj_id;
884 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
885 	error = hammer_mem_add(record);
886 	return(error);
887 }
888 
889 /*
890  * Locate a bulk record in-memory.  Bulk records allow disk space to be
891  * reserved so the front-end can flush large data writes without having
892  * to queue the BIO to the flusher.  Only the related record gets queued
893  * to the flusher.
894  */
895 
896 static hammer_record_t
897 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
898 {
899 	struct hammer_bulk_info info;
900 
901 	bzero(&info, sizeof(info));
902 	info.leaf.base.obj_id = ip->obj_id;
903 	info.leaf.base.key = file_offset + bytes;
904 	info.leaf.base.create_tid = 0;
905 	info.leaf.base.delete_tid = 0;
906 	info.leaf.base.rec_type = HAMMER_RECTYPE_DATA;
907 	info.leaf.base.obj_type = 0;				/* unused */
908 	info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;	/* unused */
909 	info.leaf.base.localization = ip->obj_localization +	/* unused */
910 				      HAMMER_LOCALIZE_MISC;
911 	info.leaf.data_len = bytes;
912 
913 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp,
914 				   hammer_bulk_scan_callback, &info);
915 
916 	return(info.record);	/* may be NULL */
917 }
918 
919 /*
920  * Take records vetted by overlap_cmp.  The first non-deleted record
921  * (if any) stops the scan.
922  */
923 static int
924 hammer_bulk_scan_callback(hammer_record_t record, void *data)
925 {
926 	struct hammer_bulk_info *info = data;
927 
928 	if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
929 			     HAMMER_RECF_COMMITTED)) {
930 		return(0);
931 	}
932 	hammer_ref(&record->lock);
933 	info->record = record;
934 	return(-1);			/* stop scan */
935 }
936 
937 /*
938  * Reserve blockmap space placemarked with an in-memory record.
939  *
940  * This routine is called by the frontend in order to be able to directly
941  * flush a buffer cache buffer.  The frontend has locked the related buffer
942  * cache buffers and we should be able to manipulate any overlapping
943  * in-memory records.
944  *
945  * The caller is responsible for adding the returned record.
946  */
947 hammer_record_t
948 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
949 		   int *errorp)
950 {
951 	hammer_record_t record;
952 	hammer_record_t conflict;
953 	int zone;
954 
955 	/*
956 	 * Deal with conflicting in-memory records.  We cannot have multiple
957 	 * in-memory records for the same base offset without seriously
958 	 * confusing the backend, including but not limited to the backend
959 	 * issuing delete-create-delete or create-delete-create sequences
960 	 * and asserting on the delete_tid being the same as the create_tid.
961 	 *
962 	 * If we encounter a record with the backend interlock set we cannot
963 	 * immediately delete it without confusing the backend.
964 	 */
965 	while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
966 		if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
967 			conflict->flags |= HAMMER_RECF_WANTED;
968 			tsleep(conflict, 0, "hmrrc3", 0);
969 		} else {
970 			conflict->flags |= HAMMER_RECF_DELETED_FE;
971 		}
972 		hammer_rel_mem_record(conflict);
973 	}
974 
975 	/*
976 	 * Create a record to cover the direct write.  This is called with
977 	 * the related BIO locked so there should be no possible conflict.
978 	 *
979 	 * The backend is responsible for finalizing the space reserved in
980 	 * this record.
981 	 *
982 	 * XXX bytes not aligned, depend on the reservation code to
983 	 * align the reservation.
984 	 */
985 	record = hammer_alloc_mem_record(ip, 0);
986 	zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
987 					   HAMMER_ZONE_SMALL_DATA_INDEX;
988 	record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
989 					       &record->leaf.data_offset,
990 					       errorp);
991 	if (record->resv == NULL) {
992 		kprintf("hammer_ip_add_bulk: reservation failed\n");
993 		hammer_rel_mem_record(record);
994 		return(NULL);
995 	}
996 	record->type = HAMMER_MEM_RECORD_DATA;
997 	record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
998 	record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
999 	record->leaf.base.obj_id = ip->obj_id;
1000 	record->leaf.base.key = file_offset + bytes;
1001 	record->leaf.base.localization = ip->obj_localization +
1002 					 HAMMER_LOCALIZE_MISC;
1003 	record->leaf.data_len = bytes;
1004 	hammer_crc_set_leaf(data, &record->leaf);
1005 	KKASSERT(*errorp == 0);
1006 	return(record);
1007 }
1008 
1009 /*
1010  * Frontend truncation code.  Scan in-memory records only.  On-disk records
1011  * and records in a flushing state are handled by the backend.  The vnops
1012  * setattr code will handle the block containing the truncation point.
1013  *
1014  * Partial blocks are not deleted.
1015  */
1016 int
1017 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size)
1018 {
1019 	struct rec_trunc_info info;
1020 
1021 	switch(ip->ino_data.obj_type) {
1022 	case HAMMER_OBJTYPE_REGFILE:
1023 		info.rec_type = HAMMER_RECTYPE_DATA;
1024 		break;
1025 	case HAMMER_OBJTYPE_DBFILE:
1026 		info.rec_type = HAMMER_RECTYPE_DB;
1027 		break;
1028 	default:
1029 		return(EINVAL);
1030 	}
1031 	info.trunc_off = file_size;
1032 	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp,
1033 				   hammer_frontend_trunc_callback, &info);
1034 	return(0);
1035 }
1036 
1037 static int
1038 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused)
1039 {
1040 	if (record->flags & HAMMER_RECF_DELETED_FE)
1041 		return(0);
1042 	if (record->flush_state == HAMMER_FST_FLUSH)
1043 		return(0);
1044 	KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
1045 	hammer_ref(&record->lock);
1046 	record->flags |= HAMMER_RECF_DELETED_FE;
1047 	hammer_rel_mem_record(record);
1048 	return(0);
1049 }
1050 
1051 /*
1052  * Return 1 if the caller must check for and delete existing records
1053  * before writing out a new data record.
1054  *
1055  * Return 0 if the caller can just insert the record into the B-Tree without
1056  * checking.
1057  */
1058 static int
1059 hammer_record_needs_overwrite_delete(hammer_record_t record)
1060 {
1061 	hammer_inode_t ip = record->ip;
1062 	int64_t file_offset;
1063 	int r;
1064 
1065 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
1066 		file_offset = record->leaf.base.key;
1067 	else
1068 		file_offset = record->leaf.base.key - record->leaf.data_len;
1069 	r = (file_offset < ip->save_trunc_off);
1070 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1071 		if (ip->save_trunc_off <= record->leaf.base.key)
1072 			ip->save_trunc_off = record->leaf.base.key + 1;
1073 	} else {
1074 		if (ip->save_trunc_off < record->leaf.base.key)
1075 			ip->save_trunc_off = record->leaf.base.key;
1076 	}
1077 	return(r);
1078 }
1079 
1080 /*
1081  * Backend code.  Sync a record to the media.
1082  */
1083 int
1084 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
1085 {
1086 	hammer_transaction_t trans = cursor->trans;
1087 	int64_t file_offset;
1088 	int bytes;
1089 	void *bdata;
1090 	int error;
1091 	int doprop;
1092 
1093 	KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1094 	KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
1095 	KKASSERT(record->leaf.base.localization != 0);
1096 
1097 	/*
1098 	 * Any direct-write related to the record must complete before we
1099 	 * can sync the record to the on-disk media.
1100 	 */
1101 	if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL))
1102 		hammer_io_direct_wait(record);
1103 
1104 	/*
1105 	 * If this is a bulk-data record placemarker there may be an existing
1106 	 * record on-disk, indicating a data overwrite.  If there is the
1107 	 * on-disk record must be deleted before we can insert our new record.
1108 	 *
1109 	 * We've synthesized this record and do not know what the create_tid
1110 	 * on-disk is, nor how much data it represents.
1111 	 *
1112 	 * Keep in mind that (key) for data records is (base_offset + len),
1113 	 * not (base_offset).  Also, we only want to get rid of on-disk
1114 	 * records since we are trying to sync our in-memory record, call
1115 	 * hammer_ip_delete_range() with truncating set to 1 to make sure
1116 	 * it skips in-memory records.
1117 	 *
1118 	 * It is ok for the lookup to return ENOENT.
1119 	 *
1120 	 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1121 	 * to call hammer_ip_delete_range() or not.  This also means we must
1122 	 * update sync_trunc_off() as we write.
1123 	 */
1124 	if (record->type == HAMMER_MEM_RECORD_DATA &&
1125 	    hammer_record_needs_overwrite_delete(record)) {
1126 		file_offset = record->leaf.base.key - record->leaf.data_len;
1127 		bytes = (record->leaf.data_len + HAMMER_BUFMASK) &
1128 			~HAMMER_BUFMASK;
1129 		KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
1130 		error = hammer_ip_delete_range(
1131 				cursor, record->ip,
1132 				file_offset, file_offset + bytes - 1,
1133 				1);
1134 		if (error && error != ENOENT)
1135 			goto done;
1136 	}
1137 
1138 	/*
1139 	 * If this is a general record there may be an on-disk version
1140 	 * that must be deleted before we can insert the new record.
1141 	 */
1142 	if (record->type == HAMMER_MEM_RECORD_GENERAL) {
1143 		error = hammer_delete_general(cursor, record->ip,
1144 					      &record->leaf);
1145 		if (error && error != ENOENT)
1146 			goto done;
1147 	}
1148 
1149 	/*
1150 	 * Setup the cursor.
1151 	 */
1152 	hammer_normalize_cursor(cursor);
1153 	cursor->key_beg = record->leaf.base;
1154 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1155 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1156 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
1157 
1158 	/*
1159 	 * Records can wind up on-media before the inode itself is on-media.
1160 	 * Flag the case.
1161 	 */
1162 	record->ip->flags |= HAMMER_INODE_DONDISK;
1163 
1164 	/*
1165 	 * If we are deleting a directory entry an exact match must be
1166 	 * found on-disk.
1167 	 */
1168 	if (record->type == HAMMER_MEM_RECORD_DEL) {
1169 		error = hammer_btree_lookup(cursor);
1170 		if (error == 0) {
1171 			KKASSERT(cursor->iprec == NULL);
1172 			error = hammer_ip_delete_record(cursor, record->ip,
1173 							trans->tid);
1174 			if (error == 0) {
1175 				record->flags |= HAMMER_RECF_DELETED_BE |
1176 						 HAMMER_RECF_COMMITTED;
1177 				++record->ip->rec_generation;
1178 			}
1179 		}
1180 		goto done;
1181 	}
1182 
1183 	/*
1184 	 * We are inserting.
1185 	 *
1186 	 * Issue a lookup to position the cursor and locate the insertion
1187 	 * point.  The target key should not exist.  If we are creating a
1188 	 * directory entry we may have to iterate the low 32 bits of the
1189 	 * key to find an unused key.
1190 	 */
1191 	hammer_sync_lock_sh(trans);
1192 	cursor->flags |= HAMMER_CURSOR_INSERT;
1193 	error = hammer_btree_lookup(cursor);
1194 	if (hammer_debug_inode)
1195 		kprintf("DOINSERT LOOKUP %d\n", error);
1196 	if (error == 0) {
1197 		kprintf("hammer_ip_sync_record: duplicate rec "
1198 			"at (%016llx)\n", (long long)record->leaf.base.key);
1199 		Debugger("duplicate record1");
1200 		error = EIO;
1201 	}
1202 #if 0
1203 	if (record->type == HAMMER_MEM_RECORD_DATA)
1204 		kprintf("sync_record  %016llx ---------------- %016llx %d\n",
1205 			record->leaf.base.key - record->leaf.data_len,
1206 			record->leaf.data_offset, error);
1207 #endif
1208 
1209 	if (error != ENOENT)
1210 		goto done_unlock;
1211 
1212 	/*
1213 	 * Allocate the record and data.  The result buffers will be
1214 	 * marked as being modified and further calls to
1215 	 * hammer_modify_buffer() will result in unneeded UNDO records.
1216 	 *
1217 	 * Support zero-fill records (data == NULL and data_len != 0)
1218 	 */
1219 	if (record->type == HAMMER_MEM_RECORD_DATA) {
1220 		/*
1221 		 * The data portion of a bulk-data record has already been
1222 		 * committed to disk, we need only adjust the layer2
1223 		 * statistics in the same transaction as our B-Tree insert.
1224 		 */
1225 		KKASSERT(record->leaf.data_offset != 0);
1226 		error = hammer_blockmap_finalize(trans,
1227 						 record->resv,
1228 						 record->leaf.data_offset,
1229 						 record->leaf.data_len);
1230 	} else if (record->data && record->leaf.data_len) {
1231 		/*
1232 		 * Wholely cached record, with data.  Allocate the data.
1233 		 */
1234 		bdata = hammer_alloc_data(trans, record->leaf.data_len,
1235 					  record->leaf.base.rec_type,
1236 					  &record->leaf.data_offset,
1237 					  &cursor->data_buffer,
1238 					  0, &error);
1239 		if (bdata == NULL)
1240 			goto done_unlock;
1241 		hammer_crc_set_leaf(record->data, &record->leaf);
1242 		hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
1243 		bcopy(record->data, bdata, record->leaf.data_len);
1244 		hammer_modify_buffer_done(cursor->data_buffer);
1245 	} else {
1246 		/*
1247 		 * Wholely cached record, without data.
1248 		 */
1249 		record->leaf.data_offset = 0;
1250 		record->leaf.data_crc = 0;
1251 	}
1252 
1253 	error = hammer_btree_insert(cursor, &record->leaf, &doprop);
1254 	if (hammer_debug_inode && error) {
1255 		kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n",
1256 			error,
1257 			(long long)cursor->node->node_offset,
1258 			cursor->index,
1259 			(long long)record->leaf.base.key);
1260 	}
1261 
1262 	/*
1263 	 * Our record is on-disk and we normally mark the in-memory version
1264 	 * as having been committed (and not BE-deleted).
1265 	 *
1266 	 * If the record represented a directory deletion but we had to
1267 	 * sync a valid directory entry to disk due to dependancies,
1268 	 * we must convert the record to a covering delete so the
1269 	 * frontend does not have visibility on the synced entry.
1270 	 */
1271 	if (error == 0) {
1272 		if (doprop) {
1273 			hammer_btree_do_propagation(cursor,
1274 						    record->ip->pfsm,
1275 						    &record->leaf);
1276 		}
1277 		if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
1278 			/*
1279 			 * Must convert deleted directory entry add
1280 			 * to a directory entry delete.
1281 			 */
1282 			KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
1283 			record->flags &= ~HAMMER_RECF_DELETED_FE;
1284 			record->type = HAMMER_MEM_RECORD_DEL;
1285 			KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1286 			record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1287 			KKASSERT((record->flags & (HAMMER_RECF_COMMITTED |
1288 						 HAMMER_RECF_DELETED_BE)) == 0);
1289 			/* converted record is not yet committed */
1290 			/* hammer_flush_record_done takes care of the rest */
1291 		} else {
1292 			/*
1293 			 * Everything went fine and we are now done with
1294 			 * this record.
1295 			 */
1296 			record->flags |= HAMMER_RECF_COMMITTED;
1297 			++record->ip->rec_generation;
1298 		}
1299 	} else {
1300 		if (record->leaf.data_offset) {
1301 			hammer_blockmap_free(trans, record->leaf.data_offset,
1302 					     record->leaf.data_len);
1303 		}
1304 	}
1305 done_unlock:
1306 	hammer_sync_unlock(trans);
1307 done:
1308 	return(error);
1309 }
1310 
1311 /*
1312  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
1313  * entry's key is used to deal with hash collisions in the upper 32 bits.
1314  * A unique 64 bit key is generated in-memory and may be regenerated a
1315  * second time when the directory record is flushed to the on-disk B-Tree.
1316  *
1317  * A referenced record is passed to this function.  This function
1318  * eats the reference.  If an error occurs the record will be deleted.
1319  *
1320  * A copy of the temporary record->data pointer provided by the caller
1321  * will be made.
1322  */
1323 int
1324 hammer_mem_add(hammer_record_t record)
1325 {
1326 	hammer_mount_t hmp = record->ip->hmp;
1327 
1328 	/*
1329 	 * Make a private copy of record->data
1330 	 */
1331 	if (record->data)
1332 		KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA);
1333 
1334 	/*
1335 	 * Insert into the RB tree.  A unique key should have already
1336 	 * been selected if this is a directory entry.
1337 	 */
1338 	if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1339 		record->flags |= HAMMER_RECF_DELETED_FE;
1340 		hammer_rel_mem_record(record);
1341 		return (EEXIST);
1342 	}
1343 	++hmp->count_newrecords;
1344 	++hmp->rsv_recs;
1345 	++record->ip->rsv_recs;
1346 	record->ip->hmp->rsv_databytes += record->leaf.data_len;
1347 	record->flags |= HAMMER_RECF_ONRBTREE;
1348 	hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
1349 	hammer_rel_mem_record(record);
1350 	return(0);
1351 }
1352 
1353 /************************************************************************
1354  *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
1355  ************************************************************************
1356  *
1357  * These functions augment the B-Tree scanning functions in hammer_btree.c
1358  * by merging in-memory records with on-disk records.
1359  */
1360 
1361 /*
1362  * Locate a particular record either in-memory or on-disk.
1363  *
1364  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1365  * NOT be called to iterate results.
1366  */
1367 int
1368 hammer_ip_lookup(hammer_cursor_t cursor)
1369 {
1370 	int error;
1371 
1372 	/*
1373 	 * If the element is in-memory return it without searching the
1374 	 * on-disk B-Tree
1375 	 */
1376 	KKASSERT(cursor->ip);
1377 	error = hammer_mem_lookup(cursor);
1378 	if (error == 0) {
1379 		cursor->leaf = &cursor->iprec->leaf;
1380 		return(error);
1381 	}
1382 	if (error != ENOENT)
1383 		return(error);
1384 
1385 	/*
1386 	 * If the inode has on-disk components search the on-disk B-Tree.
1387 	 */
1388 	if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1389 		return(error);
1390 	error = hammer_btree_lookup(cursor);
1391 	if (error == 0)
1392 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1393 	return(error);
1394 }
1395 
1396 /*
1397  * Helper for hammer_ip_first()/hammer_ip_next()
1398  *
1399  * NOTE: Both ATEDISK and DISKEOF will be set the same.  This sets up
1400  * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek
1401  * state if hammer_ip_next() needs to re-seek.
1402  */
1403 static __inline
1404 int
1405 _hammer_ip_seek_btree(hammer_cursor_t cursor)
1406 {
1407 	hammer_inode_t ip = cursor->ip;
1408 	int error;
1409 
1410 	if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1411 		error = hammer_btree_lookup(cursor);
1412 		if (error == ENOENT || error == EDEADLK) {
1413 			if (hammer_debug_general & 0x2000) {
1414 				kprintf("error %d node %p %016llx index %d\n",
1415 					error, cursor->node,
1416 					(long long)cursor->node->node_offset,
1417 					cursor->index);
1418 			}
1419 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1420 			error = hammer_btree_iterate(cursor);
1421 		}
1422 		if (error == 0) {
1423 			cursor->flags &= ~(HAMMER_CURSOR_DISKEOF |
1424 					   HAMMER_CURSOR_ATEDISK);
1425 		} else {
1426 			cursor->flags |= HAMMER_CURSOR_DISKEOF |
1427 					 HAMMER_CURSOR_ATEDISK;
1428 			if (error == ENOENT)
1429 				error = 0;
1430 		}
1431 	} else {
1432 		cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK;
1433 		error = 0;
1434 	}
1435 	return(error);
1436 }
1437 
1438 /*
1439  * Helper for hammer_ip_next()
1440  *
1441  * The caller has determined that the media cursor is further along than the
1442  * memory cursor and must be reseeked after a generation number change.
1443  */
1444 static
1445 int
1446 _hammer_ip_reseek(hammer_cursor_t cursor)
1447 {
1448 	struct hammer_base_elm save;
1449 	hammer_btree_elm_t elm;
1450 	int error;
1451 	int r;
1452 	int again = 0;
1453 
1454 	/*
1455 	 * Do the re-seek.
1456 	 */
1457 	kprintf("HAMMER: Debug: re-seeked during scan @ino=%016llx\n",
1458 		(long long)cursor->ip->obj_id);
1459 	save = cursor->key_beg;
1460 	cursor->key_beg = cursor->iprec->leaf.base;
1461 	error = _hammer_ip_seek_btree(cursor);
1462 	KKASSERT(error == 0);
1463 	cursor->key_beg = save;
1464 
1465 	/*
1466 	 * If the memory record was previous returned to
1467 	 * the caller and the media record matches
1468 	 * (-1/+1: only create_tid differs), then iterate
1469 	 * the media record to avoid a double result.
1470 	 */
1471 	if ((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0 &&
1472 	    (cursor->flags & HAMMER_CURSOR_LASTWASMEM)) {
1473 		elm = &cursor->node->ondisk->elms[cursor->index];
1474 		r = hammer_btree_cmp(&elm->base,
1475 				     &cursor->iprec->leaf.base);
1476 		if (cursor->flags & HAMMER_CURSOR_ASOF) {
1477 			if (r >= -1 && r <= 1) {
1478 				kprintf("HAMMER: Debug: iterated after "
1479 					"re-seek (asof r=%d)\n", r);
1480 				cursor->flags |= HAMMER_CURSOR_ATEDISK;
1481 				again = 1;
1482 			}
1483 		} else {
1484 			if (r == 0) {
1485 				kprintf("HAMMER: Debug: iterated after "
1486 					"re-seek\n");
1487 				cursor->flags |= HAMMER_CURSOR_ATEDISK;
1488 				again = 1;
1489 			}
1490 		}
1491 	}
1492 	return(again);
1493 }
1494 
1495 /*
1496  * Locate the first record within the cursor's key_beg/key_end range,
1497  * restricted to a particular inode.  0 is returned on success, ENOENT
1498  * if no records matched the requested range, or some other error.
1499  *
1500  * When 0 is returned hammer_ip_next() may be used to iterate additional
1501  * records within the requested range.
1502  *
1503  * This function can return EDEADLK, requiring the caller to terminate
1504  * the cursor and try again.
1505  */
1506 
1507 int
1508 hammer_ip_first(hammer_cursor_t cursor)
1509 {
1510 	hammer_inode_t ip = cursor->ip;
1511 	int error;
1512 
1513 	KKASSERT(ip != NULL);
1514 
1515 	/*
1516 	 * Clean up fields and setup for merged scan
1517 	 */
1518 	cursor->flags &= ~HAMMER_CURSOR_RETEST;
1519 
1520 	/*
1521 	 * Search the in-memory record list (Red-Black tree).  Unlike the
1522 	 * B-Tree search, mem_first checks for records in the range.
1523 	 *
1524 	 * This function will setup both ATEMEM and MEMEOF properly for
1525 	 * the ip iteration.  ATEMEM will be set if MEMEOF is set.
1526 	 */
1527 	hammer_mem_first(cursor);
1528 
1529 	/*
1530 	 * Detect generation changes during blockages, including
1531 	 * blockages which occur on the initial btree search.
1532 	 */
1533 	cursor->rec_generation = cursor->ip->rec_generation;
1534 
1535 	/*
1536 	 * Initial search and result
1537 	 */
1538 	error = _hammer_ip_seek_btree(cursor);
1539 	if (error == 0)
1540 		error = hammer_ip_next(cursor);
1541 
1542 	return (error);
1543 }
1544 
1545 /*
1546  * Retrieve the next record in a merged iteration within the bounds of the
1547  * cursor.  This call may be made multiple times after the cursor has been
1548  * initially searched with hammer_ip_first().
1549  *
1550  * There are numerous special cases in this code to deal with races between
1551  * in-memory records and on-media records.
1552  *
1553  * 0 is returned on success, ENOENT if no further records match the
1554  * requested range, or some other error code is returned.
1555  */
1556 int
1557 hammer_ip_next(hammer_cursor_t cursor)
1558 {
1559 	hammer_btree_elm_t elm;
1560 	hammer_record_t rec;
1561 	hammer_record_t tmprec;
1562 	int error;
1563 	int r;
1564 
1565 again:
1566 	/*
1567 	 * Get the next on-disk record
1568 	 *
1569 	 * NOTE: If we deleted the last on-disk record we had scanned
1570 	 * 	 ATEDISK will be clear and RETEST will be set, forcing
1571 	 *	 a call to iterate.  The fact that ATEDISK is clear causes
1572 	 *	 iterate to re-test the 'current' element.  If ATEDISK is
1573 	 *	 set, iterate will skip the 'current' element.
1574 	 */
1575 	error = 0;
1576 	if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1577 		if (cursor->flags & (HAMMER_CURSOR_ATEDISK |
1578 				     HAMMER_CURSOR_RETEST)) {
1579 			error = hammer_btree_iterate(cursor);
1580 			cursor->flags &= ~HAMMER_CURSOR_RETEST;
1581 			if (error == 0) {
1582 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1583 				hammer_cache_node(&cursor->ip->cache[1],
1584 						  cursor->node);
1585 			} else if (error == ENOENT) {
1586 				cursor->flags |= HAMMER_CURSOR_DISKEOF |
1587 						 HAMMER_CURSOR_ATEDISK;
1588 				error = 0;
1589 			}
1590 		}
1591 	}
1592 
1593 	/*
1594 	 * If the generation changed the backend has deleted or committed
1595 	 * one or more memory records since our last check.
1596 	 *
1597 	 * When this case occurs if the disk cursor is > current memory record
1598 	 * or the disk cursor is at EOF, we must re-seek the disk-cursor.
1599 	 * Since the cursor is ahead it must have not yet been eaten (if
1600 	 * not at eof anyway). (XXX data offset case?)
1601 	 *
1602 	 * NOTE: we are not doing a full check here.  That will be handled
1603 	 * later on.
1604 	 *
1605 	 * If we have exhausted all memory records we do not have to do any
1606 	 * further seeks.
1607 	 */
1608 	while (cursor->rec_generation != cursor->ip->rec_generation &&
1609 	       error == 0
1610 	) {
1611 		kprintf("HAMMER: Debug: generation changed during scan @ino=%016llx\n", (long long)cursor->ip->obj_id);
1612 		cursor->rec_generation = cursor->ip->rec_generation;
1613 		if (cursor->flags & HAMMER_CURSOR_MEMEOF)
1614 			break;
1615 		if (cursor->flags & HAMMER_CURSOR_DISKEOF) {
1616 			r = 1;
1617 		} else {
1618 			KKASSERT((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0);
1619 			elm = &cursor->node->ondisk->elms[cursor->index];
1620 			r = hammer_btree_cmp(&elm->base,
1621 					     &cursor->iprec->leaf.base);
1622 		}
1623 
1624 		/*
1625 		 * Do we re-seek the media cursor?
1626 		 */
1627 		if (r > 0) {
1628 			if (_hammer_ip_reseek(cursor))
1629 				goto again;
1630 		}
1631 	}
1632 
1633 	/*
1634 	 * We can now safely get the next in-memory record.  We cannot
1635 	 * block here.
1636 	 *
1637 	 * hammer_rec_scan_cmp:  Is the record still in our general range,
1638 	 *			 (non-inclusive of snapshot exclusions)?
1639 	 * hammer_rec_scan_callback: Is the record in our snapshot?
1640 	 */
1641 	tmprec = NULL;
1642 	if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1643 		/*
1644 		 * If the current memory record was eaten then get the next
1645 		 * one.  Stale records are skipped.
1646 		 */
1647 		if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1648 			tmprec = cursor->iprec;
1649 			cursor->iprec = NULL;
1650 			rec = hammer_rec_rb_tree_RB_NEXT(tmprec);
1651 			while (rec) {
1652 				if (hammer_rec_scan_cmp(rec, cursor) != 0)
1653 					break;
1654 				if (hammer_rec_scan_callback(rec, cursor) != 0)
1655 					break;
1656 				rec = hammer_rec_rb_tree_RB_NEXT(rec);
1657 			}
1658 			if (cursor->iprec) {
1659 				KKASSERT(cursor->iprec == rec);
1660 				cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1661 			} else {
1662 				cursor->flags |= HAMMER_CURSOR_MEMEOF;
1663 			}
1664 			cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM;
1665 		}
1666 	}
1667 
1668 	/*
1669 	 * MEMORY RECORD VALIDITY TEST
1670 	 *
1671 	 * (We still can't block, which is why tmprec is being held so
1672 	 * long).
1673 	 *
1674 	 * If the memory record is no longer valid we skip it.  It may
1675 	 * have been deleted by the frontend.  If it was deleted or
1676 	 * committed by the backend the generation change re-seeked the
1677 	 * disk cursor and the record will be present there.
1678 	 */
1679 	if (error == 0 && (cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1680 		KKASSERT(cursor->iprec);
1681 		KKASSERT((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0);
1682 		if (!hammer_ip_iterate_mem_good(cursor, cursor->iprec)) {
1683 			cursor->flags |= HAMMER_CURSOR_ATEMEM;
1684 			if (tmprec)
1685 				hammer_rel_mem_record(tmprec);
1686 			goto again;
1687 		}
1688 	}
1689 	if (tmprec)
1690 		hammer_rel_mem_record(tmprec);
1691 
1692 	/*
1693 	 * Extract either the disk or memory record depending on their
1694 	 * relative position.
1695 	 */
1696 	error = 0;
1697 	switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1698 	case 0:
1699 		/*
1700 		 * Both entries valid.   Compare the entries and nominally
1701 		 * return the first one in the sort order.  Numerous cases
1702 		 * require special attention, however.
1703 		 */
1704 		elm = &cursor->node->ondisk->elms[cursor->index];
1705 		r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base);
1706 
1707 		/*
1708 		 * If the two entries differ only by their key (-2/2) or
1709 		 * create_tid (-1/1), and are DATA records, we may have a
1710 		 * nominal match.  We have to calculate the base file
1711 		 * offset of the data.
1712 		 */
1713 		if (r <= 2 && r >= -2 && r != 0 &&
1714 		    cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE &&
1715 		    cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1716 			int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
1717 			int64_t base2 = cursor->iprec->leaf.base.key -
1718 					cursor->iprec->leaf.data_len;
1719 			if (base1 == base2)
1720 				r = 0;
1721 		}
1722 
1723 		if (r < 0) {
1724 			error = hammer_btree_extract(cursor,
1725 						     HAMMER_CURSOR_GET_LEAF);
1726 			cursor->flags |= HAMMER_CURSOR_ATEDISK;
1727 			cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM;
1728 			break;
1729 		}
1730 
1731 		/*
1732 		 * If the entries match exactly the memory entry is either
1733 		 * an on-disk directory entry deletion or a bulk data
1734 		 * overwrite.  If it is a directory entry deletion we eat
1735 		 * both entries.
1736 		 *
1737 		 * For the bulk-data overwrite case it is possible to have
1738 		 * visibility into both, which simply means the syncer
1739 		 * hasn't gotten around to doing the delete+insert sequence
1740 		 * on the B-Tree.  Use the memory entry and throw away the
1741 		 * on-disk entry.
1742 		 *
1743 		 * If the in-memory record is not either of these we
1744 		 * probably caught the syncer while it was syncing it to
1745 		 * the media.  Since we hold a shared lock on the cursor,
1746 		 * the in-memory record had better be marked deleted at
1747 		 * this point.
1748 		 */
1749 		if (r == 0) {
1750 			if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1751 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1752 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1753 					cursor->flags |= HAMMER_CURSOR_ATEMEM;
1754 					goto again;
1755 				}
1756 			} else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1757 				if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1758 					cursor->flags |= HAMMER_CURSOR_ATEDISK;
1759 				}
1760 				/* fall through to memory entry */
1761 			} else {
1762 				panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags);
1763 				cursor->flags |= HAMMER_CURSOR_ATEMEM;
1764 				goto again;
1765 			}
1766 		}
1767 		/* fall through to the memory entry */
1768 	case HAMMER_CURSOR_ATEDISK:
1769 		/*
1770 		 * Only the memory entry is valid.
1771 		 */
1772 		cursor->leaf = &cursor->iprec->leaf;
1773 		cursor->flags |= HAMMER_CURSOR_ATEMEM;
1774 		cursor->flags |= HAMMER_CURSOR_LASTWASMEM;
1775 
1776 		/*
1777 		 * If the memory entry is an on-disk deletion we should have
1778 		 * also had found a B-Tree record.  If the backend beat us
1779 		 * to it it would have interlocked the cursor and we should
1780 		 * have seen the in-memory record marked DELETED_FE.
1781 		 */
1782 		if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL &&
1783 		    (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1784 			panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags);
1785 		}
1786 		break;
1787 	case HAMMER_CURSOR_ATEMEM:
1788 		/*
1789 		 * Only the disk entry is valid
1790 		 */
1791 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1792 		cursor->flags |= HAMMER_CURSOR_ATEDISK;
1793 		cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM;
1794 		break;
1795 	default:
1796 		/*
1797 		 * Neither entry is valid
1798 		 *
1799 		 * XXX error not set properly
1800 		 */
1801 		cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM;
1802 		cursor->leaf = NULL;
1803 		error = ENOENT;
1804 		break;
1805 	}
1806 	return(error);
1807 }
1808 
1809 /*
1810  * Resolve the cursor->data pointer for the current cursor position in
1811  * a merged iteration.
1812  */
1813 int
1814 hammer_ip_resolve_data(hammer_cursor_t cursor)
1815 {
1816 	hammer_record_t record;
1817 	int error;
1818 
1819 	if (hammer_cursor_inmem(cursor)) {
1820 		/*
1821 		 * The data associated with an in-memory record is usually
1822 		 * kmalloced, but reserve-ahead data records will have an
1823 		 * on-disk reference.
1824 		 *
1825 		 * NOTE: Reserve-ahead data records must be handled in the
1826 		 * context of the related high level buffer cache buffer
1827 		 * to interlock against async writes.
1828 		 */
1829 		record = cursor->iprec;
1830 		cursor->data = record->data;
1831 		error = 0;
1832 		if (cursor->data == NULL) {
1833 			KKASSERT(record->leaf.base.rec_type ==
1834 				 HAMMER_RECTYPE_DATA);
1835 			cursor->data = hammer_bread_ext(cursor->trans->hmp,
1836 						    record->leaf.data_offset,
1837 						    record->leaf.data_len,
1838 						    &error,
1839 						    &cursor->data_buffer);
1840 		}
1841 	} else {
1842 		cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf;
1843 		error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1844 	}
1845 	return(error);
1846 }
1847 
1848 /*
1849  * Backend truncation / record replacement - delete records in range.
1850  *
1851  * Delete all records within the specified range for inode ip.  In-memory
1852  * records still associated with the frontend are ignored.
1853  *
1854  * If truncating is non-zero in-memory records associated with the back-end
1855  * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
1856  *
1857  * NOTES:
1858  *
1859  *	* An unaligned range will cause new records to be added to cover
1860  *        the edge cases. (XXX not implemented yet).
1861  *
1862  *	* Replacement via reservations (see hammer_ip_sync_record_cursor())
1863  *        also do not deal with unaligned ranges.
1864  *
1865  *	* ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1866  *
1867  *	* Record keys for regular file data have to be special-cased since
1868  * 	  they indicate the end of the range (key = base + bytes).
1869  *
1870  *	* This function may be asked to delete ridiculously huge ranges, for
1871  *	  example if someone truncates or removes a 1TB regular file.  We
1872  *	  must be very careful on restarts and we may have to stop w/
1873  *	  EWOULDBLOCK to avoid blowing out the buffer cache.
1874  */
1875 int
1876 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1877 		       int64_t ran_beg, int64_t ran_end, int truncating)
1878 {
1879 	hammer_transaction_t trans = cursor->trans;
1880 	hammer_btree_leaf_elm_t leaf;
1881 	int error;
1882 	int64_t off;
1883 	int64_t tmp64;
1884 
1885 #if 0
1886 	kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1887 #endif
1888 
1889 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
1890 retry:
1891 	hammer_normalize_cursor(cursor);
1892 	cursor->key_beg.localization = ip->obj_localization +
1893 				       HAMMER_LOCALIZE_MISC;
1894 	cursor->key_beg.obj_id = ip->obj_id;
1895 	cursor->key_beg.create_tid = 0;
1896 	cursor->key_beg.delete_tid = 0;
1897 	cursor->key_beg.obj_type = 0;
1898 
1899 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1900 		cursor->key_beg.key = ran_beg;
1901 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1902 	} else {
1903 		/*
1904 		 * The key in the B-Tree is (base+bytes), so the first possible
1905 		 * matching key is ran_beg + 1.
1906 		 */
1907 		cursor->key_beg.key = ran_beg + 1;
1908 		cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1909 	}
1910 
1911 	cursor->key_end = cursor->key_beg;
1912 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1913 		cursor->key_end.key = ran_end;
1914 	} else {
1915 		tmp64 = ran_end + MAXPHYS + 1;	/* work around GCC-4 bug */
1916 		if (tmp64 < ran_end)
1917 			cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1918 		else
1919 			cursor->key_end.key = ran_end + MAXPHYS + 1;
1920 	}
1921 
1922 	cursor->asof = ip->obj_asof;
1923 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1924 	cursor->flags |= HAMMER_CURSOR_ASOF;
1925 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1926 	cursor->flags |= HAMMER_CURSOR_BACKEND;
1927 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1928 
1929 	error = hammer_ip_first(cursor);
1930 
1931 	/*
1932 	 * Iterate through matching records and mark them as deleted.
1933 	 */
1934 	while (error == 0) {
1935 		leaf = cursor->leaf;
1936 
1937 		KKASSERT(leaf->base.delete_tid == 0);
1938 		KKASSERT(leaf->base.obj_id == ip->obj_id);
1939 
1940 		/*
1941 		 * There may be overlap cases for regular file data.  Also
1942 		 * remember the key for a regular file record is (base + len),
1943 		 * NOT (base).
1944 		 *
1945 		 * Note that do to duplicates (mem & media) allowed by
1946 		 * DELETE_VISIBILITY, off can wind up less then ran_beg.
1947 		 */
1948 		if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
1949 			off = leaf->base.key - leaf->data_len;
1950 			/*
1951 			 * Check the left edge case.  We currently do not
1952 			 * split existing records.
1953 			 */
1954 			if (off < ran_beg && leaf->base.key > ran_beg) {
1955 				panic("hammer left edge case %016llx %d\n",
1956 					(long long)leaf->base.key,
1957 					leaf->data_len);
1958 			}
1959 
1960 			/*
1961 			 * Check the right edge case.  Note that the
1962 			 * record can be completely out of bounds, which
1963 			 * terminates the search.
1964 			 *
1965 			 * base->key is exclusive of the right edge while
1966 			 * ran_end is inclusive of the right edge.  The
1967 			 * (key - data_len) left boundary is inclusive.
1968 			 *
1969 			 * XXX theory-check this test at some point, are
1970 			 * we missing a + 1 somewhere?  Note that ran_end
1971 			 * could overflow.
1972 			 */
1973 			if (leaf->base.key - 1 > ran_end) {
1974 				if (leaf->base.key - leaf->data_len > ran_end)
1975 					break;
1976 				panic("hammer right edge case\n");
1977 			}
1978 		} else {
1979 			off = leaf->base.key;
1980 		}
1981 
1982 		/*
1983 		 * Delete the record.  When truncating we do not delete
1984 		 * in-memory (data) records because they represent data
1985 		 * written after the truncation.
1986 		 *
1987 		 * This will also physically destroy the B-Tree entry and
1988 		 * data if the retention policy dictates.  The function
1989 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
1990 		 * to retest the new 'current' element.
1991 		 */
1992 		if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
1993 			error = hammer_ip_delete_record(cursor, ip, trans->tid);
1994 			/*
1995 			 * If we have built up too many meta-buffers we risk
1996 			 * deadlocking the kernel and must stop.  This can
1997 			 * occur when deleting ridiculously huge files.
1998 			 * sync_trunc_off is updated so the next cycle does
1999 			 * not re-iterate records we have already deleted.
2000 			 *
2001 			 * This is only done with formal truncations.
2002 			 */
2003 			if (truncating > 1 && error == 0 &&
2004 			    hammer_flusher_meta_limit(ip->hmp)) {
2005 				ip->sync_trunc_off = off;
2006 				error = EWOULDBLOCK;
2007 			}
2008 		}
2009 		if (error)
2010 			break;
2011 		ran_beg = off;	/* for restart */
2012 		error = hammer_ip_next(cursor);
2013 	}
2014 	if (cursor->node)
2015 		hammer_cache_node(&ip->cache[1], cursor->node);
2016 
2017 	if (error == EDEADLK) {
2018 		hammer_done_cursor(cursor);
2019 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
2020 		if (error == 0)
2021 			goto retry;
2022 	}
2023 	if (error == ENOENT)
2024 		error = 0;
2025 	return(error);
2026 }
2027 
2028 /*
2029  * This backend function deletes the specified record on-disk, similar to
2030  * delete_range but for a specific record.  Unlike the exact deletions
2031  * used when deleting a directory entry this function uses an ASOF search
2032  * like delete_range.
2033  *
2034  * This function may be called with ip->obj_asof set for a slave snapshot,
2035  * so don't use it.  We always delete non-historical records only.
2036  */
2037 static int
2038 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
2039 		      hammer_btree_leaf_elm_t leaf)
2040 {
2041 	hammer_transaction_t trans = cursor->trans;
2042 	int error;
2043 
2044 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
2045 retry:
2046 	hammer_normalize_cursor(cursor);
2047 	cursor->key_beg = leaf->base;
2048 	cursor->asof = HAMMER_MAX_TID;
2049 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
2050 	cursor->flags |= HAMMER_CURSOR_ASOF;
2051 	cursor->flags |= HAMMER_CURSOR_BACKEND;
2052 	cursor->flags &= ~HAMMER_CURSOR_INSERT;
2053 
2054 	error = hammer_btree_lookup(cursor);
2055 	if (error == 0) {
2056 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
2057 	}
2058 	if (error == EDEADLK) {
2059 		hammer_done_cursor(cursor);
2060 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
2061 		if (error == 0)
2062 			goto retry;
2063 	}
2064 	return(error);
2065 }
2066 
2067 /*
2068  * This function deletes remaining auxillary records when an inode is
2069  * being deleted.  This function explicitly does not delete the
2070  * inode record, directory entry, data, or db records.  Those must be
2071  * properly disposed of prior to this call.
2072  */
2073 int
2074 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp)
2075 {
2076 	hammer_transaction_t trans = cursor->trans;
2077 	hammer_btree_leaf_elm_t leaf;
2078 	int error;
2079 
2080 	KKASSERT(trans->type == HAMMER_TRANS_FLS);
2081 retry:
2082 	hammer_normalize_cursor(cursor);
2083 	cursor->key_beg.localization = ip->obj_localization +
2084 				       HAMMER_LOCALIZE_MISC;
2085 	cursor->key_beg.obj_id = ip->obj_id;
2086 	cursor->key_beg.create_tid = 0;
2087 	cursor->key_beg.delete_tid = 0;
2088 	cursor->key_beg.obj_type = 0;
2089 	cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START;
2090 	cursor->key_beg.key = HAMMER_MIN_KEY;
2091 
2092 	cursor->key_end = cursor->key_beg;
2093 	cursor->key_end.rec_type = HAMMER_RECTYPE_MAX;
2094 	cursor->key_end.key = HAMMER_MAX_KEY;
2095 
2096 	cursor->asof = ip->obj_asof;
2097 	cursor->flags &= ~HAMMER_CURSOR_INITMASK;
2098 	cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2099 	cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
2100 	cursor->flags |= HAMMER_CURSOR_BACKEND;
2101 
2102 	error = hammer_ip_first(cursor);
2103 
2104 	/*
2105 	 * Iterate through matching records and mark them as deleted.
2106 	 */
2107 	while (error == 0) {
2108 		leaf = cursor->leaf;
2109 
2110 		KKASSERT(leaf->base.delete_tid == 0);
2111 
2112 		/*
2113 		 * Mark the record and B-Tree entry as deleted.  This will
2114 		 * also physically delete the B-Tree entry, record, and
2115 		 * data if the retention policy dictates.  The function
2116 		 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
2117 		 * to retest the new 'current' element.
2118 		 *
2119 		 * Directory entries (and delete-on-disk directory entries)
2120 		 * must be synced and cannot be deleted.
2121 		 */
2122 		error = hammer_ip_delete_record(cursor, ip, trans->tid);
2123 		++*countp;
2124 		if (error)
2125 			break;
2126 		error = hammer_ip_next(cursor);
2127 	}
2128 	if (cursor->node)
2129 		hammer_cache_node(&ip->cache[1], cursor->node);
2130 	if (error == EDEADLK) {
2131 		hammer_done_cursor(cursor);
2132 		error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
2133 		if (error == 0)
2134 			goto retry;
2135 	}
2136 	if (error == ENOENT)
2137 		error = 0;
2138 	return(error);
2139 }
2140 
2141 /*
2142  * Delete the record at the current cursor.  On success the cursor will
2143  * be positioned appropriately for an iteration but may no longer be at
2144  * a leaf node.
2145  *
2146  * This routine is only called from the backend.
2147  *
2148  * NOTE: This can return EDEADLK, requiring the caller to terminate the
2149  * cursor and retry.
2150  */
2151 int
2152 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
2153 			hammer_tid_t tid)
2154 {
2155 	hammer_record_t iprec;
2156 	hammer_mount_t hmp;
2157 	int error;
2158 
2159 	KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
2160 	KKASSERT(tid != 0);
2161 	hmp = cursor->node->hmp;
2162 
2163 	/*
2164 	 * In-memory (unsynchronized) records can simply be freed.  This
2165 	 * only occurs in range iterations since all other records are
2166 	 * individually synchronized.  Thus there should be no confusion with
2167 	 * the interlock.
2168 	 *
2169 	 * An in-memory record may be deleted before being committed to disk,
2170 	 * but could have been accessed in the mean time.  The reservation
2171 	 * code will deal with the case.
2172 	 */
2173 	if (hammer_cursor_inmem(cursor)) {
2174 		iprec = cursor->iprec;
2175 		KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
2176 		iprec->flags |= HAMMER_RECF_DELETED_FE;
2177 		iprec->flags |= HAMMER_RECF_DELETED_BE;
2178 		KKASSERT(iprec->ip == ip);
2179 		++ip->rec_generation;
2180 		return(0);
2181 	}
2182 
2183 	/*
2184 	 * On-disk records are marked as deleted by updating their delete_tid.
2185 	 * This does not effect their position in the B-Tree (which is based
2186 	 * on their create_tid).
2187 	 *
2188 	 * Frontend B-Tree operations track inodes so we tell
2189 	 * hammer_delete_at_cursor() not to.
2190 	 */
2191 	error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
2192 
2193 	if (error == 0) {
2194 		error = hammer_delete_at_cursor(
2195 				cursor,
2196 				HAMMER_DELETE_ADJUST | hammer_nohistory(ip),
2197 				cursor->trans->tid,
2198 				cursor->trans->time32,
2199 				0, NULL);
2200 	}
2201 	return(error);
2202 }
2203 
2204 /*
2205  * Delete the B-Tree element at the current cursor and do any necessary
2206  * mirror propagation.
2207  *
2208  * The cursor must be properly positioned for an iteration on return but
2209  * may be pointing at an internal element.
2210  *
2211  * An element can be un-deleted by passing a delete_tid of 0 with
2212  * HAMMER_DELETE_ADJUST.
2213  */
2214 int
2215 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
2216 			hammer_tid_t delete_tid, u_int32_t delete_ts,
2217 			int track, int64_t *stat_bytes)
2218 {
2219 	struct hammer_btree_leaf_elm save_leaf;
2220 	hammer_transaction_t trans;
2221 	hammer_btree_leaf_elm_t leaf;
2222 	hammer_node_t node;
2223 	hammer_btree_elm_t elm;
2224 	hammer_off_t data_offset;
2225 	int32_t data_len;
2226 	u_int16_t rec_type;
2227 	int error;
2228 	int icount;
2229 	int doprop;
2230 
2231 	error = hammer_cursor_upgrade(cursor);
2232 	if (error)
2233 		return(error);
2234 
2235 	trans = cursor->trans;
2236 	node = cursor->node;
2237 	elm = &node->ondisk->elms[cursor->index];
2238 	leaf = &elm->leaf;
2239 	KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
2240 
2241 	hammer_sync_lock_sh(trans);
2242 	doprop = 0;
2243 	icount = 0;
2244 
2245 	/*
2246 	 * Adjust the delete_tid.  Update the mirror_tid propagation field
2247 	 * as well.  delete_tid can be 0 (undelete -- used by mirroring).
2248 	 */
2249 	if (delete_flags & HAMMER_DELETE_ADJUST) {
2250 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE) {
2251 			if (elm->leaf.base.delete_tid == 0 && delete_tid)
2252 				icount = -1;
2253 			if (elm->leaf.base.delete_tid && delete_tid == 0)
2254 				icount = 1;
2255 		}
2256 
2257 		hammer_modify_node(trans, node, elm, sizeof(*elm));
2258 		elm->leaf.base.delete_tid = delete_tid;
2259 		elm->leaf.delete_ts = delete_ts;
2260 		hammer_modify_node_done(node);
2261 
2262 		if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) {
2263 			hammer_modify_node_field(trans, node, mirror_tid);
2264 			node->ondisk->mirror_tid = elm->leaf.base.delete_tid;
2265 			hammer_modify_node_done(node);
2266 			doprop = 1;
2267 			if (hammer_debug_general & 0x0002) {
2268 				kprintf("delete_at_cursor: propagate %016llx"
2269 					" @%016llx\n",
2270 					(long long)elm->leaf.base.delete_tid,
2271 					(long long)node->node_offset);
2272 			}
2273 		}
2274 
2275 		/*
2276 		 * Adjust for the iteration.  We have deleted the current
2277 		 * element and want to clear ATEDISK so the iteration does
2278 		 * not skip the element after, which now becomes the current
2279 		 * element.  This element must be re-tested if doing an
2280 		 * iteration, which is handled by the RETEST flag.
2281 		 */
2282 		if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2283 			cursor->flags |= HAMMER_CURSOR_RETEST;
2284 			cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2285 		}
2286 
2287 		/*
2288 		 * An on-disk record cannot have the same delete_tid
2289 		 * as its create_tid.  In a chain of record updates
2290 		 * this could result in a duplicate record.
2291 		 */
2292 		KKASSERT(elm->leaf.base.delete_tid !=
2293 			 elm->leaf.base.create_tid);
2294 	}
2295 
2296 	/*
2297 	 * Destroy the B-Tree element if asked (typically if a nohistory
2298 	 * file or mount, or when called by the pruning code).
2299 	 *
2300 	 * Adjust the ATEDISK flag to properly support iterations.
2301 	 */
2302 	if (delete_flags & HAMMER_DELETE_DESTROY) {
2303 		data_offset = elm->leaf.data_offset;
2304 		data_len = elm->leaf.data_len;
2305 		rec_type = elm->leaf.base.rec_type;
2306 		if (doprop) {
2307 			save_leaf = elm->leaf;
2308 			leaf = &save_leaf;
2309 		}
2310 		if (elm->base.rec_type == HAMMER_RECTYPE_INODE &&
2311 		    elm->leaf.base.delete_tid == 0) {
2312 			icount = -1;
2313 		}
2314 
2315 		error = hammer_btree_delete(cursor);
2316 		if (error == 0) {
2317 			/*
2318 			 * The deletion moves the next element (if any) to
2319 			 * the current element position.  We must clear
2320 			 * ATEDISK so this element is not skipped and we
2321 			 * must set RETEST to force any iteration to re-test
2322 			 * the element.
2323 			 */
2324 			if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2325 				cursor->flags |= HAMMER_CURSOR_RETEST;
2326 				cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2327 			}
2328 		}
2329 		if (error == 0) {
2330 			switch(data_offset & HAMMER_OFF_ZONE_MASK) {
2331 			case HAMMER_ZONE_LARGE_DATA:
2332 			case HAMMER_ZONE_SMALL_DATA:
2333 			case HAMMER_ZONE_META:
2334 				hammer_blockmap_free(trans,
2335 						     data_offset, data_len);
2336 				break;
2337 			default:
2338 				break;
2339 			}
2340 		}
2341 	}
2342 
2343 	/*
2344 	 * Track inode count and next_tid.  This is used by the mirroring
2345 	 * and PFS code.  icount can be negative, zero, or positive.
2346 	 */
2347 	if (error == 0 && track) {
2348 		if (icount) {
2349 			hammer_modify_volume_field(trans, trans->rootvol,
2350 						   vol0_stat_inodes);
2351 			trans->rootvol->ondisk->vol0_stat_inodes += icount;
2352 			hammer_modify_volume_done(trans->rootvol);
2353 		}
2354 		if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) {
2355 			hammer_modify_volume(trans, trans->rootvol, NULL, 0);
2356 			trans->rootvol->ondisk->vol0_next_tid = delete_tid;
2357 			hammer_modify_volume_done(trans->rootvol);
2358 		}
2359 	}
2360 
2361 	/*
2362 	 * mirror_tid propagation occurs if the node's mirror_tid had to be
2363 	 * updated while adjusting the delete_tid.
2364 	 *
2365 	 * This occurs when deleting even in nohistory mode, but does not
2366 	 * occur when pruning an already-deleted node.
2367 	 *
2368 	 * cursor->ip is NULL when called from the pruning, mirroring,
2369 	 * and pfs code.  If non-NULL propagation will be conditionalized
2370 	 * on whether the PFS is in no-history mode or not.
2371 	 */
2372 	if (doprop) {
2373 		if (cursor->ip)
2374 			hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
2375 		else
2376 			hammer_btree_do_propagation(cursor, NULL, leaf);
2377 	}
2378 	hammer_sync_unlock(trans);
2379 	return (error);
2380 }
2381 
2382 /*
2383  * Determine whether we can remove a directory.  This routine checks whether
2384  * a directory is empty or not and enforces flush connectivity.
2385  *
2386  * Flush connectivity requires that we block if the target directory is
2387  * currently flushing, otherwise it may not end up in the same flush group.
2388  *
2389  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2390  */
2391 int
2392 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
2393 {
2394 	struct hammer_cursor cursor;
2395 	int error;
2396 
2397 	/*
2398 	 * Check directory empty
2399 	 */
2400 	hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2401 
2402 	cursor.key_beg.localization = ip->obj_localization +
2403 				      hammer_dir_localization(ip);
2404 	cursor.key_beg.obj_id = ip->obj_id;
2405 	cursor.key_beg.create_tid = 0;
2406 	cursor.key_beg.delete_tid = 0;
2407 	cursor.key_beg.obj_type = 0;
2408 	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
2409 	cursor.key_beg.key = HAMMER_MIN_KEY;
2410 
2411 	cursor.key_end = cursor.key_beg;
2412 	cursor.key_end.rec_type = 0xFFFF;
2413 	cursor.key_end.key = HAMMER_MAX_KEY;
2414 
2415 	cursor.asof = ip->obj_asof;
2416 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2417 
2418 	error = hammer_ip_first(&cursor);
2419 	if (error == ENOENT)
2420 		error = 0;
2421 	else if (error == 0)
2422 		error = ENOTEMPTY;
2423 	hammer_done_cursor(&cursor);
2424 	return(error);
2425 }
2426 
2427